summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrancis Quiers <fquiers@cisco.com>2020-05-18 17:15:05 +0100
committerTimothy B. Terriberry <territim@amazon.com>2022-03-07 16:24:42 -0800
commitc6f98577716d39907264d5388b74b5be5dea3d6c (patch)
treec28d7ed32be8f98d43d5cb34e8efb004399ffe0e
parent37aba6e9b382f7dbdb7916adbc335704cf2992e8 (diff)
downloadopus-c6f98577716d39907264d5388b74b5be5dea3d6c.tar.gz
Update and re-enable SILK SSE4.1 optimisations
-rw-r--r--silk/NSQ.c34
-rw-r--r--silk/NSQ_del_dec.c28
-rw-r--r--silk/SigProc_FIX.h6
-rw-r--r--silk/VQ_WMat_EC.c4
-rw-r--r--silk/fixed/burg_modified_FIX.c8
-rw-r--r--silk/fixed/vector_ops_FIX.c2
-rw-r--r--silk/fixed/x86/burg_modified_FIX_sse4_1.c69
-rw-r--r--silk/fixed/x86/prefilter_FIX_sse.c160
-rw-r--r--silk/fixed/x86/vector_ops_FIX_sse4_1.c40
-rw-r--r--silk/main.h60
-rw-r--r--silk/x86/NSQ_del_dec_sse4_1.c179
-rw-r--r--silk/x86/NSQ_sse4_1.c211
-rw-r--r--silk/x86/SigProc_FIX_sse.h12
-rw-r--r--silk/x86/VAD_sse4_1.c28
-rw-r--r--silk/x86/VQ_WMat_EC_sse4_1.c189
-rw-r--r--silk/x86/main_sse.h170
-rw-r--r--silk/x86/x86_silk_map.c89
17 files changed, 650 insertions, 639 deletions
diff --git a/silk/NSQ.c b/silk/NSQ.c
index ab8f9b6c..45dd45ce 100644
--- a/silk/NSQ.c
+++ b/silk/NSQ.c
@@ -75,21 +75,21 @@ static OPUS_INLINE void silk_noise_shape_quantizer(
void silk_NSQ_c
(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
const opus_int16 x16[], /* I Input */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
)
{
opus_int k, lag, start_idx, LSF_interpolation_flag;
@@ -173,9 +173,9 @@ void silk_NSQ_c
RESTORE_STACK;
}
-/***********************************/
-/* silk_noise_shape_quantizer */
-/***********************************/
+/******************************/
+/* silk_noise_shape_quantizer */
+/******************************/
#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
static OPUS_INLINE
diff --git a/silk/NSQ_del_dec.c b/silk/NSQ_del_dec.c
index 00e749c3..41f3fc93 100644
--- a/silk/NSQ_del_dec.c
+++ b/silk/NSQ_del_dec.c
@@ -115,21 +115,21 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
);
void silk_NSQ_del_dec_c(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
const opus_int16 x16[], /* I Input */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
)
{
opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
diff --git a/silk/SigProc_FIX.h b/silk/SigProc_FIX.h
index f9ae3263..1d9bf2f1 100644
--- a/silk/SigProc_FIX.h
+++ b/silk/SigProc_FIX.h
@@ -381,7 +381,7 @@ opus_int32 silk_inner_prod_aligned_scale(
const opus_int len /* I vector lengths */
);
-opus_int64 silk_inner_prod16_aligned_64_c(
+opus_int64 silk_inner_prod16_c(
const opus_int16 *inVec1, /* I input vector 1 */
const opus_int16 *inVec2, /* I input vector 2 */
const opus_int len /* I vector lengths */
@@ -613,8 +613,8 @@ static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a, opus_int64 b)
#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
((void)(arch), silk_burg_modified_c(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
-#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
- ((void)(arch),silk_inner_prod16_aligned_64_c(inVec1, inVec2, len))
+#define silk_inner_prod16(inVec1, inVec2, len, arch) \
+ ((void)(arch),silk_inner_prod16_c(inVec1, inVec2, len))
#endif
#include "Inlines.h"
diff --git a/silk/VQ_WMat_EC.c b/silk/VQ_WMat_EC.c
index 0f3d545c..245a7e4b 100644
--- a/silk/VQ_WMat_EC.c
+++ b/silk/VQ_WMat_EC.c
@@ -64,7 +64,7 @@ void silk_VQ_WMat_EC_c(
*rate_dist_Q8 = silk_int32_MAX;
*res_nrg_Q15 = silk_int32_MAX;
cb_row_Q7 = cb_Q7;
- /* In things go really bad, at least *ind is set to something safe. */
+ /* If things go really bad, at least *ind is set to something safe. */
*ind = 0;
for( k = 0; k < L; k++ ) {
opus_int32 penalty;
@@ -115,7 +115,7 @@ void silk_VQ_WMat_EC_c(
if( sum1_Q15 >= 0 ) {
/* Translate residual energy to bits using high-rate assumption (6 dB ==> 1 bit/sample) */
bits_res_Q8 = silk_SMULBB( subfr_len, silk_lin2log( sum1_Q15 + penalty) - (15 << 7) );
- /* In the following line we reduce the codelength component by half ("-1"); seems to slghtly improve quality */
+ /* In the following line we reduce the codelength component by half ("-1"); seems to slightly improve quality */
bits_tot_Q8 = silk_ADD_LSHIFT32( bits_res_Q8, cl_Q5[ k ], 3-1 );
if( bits_tot_Q8 <= *rate_dist_Q8 ) {
*rate_dist_Q8 = bits_tot_Q8;
diff --git a/silk/fixed/burg_modified_FIX.c b/silk/fixed/burg_modified_FIX.c
index 274d4b28..185a12b1 100644
--- a/silk/fixed/burg_modified_FIX.c
+++ b/silk/fixed/burg_modified_FIX.c
@@ -68,7 +68,7 @@ void silk_burg_modified_c(
celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
/* Compute autocorrelations, added over subframes */
- C0_64 = silk_inner_prod16_aligned_64( x, x, subfr_length*nb_subfr, arch );
+ C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch );
lz = silk_CLZ64(C0_64);
rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;
if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;
@@ -87,7 +87,7 @@ void silk_burg_modified_c(
x_ptr = x + s * subfr_length;
for( n = 1; n < D + 1; n++ ) {
C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
- silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
+ silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
}
}
} else {
@@ -150,7 +150,7 @@ void silk_burg_modified_c(
C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */
C_last_row[ k ] = silk_MLA( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 ); /* Q17 */
- /* We sometimes have get overflows in the multiplications (even beyond +/- 2^32),
+ /* We sometimes get overflows in the multiplications (even beyond +/- 2^32),
but they cancel each other and the real result seems to always fit in a 32-bit
signed integer. This was determined experimentally, not theoretically (unfortunately). */
tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */
@@ -253,7 +253,7 @@ void silk_burg_modified_c(
if( rshifts > 0 ) {
for( s = 0; s < nb_subfr; s++ ) {
x_ptr = x + s * subfr_length;
- C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
+ C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts );
}
} else {
for( s = 0; s < nb_subfr; s++ ) {
diff --git a/silk/fixed/vector_ops_FIX.c b/silk/fixed/vector_ops_FIX.c
index d9498001..dcf84070 100644
--- a/silk/fixed/vector_ops_FIX.c
+++ b/silk/fixed/vector_ops_FIX.c
@@ -87,7 +87,7 @@ opus_int32 silk_inner_prod_aligned(
#endif
}
-opus_int64 silk_inner_prod16_aligned_64_c(
+opus_int64 silk_inner_prod16_c(
const opus_int16 *inVec1, /* I input vector 1 */
const opus_int16 *inVec2, /* I input vector 2 */
const opus_int len /* I vector lengths */
diff --git a/silk/fixed/x86/burg_modified_FIX_sse4_1.c b/silk/fixed/x86/burg_modified_FIX_sse4_1.c
index bbb1ce0f..e58bf079 100644
--- a/silk/fixed/x86/burg_modified_FIX_sse4_1.c
+++ b/silk/fixed/x86/burg_modified_FIX_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
- Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@@ -42,7 +42,7 @@
#define MAX_FRAME_SIZE 384 /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */
#define QA 25
-#define N_BITS_HEAD_ROOM 2
+#define N_BITS_HEAD_ROOM 3
#define MIN_RSHIFTS -16
#define MAX_RSHIFTS (32 - QA)
@@ -59,7 +59,7 @@ void silk_burg_modified_sse4_1(
int arch /* I Run-time architecture */
)
{
- opus_int k, n, s, lz, rshifts, rshifts_extra, reached_max_gain;
+ opus_int k, n, s, lz, rshifts, reached_max_gain;
opus_int32 C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;
const opus_int16 *x_ptr;
opus_int32 C_first_row[ SILK_MAX_ORDER_LPC ];
@@ -68,6 +68,7 @@ void silk_burg_modified_sse4_1(
opus_int32 CAf[ SILK_MAX_ORDER_LPC + 1 ];
opus_int32 CAb[ SILK_MAX_ORDER_LPC + 1 ];
opus_int32 xcorr[ SILK_MAX_ORDER_LPC ];
+ opus_int64 C0_64;
__m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210;
__m128i CONST1 = _mm_set1_epi32(1);
@@ -75,23 +76,18 @@ void silk_burg_modified_sse4_1(
celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
/* Compute autocorrelations, added over subframes */
- silk_sum_sqr_shift( &C0, &rshifts, x, nb_subfr * subfr_length );
- if( rshifts > MAX_RSHIFTS ) {
- C0 = silk_LSHIFT32( C0, rshifts - MAX_RSHIFTS );
- silk_assert( C0 > 0 );
- rshifts = MAX_RSHIFTS;
+ C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch );
+ lz = silk_CLZ64(C0_64);
+ rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;
+ if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;
+ if (rshifts < MIN_RSHIFTS) rshifts = MIN_RSHIFTS;
+
+ if (rshifts > 0) {
+ C0 = (opus_int32)silk_RSHIFT64(C0_64, rshifts );
} else {
- lz = silk_CLZ32( C0 ) - 1;
- rshifts_extra = N_BITS_HEAD_ROOM - lz;
- if( rshifts_extra > 0 ) {
- rshifts_extra = silk_min( rshifts_extra, MAX_RSHIFTS - rshifts );
- C0 = silk_RSHIFT32( C0, rshifts_extra );
- } else {
- rshifts_extra = silk_max( rshifts_extra, MIN_RSHIFTS - rshifts );
- C0 = silk_LSHIFT32( C0, -rshifts_extra );
- }
- rshifts += rshifts_extra;
+ C0 = silk_LSHIFT32((opus_int32)C0_64, -rshifts );
}
+
CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1; /* Q(-rshifts) */
silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
if( rshifts > 0 ) {
@@ -99,7 +95,7 @@ void silk_burg_modified_sse4_1(
x_ptr = x + s * subfr_length;
for( n = 1; n < D + 1; n++ ) {
C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
- silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
+ silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
}
}
} else {
@@ -203,8 +199,11 @@ void silk_burg_modified_sse4_1(
C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */
C_last_row[ k ] = silk_MLA( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 ); /* Q17 */
- tmp1 = silk_MLA( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */
- tmp2 = silk_MLA( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 ); /* Q17 */
+ /* We sometimes get overflows in the multiplications (even beyond +/- 2^32),
+ but they cancel each other and the real result seems to always fit in a 32-bit
+ signed integer. This was determined experimentally, not theoretically (unfortunately). */
+ tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */
+ tmp2 = silk_MLA_ovflw( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 ); /* Q17 */
}
tmp1 = -tmp1; /* Q17 */
@@ -350,7 +349,7 @@ void silk_burg_modified_sse4_1(
if( rshifts > 0 ) {
for( s = 0; s < nb_subfr; s++ ) {
x_ptr = x + s * subfr_length;
- C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
+ C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts );
}
} else {
for( s = 0; s < nb_subfr; s++ ) {
@@ -374,4 +373,28 @@ void silk_burg_modified_sse4_1(
*res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */
*res_nrg_Q = -rshifts;
}
+
+#ifdef OPUS_CHECK_ASM
+ {
+ opus_int32 res_nrg_c = 0;
+ opus_int res_nrg_Q_c = 0;
+ opus_int32 A_Q16_c[ MAX_LPC_ORDER ] = {0};
+
+ silk_burg_modified_c(
+ &res_nrg_c,
+ &res_nrg_Q_c,
+ A_Q16_c,
+ x,
+ minInvGain_Q30,
+ subfr_length,
+ nb_subfr,
+ D,
+ 0
+ );
+
+ silk_assert( *res_nrg == res_nrg_c );
+ silk_assert( *res_nrg_Q == res_nrg_Q_c );
+ silk_assert( !memcmp( A_Q16, A_Q16_c, D * sizeof( *A_Q16 ) ) );
+ }
+#endif
}
diff --git a/silk/fixed/x86/prefilter_FIX_sse.c b/silk/fixed/x86/prefilter_FIX_sse.c
deleted file mode 100644
index 555432cd..00000000
--- a/silk/fixed/x86/prefilter_FIX_sse.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/* Copyright (c) 2014, Cisco Systems, INC
- Written by XiangMingZhu WeiZhou MinPeng YanWang
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
-
- - Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- - Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
- OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include <xmmintrin.h>
-#include <emmintrin.h>
-#include <smmintrin.h>
-#include "main.h"
-#include "celt/x86/x86cpu.h"
-
-void silk_warped_LPC_analysis_filter_FIX_sse4_1(
- opus_int32 state[], /* I/O State [order + 1] */
- opus_int32 res_Q2[], /* O Residual signal [length] */
- const opus_int16 coef_Q13[], /* I Coefficients [order] */
- const opus_int16 input[], /* I Input signal [length] */
- const opus_int16 lambda_Q16, /* I Warping factor */
- const opus_int length, /* I Length of input signal */
- const opus_int order /* I Filter order (even) */
-)
-{
- opus_int n, i;
- opus_int32 acc_Q11, tmp1, tmp2;
-
- /* Order must be even */
- celt_assert( ( order & 1 ) == 0 );
-
- if (order == 10)
- {
- if (0 == lambda_Q16)
- {
- __m128i coef_Q13_3210, coef_Q13_7654;
- __m128i coef_Q13_0123, coef_Q13_4567;
- __m128i state_0123, state_4567;
- __m128i xmm_product1, xmm_product2;
- __m128i xmm_tempa, xmm_tempb;
-
- register opus_int32 sum;
- register opus_int32 state_8, state_9, state_a;
- register opus_int64 coef_Q13_8, coef_Q13_9;
-
- celt_assert( length > 0 );
-
- coef_Q13_3210 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 0 ] );
- coef_Q13_7654 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 4 ] );
-
- coef_Q13_0123 = _mm_shuffle_epi32( coef_Q13_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) );
- coef_Q13_4567 = _mm_shuffle_epi32( coef_Q13_7654, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-
- coef_Q13_8 = (opus_int64) coef_Q13[ 8 ];
- coef_Q13_9 = (opus_int64) coef_Q13[ 9 ];
-
- state_0123 = _mm_loadu_si128( (__m128i *)(&state[ 0 ] ) );
- state_4567 = _mm_loadu_si128( (__m128i *)(&state[ 4 ] ) );
-
- state_0123 = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
- state_4567 = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-
- state_8 = state[ 8 ];
- state_9 = state[ 9 ];
- state_a = 0;
-
- for( n = 0; n < length; n++ )
- {
- xmm_product1 = _mm_mul_epi32( coef_Q13_0123, state_0123 ); /* 64-bit multiply, only 2 pairs */
- xmm_product2 = _mm_mul_epi32( coef_Q13_4567, state_4567 );
-
- xmm_tempa = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
- xmm_tempb = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-
- xmm_product1 = _mm_srli_epi64( xmm_product1, 16 ); /* >> 16, zero extending works */
- xmm_product2 = _mm_srli_epi64( xmm_product2, 16 );
-
- xmm_tempa = _mm_mul_epi32( coef_Q13_3210, xmm_tempa );
- xmm_tempb = _mm_mul_epi32( coef_Q13_7654, xmm_tempb );
-
- xmm_tempa = _mm_srli_epi64( xmm_tempa, 16 );
- xmm_tempb = _mm_srli_epi64( xmm_tempb, 16 );
-
- xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_product1 );
- xmm_tempb = _mm_add_epi32( xmm_tempb, xmm_product2 );
- xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_tempb );
-
- sum = (opus_int32)((coef_Q13_8 * state_8) >> 16);
- sum += (opus_int32)((coef_Q13_9 * state_9) >> 16);
-
- xmm_tempa = _mm_add_epi32( xmm_tempa, _mm_shuffle_epi32( xmm_tempa, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
- sum += _mm_cvtsi128_si32( xmm_tempa);
- res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( ( 5 + sum ), 9);
-
- /* move right */
- state_a = state_9;
- state_9 = state_8;
- state_8 = _mm_cvtsi128_si32( state_4567 );
- state_4567 = _mm_alignr_epi8( state_0123, state_4567, 4 );
-
- state_0123 = _mm_alignr_epi8( _mm_cvtsi32_si128( silk_LSHIFT( input[ n ], 14 ) ), state_0123, 4 );
- }
-
- _mm_storeu_si128( (__m128i *)( &state[ 0 ] ), _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
- _mm_storeu_si128( (__m128i *)( &state[ 4 ] ), _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
- state[ 8 ] = state_8;
- state[ 9 ] = state_9;
- state[ 10 ] = state_a;
-
- return;
- }
- }
-
- for( n = 0; n < length; n++ ) {
- /* Output of lowpass section */
- tmp2 = silk_SMLAWB( state[ 0 ], state[ 1 ], lambda_Q16 );
- state[ 0 ] = silk_LSHIFT( input[ n ], 14 );
- /* Output of allpass section */
- tmp1 = silk_SMLAWB( state[ 1 ], state[ 2 ] - tmp2, lambda_Q16 );
- state[ 1 ] = tmp2;
- acc_Q11 = silk_RSHIFT( order, 1 );
- acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ 0 ] );
- /* Loop over allpass sections */
- for( i = 2; i < order; i += 2 ) {
- /* Output of allpass section */
- tmp2 = silk_SMLAWB( state[ i ], state[ i + 1 ] - tmp1, lambda_Q16 );
- state[ i ] = tmp1;
- acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ i - 1 ] );
- /* Output of allpass section */
- tmp1 = silk_SMLAWB( state[ i + 1 ], state[ i + 2 ] - tmp2, lambda_Q16 );
- state[ i + 1 ] = tmp2;
- acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ i ] );
- }
- state[ order ] = tmp1;
- acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ order - 1 ] );
- res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 );
- }
-}
diff --git a/silk/fixed/x86/vector_ops_FIX_sse4_1.c b/silk/fixed/x86/vector_ops_FIX_sse4_1.c
index c1e90564..0cfb08d9 100644
--- a/silk/fixed/x86/vector_ops_FIX_sse4_1.c
+++ b/silk/fixed/x86/vector_ops_FIX_sse4_1.c
@@ -37,39 +37,36 @@
#include "SigProc_FIX.h"
#include "pitch.h"
-opus_int64 silk_inner_prod16_aligned_64_sse4_1(
+opus_int64 silk_inner_prod16_sse4_1(
const opus_int16 *inVec1, /* I input vector 1 */
const opus_int16 *inVec2, /* I input vector 2 */
const opus_int len /* I vector lengths */
)
{
- opus_int i, dataSize8;
+ opus_int i, dataSize4;
opus_int64 sum;
- __m128i xmm_tempa;
- __m128i inVec1_76543210, acc1;
- __m128i inVec2_76543210, acc2;
+ __m128i xmm_prod_20, xmm_prod_31;
+ __m128i inVec1_3210, acc1;
+ __m128i inVec2_3210, acc2;
sum = 0;
- dataSize8 = len & ~7;
+ dataSize4 = len & ~3;
acc1 = _mm_setzero_si128();
acc2 = _mm_setzero_si128();
- for( i = 0; i < dataSize8; i += 8 ) {
- inVec1_76543210 = _mm_loadu_si128( (__m128i *)(&inVec1[i + 0] ) );
- inVec2_76543210 = _mm_loadu_si128( (__m128i *)(&inVec2[i + 0] ) );
+ for( i = 0; i < dataSize4; i += 4 ) {
+ inVec1_3210 = OP_CVTEPI16_EPI32_M64( &inVec1[i + 0] );
+ inVec2_3210 = OP_CVTEPI16_EPI32_M64( &inVec2[i + 0] );
+ xmm_prod_20 = _mm_mul_epi32( inVec1_3210, inVec2_3210 );
- /* only when all 4 operands are -32768 (0x8000), this results in wrap around */
- inVec1_76543210 = _mm_madd_epi16( inVec1_76543210, inVec2_76543210 );
+ inVec1_3210 = _mm_shuffle_epi32( inVec1_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+ inVec2_3210 = _mm_shuffle_epi32( inVec2_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+ xmm_prod_31 = _mm_mul_epi32( inVec1_3210, inVec2_3210 );
- xmm_tempa = _mm_cvtepi32_epi64( inVec1_76543210 );
- /* equal shift right 8 bytes */
- inVec1_76543210 = _mm_shuffle_epi32( inVec1_76543210, _MM_SHUFFLE( 0, 0, 3, 2 ) );
- inVec1_76543210 = _mm_cvtepi32_epi64( inVec1_76543210 );
-
- acc1 = _mm_add_epi64( acc1, xmm_tempa );
- acc2 = _mm_add_epi64( acc2, inVec1_76543210 );
+ acc1 = _mm_add_epi64( acc1, xmm_prod_20 );
+ acc2 = _mm_add_epi64( acc2, xmm_prod_31 );
}
acc1 = _mm_add_epi64( acc1, acc2 );
@@ -84,5 +81,12 @@ opus_int64 silk_inner_prod16_aligned_64_sse4_1(
sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] );
}
+#ifdef OPUS_CHECK_ASM
+ {
+ opus_int64 sum_c = silk_inner_prod16_c( inVec1, inVec2, len );
+ silk_assert( sum == sum_c );
+ }
+#endif
+
return sum;
}
diff --git a/silk/main.h b/silk/main.h
index 1a33eed5..a5f56875 100644
--- a/silk/main.h
+++ b/silk/main.h
@@ -247,21 +247,21 @@ void silk_VQ_WMat_EC_c(
/************************************/
void silk_NSQ_c(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
- const opus_int16 x16[], /* I Input */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const opus_int16 x16[], /* I Input */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
);
#if !defined(OVERRIDE_silk_NSQ)
@@ -273,21 +273,21 @@ void silk_NSQ_c(
/* Noise shaping using delayed decision */
void silk_NSQ_del_dec_c(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
- const opus_int16 x16[], /* I Input */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const opus_int16 x16[], /* I Input */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
);
#if !defined(OVERRIDE_silk_NSQ_del_dec)
diff --git a/silk/x86/NSQ_del_dec_sse4_1.c b/silk/x86/NSQ_del_dec_sse4_1.c
index 2c75ede2..42735c52 100644
--- a/silk/x86/NSQ_del_dec_sse4_1.c
+++ b/silk/x86/NSQ_del_dec_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
- Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@@ -46,6 +46,7 @@ typedef struct {
opus_int32 Shape_Q14[ DECISION_DELAY ];
opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ];
opus_int32 LF_AR_Q14;
+ opus_int32 Diff_Q14;
opus_int32 Seed;
opus_int32 SeedInit;
opus_int32 RD_Q10;
@@ -56,6 +57,7 @@ typedef struct {
opus_int32 RD_Q10;
opus_int32 xq_Q14;
opus_int32 LF_AR_Q14;
+ opus_int32 Diff_Q14;
opus_int32 sLTP_shp_Q14;
opus_int32 LPC_exc_Q14;
} NSQ_sample_struct;
@@ -66,7 +68,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
- const opus_int32 x_Q3[], /* I Input in Q3 */
+ const opus_int16 x16[], /* I Input */
opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
@@ -112,21 +114,21 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
);
void silk_NSQ_del_dec_sse4_1(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
- const opus_int32 x_Q3[], /* I Prefiltered input signal */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const opus_int16 x16[], /* I Input */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
)
{
opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
@@ -142,8 +144,39 @@ void silk_NSQ_del_dec_sse4_1(
VARDECL( opus_int32, delayedGain_Q10 );
VARDECL( NSQ_del_dec_struct, psDelDec );
NSQ_del_dec_struct *psDD;
+#ifdef OPUS_CHECK_ASM
+ silk_nsq_state NSQ_c;
+ SideInfoIndices psIndices_c;
+ opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
+ const opus_int8 *const pulses_a = pulses;
+#endif
SAVE_STACK;
+#ifdef OPUS_CHECK_ASM
+ ( void )pulses_a;
+ silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
+ silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
+ silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH );
+ silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) );
+ silk_NSQ_del_dec_c(
+ psEncC,
+ &NSQ_c,
+ &psIndices_c,
+ x16,
+ pulses_c,
+ PredCoef_Q12,
+ LTPCoef_Q14,
+ AR_Q13,
+ HarmShapeGain_Q14,
+ Tilt_Q14,
+ LF_shp_Q14,
+ Gains_Q16,
+ pitchL,
+ Lambda_Q10,
+ LTP_scale_Q14
+ );
+#endif
+
/* Set unvoiced lag to the previous one, overwrite later for voiced */
lag = NSQ->lagPrev;
@@ -158,6 +191,7 @@ void silk_NSQ_del_dec_sse4_1(
psDD->SeedInit = psDD->Seed;
psDD->RD_Q10 = 0;
psDD->LF_AR_Q14 = NSQ->sLF_AR_shp_Q14;
+ psDD->Diff_Q14 = NSQ->sDiff_shp_Q14;
psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ];
silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) );
@@ -185,8 +219,7 @@ void silk_NSQ_del_dec_sse4_1(
LSF_interpolation_flag = 1;
}
- ALLOC( sLTP_Q15,
- psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
+ ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
@@ -198,7 +231,7 @@ void silk_NSQ_del_dec_sse4_1(
for( k = 0; k < psEncC->nb_subfr; k++ ) {
A_Q12 = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
B_Q14 = &LTPCoef_Q14[ k * LTP_ORDER ];
- AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ];
+ AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ];
/* Noise shape parameters */
silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
@@ -257,7 +290,7 @@ void silk_NSQ_del_dec_sse4_1(
}
}
- silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k,
+ silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );
silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
@@ -265,7 +298,7 @@ void silk_NSQ_del_dec_sse4_1(
Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
- x_Q3 += psEncC->subfr_length;
+ x16 += psEncC->subfr_length;
pulses += psEncC->subfr_length;
pxq += psEncC->subfr_length;
}
@@ -288,6 +321,7 @@ void silk_NSQ_del_dec_sse4_1(
for( i = 0; i < decisionDelay; i++ ) {
last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY;
if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY;
+
pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) );
@@ -298,11 +332,19 @@ void silk_NSQ_del_dec_sse4_1(
/* Update states */
NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14;
+ NSQ->sDiff_shp_Q14 = psDD->Diff_Q14;
NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];
/* Save quantized speech signal */
silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
+
+#ifdef OPUS_CHECK_ASM
+ silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
+ silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
+ silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) );
+#endif
+
RESTORE_STACK;
}
@@ -345,6 +387,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
opus_int32 q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
opus_int32 tmp1, tmp2, sLF_AR_shp_Q14;
opus_int32 *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14;
+
VARDECL( NSQ_sample_pair, psSampleState );
NSQ_del_dec_struct *psDD;
NSQ_sample_struct *psSS;
@@ -356,6 +399,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
celt_assert( nStatesDelayedDecision > 0 );
ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair );
+ int rdo_offset = (Lambda_Q10 >> 1) - 512;
+
shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 );
@@ -407,8 +452,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
/* Long-term shaping */
if( lag > 0 ) {
/* Symmetric, packed FIR coefficients */
- n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
- n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
+ n_LTP_Q14 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
+ n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 ); /* Q12 -> Q14 */
shp_lag_ptr++;
} else {
@@ -478,7 +523,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
- /* setp 4 */
+ /* step 4 */
psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -15 ] ) );
psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF );
@@ -511,9 +556,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */
/* Noise shape feedback */
- silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */
+ celt_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */
/* Output of lowpass section */
- tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping_Q16 );
+ tmp2 = silk_SMLAWB( psDD->Diff_Q14, psDD->sAR2_Q14[ 0 ], warping_Q16 );
/* Output of allpass section */
tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 );
psDD->sAR2_Q14[ 0 ] = tmp2;
@@ -543,9 +588,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
/* Input minus prediction plus noise feedback */
/* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */
- tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 ); /* Q14 */
+ tmp1 = silk_ADD_SAT32( n_AR_Q14, n_LF_Q14 ); /* Q14 */
tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 ); /* Q13 */
- tmp1 = silk_SUB32( tmp2, tmp1 ); /* Q13 */
+ tmp1 = silk_SUB_SAT32( tmp2, tmp1 ); /* Q13 */
tmp1 = silk_RSHIFT_ROUND( tmp1, 4 ); /* Q10 */
r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 ); /* residual error Q10 */
@@ -559,6 +604,18 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
/* Find two quantization level candidates and measure their rate-distortion */
q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
+ if (Lambda_Q10 > 2048) {
+ /* For aggressive RDO, the bias becomes more than one pulse. */
+ if (q1_Q10 > rdo_offset) {
+ q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 );
+ } else if (q1_Q10 < -rdo_offset) {
+ q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 );
+ } else if (q1_Q10 < 0) {
+ q1_Q0 = -1;
+ } else {
+ q1_Q0 = 0;
+ }
+ }
if( q1_Q0 > 0 ) {
q1_Q10 = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 );
@@ -612,8 +669,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
/* Update states */
- sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 );
- psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
+ psSS[ 0 ].Diff_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_Q10[ i ], 4 );
+ sLF_AR_shp_Q14 = silk_SUB32( psSS[ 0 ].Diff_Q14, n_AR_Q14 );
+ psSS[ 0 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );
psSS[ 0 ].LF_AR_Q14 = sLF_AR_shp_Q14;
psSS[ 0 ].LPC_exc_Q14 = LPC_exc_Q14;
psSS[ 0 ].xq_Q14 = xq_Q14;
@@ -626,14 +684,14 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
exc_Q14 = -exc_Q14;
}
-
/* Add predictions */
LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
/* Update states */
- sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 );
- psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
+ psSS[ 1 ].Diff_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_Q10[ i ], 4 );
+ sLF_AR_shp_Q14 = silk_SUB32( psSS[ 1 ].Diff_Q14, n_AR_Q14 );
+ psSS[ 1 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );
psSS[ 1 ].LF_AR_Q14 = sLF_AR_shp_Q14;
psSS[ 1 ].LPC_exc_Q14 = LPC_exc_Q14;
psSS[ 1 ].xq_Q14 = xq_Q14;
@@ -705,6 +763,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
psDD = &psDelDec[ k ];
psSS = &psSampleState[ k ][ 0 ];
psDD->LF_AR_Q14 = psSS->LF_AR_Q14;
+ psDD->Diff_Q14 = psSS->Diff_Q14;
psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14;
psDD->Xq_Q14[ *smpl_buf_idx ] = psSS->xq_Q14;
psDD->Q_Q10[ *smpl_buf_idx ] = psSS->Q_Q10;
@@ -728,7 +787,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
- const opus_int32 x_Q3[], /* I Input in Q3 */
+ const opus_int16 x16[], /* I Input */
opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
@@ -742,51 +801,41 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
)
{
opus_int i, k, lag;
- opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
+ opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
NSQ_del_dec_struct *psDD;
- __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
+ __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1;
lag = pitchL[ subfr ];
inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
-
silk_assert( inv_gain_Q31 != 0 );
- /* Calculate gain adjustment factor */
- if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
- gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
- } else {
- gain_adj_Q16 = (opus_int32)1 << 16;
- }
-
/* Scale input */
- inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
+ inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
- /* prepare inv_gain_Q23 in packed 4 32-bits */
- xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
+ /* prepare inv_gain_Q26 in packed 4 32-bits */
+ xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26);
for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
- xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
+ xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) );
+
/* equal shift right 4 bytes*/
- xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+ xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
- xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
- xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
+ xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 );
+ xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 );
- xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
- xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
+ xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 );
+ xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 );
- xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
+ xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );
- _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ])), xmm_x_Q3_x2x0 );
+ _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
}
for( ; i < psEncC->subfr_length; i++ ) {
- x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
+ x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 );
}
- /* Save inverse gain */
- NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
-
/* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
if( NSQ->rewhite_flag ) {
if( subfr == 0 ) {
@@ -800,7 +849,9 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
}
/* Adjust for changing gain */
- if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
+ if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
+ gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
+
/* Scale long-term shaping state */
{
__m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
@@ -841,6 +892,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
/* Scale scalar states */
psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 );
+ psDD->Diff_Q14 = silk_SMULWW( gain_adj_Q16, psDD->Diff_Q14 );
/* Scale short-term prediction and shaping states */
for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
@@ -855,5 +907,8 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
}
}
}
+
+ /* Save inverse gain */
+ NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
}
}
diff --git a/silk/x86/NSQ_sse4_1.c b/silk/x86/NSQ_sse4_1.c
index b0315e35..a2a74659 100644
--- a/silk/x86/NSQ_sse4_1.c
+++ b/silk/x86/NSQ_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
- Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@@ -37,17 +37,17 @@
#include "stack_alloc.h"
static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- const opus_int32 x_Q3[], /* I input in Q3 */
- opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */
- const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */
- opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
- opus_int subfr, /* I subframe number */
- const opus_int LTP_scale_Q14, /* I */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
- const opus_int signal_type /* I Signal type */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ const opus_int16 x16[], /* I input */
+ opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */
+ const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */
+ opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
+ opus_int subfr, /* I subframe number */
+ const opus_int LTP_scale_Q14, /* I */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
+ const opus_int signal_type /* I Signal type */
);
static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
@@ -65,27 +65,28 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
opus_int Tilt_Q14, /* I Spectral tilt */
opus_int32 LF_shp_Q14, /* I */
opus_int32 Gain_Q16, /* I */
+ opus_int Lambda_Q10, /* I */
opus_int offset_Q10, /* I */
opus_int length, /* I Input length */
opus_int32 table[][4] /* I */
);
void silk_NSQ_sse4_1(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
- const opus_int32 x_Q3[], /* I Prefiltered input signal */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const opus_int16 x16[], /* I Input */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
)
{
opus_int k, lag, start_idx, LSF_interpolation_flag;
@@ -101,8 +102,41 @@ void silk_NSQ_sse4_1(
opus_int32 tmp1;
opus_int32 q1_Q10, q2_Q10, rd1_Q20, rd2_Q20;
+#ifdef OPUS_CHECK_ASM
+ silk_nsq_state NSQ_c;
+ SideInfoIndices psIndices_c;
+ opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
+ const opus_int8 *const pulses_a = pulses;
+#endif
+
SAVE_STACK;
+#ifdef OPUS_CHECK_ASM
+ ( void )pulses_a;
+ silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
+ silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
+ silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH );
+ silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) );
+
+ silk_NSQ_c(
+ psEncC,
+ &NSQ_c,
+ &psIndices_c,
+ x16,
+ pulses_c,
+ PredCoef_Q12,
+ LTPCoef_Q14,
+ AR_Q13,
+ HarmShapeGain_Q14,
+ Tilt_Q14,
+ LF_shp_Q14,
+ Gains_Q16,
+ pitchL,
+ Lambda_Q10,
+ LTP_scale_Q14
+ );
+#endif
+
NSQ->rand_seed = psIndices->Seed;
/* Set unvoiced lag to the previous one, overwrite later for voiced */
@@ -172,8 +206,7 @@ void silk_NSQ_sse4_1(
LSF_interpolation_flag = 1;
}
- ALLOC( sLTP_Q15,
- psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
+ ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
/* Set up pointers to start of sub frame */
@@ -183,7 +216,7 @@ void silk_NSQ_sse4_1(
for( k = 0; k < psEncC->nb_subfr; k++ ) {
A_Q12 = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ];
B_Q14 = &LTPCoef_Q14[ k * LTP_ORDER ];
- AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ];
+ AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ];
/* Noise shape parameters */
silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
@@ -209,12 +242,12 @@ void silk_NSQ_sse4_1(
}
}
- silk_nsq_scale_states_sse4_1( psEncC, NSQ, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
+ silk_nsq_scale_states_sse4_1( psEncC, NSQ, x16, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) )
{
silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
- AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ],
+ AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
offset_Q10, psEncC->subfr_length, &(table[32]) );
}
else
@@ -224,7 +257,7 @@ void silk_NSQ_sse4_1(
offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch );
}
- x_Q3 += psEncC->subfr_length;
+ x16 += psEncC->subfr_length;
pulses += psEncC->subfr_length;
pxq += psEncC->subfr_length;
}
@@ -235,12 +268,19 @@ void silk_NSQ_sse4_1(
/* Save quantized speech and noise shaping signals */
silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
+
+#ifdef OPUS_CHECK_ASM
+ silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
+ silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
+ silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) );
+#endif
+
RESTORE_STACK;
}
-/***********************************/
-/* silk_noise_shape_quantizer_10_16 */
-/***********************************/
+/************************************/
+/* silk_noise_shape_quantizer_10_16 */
+/************************************/
static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
silk_nsq_state *NSQ, /* I/O NSQ state */
opus_int signalType, /* I Signal type */
@@ -256,6 +296,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
opus_int Tilt_Q14, /* I Spectral tilt */
opus_int32 LF_shp_Q14, /* I */
opus_int32 Gain_Q16, /* I */
+ opus_int Lambda_Q10, /* I */
opus_int offset_Q10, /* I */
opus_int length, /* I Input length */
opus_int32 table[][4] /* I */
@@ -264,7 +305,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
opus_int i;
opus_int32 LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13;
opus_int32 n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10;
- opus_int32 exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
+ opus_int32 exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10, sDiff_shp_Q14;
opus_int32 tmp1, tmp2, sLF_AR_shp_Q14;
opus_int32 *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr;
@@ -279,6 +320,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
__m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210;
__m128i AR_shp_Q13_76543210;
+ int rdo_offset = (Lambda_Q10 >> 1) - 512;
+
shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 );
@@ -288,6 +331,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14;
xq_Q14 = psLPC_Q14[ 0 ];
+ sDiff_shp_Q14 = NSQ->sDiff_shp_Q14;
LTP_pred_Q13 = 0;
/* load a_Q12 */
@@ -430,8 +474,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 );
sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 );
- sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (xq_Q14 >> 16), 0 );
- sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (xq_Q14), 0 );
+ sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (sDiff_shp_Q14 >> 16), 0 );
+ sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (sDiff_shp_Q14), 0 );
/* high part, use pmaddwd, results in 4 32-bit */
xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 );
@@ -462,14 +506,14 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 );
n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 );
- silk_assert( lag > 0 || signalType != TYPE_VOICED );
+ celt_assert( lag > 0 || signalType != TYPE_VOICED );
/* Combine prediction and noise shaping signals */
tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 ); /* Q12 */
tmp1 = silk_SUB32( tmp1, n_LF_Q12 ); /* Q12 */
if( lag > 0 ) {
/* Symmetric, packed FIR coefficients */
- n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
+ n_LTP_Q13 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );
shp_lag_ptr++;
@@ -495,6 +539,18 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
/* Find two quantization level candidates and measure their rate-distortion */
q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
+ if (Lambda_Q10 > 2048) {
+ /* For aggressive RDO, the bias becomes more than one pulse. */
+ if (q1_Q10 > rdo_offset) {
+ q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 );
+ } else if (q1_Q10 < -rdo_offset) {
+ q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 );
+ } else if (q1_Q10 < 0) {
+ q1_Q0 = -1;
+ } else {
+ q1_Q0 = 0;
+ }
+ }
q1_Q10 = table[q1_Q0][0];
q2_Q10 = table[q1_Q0][1];
@@ -519,7 +575,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
/* Update states */
psLPC_Q14++;
*psLPC_Q14 = xq_Q14;
- sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, n_AR_Q12, 2 );
+ NSQ->sDiff_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_sc_Q10[ i ], 4 );
+ sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( NSQ->sDiff_shp_Q14, n_AR_Q12, 2 );
NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 );
sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 );
@@ -600,64 +657,54 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
}
static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- const opus_int32 x_Q3[], /* I input in Q3 */
- opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */
- const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */
- opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
- opus_int subfr, /* I subframe number */
- const opus_int LTP_scale_Q14, /* I */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
- const opus_int signal_type /* I Signal type */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ const opus_int16 x16[], /* I input */
+ opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */
+ const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */
+ opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
+ opus_int subfr, /* I subframe number */
+ const opus_int LTP_scale_Q14, /* I */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
+ const opus_int signal_type /* I Signal type */
)
{
opus_int i, lag;
- opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
- __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
+ opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
+ __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1;
lag = pitchL[ subfr ];
inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
silk_assert( inv_gain_Q31 != 0 );
- /* Calculate gain adjustment factor */
- if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
- gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
- } else {
- gain_adj_Q16 = (opus_int32)1 << 16;
- }
-
/* Scale input */
- inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
+ inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
- /* prepare inv_gain_Q23 in packed 4 32-bits */
- xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
+ /* prepare inv_gain_Q26 in packed 4 32-bits */
+ xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26);
for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
- xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
+ xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) );
/* equal shift right 4 bytes*/
- xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+ xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
- xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
- xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
+ xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 );
+ xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 );
- xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
- xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
+ xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 );
+ xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 );
- xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
+ xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );
- _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x_Q3_x2x0 );
+ _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
}
for( ; i < psEncC->subfr_length; i++ ) {
- x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
+ x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 );
}
- /* Save inverse gain */
- NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
-
/* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
if( NSQ->rewhite_flag ) {
if( subfr == 0 ) {
@@ -671,7 +718,9 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
}
/* Adjust for changing gain */
- if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
+ if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
+ gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
+
/* Scale long-term shaping state */
__m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
@@ -707,6 +756,7 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
}
NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 );
+ NSQ->sDiff_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sDiff_shp_Q14 );
/* Scale short-term prediction and shaping states */
for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
@@ -715,5 +765,8 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] );
}
+
+ /* Save inverse gain */
+ NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
}
}
diff --git a/silk/x86/SigProc_FIX_sse.h b/silk/x86/SigProc_FIX_sse.h
index 61efa8da..e49d5d4e 100644
--- a/silk/x86/SigProc_FIX_sse.h
+++ b/silk/x86/SigProc_FIX_sse.h
@@ -67,7 +67,7 @@ extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])(
#endif
-opus_int64 silk_inner_prod16_aligned_64_sse4_1(
+opus_int64 silk_inner_prod16_sse4_1(
const opus_int16 *inVec1,
const opus_int16 *inVec2,
const opus_int len
@@ -76,18 +76,18 @@ opus_int64 silk_inner_prod16_aligned_64_sse4_1(
#if defined(OPUS_X86_PRESUME_SSE4_1)
-#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
- ((void)(arch),silk_inner_prod16_aligned_64_sse4_1(inVec1, inVec2, len))
+#define silk_inner_prod16(inVec1, inVec2, len, arch) \
+ ((void)(arch),silk_inner_prod16_sse4_1(inVec1, inVec2, len))
#else
-extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])(
+extern opus_int64 (*const SILK_INNER_PROD16_IMPL[OPUS_ARCHMASK + 1])(
const opus_int16 *inVec1,
const opus_int16 *inVec2,
const opus_int len);
-# define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
- ((*SILK_INNER_PROD16_ALIGNED_64_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))
+# define silk_inner_prod16(inVec1, inVec2, len, arch) \
+ ((*SILK_INNER_PROD16_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))
#endif
#endif
diff --git a/silk/x86/VAD_sse4_1.c b/silk/x86/VAD_sse4_1.c
index d02ddf4a..e7eaf971 100644
--- a/silk/x86/VAD_sse4_1.c
+++ b/silk/x86/VAD_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
- Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@@ -63,6 +63,14 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( /* O Return value, 0 if s
SAVE_STACK;
+#ifdef OPUS_CHECK_ASM
+ silk_encoder_state psEncC_c;
+ opus_int ret_c;
+
+ silk_memcpy( &psEncC_c, psEncC, sizeof( psEncC_c ) );
+ ret_c = silk_VAD_GetSA_Q8_c( &psEncC_c, pIn );
+#endif
+
/* Safety checks */
silk_assert( VAD_N_BANDS == 4 );
celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
@@ -233,15 +241,14 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( /* O Return value, 0 if s
speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
}
+ if( psEncC->frame_length == 20 * psEncC->fs_kHz ) {
+ speech_nrg = silk_RSHIFT32( speech_nrg, 1 );
+ }
/* Power scaling */
if( speech_nrg <= 0 ) {
SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
- } else if( speech_nrg < 32768 ) {
- if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
- speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 );
- } else {
- speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 );
- }
+ } else if( speech_nrg < 16384 ) {
+ speech_nrg = silk_LSHIFT32( speech_nrg, 16 );
/* square-root */
speech_nrg = silk_SQRT_APPROX( speech_nrg );
@@ -272,6 +279,11 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( /* O Return value, 0 if s
psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
}
+#ifdef OPUS_CHECK_ASM
+ silk_assert( ret == ret_c );
+ silk_assert( !memcmp( &psEncC_c, psEncC, sizeof( psEncC_c ) ) );
+#endif
+
RESTORE_STACK;
return( ret );
}
diff --git a/silk/x86/VQ_WMat_EC_sse4_1.c b/silk/x86/VQ_WMat_EC_sse4_1.c
index 74d6c6d0..2c7d18d0 100644
--- a/silk/x86/VQ_WMat_EC_sse4_1.c
+++ b/silk/x86/VQ_WMat_EC_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
- Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@@ -38,105 +38,136 @@
/* Entropy constrained matrix-weighted VQ, hard-coded to 5-element vectors, for a single input data vector */
void silk_VQ_WMat_EC_sse4_1(
opus_int8 *ind, /* O index of best codebook vector */
- opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
+ opus_int32 *res_nrg_Q15, /* O best residual energy */
+ opus_int32 *rate_dist_Q8, /* O best total bitrate */
opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
- const opus_int16 *in_Q14, /* I input vector to be quantized */
- const opus_int32 *W_Q18, /* I weighting matrix */
+ const opus_int32 *XX_Q17, /* I correlation matrix */
+ const opus_int32 *xX_Q17, /* I correlation vector */
const opus_int8 *cb_Q7, /* I codebook */
const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
- const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
+ const opus_int subfr_len, /* I number of samples per subframe */
const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
- opus_int L /* I number of vectors in codebook */
+ const opus_int L /* I number of vectors in codebook */
)
{
opus_int k, gain_tmp_Q7;
const opus_int8 *cb_row_Q7;
- opus_int16 diff_Q14[ 5 ];
- opus_int32 sum1_Q14, sum2_Q16;
+ opus_int32 neg_xX_Q24[ 5 ];
+ opus_int32 sum1_Q15, sum2_Q24;
+ opus_int32 bits_res_Q8, bits_tot_Q8;
+ __m128i v_XX_31_Q17, v_XX_42_Q17, v_cb_row_31_Q7, v_cb_row_42_Q7, v_acc1_Q24, v_acc2_Q24;
+
+ /* Negate and convert to new Q domain */
+ neg_xX_Q24[ 0 ] = -silk_LSHIFT32( xX_Q17[ 0 ], 7 );
+ neg_xX_Q24[ 1 ] = -silk_LSHIFT32( xX_Q17[ 1 ], 7 );
+ neg_xX_Q24[ 2 ] = -silk_LSHIFT32( xX_Q17[ 2 ], 7 );
+ neg_xX_Q24[ 3 ] = -silk_LSHIFT32( xX_Q17[ 3 ], 7 );
+ neg_xX_Q24[ 4 ] = -silk_LSHIFT32( xX_Q17[ 4 ], 7 );
+
+ v_XX_31_Q17 = _mm_loadu_si128( (__m128i *)(&XX_Q17[ 1 ] ) );
+ v_XX_42_Q17 = _mm_shuffle_epi32( v_XX_31_Q17, _MM_SHUFFLE( 0, 3, 2, 1 ) );
- __m128i C_tmp1, C_tmp2, C_tmp3, C_tmp4, C_tmp5;
/* Loop over codebook */
- *rate_dist_Q14 = silk_int32_MAX;
+ *rate_dist_Q8 = silk_int32_MAX;
+ *res_nrg_Q15 = silk_int32_MAX;
cb_row_Q7 = cb_Q7;
+ /* If things go really bad, at least *ind is set to something safe. */
+ *ind = 0;
for( k = 0; k < L; k++ ) {
+ opus_int32 penalty;
gain_tmp_Q7 = cb_gain_Q7[k];
-
- diff_Q14[ 0 ] = in_Q14[ 0 ] - silk_LSHIFT( cb_row_Q7[ 0 ], 7 );
-
- C_tmp1 = OP_CVTEPI16_EPI32_M64( &in_Q14[ 1 ] );
- C_tmp2 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] );
- C_tmp2 = _mm_slli_epi32( C_tmp2, 7 );
- C_tmp1 = _mm_sub_epi32( C_tmp1, C_tmp2 );
-
- diff_Q14[ 1 ] = _mm_extract_epi16( C_tmp1, 0 );
- diff_Q14[ 2 ] = _mm_extract_epi16( C_tmp1, 2 );
- diff_Q14[ 3 ] = _mm_extract_epi16( C_tmp1, 4 );
- diff_Q14[ 4 ] = _mm_extract_epi16( C_tmp1, 6 );
-
/* Weighted rate */
- sum1_Q14 = silk_SMULBB( mu_Q9, cl_Q5[ k ] );
+ /* Quantization error: 1 - 2 * xX * cb + cb' * XX * cb */
+ sum1_Q15 = SILK_FIX_CONST( 1.001, 15 );
/* Penalty for too large gain */
- sum1_Q14 = silk_ADD_LSHIFT32( sum1_Q14, silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 10 );
-
- silk_assert( sum1_Q14 >= 0 );
-
- /* first row of W_Q18 */
- C_tmp3 = _mm_loadu_si128( (__m128i *)(&W_Q18[ 1 ] ) );
- C_tmp4 = _mm_mul_epi32( C_tmp3, C_tmp1 );
- C_tmp4 = _mm_srli_si128( C_tmp4, 2 );
-
- C_tmp1 = _mm_shuffle_epi32( C_tmp1, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
- C_tmp3 = _mm_shuffle_epi32( C_tmp3, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
-
- C_tmp5 = _mm_mul_epi32( C_tmp3, C_tmp1 );
- C_tmp5 = _mm_srli_si128( C_tmp5, 2 );
-
- C_tmp5 = _mm_add_epi32( C_tmp4, C_tmp5 );
- C_tmp5 = _mm_slli_epi32( C_tmp5, 1 );
-
- C_tmp5 = _mm_add_epi32( C_tmp5, _mm_shuffle_epi32( C_tmp5, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
- sum2_Q16 = _mm_cvtsi128_si32( C_tmp5 );
-
- sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 0 ], diff_Q14[ 0 ] );
- sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 0 ] );
-
- /* second row of W_Q18 */
- sum2_Q16 = silk_SMULWB( W_Q18[ 7 ], diff_Q14[ 2 ] );
- sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 8 ], diff_Q14[ 3 ] );
- sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 9 ], diff_Q14[ 4 ] );
- sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
- sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 6 ], diff_Q14[ 1 ] );
- sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 1 ] );
-
- /* third row of W_Q18 */
- sum2_Q16 = silk_SMULWB( W_Q18[ 13 ], diff_Q14[ 3 ] );
- sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 14 ], diff_Q14[ 4 ] );
- sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
- sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 12 ], diff_Q14[ 2 ] );
- sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 2 ] );
-
- /* fourth row of W_Q18 */
- sum2_Q16 = silk_SMULWB( W_Q18[ 19 ], diff_Q14[ 4 ] );
- sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
- sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 18 ], diff_Q14[ 3 ] );
- sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 3 ] );
-
- /* last row of W_Q18 */
- sum2_Q16 = silk_SMULWB( W_Q18[ 24 ], diff_Q14[ 4 ] );
- sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 4 ] );
-
- silk_assert( sum1_Q14 >= 0 );
+ penalty = silk_LSHIFT32( silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 11 );
+
+ /* first row of XX_Q17 */
+ v_cb_row_31_Q7 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] );
+ v_cb_row_42_Q7 = _mm_shuffle_epi32( v_cb_row_31_Q7, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+ v_cb_row_31_Q7 = _mm_mul_epi32( v_XX_31_Q17, v_cb_row_31_Q7 );
+ v_cb_row_42_Q7 = _mm_mul_epi32( v_XX_42_Q17, v_cb_row_42_Q7 );
+ v_acc1_Q24 = _mm_add_epi64( v_cb_row_31_Q7, v_cb_row_42_Q7);
+ v_acc2_Q24 = _mm_shuffle_epi32( v_acc1_Q24, _MM_SHUFFLE( 1, 0, 3, 2 ) );
+ v_acc1_Q24 = _mm_add_epi64( v_acc1_Q24, v_acc2_Q24);
+ sum2_Q24 = _mm_cvtsi128_si32( v_acc1_Q24 );
+ sum2_Q24 = silk_ADD32( neg_xX_Q24[ 0 ], sum2_Q24 );
+ sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+ sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 0 ], cb_row_Q7[ 0 ] );
+ sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 0 ] );
+
+ /* second row of XX_Q17 */
+ sum2_Q24 = silk_MLA( neg_xX_Q24[ 1 ], XX_Q17[ 7 ], cb_row_Q7[ 2 ] );
+ sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 8 ], cb_row_Q7[ 3 ] );
+ sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 9 ], cb_row_Q7[ 4 ] );
+ sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+ sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 6 ], cb_row_Q7[ 1 ] );
+ sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 1 ] );
+
+ /* third row of XX_Q17 */
+ sum2_Q24 = silk_MLA( neg_xX_Q24[ 2 ], XX_Q17[ 13 ], cb_row_Q7[ 3 ] );
+ sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 14 ], cb_row_Q7[ 4 ] );
+ sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+ sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 12 ], cb_row_Q7[ 2 ] );
+ sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 2 ] );
+
+ /* fourth row of XX_Q17 */
+ sum2_Q24 = silk_MLA( neg_xX_Q24[ 3 ], XX_Q17[ 19 ], cb_row_Q7[ 4 ] );
+ sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+ sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 18 ], cb_row_Q7[ 3 ] );
+ sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 3 ] );
+
+ /* last row of XX_Q17 */
+ sum2_Q24 = silk_LSHIFT32( neg_xX_Q24[ 4 ], 1 );
+ sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 24 ], cb_row_Q7[ 4 ] );
+ sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 4 ] );
/* find best */
- if( sum1_Q14 < *rate_dist_Q14 ) {
- *rate_dist_Q14 = sum1_Q14;
- *ind = (opus_int8)k;
- *gain_Q7 = gain_tmp_Q7;
+ if( sum1_Q15 >= 0 ) {
+ /* Translate residual energy to bits using high-rate assumption (6 dB ==> 1 bit/sample) */
+ bits_res_Q8 = silk_SMULBB( subfr_len, silk_lin2log( sum1_Q15 + penalty) - (15 << 7) );
+ /* In the following line we reduce the codelength component by half ("-1"); seems to slightly improve quality */
+ bits_tot_Q8 = silk_ADD_LSHIFT32( bits_res_Q8, cl_Q5[ k ], 3-1 );
+ if( bits_tot_Q8 <= *rate_dist_Q8 ) {
+ *rate_dist_Q8 = bits_tot_Q8;
+ *res_nrg_Q15 = sum1_Q15 + penalty;
+ *ind = (opus_int8)k;
+ *gain_Q7 = gain_tmp_Q7;
+ }
}
/* Go to next cbk vector */
cb_row_Q7 += LTP_ORDER;
}
+
+#ifdef OPUS_CHECK_ASM
+ {
+ opus_int8 ind_c = 0;
+ opus_int32 res_nrg_Q15_c = 0;
+ opus_int32 rate_dist_Q8_c = 0;
+ opus_int gain_Q7_c = 0;
+
+ silk_VQ_WMat_EC_c(
+ &ind_c,
+ &res_nrg_Q15_c,
+ &rate_dist_Q8_c,
+ &gain_Q7_c,
+ XX_Q17,
+ xX_Q17,
+ cb_Q7,
+ cb_gain_Q7,
+ cl_Q5,
+ subfr_len,
+ max_gain_Q7,
+ L
+ );
+
+ silk_assert( *ind == ind_c );
+ silk_assert( *res_nrg_Q15 == res_nrg_Q15_c );
+ silk_assert( *rate_dist_Q8 == rate_dist_Q8_c );
+ silk_assert( *gain_Q7 == gain_Q7_c );
+ }
+#endif
}
diff --git a/silk/x86/main_sse.h b/silk/x86/main_sse.h
index 2f15d448..0a0391a2 100644
--- a/silk/x86/main_sse.h
+++ b/silk/x86/main_sse.h
@@ -34,73 +34,72 @@
# if defined(OPUS_X86_MAY_HAVE_SSE4_1)
-#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */
# define OVERRIDE_silk_VQ_WMat_EC
void silk_VQ_WMat_EC_sse4_1(
opus_int8 *ind, /* O index of best codebook vector */
- opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
+ opus_int32 *res_nrg_Q15, /* O best residual energy */
+ opus_int32 *rate_dist_Q8, /* O best total bitrate */
opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
- const opus_int16 *in_Q14, /* I input vector to be quantized */
- const opus_int32 *W_Q18, /* I weighting matrix */
+ const opus_int32 *XX_Q17, /* I correlation matrix */
+ const opus_int32 *xX_Q17, /* I correlation vector */
const opus_int8 *cb_Q7, /* I codebook */
const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
- const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
+ const opus_int subfr_len, /* I number of samples per subframe */
const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
- opus_int L /* I number of vectors in codebook */
+ const opus_int L /* I number of vectors in codebook */
);
#if defined OPUS_X86_PRESUME_SSE4_1
-#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
- mu_Q9, max_gain_Q7, L, arch) \
- ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
- mu_Q9, max_gain_Q7, L))
+#define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+ subfr_len, max_gain_Q7, L, arch) \
+ ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+ subfr_len, max_gain_Q7, L))
#else
extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(
opus_int8 *ind, /* O index of best codebook vector */
- opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
+ opus_int32 *res_nrg_Q15, /* O best residual energy */
+ opus_int32 *rate_dist_Q8, /* O best total bitrate */
opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
- const opus_int16 *in_Q14, /* I input vector to be quantized */
- const opus_int32 *W_Q18, /* I weighting matrix */
+ const opus_int32 *XX_Q17, /* I correlation matrix */
+ const opus_int32 *xX_Q17, /* I correlation vector */
const opus_int8 *cb_Q7, /* I codebook */
const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
- const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
+ const opus_int subfr_len, /* I number of samples per subframe */
const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
- opus_int L /* I number of vectors in codebook */
+ const opus_int L /* I number of vectors in codebook */
);
-# define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
- mu_Q9, max_gain_Q7, L, arch) \
- ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
- mu_Q9, max_gain_Q7, L))
+# define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+ subfr_len, max_gain_Q7, L, arch) \
+ ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+ subfr_len, max_gain_Q7, L))
#endif
-#endif
-#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
# define OVERRIDE_silk_NSQ
void silk_NSQ_sse4_1(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
- const opus_int32 x_Q3[], /* I Prefiltered input signal */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const opus_int16 x16[], /* I Input */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
);
#if defined OPUS_X86_PRESUME_SSE4_1
@@ -113,21 +112,21 @@ void silk_NSQ_sse4_1(
#else
extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
- const opus_int32 x_Q3[], /* I Prefiltered input signal */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const opus_int16 x16[], /* I Input */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
);
# define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
@@ -140,57 +139,56 @@ extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
# define OVERRIDE_silk_NSQ_del_dec
void silk_NSQ_del_dec_sse4_1(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
- const opus_int32 x_Q3[], /* I Prefiltered input signal */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const opus_int16 x16[], /* I Input */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
);
#if defined OPUS_X86_PRESUME_SSE4_1
-#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
- ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+ ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
#else
extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
- const opus_int32 x_Q3[], /* I Prefiltered input signal */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const opus_int16 x16[], /* I Input */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
);
-# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
- ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+ ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
#endif
-#endif
void silk_noise_shape_quantizer(
silk_nsq_state *NSQ, /* I/O NSQ state */
diff --git a/silk/x86/x86_silk_map.c b/silk/x86/x86_silk_map.c
index 32dcc3ca..ca13cde9 100644
--- a/silk/x86/x86_silk_map.c
+++ b/silk/x86/x86_silk_map.c
@@ -41,16 +41,16 @@
#include "fixed/main_FIX.h"
-opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )(
+opus_int64 (*const SILK_INNER_PROD16_IMPL[ OPUS_ARCHMASK + 1 ] )(
const opus_int16 *inVec1,
const opus_int16 *inVec2,
const opus_int len
) = {
- silk_inner_prod16_aligned_64_c, /* non-sse */
- silk_inner_prod16_aligned_64_c,
- silk_inner_prod16_aligned_64_c,
- MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */
- MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ) /* avx */
+ silk_inner_prod16_c, /* non-sse */
+ silk_inner_prod16_c,
+ silk_inner_prod16_c,
+ MAY_HAVE_SSE4_1( silk_inner_prod16 ), /* sse4.1 */
+ MAY_HAVE_SSE4_1( silk_inner_prod16 ) /* avx */
};
#endif
@@ -66,23 +66,22 @@ opus_int (*const SILK_VAD_GETSA_Q8_IMPL[ OPUS_ARCHMASK + 1 ] )(
MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 ) /* avx */
};
-#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
- const opus_int32 x_Q3[], /* I Prefiltered input signal */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const opus_int16 x16[], /* I Input */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
) = {
silk_NSQ_c, /* non-sse */
silk_NSQ_c,
@@ -90,21 +89,20 @@ void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */
MAY_HAVE_SSE4_1( silk_NSQ ) /* avx */
};
-#endif
-#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */
void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
opus_int8 *ind, /* O index of best codebook vector */
- opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
+ opus_int32 *res_nrg_Q15, /* O best residual energy */
+ opus_int32 *rate_dist_Q8, /* O best total bitrate */
opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
- const opus_int16 *in_Q14, /* I input vector to be quantized */
- const opus_int32 *W_Q18, /* I weighting matrix */
+ const opus_int32 *XX_Q17, /* I correlation matrix */
+ const opus_int32 *xX_Q17, /* I correlation vector */
const opus_int8 *cb_Q7, /* I codebook */
const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
- const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
+ const opus_int subfr_len, /* I number of samples per subframe */
const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
- opus_int L /* I number of vectors in codebook */
+ const opus_int L /* I number of vectors in codebook */
) = {
silk_VQ_WMat_EC_c, /* non-sse */
silk_VQ_WMat_EC_c,
@@ -112,25 +110,23 @@ void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */
MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ) /* avx */
};
-#endif
-#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
- const opus_int32 x_Q3[], /* I Prefiltered input signal */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const opus_int16 x16[], /* I Input */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
) = {
silk_NSQ_del_dec_c, /* non-sse */
silk_NSQ_del_dec_c,
@@ -138,7 +134,6 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */
MAY_HAVE_SSE4_1( silk_NSQ_del_dec ) /* avx */
};
-#endif
#if defined(FIXED_POINT)