Cisco optimization for x86 & fixed point

1. Only for fixed point on x86 platform (32bit and 64bit, uses SIMD intrinsics up to SSE4.2) 2. Use "configure --enable-fixed-point --enable-intrinsics" to enable optimization, default is disabled. 3. Official test cases are verified and passed. Signed-off-by: Timothy B. Terriberry <tterribe@xiph.org>
author: xiangmingzhu <xiangzhu@cisco.com> 2014-04-30 15:48:07 +0800
committer: Jean-Marc Valin <jmvalin@jmvalin.ca> 2014-10-03 21:16:00 -0400
commit: c95c9a048f3283afb2e412b10085d4f7c19e1412 (patch)
tree: ed8873af6559d7a98922e0fed85be47c826ef521 /silk/fixed
parent: 80460334b77d70e665a390503cd8992cdad06c10 (diff)
download: opus-c95c9a048f3283afb2e412b10085d4f7c19e1412.tar.gz
16 files changed, 692 insertions, 52 deletions
diff --git a/silk/fixed/LTP_analysis_filter_FIX.c b/silk/fixed/LTP_analysis_filter_FIX.c
index a9419080..5574e706 100644
--- a/silk/fixed/LTP_analysis_filter_FIX.c
+++ b/silk/fixed/LTP_analysis_filter_FIX.c
@@ -45,7 +45,7 @@ void silk_LTP_analysis_filter_FIX(
     const opus_int16 *x_ptr, *x_lag_ptr;
     opus_int16   Btmp_Q14[ LTP_ORDER ];
     opus_int16   *LTP_res_ptr;
-    opus_int     k, i, j;
+    opus_int     k, i;
     opus_int32   LTP_est;
 
     x_ptr = x;
@@ -53,9 +53,12 @@ void silk_LTP_analysis_filter_FIX(
     for( k = 0; k < nb_subfr; k++ ) {
 
         x_lag_ptr = x_ptr - pitchL[ k ];
-        for( i = 0; i < LTP_ORDER; i++ ) {
-            Btmp_Q14[ i ] = LTPCoef_Q14[ k * LTP_ORDER + i ];
-        }
+
+        Btmp_Q14[ 0 ] = LTPCoef_Q14[ k * LTP_ORDER ];
+        Btmp_Q14[ 1 ] = LTPCoef_Q14[ k * LTP_ORDER + 1 ];
+        Btmp_Q14[ 2 ] = LTPCoef_Q14[ k * LTP_ORDER + 2 ];
+        Btmp_Q14[ 3 ] = LTPCoef_Q14[ k * LTP_ORDER + 3 ];
+        Btmp_Q14[ 4 ] = LTPCoef_Q14[ k * LTP_ORDER + 4 ];
 
         /* LTP analysis FIR filter */
         for( i = 0; i < subfr_length + pre_length; i++ ) {
@@ -63,9 +66,11 @@ void silk_LTP_analysis_filter_FIX(
 
             /* Long-term prediction */
             LTP_est = silk_SMULBB( x_lag_ptr[ LTP_ORDER / 2 ], Btmp_Q14[ 0 ] );
-            for( j = 1; j < LTP_ORDER; j++ ) {
-                LTP_est = silk_SMLABB_ovflw( LTP_est, x_lag_ptr[ LTP_ORDER / 2 - j ], Btmp_Q14[ j ] );
-            }
+            LTP_est = silk_SMLABB_ovflw( LTP_est, x_lag_ptr[ 1 ], Btmp_Q14[ 1 ] );
+            LTP_est = silk_SMLABB_ovflw( LTP_est, x_lag_ptr[ 0 ], Btmp_Q14[ 2 ] );
+            LTP_est = silk_SMLABB_ovflw( LTP_est, x_lag_ptr[ -1 ], Btmp_Q14[ 3 ] );
+            LTP_est = silk_SMLABB_ovflw( LTP_est, x_lag_ptr[ -2 ], Btmp_Q14[ 4 ] );
+
             LTP_est = silk_RSHIFT_ROUND( LTP_est, 14 ); /* round and -> Q0*/
 
             /* Subtract long-term prediction */
diff --git a/silk/fixed/burg_modified_FIX.c b/silk/fixed/burg_modified_FIX.c
index 2a260775..d0a07f94 100644
--- a/silk/fixed/burg_modified_FIX.c
+++ b/silk/fixed/burg_modified_FIX.c
@@ -42,7 +42,7 @@ POSSIBILITY OF SUCH DAMAGE.
 #define MAX_RSHIFTS                 (32 - QA)
 
 /* Compute reflection coefficients from input signal */
-void silk_burg_modified(
+void silk_burg_modified_c(
     opus_int32                  *res_nrg,           /* O    Residual energy                                             */
     opus_int                    *res_nrg_Q,         /* O    Residual energy Q value                                     */
     opus_int32                  A_Q16[],            /* O    Prediction coefficients (length order)                      */
@@ -68,7 +68,7 @@ void silk_burg_modified(
     silk_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
 
     /* Compute autocorrelations, added over subframes */
-    C0_64 = silk_inner_prod16_aligned_64( x, x, subfr_length*nb_subfr );
+    C0_64 = silk_inner_prod16_aligned_64( x, x, subfr_length*nb_subfr, arch );
     lz = silk_CLZ64(C0_64);
     rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;
     if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;
@@ -87,7 +87,7 @@ void silk_burg_modified(
             x_ptr = x + s * subfr_length;
             for( n = 1; n < D + 1; n++ ) {
                 C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
-                    silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n ), rshifts );
+                    silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
             }
         }
     } else {
@@ -248,12 +248,12 @@ void silk_burg_modified(
         if( rshifts > 0 ) {
             for( s = 0; s < nb_subfr; s++ ) {
                 x_ptr = x + s * subfr_length;
-                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D ), rshifts );
+                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
             }
         } else {
             for( s = 0; s < nb_subfr; s++ ) {
                 x_ptr = x + s * subfr_length;
-                C0 -= silk_LSHIFT32( silk_inner_prod_aligned( x_ptr, x_ptr, D ), -rshifts );
+                C0 -= silk_LSHIFT32( silk_inner_prod_aligned( x_ptr, x_ptr, D, arch), -rshifts);
             }
         }
         /* Approximate residual energy */
diff --git a/silk/fixed/corrMatrix_FIX.c b/silk/fixed/corrMatrix_FIX.c
index c6172705..c1d437c7 100644
--- a/silk/fixed/corrMatrix_FIX.c
+++ b/silk/fixed/corrMatrix_FIX.c
@@ -42,7 +42,8 @@ void silk_corrVector_FIX(
     const opus_int                  L,                                      /* I    Length of vectors                                                           */
     const opus_int                  order,                                  /* I    Max lag for correlation                                                     */
     opus_int32                      *Xt,                                    /* O    Pointer to X'*t correlation vector [order]                                  */
-    const opus_int                  rshifts                                 /* I    Right shifts of correlations                                                */
+    const opus_int                  rshifts,                                /* I    Right shifts of correlations                                                */
+    int                             arch                                    /* I    Run-time architecture                                                       */
 )
 {
     opus_int         lag, i;
@@ -65,7 +66,7 @@ void silk_corrVector_FIX(
     } else {
         silk_assert( rshifts == 0 );
         for( lag = 0; lag < order; lag++ ) {
-            Xt[ lag ] = silk_inner_prod_aligned( ptr1, ptr2, L ); /* X[:,lag]'*t */
+            Xt[ lag ] = silk_inner_prod_aligned( ptr1, ptr2, L, arch ); /* X[:,lag]'*t */
             ptr1--; /* Go to next column of X */
         }
     }
@@ -78,7 +79,8 @@ void silk_corrMatrix_FIX(
     const opus_int                  order,                                  /* I    Max lag for correlation                                                     */
     const opus_int                  head_room,                              /* I    Desired headroom                                                            */
     opus_int32                      *XX,                                    /* O    Pointer to X'*X correlation matrix [ order x order ]                        */
-    opus_int                        *rshifts                                /* I/O  Right shifts of correlations                                                */
+    opus_int                        *rshifts,                               /* I/O  Right shifts of correlations                                                */
+    int                             arch                                    /* I    Run-time architecture                                                       */
 )
 {
     opus_int         i, j, lag, rshifts_local, head_room_rshifts;
@@ -138,7 +140,7 @@ void silk_corrMatrix_FIX(
     } else {
         for( lag = 1; lag < order; lag++ ) {
             /* Inner product of column 0 and column lag: X[:,0]'*X[:,lag] */
-            energy = silk_inner_prod_aligned( ptr1, ptr2, L );
+            energy = silk_inner_prod_aligned( ptr1, ptr2, L, arch );
             matrix_ptr( XX, lag, 0, order ) = energy;
             matrix_ptr( XX, 0, lag, order ) = energy;
             /* Calculate remaining off diagonal: X[:,j]'*X[:,j + lag] */
diff --git a/silk/fixed/encode_frame_FIX.c b/silk/fixed/encode_frame_FIX.c
index b490986b..1561f238 100644
--- a/silk/fixed/encode_frame_FIX.c
+++ b/silk/fixed/encode_frame_FIX.c
@@ -48,7 +48,7 @@ void silk_encode_do_VAD_FIX(
     /****************************/
     /* Voice Activity Detection */
     /****************************/
-    silk_VAD_GetSA_Q8( &psEnc->sCmn, psEnc->sCmn.inputBuf + 1 );
+    silk_VAD_GetSA_Q8( &psEnc->sCmn, psEnc->sCmn.inputBuf + 1, psEnc->sCmn.arch );
 
     /**************************************************/
     /* Convert speech activity into VAD and DTX flags */
@@ -196,11 +196,13 @@ opus_int silk_encode_frame_FIX(
                 if( psEnc->sCmn.nStatesDelayedDecision > 1 || psEnc->sCmn.warping_Q16 > 0 ) {
                     silk_NSQ_del_dec( &psEnc->sCmn, &psEnc->sCmn.sNSQ, &psEnc->sCmn.indices, xfw_Q3, psEnc->sCmn.pulses,
                            sEncCtrl.PredCoef_Q12[ 0 ], sEncCtrl.LTPCoef_Q14, sEncCtrl.AR2_Q13, sEncCtrl.HarmShapeGain_Q14,
-                           sEncCtrl.Tilt_Q14, sEncCtrl.LF_shp_Q14, sEncCtrl.Gains_Q16, sEncCtrl.pitchL, sEncCtrl.Lambda_Q10, sEncCtrl.LTP_scale_Q14 );
+                           sEncCtrl.Tilt_Q14, sEncCtrl.LF_shp_Q14, sEncCtrl.Gains_Q16, sEncCtrl.pitchL, sEncCtrl.Lambda_Q10, sEncCtrl.LTP_scale_Q14,
+                           psEnc->sCmn.arch );
                 } else {
                     silk_NSQ( &psEnc->sCmn, &psEnc->sCmn.sNSQ, &psEnc->sCmn.indices, xfw_Q3, psEnc->sCmn.pulses,
                             sEncCtrl.PredCoef_Q12[ 0 ], sEncCtrl.LTPCoef_Q14, sEncCtrl.AR2_Q13, sEncCtrl.HarmShapeGain_Q14,
-                            sEncCtrl.Tilt_Q14, sEncCtrl.LF_shp_Q14, sEncCtrl.Gains_Q16, sEncCtrl.pitchL, sEncCtrl.Lambda_Q10, sEncCtrl.LTP_scale_Q14 );
+                            sEncCtrl.Tilt_Q14, sEncCtrl.LF_shp_Q14, sEncCtrl.Gains_Q16, sEncCtrl.pitchL, sEncCtrl.Lambda_Q10, sEncCtrl.LTP_scale_Q14,
+                            psEnc->sCmn.arch);
                 }
 
                 /****************************************/
@@ -371,12 +373,12 @@ static OPUS_INLINE void silk_LBRR_encode_FIX(
             silk_NSQ_del_dec( &psEnc->sCmn, &sNSQ_LBRR, psIndices_LBRR, xfw_Q3,
                 psEnc->sCmn.pulses_LBRR[ psEnc->sCmn.nFramesEncoded ], psEncCtrl->PredCoef_Q12[ 0 ], psEncCtrl->LTPCoef_Q14,
                 psEncCtrl->AR2_Q13, psEncCtrl->HarmShapeGain_Q14, psEncCtrl->Tilt_Q14, psEncCtrl->LF_shp_Q14,
-                psEncCtrl->Gains_Q16, psEncCtrl->pitchL, psEncCtrl->Lambda_Q10, psEncCtrl->LTP_scale_Q14 );
+                psEncCtrl->Gains_Q16, psEncCtrl->pitchL, psEncCtrl->Lambda_Q10, psEncCtrl->LTP_scale_Q14, psEnc->sCmn.arch );
         } else {
             silk_NSQ( &psEnc->sCmn, &sNSQ_LBRR, psIndices_LBRR, xfw_Q3,
                 psEnc->sCmn.pulses_LBRR[ psEnc->sCmn.nFramesEncoded ], psEncCtrl->PredCoef_Q12[ 0 ], psEncCtrl->LTPCoef_Q14,
                 psEncCtrl->AR2_Q13, psEncCtrl->HarmShapeGain_Q14, psEncCtrl->Tilt_Q14, psEncCtrl->LF_shp_Q14,
-                psEncCtrl->Gains_Q16, psEncCtrl->pitchL, psEncCtrl->Lambda_Q10, psEncCtrl->LTP_scale_Q14 );
+                psEncCtrl->Gains_Q16, psEncCtrl->pitchL, psEncCtrl->Lambda_Q10, psEncCtrl->LTP_scale_Q14, psEnc->sCmn.arch );
         }
 
         /* Restore original gains */
diff --git a/silk/fixed/find_LPC_FIX.c b/silk/fixed/find_LPC_FIX.c
index 783d32e2..e11cdc86 100644
--- a/silk/fixed/find_LPC_FIX.c
+++ b/silk/fixed/find_LPC_FIX.c
@@ -95,7 +95,7 @@ void silk_find_LPC_FIX(
             silk_NLSF2A( a_tmp_Q12, NLSF0_Q15, psEncC->predictLPCOrder );
 
             /* Calculate residual energy with NLSF interpolation */
-            silk_LPC_analysis_filter( LPC_res, x, a_tmp_Q12, 2 * subfr_length, psEncC->predictLPCOrder );
+            silk_LPC_analysis_filter( LPC_res, x, a_tmp_Q12, 2 * subfr_length, psEncC->predictLPCOrder, psEncC->arch );
 
             silk_sum_sqr_shift( &res_nrg0, &rshift0, LPC_res + psEncC->predictLPCOrder,                subfr_length - psEncC->predictLPCOrder );
             silk_sum_sqr_shift( &res_nrg1, &rshift1, LPC_res + psEncC->predictLPCOrder + subfr_length, subfr_length - psEncC->predictLPCOrder );
diff --git a/silk/fixed/find_LTP_FIX.c b/silk/fixed/find_LTP_FIX.c
index 8c4d7037..1314a281 100644
--- a/silk/fixed/find_LTP_FIX.c
+++ b/silk/fixed/find_LTP_FIX.c
@@ -50,7 +50,8 @@ void silk_find_LTP_FIX(
     const opus_int                  subfr_length,                           /* I    subframe length                                                             */
     const opus_int                  nb_subfr,                               /* I    number of subframes                                                         */
     const opus_int                  mem_offset,                             /* I    number of samples in LTP memory                                             */
-    opus_int                        corr_rshifts[ MAX_NB_SUBFR ]            /* O    right shifts applied to correlations                                        */
+    opus_int                        corr_rshifts[ MAX_NB_SUBFR ],           /* O    right shifts applied to correlations                                        */
+    int                             arch                                    /* I    Run-time architecture                                                       */
 )
 {
     opus_int   i, k, lshift;
@@ -84,10 +85,10 @@ void silk_find_LTP_FIX(
             rr_shifts += ( LTP_CORRS_HEAD_ROOM - LZs );
         }
         corr_rshifts[ k ] = rr_shifts;
-        silk_corrMatrix_FIX( lag_ptr, subfr_length, LTP_ORDER, LTP_CORRS_HEAD_ROOM, WLTP_ptr, &corr_rshifts[ k ] );  /* WLTP_fix_ptr in Q( -corr_rshifts[ k ] ) */
+        silk_corrMatrix_FIX( lag_ptr, subfr_length, LTP_ORDER, LTP_CORRS_HEAD_ROOM, WLTP_ptr, &corr_rshifts[ k ], arch );  /* WLTP_fix_ptr in Q( -corr_rshifts[ k ] ) */
 
         /* The correlation vector always has lower max abs value than rr and/or RR so head room is assured */
-        silk_corrVector_FIX( lag_ptr, r_ptr, subfr_length, LTP_ORDER, Rr, corr_rshifts[ k ] );  /* Rr_fix_ptr   in Q( -corr_rshifts[ k ] ) */
+        silk_corrVector_FIX( lag_ptr, r_ptr, subfr_length, LTP_ORDER, Rr, corr_rshifts[ k ], arch );  /* Rr_fix_ptr   in Q( -corr_rshifts[ k ] ) */
         if( corr_rshifts[ k ] > rr_shifts ) {
             rr[ k ] = silk_RSHIFT( rr[ k ], corr_rshifts[ k ] - rr_shifts ); /* rr[ k ] in Q( -corr_rshifts[ k ] ) */
         }
diff --git a/silk/fixed/find_pitch_lags_FIX.c b/silk/fixed/find_pitch_lags_FIX.c
index 620f8dcd..b8440a82 100644
--- a/silk/fixed/find_pitch_lags_FIX.c
+++ b/silk/fixed/find_pitch_lags_FIX.c
@@ -112,7 +112,7 @@ void silk_find_pitch_lags_FIX(
     /*****************************************/
     /* LPC analysis filtering                */
     /*****************************************/
-    silk_LPC_analysis_filter( res, x_buf, A_Q12, buf_len, psEnc->sCmn.pitchEstimationLPCOrder );
+    silk_LPC_analysis_filter( res, x_buf, A_Q12, buf_len, psEnc->sCmn.pitchEstimationLPCOrder, psEnc->sCmn.arch );
 
     if( psEnc->sCmn.indices.signalType != TYPE_NO_VOICE_ACTIVITY && psEnc->sCmn.first_frame_after_reset == 0 ) {
         /* Threshold for pitch estimator */
diff --git a/silk/fixed/find_pred_coefs_FIX.c b/silk/fixed/find_pred_coefs_FIX.c
index 5c22f828..9706eb8a 100644
--- a/silk/fixed/find_pred_coefs_FIX.c
+++ b/silk/fixed/find_pred_coefs_FIX.c
@@ -89,11 +89,12 @@ void silk_find_pred_coefs_FIX(
         /* LTP analysis */
         silk_find_LTP_FIX( psEncCtrl->LTPCoef_Q14, WLTP, &psEncCtrl->LTPredCodGain_Q7,
             res_pitch, psEncCtrl->pitchL, Wght_Q15, psEnc->sCmn.subfr_length,
-            psEnc->sCmn.nb_subfr, psEnc->sCmn.ltp_mem_length, LTP_corrs_rshift );
+            psEnc->sCmn.nb_subfr, psEnc->sCmn.ltp_mem_length, LTP_corrs_rshift, psEnc->sCmn.arch );
 
         /* Quantize LTP gain parameters */
         silk_quant_LTP_gains( psEncCtrl->LTPCoef_Q14, psEnc->sCmn.indices.LTPIndex, &psEnc->sCmn.indices.PERIndex,
-            &psEnc->sCmn.sum_log_gain_Q7, WLTP, psEnc->sCmn.mu_LTP_Q9, psEnc->sCmn.LTPQuantLowComplexity, psEnc->sCmn.nb_subfr);
+            &psEnc->sCmn.sum_log_gain_Q7, WLTP, psEnc->sCmn.mu_LTP_Q9, psEnc->sCmn.LTPQuantLowComplexity, psEnc->sCmn.nb_subfr,
+            psEnc->sCmn.arch);
 
         /* Control LTP scaling */
         silk_LTP_scale_ctrl_FIX( psEnc, psEncCtrl, condCoding );
@@ -139,7 +140,7 @@ void silk_find_pred_coefs_FIX(
 
     /* Calculate residual energy using quantized LPC coefficients */
     silk_residual_energy_FIX( psEncCtrl->ResNrg, psEncCtrl->ResNrgQ, LPC_in_pre, psEncCtrl->PredCoef_Q12, local_gains,
-        psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.predictLPCOrder );
+        psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.predictLPCOrder, psEnc->sCmn.arch );
 
     /* Copy to prediction struct for use in next frame for interpolation */
     silk_memcpy( psEnc->sCmn.prev_NLSFq_Q15, NLSF_Q15, sizeof( psEnc->sCmn.prev_NLSFq_Q15 ) );
diff --git a/silk/fixed/main_FIX.h b/silk/fixed/main_FIX.h
index a56ca07a..ffeb4f33 100644
--- a/silk/fixed/main_FIX.h
+++ b/silk/fixed/main_FIX.h
@@ -166,7 +166,8 @@ void silk_find_LTP_FIX(
     const opus_int                  subfr_length,                           /* I    subframe length                                                             */
     const opus_int                  nb_subfr,                               /* I    number of subframes                                                         */
     const opus_int                  mem_offset,                             /* I    number of samples in LTP memory                                             */
-    opus_int                        corr_rshifts[ MAX_NB_SUBFR ]            /* O    right shifts applied to correlations                                        */
+    opus_int                        corr_rshifts[ MAX_NB_SUBFR ],           /* O    right shifts applied to correlations                                        */
+    int                             arch                                    /* I    Run-time architecture                                                       */
 );
 
 void silk_LTP_analysis_filter_FIX(
@@ -190,7 +191,8 @@ void silk_residual_energy_FIX(
     const opus_int32                gains[ MAX_NB_SUBFR ],                  /* I    Quantization gains                                                          */
     const opus_int                  subfr_length,                           /* I    Subframe length                                                             */
     const opus_int                  nb_subfr,                               /* I    Number of subframes                                                         */
-    const opus_int                  LPC_order                               /* I    LPC order                                                                   */
+    const opus_int                  LPC_order,                              /* I    LPC order                                                                   */
+    int                             arch                                    /* I    Run-time architecture                                                       */
 );
 
 /* Residual energy: nrg = wxx - 2 * wXx * c + c' * wXX * c */
@@ -220,7 +222,8 @@ void silk_corrMatrix_FIX(
     const opus_int                  order,                                  /* I    Max lag for correlation                                                     */
     const opus_int                  head_room,                              /* I    Desired headroom                                                            */
     opus_int32                      *XX,                                    /* O    Pointer to X'*X correlation matrix [ order x order ]                        */
-    opus_int                        *rshifts                                /* I/O  Right shifts of correlations                                                */
+    opus_int                        *rshifts,                               /* I/O  Right shifts of correlations                                                */
+    int                              arch                                   /* I    Run-time architecture                                                       */
 );
 
 /* Calculates correlation vector X'*t */
@@ -230,7 +233,8 @@ void silk_corrVector_FIX(
     const opus_int                  L,                                      /* I    Length of vectors                                                           */
     const opus_int                  order,                                  /* I    Max lag for correlation                                                     */
     opus_int32                      *Xt,                                    /* O    Pointer to X'*t correlation vector [order]                                  */
-    const opus_int                  rshifts                                 /* I    Right shifts of correlations                                                */
+    const opus_int                  rshifts,                                /* I    Right shifts of correlations                                                */
+    int                             arch                                    /* I    Run-time architecture                                                       */
 );
 
 /* Add noise to matrix diagonal */
diff --git a/silk/fixed/pitch_analysis_core_FIX.c b/silk/fixed/pitch_analysis_core_FIX.c
index 1641a0fb..01bb9fc0 100644
--- a/silk/fixed/pitch_analysis_core_FIX.c
+++ b/silk/fixed/pitch_analysis_core_FIX.c
@@ -72,7 +72,8 @@ static void silk_P_Ana_calc_energy_st3(
     opus_int          start_lag,                       /* I lag offset to search around */
     opus_int          sf_length,                       /* I length of one 5 ms subframe */
     opus_int          nb_subfr,                        /* I number of subframes         */
-    opus_int          complexity                       /* I Complexity setting          */
+    opus_int          complexity,                      /* I Complexity setting          */
+    int               arch                             /* I Run-time architecture       */
 );
 
 /*************************************************************/
@@ -195,8 +196,8 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
 
         /* Calculate first vector products before loop */
         cross_corr = xcorr32[ MAX_LAG_4KHZ - MIN_LAG_4KHZ ];
-        normalizer = silk_inner_prod_aligned( target_ptr, target_ptr, SF_LENGTH_8KHZ );
-        normalizer = silk_ADD32( normalizer, silk_inner_prod_aligned( basis_ptr,  basis_ptr, SF_LENGTH_8KHZ ) );
+        normalizer = silk_inner_prod_aligned( target_ptr, target_ptr, SF_LENGTH_8KHZ, arch );
+        normalizer = silk_ADD32( normalizer, silk_inner_prod_aligned( basis_ptr,  basis_ptr, SF_LENGTH_8KHZ, arch ) );
         normalizer = silk_ADD32( normalizer, silk_SMULBB( SF_LENGTH_8KHZ, 4000 ) );
 
         matrix_ptr( C, k, 0, CSTRIDE_4KHZ ) =
@@ -334,7 +335,7 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
         silk_assert( target_ptr >= frame_8kHz );
         silk_assert( target_ptr + SF_LENGTH_8KHZ <= frame_8kHz + frame_length_8kHz );
 
-        energy_target = silk_ADD32( silk_inner_prod_aligned( target_ptr, target_ptr, SF_LENGTH_8KHZ ), 1 );
+        energy_target = silk_ADD32( silk_inner_prod_aligned( target_ptr, target_ptr, SF_LENGTH_8KHZ, arch ), 1 );
         for( j = 0; j < length_d_comp; j++ ) {
             d = d_comp[ j ];
             basis_ptr = target_ptr - d;
@@ -343,9 +344,9 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
             silk_assert( basis_ptr >= frame_8kHz );
             silk_assert( basis_ptr + SF_LENGTH_8KHZ <= frame_8kHz + frame_length_8kHz );
 
-            cross_corr = silk_inner_prod_aligned( target_ptr, basis_ptr, SF_LENGTH_8KHZ );
+            cross_corr = silk_inner_prod_aligned( target_ptr, basis_ptr, SF_LENGTH_8KHZ, arch );
             if( cross_corr > 0 ) {
-                energy_basis = silk_inner_prod_aligned( basis_ptr, basis_ptr, SF_LENGTH_8KHZ );
+                energy_basis = silk_inner_prod_aligned( basis_ptr, basis_ptr, SF_LENGTH_8KHZ, arch );
                 matrix_ptr( C, k, d - ( MIN_LAG_8KHZ - 2 ), CSTRIDE_8KHZ ) =
                     (opus_int16)silk_DIV32_varQ( cross_corr,
                                                  silk_ADD32( energy_target,
@@ -519,14 +520,14 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
         ALLOC( energies_st3, nb_subfr * nb_cbk_search, silk_pe_stage3_vals );
         ALLOC( cross_corr_st3, nb_subfr * nb_cbk_search, silk_pe_stage3_vals );
         silk_P_Ana_calc_corr_st3(  cross_corr_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity, arch );
-        silk_P_Ana_calc_energy_st3( energies_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity );
+        silk_P_Ana_calc_energy_st3( energies_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity, arch );
 
         lag_counter = 0;
         silk_assert( lag == silk_SAT16( lag ) );
         contour_bias_Q15 = silk_DIV32_16( SILK_FIX_CONST( PE_FLATCONTOUR_BIAS, 15 ), lag );
 
         target_ptr = &input_frame_ptr[ PE_LTP_MEM_LENGTH_MS * Fs_kHz ];
-        energy_target = silk_ADD32( silk_inner_prod_aligned( target_ptr, target_ptr, nb_subfr * sf_length ), 1 );
+        energy_target = silk_ADD32( silk_inner_prod_aligned( target_ptr, target_ptr, nb_subfr * sf_length, arch ), 1 );
         for( d = start_lag; d <= end_lag; d++ ) {
             for( j = 0; j < nb_cbk_search; j++ ) {
                 cross_corr = 0;
@@ -671,7 +672,8 @@ static void silk_P_Ana_calc_energy_st3(
     opus_int          start_lag,                        /* I lag offset to search around */
     opus_int          sf_length,                        /* I length of one 5 ms subframe */
     opus_int          nb_subfr,                         /* I number of subframes         */
-    opus_int          complexity                        /* I Complexity setting          */
+    opus_int          complexity,                       /* I Complexity setting          */
+    int               arch                              /* I Run-time architecture       */
 )
 {
     const opus_int16 *target_ptr, *basis_ptr;
@@ -705,7 +707,7 @@ static void silk_P_Ana_calc_energy_st3(
 
         /* Calculate the energy for first lag */
         basis_ptr = target_ptr - ( start_lag + matrix_ptr( Lag_range_ptr, k, 0, 2 ) );
-        energy = silk_inner_prod_aligned( basis_ptr, basis_ptr, sf_length );
+        energy = silk_inner_prod_aligned( basis_ptr, basis_ptr, sf_length, arch );
         silk_assert( energy >= 0 );
         scratch_mem[ lag_counter ] = energy;
         lag_counter++;
diff --git a/silk/fixed/prefilter_FIX.c b/silk/fixed/prefilter_FIX.c
index e1753be5..c945dd35 100644
--- a/silk/fixed/prefilter_FIX.c
+++ b/silk/fixed/prefilter_FIX.c
@@ -50,8 +50,7 @@ static OPUS_INLINE void silk_prefilt_FIX(
     opus_int                    length                      /* I    Length of signals                   */
 );
 
-#ifndef OVERRIDE_silk_warped_LPC_analysis_filter_FIX
-void silk_warped_LPC_analysis_filter_FIX(
+void silk_warped_LPC_analysis_filter_FIX_c(
           opus_int32            state[],                    /* I/O  State [order + 1]                   */
           opus_int32            res_Q2[],                   /* O    Residual signal [length]            */
     const opus_int16            coef_Q13[],                 /* I    Coefficients [order]                */
@@ -92,7 +91,6 @@ void silk_warped_LPC_analysis_filter_FIX(
         res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 );
     }
 }
-#endif /* OVERRIDE_silk_warped_LPC_analysis_filter_FIX */
 
 void silk_prefilter_FIX(
     silk_encoder_state_FIX          *psEnc,                                 /* I/O  Encoder state                                                               */
@@ -137,7 +135,7 @@ void silk_prefilter_FIX(
 
         /* Short term FIR filtering*/
         silk_warped_LPC_analysis_filter_FIX( P->sAR_shp, st_res_Q2, AR1_shp_Q13, px,
-            psEnc->sCmn.warping_Q16, psEnc->sCmn.subfr_length, psEnc->sCmn.shapingLPCOrder );
+            psEnc->sCmn.warping_Q16, psEnc->sCmn.subfr_length, psEnc->sCmn.shapingLPCOrder, psEnc->sCmn.arch );
 
         /* Reduce (mainly) low frequencies during harmonic emphasis */
         B_Q10[ 0 ] = silk_RSHIFT_ROUND( psEncCtrl->GainsPre_Q14[ k ], 4 );
diff --git a/silk/fixed/residual_energy_FIX.c b/silk/fixed/residual_energy_FIX.c
index 105ae318..41f74778 100644
--- a/silk/fixed/residual_energy_FIX.c
+++ b/silk/fixed/residual_energy_FIX.c
@@ -42,7 +42,8 @@ void silk_residual_energy_FIX(
     const opus_int32                gains[ MAX_NB_SUBFR ],                  /* I    Quantization gains                                                          */
     const opus_int                  subfr_length,                           /* I    Subframe length                                                             */
     const opus_int                  nb_subfr,                               /* I    Number of subframes                                                         */
-    const opus_int                  LPC_order                               /* I    LPC order                                                                   */
+    const opus_int                  LPC_order,                              /* I    LPC order                                                                   */
+          int                       arch                                    /* I    Run-time architecture                                                       */
 )
 {
     opus_int         offset, i, j, rshift, lz1, lz2;
@@ -60,7 +61,7 @@ void silk_residual_energy_FIX(
     silk_assert( ( nb_subfr >> 1 ) * ( MAX_NB_SUBFR >> 1 ) == nb_subfr );
     for( i = 0; i < nb_subfr >> 1; i++ ) {
         /* Calculate half frame LPC residual signal including preceding samples */
-        silk_LPC_analysis_filter( LPC_res, x_ptr, a_Q12[ i ], ( MAX_NB_SUBFR >> 1 ) * offset, LPC_order );
+        silk_LPC_analysis_filter( LPC_res, x_ptr, a_Q12[ i ], ( MAX_NB_SUBFR >> 1 ) * offset, LPC_order, arch );
 
         /* Point to first subframe of the just calculated LPC residual signal */
         LPC_res_ptr = LPC_res + LPC_order;
diff --git a/silk/fixed/vector_ops_FIX.c b/silk/fixed/vector_ops_FIX.c
index ad8f07fd..d9498001 100644
--- a/silk/fixed/vector_ops_FIX.c
+++ b/silk/fixed/vector_ops_FIX.c
@@ -71,11 +71,12 @@ void silk_scale_vector32_Q26_lshift_18(
 opus_int32 silk_inner_prod_aligned(
     const opus_int16 *const     inVec1,             /*    I input vector 1                                              */
     const opus_int16 *const     inVec2,             /*    I input vector 2                                              */
-    const opus_int              len                 /*    I vector lengths                                              */
+    const opus_int              len,                /*    I vector lengths                                              */
+    int                         arch                /*    I Run-time architecture                                       */
 )
 {
 #ifdef FIXED_POINT
-   return celt_inner_prod(inVec1, inVec2, len);
+   return celt_inner_prod(inVec1, inVec2, len, arch);
 #else
     opus_int   i;
     opus_int32 sum = 0;
@@ -86,7 +87,7 @@ opus_int32 silk_inner_prod_aligned(
 #endif
 }
 
-opus_int64 silk_inner_prod16_aligned_64(
+opus_int64 silk_inner_prod16_aligned_64_c(
     const opus_int16            *inVec1,            /*    I input vector 1                                              */
     const opus_int16            *inVec2,            /*    I input vector 2                                              */
     const opus_int              len                 /*    I vector lengths                                              */
diff --git a/silk/fixed/x86/burg_modified_FIX_sse.c b/silk/fixed/x86/burg_modified_FIX_sse.c
new file mode 100644
index 00000000..3756095f
--- /dev/null
+++ b/silk/fixed/x86/burg_modified_FIX_sse.c
@@ -0,0 +1,375 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include "SigProc_FIX.h"
+#include "define.h"
+#include "tuning_parameters.h"
+#include "pitch.h"
+#include "celt/x86/x86cpu.h"
+
+#define MAX_FRAME_SIZE              384             /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */
+
+#define QA                          25
+#define N_BITS_HEAD_ROOM            2
+#define MIN_RSHIFTS                 -16
+#define MAX_RSHIFTS                 (32 - QA)
+
+/* Compute reflection coefficients from input signal */
+void silk_burg_modified_sse4_1(
+    opus_int32                  *res_nrg,           /* O    Residual energy                                             */
+    opus_int                    *res_nrg_Q,         /* O    Residual energy Q value                                     */
+    opus_int32                  A_Q16[],            /* O    Prediction coefficients (length order)                      */
+    const opus_int16            x[],                /* I    Input signal, length: nb_subfr * ( D + subfr_length )       */
+    const opus_int32            minInvGain_Q30,     /* I    Inverse of max prediction gain                              */
+    const opus_int              subfr_length,       /* I    Input signal subframe length (incl. D preceding samples)    */
+    const opus_int              nb_subfr,           /* I    Number of subframes stacked in x                            */
+    const opus_int              D,                  /* I    Order                                                       */
+    int                         arch                /* I    Run-time architecture                                       */
+)
+{
+    opus_int         k, n, s, lz, rshifts, rshifts_extra, reached_max_gain;
+    opus_int32       C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;
+    const opus_int16 *x_ptr;
+    opus_int32       C_first_row[ SILK_MAX_ORDER_LPC ];
+    opus_int32       C_last_row[  SILK_MAX_ORDER_LPC ];
+    opus_int32       Af_QA[       SILK_MAX_ORDER_LPC ];
+    opus_int32       CAf[ SILK_MAX_ORDER_LPC + 1 ];
+    opus_int32       CAb[ SILK_MAX_ORDER_LPC + 1 ];
+    opus_int32       xcorr[ SILK_MAX_ORDER_LPC ];
+
+    __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210;
+    __m128i CONST1 = _mm_set1_epi32(1);
+
+    silk_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
+
+    /* Compute autocorrelations, added over subframes */
+    silk_sum_sqr_shift( &C0, &rshifts, x, nb_subfr * subfr_length );
+    if( rshifts > MAX_RSHIFTS ) {
+        C0 = silk_LSHIFT32( C0, rshifts - MAX_RSHIFTS );
+        silk_assert( C0 > 0 );
+        rshifts = MAX_RSHIFTS;
+    } else {
+        lz = silk_CLZ32( C0 ) - 1;
+        rshifts_extra = N_BITS_HEAD_ROOM - lz;
+        if( rshifts_extra > 0 ) {
+            rshifts_extra = silk_min( rshifts_extra, MAX_RSHIFTS - rshifts );
+            C0 = silk_RSHIFT32( C0, rshifts_extra );
+        } else {
+            rshifts_extra = silk_max( rshifts_extra, MIN_RSHIFTS - rshifts );
+            C0 = silk_LSHIFT32( C0, -rshifts_extra );
+        }
+        rshifts += rshifts_extra;
+    }
+    CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1;                                /* Q(-rshifts) */
+    silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
+    if( rshifts > 0 ) {
+        for( s = 0; s < nb_subfr; s++ ) {
+            x_ptr = x + s * subfr_length;
+            for( n = 1; n < D + 1; n++ ) {
+                C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
+                    silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
+            }
+        }
+    } else {
+        for( s = 0; s < nb_subfr; s++ ) {
+            int i;
+            opus_int32 d;
+            x_ptr = x + s * subfr_length;
+            celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch );
+            for( n = 1; n < D + 1; n++ ) {
+               for ( i = n + subfr_length - D, d = 0; i < subfr_length; i++ )
+                  d = MAC16_16( d, x_ptr[ i ], x_ptr[ i - n ] );
+               xcorr[ n - 1 ] += d;
+            }
+            for( n = 1; n < D + 1; n++ ) {
+                C_first_row[ n - 1 ] += silk_LSHIFT32( xcorr[ n - 1 ], -rshifts );
+            }
+        }
+    }
+    silk_memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
+
+    /* Initialize */
+    CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1;                                /* Q(-rshifts) */
+
+    invGain_Q30 = (opus_int32)1 << 30;
+    reached_max_gain = 0;
+    for( n = 0; n < D; n++ ) {
+        /* Update first row of correlation matrix (without first element) */
+        /* Update last row of correlation matrix (without last element, stored in reversed order) */
+        /* Update C * Af */
+        /* Update C * flipud(Af) (stored in reversed order) */
+        if( rshifts > -2 ) {
+            for( s = 0; s < nb_subfr; s++ ) {
+                x_ptr = x + s * subfr_length;
+                x1  = -silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    16 - rshifts );        /* Q(16-rshifts) */
+                x2  = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 16 - rshifts );        /* Q(16-rshifts) */
+                tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    QA - 16 );             /* Q(QA-16) */
+                tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], QA - 16 );             /* Q(QA-16) */
+                for( k = 0; k < n; k++ ) {
+                    C_first_row[ k ] = silk_SMLAWB( C_first_row[ k ], x1, x_ptr[ n - k - 1 ]            ); /* Q( -rshifts ) */
+                    C_last_row[ k ]  = silk_SMLAWB( C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
+                    Atmp_QA = Af_QA[ k ];
+                    tmp1 = silk_SMLAWB( tmp1, Atmp_QA, x_ptr[ n - k - 1 ]            );                 /* Q(QA-16) */
+                    tmp2 = silk_SMLAWB( tmp2, Atmp_QA, x_ptr[ subfr_length - n + k ] );                 /* Q(QA-16) */
+                }
+                tmp1 = silk_LSHIFT32( -tmp1, 32 - QA - rshifts );                                       /* Q(16-rshifts) */
+                tmp2 = silk_LSHIFT32( -tmp2, 32 - QA - rshifts );                                       /* Q(16-rshifts) */
+                for( k = 0; k <= n; k++ ) {
+                    CAf[ k ] = silk_SMLAWB( CAf[ k ], tmp1, x_ptr[ n - k ]                    );        /* Q( -rshift ) */
+                    CAb[ k ] = silk_SMLAWB( CAb[ k ], tmp2, x_ptr[ subfr_length - n + k - 1 ] );        /* Q( -rshift ) */
+                }
+            }
+        } else {
+            for( s = 0; s < nb_subfr; s++ ) {
+                x_ptr = x + s * subfr_length;
+                x1  = -silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    -rshifts );            /* Q( -rshifts ) */
+                x2  = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], -rshifts );            /* Q( -rshifts ) */
+                tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    17 );                  /* Q17 */
+                tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 17 );                  /* Q17 */
+
+                X1_3210 = _mm_set1_epi32( x1 );
+                X2_3210 = _mm_set1_epi32( x2 );
+                TMP1_3210 = _mm_setzero_si128();
+                TMP2_3210 = _mm_setzero_si128();
+                for( k = 0; k < n - 3; k += 4 ) {
+                    PTR_3210   = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 1 - 3 ] );
+                    SUBFR_3210 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k ] );
+                    FIRST_3210 = _mm_loadu_si128( (__m128i *)&C_first_row[ k ] );
+                    PTR_3210   = _mm_shuffle_epi32( PTR_3210,  _MM_SHUFFLE( 0, 1, 2, 3 ) );
+                    LAST_3210  = _mm_loadu_si128( (__m128i *)&C_last_row[ k ] );
+                    ATMP_3210  = _mm_loadu_si128( (__m128i *)&Af_QA[ k ] );
+
+                    T1_3210 = _mm_mullo_epi32( PTR_3210, X1_3210 );
+                    T2_3210 = _mm_mullo_epi32( SUBFR_3210, X2_3210 );
+
+                    ATMP_3210 = _mm_srai_epi32( ATMP_3210, 7 );
+                    ATMP_3210 = _mm_add_epi32( ATMP_3210, CONST1 );
+                    ATMP_3210 = _mm_srai_epi32( ATMP_3210, 1 );
+
+                    FIRST_3210 = _mm_add_epi32( FIRST_3210, T1_3210 );
+                    LAST_3210 = _mm_add_epi32( LAST_3210, T2_3210 );
+
+                    PTR_3210   = _mm_mullo_epi32( ATMP_3210, PTR_3210 );
+                    SUBFR_3210   = _mm_mullo_epi32( ATMP_3210, SUBFR_3210 );
+
+                    _mm_storeu_si128( (__m128i *)&C_first_row[ k ], FIRST_3210 );
+                    _mm_storeu_si128( (__m128i *)&C_last_row[ k ], LAST_3210 );
+
+                    TMP1_3210 = _mm_add_epi32( TMP1_3210, PTR_3210 );
+                    TMP2_3210 = _mm_add_epi32( TMP2_3210, SUBFR_3210 );
+                }
+
+                TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_unpackhi_epi64(TMP1_3210, TMP1_3210 ) );
+                TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_unpackhi_epi64(TMP2_3210, TMP2_3210 ) );
+                TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_shufflelo_epi16(TMP1_3210, 0x0E ) );
+                TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_shufflelo_epi16(TMP2_3210, 0x0E ) );
+
+                tmp1 += _mm_cvtsi128_si32( TMP1_3210 );
+                tmp2 += _mm_cvtsi128_si32( TMP2_3210 );
+
+                for( ; k < n; k++ ) {
+                    C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ]            ); /* Q( -rshifts ) */
+                    C_last_row[ k ]  = silk_MLA( C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
+                    Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 );                                   /* Q17 */
+                    tmp1 = silk_MLA( tmp1, x_ptr[ n - k - 1 ],            Atmp1 );                      /* Q17 */
+                    tmp2 = silk_MLA( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 );                      /* Q17 */
+                }
+
+                tmp1 = -tmp1;                /* Q17 */
+                tmp2 = -tmp2;                /* Q17 */
+
+                {
+                    __m128i xmm_tmp1, xmm_tmp2;
+                    __m128i xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1;
+                    __m128i xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1;
+
+                    xmm_tmp1 = _mm_set1_epi32( tmp1 );
+                    xmm_tmp2 = _mm_set1_epi32( tmp2 );
+
+                    for( k = 0; k <= n - 3; k += 4 ) {
+                        xmm_x_ptr_n_k_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 3 ] );
+                        xmm_x_ptr_sub_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k - 1 ] );
+
+                        xmm_x_ptr_n_k_x2x0 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 1, 2, 3 ) );
+
+                        xmm_x_ptr_n_k_x2x0 = _mm_slli_epi32( xmm_x_ptr_n_k_x2x0, -rshifts - 1 );
+                        xmm_x_ptr_sub_x2x0 = _mm_slli_epi32( xmm_x_ptr_sub_x2x0, -rshifts - 1 );
+
+                        /* equal shift right 4 bytes, xmm_x_ptr_n_k_x3x1 = _mm_srli_si128(xmm_x_ptr_n_k_x2x0, 4)*/
+                        xmm_x_ptr_n_k_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+                        xmm_x_ptr_sub_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_sub_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+
+                        xmm_x_ptr_n_k_x2x0 = _mm_mul_epi32( xmm_x_ptr_n_k_x2x0, xmm_tmp1 );
+                        xmm_x_ptr_n_k_x3x1 = _mm_mul_epi32( xmm_x_ptr_n_k_x3x1, xmm_tmp1 );
+                        xmm_x_ptr_sub_x2x0 = _mm_mul_epi32( xmm_x_ptr_sub_x2x0, xmm_tmp2 );
+                        xmm_x_ptr_sub_x3x1 = _mm_mul_epi32( xmm_x_ptr_sub_x3x1, xmm_tmp2 );
+
+                        xmm_x_ptr_n_k_x2x0 = _mm_srli_epi64( xmm_x_ptr_n_k_x2x0, 16 );
+                        xmm_x_ptr_n_k_x3x1 = _mm_slli_epi64( xmm_x_ptr_n_k_x3x1, 16 );
+                        xmm_x_ptr_sub_x2x0 = _mm_srli_epi64( xmm_x_ptr_sub_x2x0, 16 );
+                        xmm_x_ptr_sub_x3x1 = _mm_slli_epi64( xmm_x_ptr_sub_x3x1, 16 );
+
+                        xmm_x_ptr_n_k_x2x0 = _mm_blend_epi16( xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1, 0xCC );
+                        xmm_x_ptr_sub_x2x0 = _mm_blend_epi16( xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1, 0xCC );
+
+                        X1_3210  = _mm_loadu_si128( (__m128i *)&CAf[ k ] );
+                        PTR_3210 = _mm_loadu_si128( (__m128i *)&CAb[ k ] );
+
+                        X1_3210  = _mm_add_epi32( X1_3210, xmm_x_ptr_n_k_x2x0 );
+                        PTR_3210 = _mm_add_epi32( PTR_3210, xmm_x_ptr_sub_x2x0 );
+
+                        _mm_storeu_si128( (__m128i *)&CAf[ k ], X1_3210 );
+                        _mm_storeu_si128( (__m128i *)&CAb[ k ], PTR_3210 );
+                    }
+
+                    for( ; k <= n; k++ ) {
+                        CAf[ k ] = silk_SMLAWW( CAf[ k ], tmp1,
+                            silk_LSHIFT32( (opus_int32)x_ptr[ n - k ], -rshifts - 1 ) );                    /* Q( -rshift ) */
+                        CAb[ k ] = silk_SMLAWW( CAb[ k ], tmp2,
+                            silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n + k - 1 ], -rshifts - 1 ) ); /* Q( -rshift ) */
+                    }
+                }
+            }
+        }
+
+        /* Calculate nominator and denominator for the next order reflection (parcor) coefficient */
+        tmp1 = C_first_row[ n ];                                                                        /* Q( -rshifts ) */
+        tmp2 = C_last_row[ n ];                                                                         /* Q( -rshifts ) */
+        num  = 0;                                                                                       /* Q( -rshifts ) */
+        nrg  = silk_ADD32( CAb[ 0 ], CAf[ 0 ] );                                                        /* Q( 1-rshifts ) */
+        for( k = 0; k < n; k++ ) {
+            Atmp_QA = Af_QA[ k ];
+            lz = silk_CLZ32( silk_abs( Atmp_QA ) ) - 1;
+            lz = silk_min( 32 - QA, lz );
+            Atmp1 = silk_LSHIFT32( Atmp_QA, lz );                                                       /* Q( QA + lz ) */
+
+            tmp1 = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( C_last_row[  n - k - 1 ], Atmp1 ), 32 - QA - lz );  /* Q( -rshifts ) */
+            tmp2 = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( C_first_row[ n - k - 1 ], Atmp1 ), 32 - QA - lz );  /* Q( -rshifts ) */
+            num  = silk_ADD_LSHIFT32( num,  silk_SMMUL( CAb[ n - k ],             Atmp1 ), 32 - QA - lz );  /* Q( -rshifts ) */
+            nrg  = silk_ADD_LSHIFT32( nrg,  silk_SMMUL( silk_ADD32( CAb[ k + 1 ], CAf[ k + 1 ] ),
+                                                                                Atmp1 ), 32 - QA - lz );    /* Q( 1-rshifts ) */
+        }
+        CAf[ n + 1 ] = tmp1;                                                                            /* Q( -rshifts ) */
+        CAb[ n + 1 ] = tmp2;                                                                            /* Q( -rshifts ) */
+        num = silk_ADD32( num, tmp2 );                                                                  /* Q( -rshifts ) */
+        num = silk_LSHIFT32( -num, 1 );                                                                 /* Q( 1-rshifts ) */
+
+        /* Calculate the next order reflection (parcor) coefficient */
+        if( silk_abs( num ) < nrg ) {
+            rc_Q31 = silk_DIV32_varQ( num, nrg, 31 );
+        } else {
+            rc_Q31 = ( num > 0 ) ? silk_int32_MAX : silk_int32_MIN;
+        }
+
+        /* Update inverse prediction gain */
+        tmp1 = ( (opus_int32)1 << 30 ) - silk_SMMUL( rc_Q31, rc_Q31 );
+        tmp1 = silk_LSHIFT( silk_SMMUL( invGain_Q30, tmp1 ), 2 );
+        if( tmp1 <= minInvGain_Q30 ) {
+            /* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */
+            tmp2 = ( (opus_int32)1 << 30 ) - silk_DIV32_varQ( minInvGain_Q30, invGain_Q30, 30 );            /* Q30 */
+            rc_Q31 = silk_SQRT_APPROX( tmp2 );                                                  /* Q15 */
+            /* Newton-Raphson iteration */
+            rc_Q31 = silk_RSHIFT32( rc_Q31 + silk_DIV32( tmp2, rc_Q31 ), 1 );                   /* Q15 */
+            rc_Q31 = silk_LSHIFT32( rc_Q31, 16 );                                               /* Q31 */
+            if( num < 0 ) {
+                /* Ensure adjusted reflection coefficients has the original sign */
+                rc_Q31 = -rc_Q31;
+            }
+            invGain_Q30 = minInvGain_Q30;
+            reached_max_gain = 1;
+        } else {
+            invGain_Q30 = tmp1;
+        }
+
+        /* Update the AR coefficients */
+        for( k = 0; k < (n + 1) >> 1; k++ ) {
+            tmp1 = Af_QA[ k ];                                                                  /* QA */
+            tmp2 = Af_QA[ n - k - 1 ];                                                          /* QA */
+            Af_QA[ k ]         = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 );      /* QA */
+            Af_QA[ n - k - 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 );      /* QA */
+        }
+        Af_QA[ n ] = silk_RSHIFT32( rc_Q31, 31 - QA );                                          /* QA */
+
+        if( reached_max_gain ) {
+            /* Reached max prediction gain; set remaining coefficients to zero and exit loop */
+            for( k = n + 1; k < D; k++ ) {
+                Af_QA[ k ] = 0;
+            }
+            break;
+        }
+
+        /* Update C * Af and C * Ab */
+        for( k = 0; k <= n + 1; k++ ) {
+            tmp1 = CAf[ k ];                                                                    /* Q( -rshifts ) */
+            tmp2 = CAb[ n - k + 1 ];                                                            /* Q( -rshifts ) */
+            CAf[ k ]         = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 );        /* Q( -rshifts ) */
+            CAb[ n - k + 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 );        /* Q( -rshifts ) */
+        }
+    }
+
+    if( reached_max_gain ) {
+        for( k = 0; k < D; k++ ) {
+            /* Scale coefficients */
+            A_Q16[ k ] = -silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 );
+        }
+        /* Subtract energy of preceding samples from C0 */
+        if( rshifts > 0 ) {
+            for( s = 0; s < nb_subfr; s++ ) {
+                x_ptr = x + s * subfr_length;
+                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
+            }
+        } else {
+            for( s = 0; s < nb_subfr; s++ ) {
+                x_ptr = x + s * subfr_length;
+                C0 -= silk_LSHIFT32( silk_inner_prod_aligned( x_ptr, x_ptr, D, arch ), -rshifts );
+            }
+        }
+        /* Approximate residual energy */
+        *res_nrg = silk_LSHIFT( silk_SMMUL( invGain_Q30, C0 ), 2 );
+        *res_nrg_Q = -rshifts;
+    } else {
+        /* Return residual energy */
+        nrg  = CAf[ 0 ];                                                                            /* Q( -rshifts ) */
+        tmp1 = (opus_int32)1 << 16;                                                                             /* Q16 */
+        for( k = 0; k < D; k++ ) {
+            Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 );                                       /* Q16 */
+            nrg  = silk_SMLAWW( nrg, CAf[ k + 1 ], Atmp1 );                                         /* Q( -rshifts ) */
+            tmp1 = silk_SMLAWW( tmp1, Atmp1, Atmp1 );                                               /* Q16 */
+            A_Q16[ k ] = -Atmp1;
+        }
+        *res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */
+        *res_nrg_Q = -rshifts;
+    }
+}
diff --git a/silk/fixed/x86/prefilter_FIX_sse.c b/silk/fixed/x86/prefilter_FIX_sse.c
new file mode 100644
index 00000000..488a603f
--- /dev/null
+++ b/silk/fixed/x86/prefilter_FIX_sse.c
@@ -0,0 +1,160 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "main.h"
+#include "celt/x86/x86cpu.h"
+
+void silk_warped_LPC_analysis_filter_FIX_sse4_1(
+    opus_int32                  state[],                    /* I/O  State [order + 1]                   */
+    opus_int32                  res_Q2[],                   /* O    Residual signal [length]            */
+    const opus_int16            coef_Q13[],                 /* I    Coefficients [order]                */
+    const opus_int16            input[],                    /* I    Input signal [length]               */
+    const opus_int16            lambda_Q16,                 /* I    Warping factor                      */
+    const opus_int              length,                     /* I    Length of input signal              */
+    const opus_int              order                       /* I    Filter order (even)                 */
+)
+{
+    opus_int     n, i;
+    opus_int32   acc_Q11, tmp1, tmp2;
+
+    /* Order must be even */
+    silk_assert( ( order & 1 ) == 0 );
+
+    if (order == 10)
+    {
+        if (0 == lambda_Q16)
+        {
+            __m128i coef_Q13_3210, coef_Q13_7654;
+            __m128i coef_Q13_0123, coef_Q13_4567;
+            __m128i state_0123, state_4567;
+            __m128i xmm_product1, xmm_product2;
+            __m128i xmm_tempa, xmm_tempb;
+
+            register opus_int32 sum;
+            register opus_int32 state_8, state_9, state_a;
+            register opus_int64 coef_Q13_8, coef_Q13_9;
+
+            silk_assert( length > 0 );
+
+            coef_Q13_3210 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 0 ] );
+            coef_Q13_7654 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 4 ] );
+
+            coef_Q13_0123 = _mm_shuffle_epi32( coef_Q13_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) );
+            coef_Q13_4567 = _mm_shuffle_epi32( coef_Q13_7654, _MM_SHUFFLE( 0, 1, 2, 3 ) );
+
+            coef_Q13_8 = (opus_int64) coef_Q13[ 8 ];
+            coef_Q13_9 = (opus_int64) coef_Q13[ 9 ];
+
+            state_0123 = _mm_loadu_si128( (__m128i *)(&state[ 0 ] ) );
+            state_4567 = _mm_loadu_si128( (__m128i *)(&state[ 4 ] ) );
+
+            state_0123 = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
+            state_4567 = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
+
+            state_8 = state[ 8 ];
+            state_9 = state[ 9 ];
+            state_a = 0;
+
+            for( n = 0; n < length; n++ )
+            {
+                xmm_product1 = _mm_mul_epi32( coef_Q13_0123, state_0123 ); /* 64-bit multiply, only 2 pairs */
+                xmm_product2 = _mm_mul_epi32( coef_Q13_4567, state_4567 );
+
+                xmm_tempa = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
+                xmm_tempb = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
+
+                xmm_product1 = _mm_srli_epi64( xmm_product1, 16 ); /* >> 16, zero extending works */
+                xmm_product2 = _mm_srli_epi64( xmm_product2, 16 );
+
+                xmm_tempa = _mm_mul_epi32( coef_Q13_3210, xmm_tempa );
+                xmm_tempb = _mm_mul_epi32( coef_Q13_7654, xmm_tempb );
+
+                xmm_tempa = _mm_srli_epi64( xmm_tempa, 16 );
+                xmm_tempb = _mm_srli_epi64( xmm_tempb, 16 );
+
+                xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_product1 );
+                xmm_tempb = _mm_add_epi32( xmm_tempb, xmm_product2 );
+                xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_tempb );
+
+                sum  = (coef_Q13_8 * state_8) >> 16;
+                sum += (coef_Q13_9 * state_9) >> 16;
+
+                xmm_tempa = _mm_add_epi32( xmm_tempa, _mm_shuffle_epi32( xmm_tempa, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
+                sum += _mm_cvtsi128_si32( xmm_tempa);
+                res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( ( 5 + sum ), 9);
+
+                /* move right */
+                state_a = state_9;
+                state_9 = state_8;
+                state_8 = _mm_cvtsi128_si32( state_4567 );
+                state_4567 = _mm_alignr_epi8( state_0123, state_4567, 4 );
+
+                state_0123 = _mm_alignr_epi8( _mm_cvtsi32_si128( silk_LSHIFT( input[ n ], 14 ) ), state_0123, 4 );
+            }
+
+            _mm_storeu_si128( (__m128i *)( &state[ 0 ] ), _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
+            _mm_storeu_si128( (__m128i *)( &state[ 4 ] ), _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
+            state[ 8 ] = state_8;
+            state[ 9 ] = state_9;
+            state[ 10 ] = state_a;
+
+            return;
+        }
+    }
+
+    for( n = 0; n < length; n++ ) {
+        /* Output of lowpass section */
+        tmp2 = silk_SMLAWB( state[ 0 ], state[ 1 ], lambda_Q16 );
+        state[ 0 ] = silk_LSHIFT( input[ n ], 14 );
+        /* Output of allpass section */
+        tmp1 = silk_SMLAWB( state[ 1 ], state[ 2 ] - tmp2, lambda_Q16 );
+        state[ 1 ] = tmp2;
+        acc_Q11 = silk_RSHIFT( order, 1 );
+        acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ 0 ] );
+        /* Loop over allpass sections */
+        for( i = 2; i < order; i += 2 ) {
+            /* Output of allpass section */
+            tmp2 = silk_SMLAWB( state[ i ], state[ i + 1 ] - tmp1, lambda_Q16 );
+            state[ i ] = tmp1;
+            acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ i - 1 ] );
+            /* Output of allpass section */
+            tmp1 = silk_SMLAWB( state[ i + 1 ], state[ i + 2 ] - tmp2, lambda_Q16 );
+            state[ i + 1 ] = tmp2;
+            acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ i ] );
+        }
+        state[ order ] = tmp1;
+        acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ order - 1 ] );
+        res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 );
+    }
+}
diff --git a/silk/fixed/x86/vector_ops_FIX_sse.c b/silk/fixed/x86/vector_ops_FIX_sse.c
new file mode 100644
index 00000000..c1e90564
--- /dev/null
+++ b/silk/fixed/x86/vector_ops_FIX_sse.c
@@ -0,0 +1,88 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "main.h"
+
+#include "SigProc_FIX.h"
+#include "pitch.h"
+
+opus_int64 silk_inner_prod16_aligned_64_sse4_1(
+    const opus_int16            *inVec1,            /*    I input vector 1                                              */
+    const opus_int16            *inVec2,            /*    I input vector 2                                              */
+    const opus_int              len                 /*    I vector lengths                                              */
+)
+{
+    opus_int  i, dataSize8;
+    opus_int64 sum;
+
+    __m128i xmm_tempa;
+    __m128i inVec1_76543210, acc1;
+    __m128i inVec2_76543210, acc2;
+
+    sum = 0;
+    dataSize8 = len & ~7;
+
+    acc1 = _mm_setzero_si128();
+    acc2 = _mm_setzero_si128();
+
+    for( i = 0; i < dataSize8; i += 8 ) {
+        inVec1_76543210 = _mm_loadu_si128( (__m128i *)(&inVec1[i + 0] ) );
+        inVec2_76543210 = _mm_loadu_si128( (__m128i *)(&inVec2[i + 0] ) );
+
+        /* only when all 4 operands are -32768 (0x8000), this results in wrap around */
+        inVec1_76543210 = _mm_madd_epi16( inVec1_76543210, inVec2_76543210 );
+
+        xmm_tempa       = _mm_cvtepi32_epi64( inVec1_76543210 );
+        /* equal shift right 8 bytes */
+        inVec1_76543210 = _mm_shuffle_epi32( inVec1_76543210, _MM_SHUFFLE( 0, 0, 3, 2 ) );
+        inVec1_76543210 = _mm_cvtepi32_epi64( inVec1_76543210 );
+
+        acc1 = _mm_add_epi64( acc1, xmm_tempa );
+        acc2 = _mm_add_epi64( acc2, inVec1_76543210 );
+    }
+
+    acc1 = _mm_add_epi64( acc1, acc2 );
+
+    /* equal shift right 8 bytes */
+    acc2 = _mm_shuffle_epi32( acc1, _MM_SHUFFLE( 0, 0, 3, 2 ) );
+    acc1 = _mm_add_epi64( acc1, acc2 );
+
+    _mm_storel_epi64( (__m128i *)&sum, acc1 );
+
+    for( ; i < len; i++ ) {
+        sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] );
+    }
+
+    return sum;
+}
author	xiangmingzhu <xiangzhu@cisco.com>	2014-04-30 15:48:07 +0800
committer	Jean-Marc Valin <jmvalin@jmvalin.ca>	2014-10-03 21:16:00 -0400
commit	c95c9a048f3283afb2e412b10085d4f7c19e1412 (patch)
tree	ed8873af6559d7a98922e0fed85be47c826ef521 /silk/fixed
parent	80460334b77d70e665a390503cd8992cdad06c10 (diff)
download	opus-c95c9a048f3283afb2e412b10085d4f7c19e1412.tar.gz