diff options
author | John Ridges <jridges@masque.com> | 2013-06-06 23:12:57 -0400 |
---|---|---|
committer | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2013-06-06 23:12:57 -0400 |
commit | e50e8084a9e8108f3535b1ffadc5c0f2341b4548 (patch) | |
tree | bea8afec871eb3c4880669f65834b64a9dd2f893 | |
parent | 70c9c3a482ed5c93f90fdb206d5d3b740f554aed (diff) | |
download | opus-e50e8084a9e8108f3535b1ffadc5c0f2341b4548.tar.gz |
Improved SSE version of xcorr_kernel()
The loop no longer reads past its buffer and is slightly faster.
Also fixes RESTORE_STACK in celt_iir().
-rw-r--r-- | celt/celt_lpc.c | 2 | ||||
-rw-r--r-- | celt/x86/pitch_sse.h | 77 |
2 files changed, 28 insertions, 51 deletions
diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c index 3ffeea78..39339e0e 100644 --- a/celt/celt_lpc.c +++ b/celt/celt_lpc.c @@ -217,8 +217,8 @@ void celt_iir(const opus_val32 *_x, } for(i=0;i<ord;i++) mem[i] = _y[N-i-1]; -#endif RESTORE_STACK; +#endif } void _celt_autocorr( diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h index 6f5a2d1b..a55a69b7 100644 --- a/celt/x86/pitch_sse.h +++ b/celt/x86/pitch_sse.h @@ -1,5 +1,4 @@ -/* Copyright (c) 2013 Xiph.Org Foundation - Written by Jean-Marc Valin */ +/* Copyright (c) 2013 Jean-Marc Valin and John Ridges */ /** @file pitch_sse.h @brief Pitch analysis @@ -37,61 +36,39 @@ #include "arch.h" #define OVERRIDE_XCORR_KERNEL - -static inline void xcorr_kernel(const opus_val16 * _x, const opus_val16 * _y, opus_val32 _sum[4], int len) +static inline void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) { int j; - __m128 sum; - __m128 x; - __m128 y; - __m128 y2; - __m128 y1; - __m128 y3; - __m128 tmp; - sum = _mm_loadu_ps(_sum); + __m128 xsum1, xsum2; + xsum1 = _mm_loadu_ps(sum); + xsum2 = _mm_setzero_ps(); - x = _mm_loadu_ps(_x); - y = _mm_loadu_ps(_y); - y1 = _mm_loadu_ps(_y+1); - for (j=0;j<len-3;j+=4) - { - _x+=4; - _y+=4; - y2 = _mm_loadu_ps(_y); - y3 = _mm_loadu_ps(_y+1); - tmp = _mm_shuffle_ps(x, x, 0x00); - sum = _mm_add_ps(sum, _mm_mul_ps(tmp, y)); - tmp = _mm_shuffle_ps(x, x, 0x55); - sum = _mm_add_ps(sum, _mm_mul_ps(tmp, y1)); - tmp = _mm_shuffle_ps(x, x, 0xaa); - y = _mm_shuffle_ps(y, y2, 0x4e); - sum = _mm_add_ps(sum, _mm_mul_ps(tmp, y)); - tmp = _mm_shuffle_ps(x, x, 0xff); - y = _mm_shuffle_ps(y1, y3, 0x4e); - sum = _mm_add_ps(sum, _mm_mul_ps(tmp, y)); - x = _mm_loadu_ps(_x); - y = y2; - y1 = y3; - } - _y++; - if (j++<len) + for (j = 0; j < len-3; j += 4) { - tmp = _mm_shuffle_ps(x, x, 0x00); - sum = _mm_add_ps(sum, _mm_mul_ps(tmp, y)); - } - if (j++<len) - { - tmp = _mm_shuffle_ps(x, x, 0x55); - y = _mm_loadu_ps(_y++); - sum = _mm_add_ps(sum, _mm_mul_ps(tmp, y)); + __m128 x0 = _mm_loadu_ps(x+j); + __m128 y0 = _mm_loadu_ps(y+j); + __m128 y3 = _mm_loadu_ps(y+j+3); + + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),y0)); + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55), + _mm_shuffle_ps(y0,y3,0x49))); + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa), + _mm_shuffle_ps(y0,y3,0x9e))); + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3)); } - if (j++<len) + if (j < len) { - tmp = _mm_shuffle_ps(x, x, 0xaa); - y = _mm_loadu_ps(_y++); - sum = _mm_add_ps(sum, _mm_mul_ps(tmp, y)); + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); + if (++j < len) + { + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); + if (++j < len) + { + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); + } + } } - _mm_storeu_ps(_sum, sum); + _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); } #endif |