diff options
Diffstat (limited to 'openjpeg/src/lib/openjp2/mct.c')
-rw-r--r-- | openjpeg/src/lib/openjp2/mct.c | 264 |
1 files changed, 248 insertions, 16 deletions
diff --git a/openjpeg/src/lib/openjp2/mct.c b/openjpeg/src/lib/openjp2/mct.c index 60ee0969d..e1f2f50dd 100644 --- a/openjpeg/src/lib/openjp2/mct.c +++ b/openjpeg/src/lib/openjp2/mct.c @@ -40,6 +40,12 @@ #ifdef __SSE__ #include <xmmintrin.h> #endif +#ifdef __SSE2__ +#include <emmintrin.h> +#endif +#ifdef __SSE4_1__ +#include <smmintrin.h> +#endif #include "opj_includes.h" @@ -64,16 +70,61 @@ const OPJ_FLOAT64 * opj_mct_get_mct_norms_real () } /* <summary> */ -/* Foward reversible MCT. */ +/* Forward reversible MCT. */ /* </summary> */ +#ifdef __SSE2__ void opj_mct_encode( - OPJ_INT32* restrict c0, - OPJ_INT32* restrict c1, - OPJ_INT32* restrict c2, + OPJ_INT32* OPJ_RESTRICT c0, + OPJ_INT32* OPJ_RESTRICT c1, + OPJ_INT32* OPJ_RESTRICT c2, OPJ_UINT32 n) { - OPJ_UINT32 i; - for(i = 0; i < n; ++i) { + OPJ_SIZE_T i; + const OPJ_SIZE_T len = n; + /* buffer are aligned on 16 bytes */ + assert( ((size_t)c0 & 0xf) == 0 ); + assert( ((size_t)c1 & 0xf) == 0 ); + assert( ((size_t)c2 & 0xf) == 0 ); + + for(i = 0; i < (len & ~3U); i += 4) { + __m128i y, u, v; + __m128i r = _mm_load_si128((const __m128i *)&(c0[i])); + __m128i g = _mm_load_si128((const __m128i *)&(c1[i])); + __m128i b = _mm_load_si128((const __m128i *)&(c2[i])); + y = _mm_add_epi32(g, g); + y = _mm_add_epi32(y, b); + y = _mm_add_epi32(y, r); + y = _mm_srai_epi32(y, 2); + u = _mm_sub_epi32(b, g); + v = _mm_sub_epi32(r, g); + _mm_store_si128((__m128i *)&(c0[i]), y); + _mm_store_si128((__m128i *)&(c1[i]), u); + _mm_store_si128((__m128i *)&(c2[i]), v); + } + + for(; i < len; ++i) { + OPJ_INT32 r = c0[i]; + OPJ_INT32 g = c1[i]; + OPJ_INT32 b = c2[i]; + OPJ_INT32 y = (r + (g * 2) + b) >> 2; + OPJ_INT32 u = b - g; + OPJ_INT32 v = r - g; + c0[i] = y; + c1[i] = u; + c2[i] = v; + } +} +#else +void opj_mct_encode( + OPJ_INT32* OPJ_RESTRICT c0, + OPJ_INT32* OPJ_RESTRICT c1, + OPJ_INT32* OPJ_RESTRICT c2, + OPJ_UINT32 n) +{ + OPJ_SIZE_T i; + const OPJ_SIZE_T len = n; + + for(i = 0; i < len; ++i) { OPJ_INT32 r = c0[i]; OPJ_INT32 g = c1[i]; OPJ_INT32 b = c2[i]; @@ -85,14 +136,51 @@ void opj_mct_encode( c2[i] = v; } } +#endif /* <summary> */ /* Inverse reversible MCT. */ /* </summary> */ +#ifdef __SSE2__ void opj_mct_decode( - OPJ_INT32* restrict c0, - OPJ_INT32* restrict c1, - OPJ_INT32* restrict c2, + OPJ_INT32* OPJ_RESTRICT c0, + OPJ_INT32* OPJ_RESTRICT c1, + OPJ_INT32* OPJ_RESTRICT c2, + OPJ_UINT32 n) +{ + OPJ_SIZE_T i; + const OPJ_SIZE_T len = n; + + for(i = 0; i < (len & ~3U); i += 4) { + __m128i r, g, b; + __m128i y = _mm_load_si128((const __m128i *)&(c0[i])); + __m128i u = _mm_load_si128((const __m128i *)&(c1[i])); + __m128i v = _mm_load_si128((const __m128i *)&(c2[i])); + g = y; + g = _mm_sub_epi32(g, _mm_srai_epi32(_mm_add_epi32(u, v), 2)); + r = _mm_add_epi32(v, g); + b = _mm_add_epi32(u, g); + _mm_store_si128((__m128i *)&(c0[i]), r); + _mm_store_si128((__m128i *)&(c1[i]), g); + _mm_store_si128((__m128i *)&(c2[i]), b); + } + for (; i < len; ++i) { + OPJ_INT32 y = c0[i]; + OPJ_INT32 u = c1[i]; + OPJ_INT32 v = c2[i]; + OPJ_INT32 g = y - ((u + v) >> 2); + OPJ_INT32 r = v + g; + OPJ_INT32 b = u + g; + c0[i] = r; + c1[i] = g; + c2[i] = b; + } +} +#else +void opj_mct_decode( + OPJ_INT32* OPJ_RESTRICT c0, + OPJ_INT32* OPJ_RESTRICT c1, + OPJ_INT32* OPJ_RESTRICT c2, OPJ_UINT32 n) { OPJ_UINT32 i; @@ -108,6 +196,7 @@ void opj_mct_decode( c2[i] = b; } } +#endif /* <summary> */ /* Get norm of basis function of reversible MCT. */ @@ -117,12 +206,154 @@ OPJ_FLOAT64 opj_mct_getnorm(OPJ_UINT32 compno) { } /* <summary> */ -/* Foward irreversible MCT. */ +/* Forward irreversible MCT. */ /* </summary> */ +#ifdef __SSE4_1__ void opj_mct_encode_real( - OPJ_INT32* restrict c0, - OPJ_INT32* restrict c1, - OPJ_INT32* restrict c2, + OPJ_INT32* OPJ_RESTRICT c0, + OPJ_INT32* OPJ_RESTRICT c1, + OPJ_INT32* OPJ_RESTRICT c2, + OPJ_UINT32 n) +{ + OPJ_SIZE_T i; + const OPJ_SIZE_T len = n; + + const __m128i ry = _mm_set1_epi32(2449); + const __m128i gy = _mm_set1_epi32(4809); + const __m128i by = _mm_set1_epi32(934); + const __m128i ru = _mm_set1_epi32(1382); + const __m128i gu = _mm_set1_epi32(2714); + /* const __m128i bu = _mm_set1_epi32(4096); */ + /* const __m128i rv = _mm_set1_epi32(4096); */ + const __m128i gv = _mm_set1_epi32(3430); + const __m128i bv = _mm_set1_epi32(666); + const __m128i mulround = _mm_shuffle_epi32(_mm_cvtsi32_si128(4096), _MM_SHUFFLE(1, 0, 1, 0)); + + for(i = 0; i < (len & ~3U); i += 4) { + __m128i lo, hi; + __m128i y, u, v; + __m128i r = _mm_load_si128((const __m128i *)&(c0[i])); + __m128i g = _mm_load_si128((const __m128i *)&(c1[i])); + __m128i b = _mm_load_si128((const __m128i *)&(c2[i])); + + lo = r; + hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, ry); + hi = _mm_mul_epi32(hi, ry); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32-13); + y = _mm_blend_epi16(lo, hi, 0xCC); + + lo = g; + hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, gy); + hi = _mm_mul_epi32(hi, gy); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32-13); + y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC)); + + lo = b; + hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, by); + hi = _mm_mul_epi32(hi, by); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32-13); + y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC)); + _mm_store_si128((__m128i *)&(c0[i]), y); + + /*lo = b; + hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, mulround); + hi = _mm_mul_epi32(hi, mulround);*/ + lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 2, 0))); + hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 3, 1))); + lo = _mm_slli_epi64(lo, 12); + hi = _mm_slli_epi64(hi, 12); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32-13); + u = _mm_blend_epi16(lo, hi, 0xCC); + + lo = r; + hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, ru); + hi = _mm_mul_epi32(hi, ru); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32-13); + u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC)); + + lo = g; + hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, gu); + hi = _mm_mul_epi32(hi, gu); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32-13); + u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC)); + _mm_store_si128((__m128i *)&(c1[i]), u); + + /*lo = r; + hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, mulround); + hi = _mm_mul_epi32(hi, mulround);*/ + lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 2, 0))); + hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 3, 1))); + lo = _mm_slli_epi64(lo, 12); + hi = _mm_slli_epi64(hi, 12); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32-13); + v = _mm_blend_epi16(lo, hi, 0xCC); + + lo = g; + hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, gv); + hi = _mm_mul_epi32(hi, gv); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32-13); + v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC)); + + lo = b; + hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, bv); + hi = _mm_mul_epi32(hi, bv); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32-13); + v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC)); + _mm_store_si128((__m128i *)&(c2[i]), v); + } + for(; i < len; ++i) { + OPJ_INT32 r = c0[i]; + OPJ_INT32 g = c1[i]; + OPJ_INT32 b = c2[i]; + OPJ_INT32 y = opj_int_fix_mul(r, 2449) + opj_int_fix_mul(g, 4809) + opj_int_fix_mul(b, 934); + OPJ_INT32 u = -opj_int_fix_mul(r, 1382) - opj_int_fix_mul(g, 2714) + opj_int_fix_mul(b, 4096); + OPJ_INT32 v = opj_int_fix_mul(r, 4096) - opj_int_fix_mul(g, 3430) - opj_int_fix_mul(b, 666); + c0[i] = y; + c1[i] = u; + c2[i] = v; + } +} +#else +void opj_mct_encode_real( + OPJ_INT32* OPJ_RESTRICT c0, + OPJ_INT32* OPJ_RESTRICT c1, + OPJ_INT32* OPJ_RESTRICT c2, OPJ_UINT32 n) { OPJ_UINT32 i; @@ -138,14 +369,15 @@ void opj_mct_encode_real( c2[i] = v; } } +#endif /* <summary> */ /* Inverse irreversible MCT. */ /* </summary> */ void opj_mct_decode_real( - OPJ_FLOAT32* restrict c0, - OPJ_FLOAT32* restrict c1, - OPJ_FLOAT32* restrict c2, + OPJ_FLOAT32* OPJ_RESTRICT c0, + OPJ_FLOAT32* OPJ_RESTRICT c1, + OPJ_FLOAT32* OPJ_RESTRICT c2, OPJ_UINT32 n) { OPJ_UINT32 i; |