From 421961f00b505dcad41603305b47eda0b2ddfe92 Mon Sep 17 00:00:00 2001 From: lvqcl Date: Wed, 19 Sep 2018 20:03:37 +0300 Subject: Replace hadd with shuffle + add --- src/libFLAC/lpc_intrin_sse41.c | 16 ++++++++-------- src/libFLAC/stream_encoder_intrin_avx2.c | 4 ++-- src/libFLAC/stream_encoder_intrin_sse2.c | 4 ++-- src/libFLAC/stream_encoder_intrin_ssse3.c | 4 ++-- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/libFLAC/lpc_intrin_sse41.c b/src/libFLAC/lpc_intrin_sse41.c index 96dd20de..4ef3d3e4 100644 --- a/src/libFLAC/lpc_intrin_sse41.c +++ b/src/libFLAC/lpc_intrin_sse41.c @@ -980,8 +980,8 @@ void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_ summ = _mm_add_epi32(summ, _mm_mullo_epi32(dat[1], qlp[1])); summ = _mm_add_epi32(summ, _mm_mullo_epi32(dat[0], qlp[0])); - summ = _mm_hadd_epi32(summ, summ); - summ = _mm_hadd_epi32(summ, summ); + summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2))); + summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2))); summ = _mm_sra_epi32(summ, cnt); temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ); @@ -1009,8 +1009,8 @@ void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_ for (i = 0;;) { summ = _mm_add_epi32(_mm_mullo_epi32(dat[1], qlp[1]), _mm_mullo_epi32(dat[0], qlp[0])); - summ = _mm_hadd_epi32(summ, summ); - summ = _mm_hadd_epi32(summ, summ); + summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2))); + summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2))); summ = _mm_sra_epi32(summ, cnt); temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ); @@ -1079,8 +1079,8 @@ void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint summ = _mm_madd_epi16(dat[1], qlp[1]); summ = _mm_add_epi32(summ, _mm_madd_epi16(dat[0], qlp[0])); - summ = _mm_hadd_epi32(summ, summ); - summ = _mm_hadd_epi32(summ, summ); + summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2))); + summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2))); summ = _mm_sra_epi32(summ, cnt); temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ); @@ -1109,8 +1109,8 @@ void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint for(i = 0;;) { summ = _mm_madd_epi16(dat0, qlp0); - summ = _mm_hadd_epi32(summ, summ); - summ = _mm_hadd_epi32(summ, summ); + summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2))); + summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2))); summ = _mm_sra_epi32(summ, cnt); temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ); diff --git a/src/libFLAC/stream_encoder_intrin_avx2.c b/src/libFLAC/stream_encoder_intrin_avx2.c index 265e6fe7..94bde0e7 100644 --- a/src/libFLAC/stream_encoder_intrin_avx2.c +++ b/src/libFLAC/stream_encoder_intrin_avx2.c @@ -83,8 +83,8 @@ void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual sum128 = _mm_add_epi32(sum128, res128); } - sum128 = _mm_hadd_epi32(sum128, sum128); - sum128 = _mm_hadd_epi32(sum128, sum128); + sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_SHUFFLE(1,0,3,2))); + sum128 = _mm_add_epi32(sum128, _mm_shufflelo_epi16(sum128, _MM_SHUFFLE(1,0,3,2))); abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(sum128); /* workaround for MSVC bugs (at least versions 2015 and 2017 are affected) */ #if (defined _MSC_VER) && (defined FLAC__CPU_X86_64) diff --git a/src/libFLAC/stream_encoder_intrin_sse2.c b/src/libFLAC/stream_encoder_intrin_sse2.c index ed94ec3e..44ee4d35 100644 --- a/src/libFLAC/stream_encoder_intrin_sse2.c +++ b/src/libFLAC/stream_encoder_intrin_sse2.c @@ -97,8 +97,8 @@ void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual mm_sum = _mm_add_epi32(mm_sum, mm_res); } - mm_sum = _mm_add_epi32(mm_sum, _mm_srli_si128(mm_sum, 8)); - mm_sum = _mm_add_epi32(mm_sum, _mm_srli_si128(mm_sum, 4)); + mm_sum = _mm_add_epi32(mm_sum, _mm_shuffle_epi32(mm_sum, _MM_SHUFFLE(1,0,3,2))); + mm_sum = _mm_add_epi32(mm_sum, _mm_shufflelo_epi16(mm_sum, _MM_SHUFFLE(1,0,3,2))); abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(mm_sum); /* workaround for MSVC bugs (at least versions 2015 and 2017 are affected) */ #if (defined _MSC_VER) && (defined FLAC__CPU_X86_64) diff --git a/src/libFLAC/stream_encoder_intrin_ssse3.c b/src/libFLAC/stream_encoder_intrin_ssse3.c index b5996f7f..d384dc03 100644 --- a/src/libFLAC/stream_encoder_intrin_ssse3.c +++ b/src/libFLAC/stream_encoder_intrin_ssse3.c @@ -86,8 +86,8 @@ void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residua mm_sum = _mm_add_epi32(mm_sum, mm_res); } - mm_sum = _mm_hadd_epi32(mm_sum, mm_sum); - mm_sum = _mm_hadd_epi32(mm_sum, mm_sum); + mm_sum = _mm_add_epi32(mm_sum, _mm_shuffle_epi32(mm_sum, _MM_SHUFFLE(1,0,3,2))); + mm_sum = _mm_add_epi32(mm_sum, _mm_shufflelo_epi16(mm_sum, _MM_SHUFFLE(1,0,3,2))); abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(mm_sum); /* workaround for MSVC bugs (at least versions 2015 and 2017 are affected) */ #if (defined _MSC_VER) && (defined FLAC__CPU_X86_64) -- cgit v1.2.1