summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorlvqcl <lvqcl.mail@gmail.com>2018-09-19 20:03:37 +0300
committerErik de Castro Lopo <erikd@mega-nerd.com>2018-09-20 07:20:13 +1000
commit421961f00b505dcad41603305b47eda0b2ddfe92 (patch)
tree051990fb5c29ee092201be9562fa6479834a25f3
parentfaafa4c82c31e5aed7bc7c0e87a379825372c6ac (diff)
downloadflac-421961f00b505dcad41603305b47eda0b2ddfe92.tar.gz
Replace hadd with shuffle + add
-rw-r--r--src/libFLAC/lpc_intrin_sse41.c16
-rw-r--r--src/libFLAC/stream_encoder_intrin_avx2.c4
-rw-r--r--src/libFLAC/stream_encoder_intrin_sse2.c4
-rw-r--r--src/libFLAC/stream_encoder_intrin_ssse3.c4
4 files changed, 14 insertions, 14 deletions
diff --git a/src/libFLAC/lpc_intrin_sse41.c b/src/libFLAC/lpc_intrin_sse41.c
index 96dd20de..4ef3d3e4 100644
--- a/src/libFLAC/lpc_intrin_sse41.c
+++ b/src/libFLAC/lpc_intrin_sse41.c
@@ -980,8 +980,8 @@ void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_
summ = _mm_add_epi32(summ, _mm_mullo_epi32(dat[1], qlp[1]));
summ = _mm_add_epi32(summ, _mm_mullo_epi32(dat[0], qlp[0]));
- summ = _mm_hadd_epi32(summ, summ);
- summ = _mm_hadd_epi32(summ, summ);
+ summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
+ summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
summ = _mm_sra_epi32(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
@@ -1009,8 +1009,8 @@ void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_
for (i = 0;;) {
summ = _mm_add_epi32(_mm_mullo_epi32(dat[1], qlp[1]), _mm_mullo_epi32(dat[0], qlp[0]));
- summ = _mm_hadd_epi32(summ, summ);
- summ = _mm_hadd_epi32(summ, summ);
+ summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
+ summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
summ = _mm_sra_epi32(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
@@ -1079,8 +1079,8 @@ void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint
summ = _mm_madd_epi16(dat[1], qlp[1]);
summ = _mm_add_epi32(summ, _mm_madd_epi16(dat[0], qlp[0]));
- summ = _mm_hadd_epi32(summ, summ);
- summ = _mm_hadd_epi32(summ, summ);
+ summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
+ summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
summ = _mm_sra_epi32(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
@@ -1109,8 +1109,8 @@ void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint
for(i = 0;;) {
summ = _mm_madd_epi16(dat0, qlp0);
- summ = _mm_hadd_epi32(summ, summ);
- summ = _mm_hadd_epi32(summ, summ);
+ summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
+ summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
summ = _mm_sra_epi32(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
diff --git a/src/libFLAC/stream_encoder_intrin_avx2.c b/src/libFLAC/stream_encoder_intrin_avx2.c
index 265e6fe7..94bde0e7 100644
--- a/src/libFLAC/stream_encoder_intrin_avx2.c
+++ b/src/libFLAC/stream_encoder_intrin_avx2.c
@@ -83,8 +83,8 @@ void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual
sum128 = _mm_add_epi32(sum128, res128);
}
- sum128 = _mm_hadd_epi32(sum128, sum128);
- sum128 = _mm_hadd_epi32(sum128, sum128);
+ sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_SHUFFLE(1,0,3,2)));
+ sum128 = _mm_add_epi32(sum128, _mm_shufflelo_epi16(sum128, _MM_SHUFFLE(1,0,3,2)));
abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(sum128);
/* workaround for MSVC bugs (at least versions 2015 and 2017 are affected) */
#if (defined _MSC_VER) && (defined FLAC__CPU_X86_64)
diff --git a/src/libFLAC/stream_encoder_intrin_sse2.c b/src/libFLAC/stream_encoder_intrin_sse2.c
index ed94ec3e..44ee4d35 100644
--- a/src/libFLAC/stream_encoder_intrin_sse2.c
+++ b/src/libFLAC/stream_encoder_intrin_sse2.c
@@ -97,8 +97,8 @@ void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual
mm_sum = _mm_add_epi32(mm_sum, mm_res);
}
- mm_sum = _mm_add_epi32(mm_sum, _mm_srli_si128(mm_sum, 8));
- mm_sum = _mm_add_epi32(mm_sum, _mm_srli_si128(mm_sum, 4));
+ mm_sum = _mm_add_epi32(mm_sum, _mm_shuffle_epi32(mm_sum, _MM_SHUFFLE(1,0,3,2)));
+ mm_sum = _mm_add_epi32(mm_sum, _mm_shufflelo_epi16(mm_sum, _MM_SHUFFLE(1,0,3,2)));
abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(mm_sum);
/* workaround for MSVC bugs (at least versions 2015 and 2017 are affected) */
#if (defined _MSC_VER) && (defined FLAC__CPU_X86_64)
diff --git a/src/libFLAC/stream_encoder_intrin_ssse3.c b/src/libFLAC/stream_encoder_intrin_ssse3.c
index b5996f7f..d384dc03 100644
--- a/src/libFLAC/stream_encoder_intrin_ssse3.c
+++ b/src/libFLAC/stream_encoder_intrin_ssse3.c
@@ -86,8 +86,8 @@ void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residua
mm_sum = _mm_add_epi32(mm_sum, mm_res);
}
- mm_sum = _mm_hadd_epi32(mm_sum, mm_sum);
- mm_sum = _mm_hadd_epi32(mm_sum, mm_sum);
+ mm_sum = _mm_add_epi32(mm_sum, _mm_shuffle_epi32(mm_sum, _MM_SHUFFLE(1,0,3,2)));
+ mm_sum = _mm_add_epi32(mm_sum, _mm_shufflelo_epi16(mm_sum, _MM_SHUFFLE(1,0,3,2)));
abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(mm_sum);
/* workaround for MSVC bugs (at least versions 2015 and 2017 are affected) */
#if (defined _MSC_VER) && (defined FLAC__CPU_X86_64)