summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorErik de Castro Lopo <erikd@mega-nerd.com>2014-09-21 09:28:36 +1000
committerErik de Castro Lopo <erikd@mega-nerd.com>2014-09-21 09:34:32 +1000
commit71875b0c75d5ff686f3a9bf3adc268d6442eeabb (patch)
tree383f15a500cb9f48c10166cdd77d20a86a37fca0
parent6abc4803872bfc5fbe7a14b9e7e914cdf0fea560 (diff)
downloadflac-71875b0c75d5ff686f3a9bf3adc268d6442eeabb.tar.gz
fixed_intrin_sse[23].c : Add new, simpler SSE code.
It's simpler but not faster so it is disabled by default. Maybe it will be faster on newer CPUs though.. Patch-from: lvqcl <lvqcl.mail@gmail.com>
-rw-r--r--src/libFLAC/fixed_intrin_sse2.c14
-rw-r--r--src/libFLAC/fixed_intrin_ssse3.c14
2 files changed, 24 insertions, 4 deletions
diff --git a/src/libFLAC/fixed_intrin_sse2.c b/src/libFLAC/fixed_intrin_sse2.c
index 35fe256a..26bc1621 100644
--- a/src/libFLAC/fixed_intrin_sse2.c
+++ b/src/libFLAC/fixed_intrin_sse2.c
@@ -80,6 +80,7 @@ unsigned FLAC__fixed_compute_best_predictor_intrin_sse2(const FLAC__int32 data[]
__m128i err0, err1, tmp;
err0 = _mm_cvtsi32_si128(data[i]); // 0 0 0 e0
err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0)); // e0 e0 e0 e0
+#if 1 /* OPT_SSE */
err1 = _mm_sub_epi32(err1, last_error);
last_error = _mm_srli_si128(last_error, 4); // 0 le0 le1 le2
err1 = _mm_sub_epi32(err1, last_error);
@@ -87,7 +88,11 @@ unsigned FLAC__fixed_compute_best_predictor_intrin_sse2(const FLAC__int32 data[]
err1 = _mm_sub_epi32(err1, last_error);
last_error = _mm_srli_si128(last_error, 4); // 0 0 0 le0
err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4
-
+#else
+ last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8)); // le0 le1 le2+le0 le3+le1
+ last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4)); // le0 le1+le0 le2+le0+le1 le3+le1+le2+le0
+ err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4
+#endif
tmp = _mm_slli_si128(err0, 12); // e0 0 0 0
last_error = _mm_srli_si128(err1, 4); // 0 e0 e1 e2
last_error = _mm_or_si128(last_error, tmp); // e0 e1 e2 e3
@@ -172,6 +177,7 @@ unsigned FLAC__fixed_compute_best_predictor_wide_intrin_sse2(const FLAC__int32 d
__m128i err0, err1, tmp;
err0 = _mm_cvtsi32_si128(data[i]); // 0 0 0 e0
err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0)); // e0 e0 e0 e0
+#if 1 /* OPT_SSE */
err1 = _mm_sub_epi32(err1, last_error);
last_error = _mm_srli_si128(last_error, 4); // 0 le0 le1 le2
err1 = _mm_sub_epi32(err1, last_error);
@@ -179,7 +185,11 @@ unsigned FLAC__fixed_compute_best_predictor_wide_intrin_sse2(const FLAC__int32 d
err1 = _mm_sub_epi32(err1, last_error);
last_error = _mm_srli_si128(last_error, 4); // 0 0 0 le0
err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4
-
+#else
+ last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8)); // le0 le1 le2+le0 le3+le1
+ last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4)); // le0 le1+le0 le2+le0+le1 le3+le1+le2+le0
+ err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4
+#endif
tmp = _mm_slli_si128(err0, 12); // e0 0 0 0
last_error = _mm_srli_si128(err1, 4); // 0 e0 e1 e2
last_error = _mm_or_si128(last_error, tmp); // e0 e1 e2 e3
diff --git a/src/libFLAC/fixed_intrin_ssse3.c b/src/libFLAC/fixed_intrin_ssse3.c
index e444c71a..50c663d8 100644
--- a/src/libFLAC/fixed_intrin_ssse3.c
+++ b/src/libFLAC/fixed_intrin_ssse3.c
@@ -80,6 +80,7 @@ unsigned FLAC__fixed_compute_best_predictor_intrin_ssse3(const FLAC__int32 data[
__m128i err0, err1;
err0 = _mm_cvtsi32_si128(data[i]); // 0 0 0 e0
err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0)); // e0 e0 e0 e0
+#if 1 /* OPT_SSE */
err1 = _mm_sub_epi32(err1, last_error);
last_error = _mm_srli_si128(last_error, 4); // 0 le0 le1 le2
err1 = _mm_sub_epi32(err1, last_error);
@@ -87,7 +88,11 @@ unsigned FLAC__fixed_compute_best_predictor_intrin_ssse3(const FLAC__int32 data[
err1 = _mm_sub_epi32(err1, last_error);
last_error = _mm_srli_si128(last_error, 4); // 0 0 0 le0
err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4
-
+#else
+ last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8)); // le0 le1 le2+le0 le3+le1
+ last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4)); // le0 le1+le0 le2+le0+le1 le3+le1+le2+le0
+ err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4
+#endif
last_error = _mm_alignr_epi8(err0, err1, 4); // e0 e1 e2 e3
err0 = _mm_abs_epi32(err0);
@@ -166,6 +171,7 @@ unsigned FLAC__fixed_compute_best_predictor_wide_intrin_ssse3(const FLAC__int32
__m128i err0, err1;
err0 = _mm_cvtsi32_si128(data[i]); // 0 0 0 e0
err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0)); // e0 e0 e0 e0
+#if 1 /* OPT_SSE */
err1 = _mm_sub_epi32(err1, last_error);
last_error = _mm_srli_si128(last_error, 4); // 0 le0 le1 le2
err1 = _mm_sub_epi32(err1, last_error);
@@ -173,7 +179,11 @@ unsigned FLAC__fixed_compute_best_predictor_wide_intrin_ssse3(const FLAC__int32
err1 = _mm_sub_epi32(err1, last_error);
last_error = _mm_srli_si128(last_error, 4); // 0 0 0 le0
err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4
-
+#else
+ last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8)); // le0 le1 le2+le0 le3+le1
+ last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4)); // le0 le1+le0 le2+le0+le1 le3+le1+le2+le0
+ err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4
+#endif
last_error = _mm_alignr_epi8(err0, err1, 4); // e0 e1 e2 e3
err0 = _mm_abs_epi32(err0);