summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorScott LaVarnway <slavarnway@google.com>2022-12-20 15:43:44 -0800
committerScott LaVarnway <slavarnway@google.com>2022-12-20 15:59:20 -0800
commite022d5b71ffca486b5bc174702a9fe0e35038c75 (patch)
tree3917cfcbc7ae74dca2b452e93562ae94d2ee1c1e
parent883863001652627f47dc1ecc6e42294687c8785b (diff)
downloadlibvpx-e022d5b71ffca486b5bc174702a9fe0e35038c75.tar.gz
[x86]: Add vpx_highbd_comp_avg_pred_sse2().
C vs SSE2 4x4: 3.38x 8x8: 3.45x 16x16: 2.06x 32x32: 2.19x 64x64: 1.39x Change-Id: I46638fe187b49a78fee554114fac51c485d74474
-rw-r--r--test/comp_avg_pred_test.cc8
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl2
-rw-r--r--vpx_dsp/x86/highbd_variance_sse2.c47
3 files changed, 55 insertions, 2 deletions
diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc
index 66dc4eb4e..70aeab8d7 100644
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc
@@ -185,7 +185,7 @@ void AvgPredTest<bitdepth, Pixel>::TestSpeed() {
vpx_usec_timer timer;
vpx_usec_timer_start(&timer);
- for (int i = 0; i < 10000000 / (width * height); ++i) {
+ for (int i = 0; i < 100000000 / (width * height); ++i) {
avg_pred_func_((uint8_t *)avg.TopLeftPixel(),
(uint8_t *)pred.TopLeftPixel(), width, height,
(uint8_t *)ref.TopLeftPixel(), ref.stride());
@@ -254,5 +254,11 @@ INSTANTIATE_TEST_SUITE_P(
C, AvgPredTestHBD,
::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_c>));
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, AvgPredTestHBD,
+ ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_sse2>));
+#endif // HAVE_SSE2
+
#endif // CONFIG_VP9_HIGHBITDEPTH
} // namespace
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index b6d656820..8725821b6 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1400,7 +1400,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_highbd_12_mse8x8 sse2 neon/;
add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride";
- specialize qw/vpx_highbd_comp_avg_pred neon/;
+ specialize qw/vpx_highbd_comp_avg_pred neon sse2/;
#
# Subpixel Variance
diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c
index 7c8d79b09..381e0ad19 100644
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c
@@ -7,6 +7,7 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <emmintrin.h> // SSE2
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
@@ -559,3 +560,49 @@ FNS(sse2)
#undef FNS
#undef FN
+
+void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred,
+ int width, int height, const uint16_t *ref,
+ int ref_stride) {
+ int i, j;
+ if (width > 8) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; j += 16) {
+ const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[j]);
+ const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[j + 8]);
+ const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[j]);
+ const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[j + 8]);
+ _mm_storeu_si128((__m128i *)&comp_pred[j], _mm_avg_epu16(p0, r0));
+ _mm_storeu_si128((__m128i *)&comp_pred[j + 8], _mm_avg_epu16(p1, r1));
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+ } else if (width == 8) {
+ for (i = 0; i < height; i += 2) {
+ const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[0]);
+ const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[8]);
+ const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[0]);
+ const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[ref_stride]);
+ _mm_storeu_si128((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
+ _mm_storeu_si128((__m128i *)&comp_pred[8], _mm_avg_epu16(p1, r1));
+ comp_pred += 8 << 1;
+ pred += 8 << 1;
+ ref += ref_stride << 1;
+ }
+ } else {
+ assert(width == 4);
+ for (i = 0; i < height; i += 2) {
+ const __m128i p0 = _mm_loadl_epi64((const __m128i *)&pred[0]);
+ const __m128i p1 = _mm_loadl_epi64((const __m128i *)&pred[4]);
+ const __m128i r0 = _mm_loadl_epi64((const __m128i *)&ref[0]);
+ const __m128i r1 = _mm_loadl_epi64((const __m128i *)&ref[ref_stride]);
+ _mm_storel_epi64((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
+ _mm_storel_epi64((__m128i *)&comp_pred[4], _mm_avg_epu16(p1, r1));
+ comp_pred += 4 << 1;
+ pred += 4 << 1;
+ ref += ref_stride << 1;
+ }
+ }
+}