diff options
author | Scott LaVarnway <slavarnway@google.com> | 2022-12-20 15:43:44 -0800 |
---|---|---|
committer | Scott LaVarnway <slavarnway@google.com> | 2022-12-20 15:59:20 -0800 |
commit | e022d5b71ffca486b5bc174702a9fe0e35038c75 (patch) | |
tree | 3917cfcbc7ae74dca2b452e93562ae94d2ee1c1e | |
parent | 883863001652627f47dc1ecc6e42294687c8785b (diff) | |
download | libvpx-e022d5b71ffca486b5bc174702a9fe0e35038c75.tar.gz |
[x86]: Add vpx_highbd_comp_avg_pred_sse2().
C vs SSE2
4x4: 3.38x
8x8: 3.45x
16x16: 2.06x
32x32: 2.19x
64x64: 1.39x
Change-Id: I46638fe187b49a78fee554114fac51c485d74474
-rw-r--r-- | test/comp_avg_pred_test.cc | 8 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_variance_sse2.c | 47 |
3 files changed, 55 insertions, 2 deletions
diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc index 66dc4eb4e..70aeab8d7 100644 --- a/test/comp_avg_pred_test.cc +++ b/test/comp_avg_pred_test.cc @@ -185,7 +185,7 @@ void AvgPredTest<bitdepth, Pixel>::TestSpeed() { vpx_usec_timer timer; vpx_usec_timer_start(&timer); - for (int i = 0; i < 10000000 / (width * height); ++i) { + for (int i = 0; i < 100000000 / (width * height); ++i) { avg_pred_func_((uint8_t *)avg.TopLeftPixel(), (uint8_t *)pred.TopLeftPixel(), width, height, (uint8_t *)ref.TopLeftPixel(), ref.stride()); @@ -254,5 +254,11 @@ INSTANTIATE_TEST_SUITE_P( C, AvgPredTestHBD, ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_c>)); +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, AvgPredTestHBD, + ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_sse2>)); +#endif // HAVE_SSE2 + #endif // CONFIG_VP9_HIGHBITDEPTH } // namespace diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index b6d656820..8725821b6 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1400,7 +1400,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_12_mse8x8 sse2 neon/; add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride"; - specialize qw/vpx_highbd_comp_avg_pred neon/; + specialize qw/vpx_highbd_comp_avg_pred neon sse2/; # # Subpixel Variance diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c index 7c8d79b09..381e0ad19 100644 --- a/vpx_dsp/x86/highbd_variance_sse2.c +++ b/vpx_dsp/x86/highbd_variance_sse2.c @@ -7,6 +7,7 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include <emmintrin.h> // SSE2 #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" @@ -559,3 +560,49 @@ FNS(sse2) #undef FNS #undef FN + +void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred, + int width, int height, const uint16_t *ref, + int ref_stride) { + int i, j; + if (width > 8) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 16) { + const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[j]); + const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[j + 8]); + const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[j]); + const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[j + 8]); + _mm_storeu_si128((__m128i *)&comp_pred[j], _mm_avg_epu16(p0, r0)); + _mm_storeu_si128((__m128i *)&comp_pred[j + 8], _mm_avg_epu16(p1, r1)); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } else if (width == 8) { + for (i = 0; i < height; i += 2) { + const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[0]); + const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[8]); + const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[0]); + const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[ref_stride]); + _mm_storeu_si128((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0)); + _mm_storeu_si128((__m128i *)&comp_pred[8], _mm_avg_epu16(p1, r1)); + comp_pred += 8 << 1; + pred += 8 << 1; + ref += ref_stride << 1; + } + } else { + assert(width == 4); + for (i = 0; i < height; i += 2) { + const __m128i p0 = _mm_loadl_epi64((const __m128i *)&pred[0]); + const __m128i p1 = _mm_loadl_epi64((const __m128i *)&pred[4]); + const __m128i r0 = _mm_loadl_epi64((const __m128i *)&ref[0]); + const __m128i r1 = _mm_loadl_epi64((const __m128i *)&ref[ref_stride]); + _mm_storel_epi64((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0)); + _mm_storel_epi64((__m128i *)&comp_pred[4], _mm_avg_epu16(p1, r1)); + comp_pred += 4 << 1; + pred += 4 << 1; + ref += ref_stride << 1; + } + } +} |