summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorScott LaVarnway <slavarnway@google.com>2022-12-06 13:13:30 -0800
committerScott LaVarnway <slavarnway@google.com>2022-12-08 12:04:53 -0800
commita7bb04b43598cb2dcefea2352a1fde4dbd269fe5 (patch)
tree55aa746a11b48b485bdf6a820b510e76fe47e8a0
parent1450ec46e273b32234b036d7803aaae09423dd08 (diff)
downloadlibvpx-a7bb04b43598cb2dcefea2352a1fde4dbd269fe5.tar.gz
[x86]: Add vpx_highbd_subtract_block_avx2().
Up to 4x faster than "sse2 vectorized C". Change-Id: Ie9b3c12a437c5cddf92c4d5349c4f659ca6b82ea
-rw-r--r--test/vp9_subtract_test.cc9
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl2
-rw-r--r--vpx_dsp/x86/subtract_avx2.c107
3 files changed, 117 insertions, 1 deletions
diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc
index 7c69a317c..a57082f1e 100644
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
@@ -308,5 +308,14 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values(&vpx_highbd_subtract_block_c),
::testing::Values(&vpx_highbd_subtract_block_c)));
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, VPXHBDSubtractBlockTest,
+ ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+ ::testing::Values(12),
+ ::testing::Values(&vpx_highbd_subtract_block_avx2),
+ ::testing::Values(&vpx_highbd_subtract_block_c)));
+#endif // HAVE_AVX2
+
#endif // CONFIG_VP9_HIGHBITDEPTH
} // namespace vp9
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 51f5ebedd..b6d656820 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -939,7 +939,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Block subtraction
#
add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd";
- specialize qw/vpx_highbd_subtract_block neon/;
+ specialize qw/vpx_highbd_subtract_block neon avx2/;
#
# Single block SAD
diff --git a/vpx_dsp/x86/subtract_avx2.c b/vpx_dsp/x86/subtract_avx2.c
index 4d259ef5c..4849581ed 100644
--- a/vpx_dsp/x86/subtract_avx2.c
+++ b/vpx_dsp/x86/subtract_avx2.c
@@ -94,3 +94,110 @@ void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
break;
}
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride,
+ const uint8_t *src8_ptr,
+ ptrdiff_t src_stride,
+ const uint8_t *pred8_ptr,
+ ptrdiff_t pred_stride, int bd) {
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr);
+ uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(pred8_ptr);
+ (void)bd;
+ if (cols == 64) {
+ int j = rows;
+ do {
+ const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+ const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16));
+ const __m256i s2 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 32));
+ const __m256i s3 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 48));
+ const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+ const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16));
+ const __m256i p2 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 32));
+ const __m256i p3 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 48));
+ const __m256i d0 = _mm256_sub_epi16(s0, p0);
+ const __m256i d1 = _mm256_sub_epi16(s1, p1);
+ const __m256i d2 = _mm256_sub_epi16(s2, p2);
+ const __m256i d3 = _mm256_sub_epi16(s3, p3);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 32), d2);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 48), d3);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ } while (--j != 0);
+ } else if (cols == 32) {
+ int j = rows;
+ do {
+ const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+ const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16));
+ const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+ const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16));
+ const __m256i d0 = _mm256_sub_epi16(s0, p0);
+ const __m256i d1 = _mm256_sub_epi16(s1, p1);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ } while (--j != 0);
+ } else if (cols == 16) {
+ int j = rows;
+ do {
+ const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+ const __m256i s1 =
+ _mm256_lddqu_si256((const __m256i *)(src_ptr + src_stride));
+ const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+ const __m256i p1 =
+ _mm256_lddqu_si256((const __m256i *)(pred_ptr + pred_stride));
+ const __m256i d0 = _mm256_sub_epi16(s0, p0);
+ const __m256i d1 = _mm256_sub_epi16(s1, p1);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + diff_stride), d1);
+ src_ptr += src_stride << 1;
+ pred_ptr += pred_stride << 1;
+ diff_ptr += diff_stride << 1;
+ j -= 2;
+ } while (j != 0);
+ } else if (cols == 8) {
+ int j = rows;
+ do {
+ const __m128i s0 = _mm_lddqu_si128((const __m128i *)src_ptr);
+ const __m128i s1 =
+ _mm_lddqu_si128((const __m128i *)(src_ptr + src_stride));
+ const __m128i p0 = _mm_lddqu_si128((const __m128i *)pred_ptr);
+ const __m128i p1 =
+ _mm_lddqu_si128((const __m128i *)(pred_ptr + pred_stride));
+ const __m128i d0 = _mm_sub_epi16(s0, p0);
+ const __m128i d1 = _mm_sub_epi16(s1, p1);
+ _mm_storeu_si128((__m128i *)diff_ptr, d0);
+ _mm_storeu_si128((__m128i *)(diff_ptr + diff_stride), d1);
+ src_ptr += src_stride << 1;
+ pred_ptr += pred_stride << 1;
+ diff_ptr += diff_stride << 1;
+ j -= 2;
+ } while (j != 0);
+ } else {
+ int j = rows;
+ assert(cols == 4);
+ do {
+ const __m128i s0 = _mm_loadl_epi64((const __m128i *)src_ptr);
+ const __m128i s1 =
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
+ const __m128i p0 = _mm_loadl_epi64((const __m128i *)pred_ptr);
+ const __m128i p1 =
+ _mm_loadl_epi64((const __m128i *)(pred_ptr + pred_stride));
+ const __m128i d0 = _mm_sub_epi16(s0, p0);
+ const __m128i d1 = _mm_sub_epi16(s1, p1);
+ _mm_storel_epi64((__m128i *)diff_ptr, d0);
+ _mm_storel_epi64((__m128i *)(diff_ptr + diff_stride), d1);
+ src_ptr += src_stride << 1;
+ pred_ptr += pred_stride << 1;
+ diff_ptr += diff_stride << 1;
+ j -= 2;
+ } while (j != 0);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH