summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYunqing Wang <yunqingwang@google.com>2023-04-18 16:11:01 +0000
committerGerrit Code Review <noreply-gerritcodereview@google.com>2023-04-18 16:11:01 +0000
commit3d7358796d60af4324d822524070a4b50e1a09a9 (patch)
treed1e28091c4f9f4a70f62f1a83a4f9d993a187567
parent8f14f66490f8bfbbf01ce809629c02631baf5b8f (diff)
parent232f8659aafec1461cac76f76885c8663755957f (diff)
downloadlibvpx-3d7358796d60af4324d822524070a4b50e1a09a9.tar.gz
Merge "Downsample SAD computation in motion search" into main
-rw-r--r--test/sad_test.cc515
-rw-r--r--vp9/common/vp9_rtcd_defs.pl4
-rw-r--r--vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c6
-rw-r--r--vp9/encoder/vp9_encoder.c647
-rw-r--r--vp9/encoder/vp9_firstpass.c7
-rw-r--r--vp9/encoder/vp9_mcomp.c97
-rw-r--r--vp9/encoder/vp9_mcomp.h10
-rw-r--r--vp9/encoder/vp9_speed_features.c2
-rw-r--r--vp9/encoder/vp9_speed_features.h4
-rw-r--r--vp9/encoder/x86/vp9_diamond_search_sad_avx.c6
-rw-r--r--vpx_dsp/sad.c30
-rw-r--r--vpx_dsp/variance.h4
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl147
-rw-r--r--vpx_dsp/x86/highbd_sad4d_avx2.c313
-rw-r--r--vpx_dsp/x86/highbd_sad4d_sse2.asm43
-rw-r--r--vpx_dsp/x86/highbd_sad_avx2.c188
-rw-r--r--vpx_dsp/x86/highbd_sad_sse2.asm59
-rw-r--r--vpx_dsp/x86/sad4d_avx2.c66
-rw-r--r--vpx_dsp/x86/sad4d_sse2.asm43
-rw-r--r--vpx_dsp/x86/sad_avx2.c145
-rw-r--r--vpx_dsp/x86/sad_sse2.asm70
21 files changed, 1824 insertions, 582 deletions
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 0896c77f1..561da5ddf 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -42,6 +42,10 @@ typedef unsigned int (*SadMxNFunc)(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride);
typedef TestParams<SadMxNFunc> SadMxNParam;
+typedef unsigned int (*SadSkipMxNFunc)(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride);
+typedef TestParams<SadSkipMxNFunc> SadSkipMxNParam;
+
typedef unsigned int (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
const uint8_t *second_pred);
@@ -52,6 +56,11 @@ typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, int src_stride,
unsigned int *sad_array);
typedef TestParams<SadMxNx4Func> SadMxNx4Param;
+typedef void (*SadSkipMxNx4Func)(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_ptr[], int ref_stride,
+ unsigned int *sad_array);
+typedef TestParams<SadSkipMxNx4Func> SadSkipMxNx4Param;
+
typedef void (*SadMxNx8Func)(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
unsigned int *sad_array);
@@ -170,6 +179,34 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {
return sad;
}
+ // Sum of Absolute Differences Skip rows. Given two blocks, calculate the
+ // absolute difference between two pixels in the same relative location every
+ // other row; accumulate and double the result at the end.
+ uint32_t ReferenceSADSkip(int ref_offset) const {
+ uint32_t sad = 0;
+ const uint8_t *const reference8 = GetReferenceFromOffset(ref_offset);
+ const uint8_t *const source8 = source_data_;
+#if CONFIG_VP9_HIGHBITDEPTH
+ const uint16_t *const reference16 =
+ CONVERT_TO_SHORTPTR(GetReferenceFromOffset(ref_offset));
+ const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ for (int h = 0; h < params_.height; h += 2) {
+ for (int w = 0; w < params_.width; ++w) {
+ if (!use_high_bit_depth_) {
+ sad += abs(source8[h * source_stride_ + w] -
+ reference8[h * reference_stride_ + w]);
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ sad += abs(source16[h * source_stride_ + w] -
+ reference16[h * reference_stride_ + w]);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+ }
+ }
+ return sad * 2;
+ }
+
// Sum of Absolute Differences Average. Given two blocks, and a prediction
// calculate the absolute difference between one pixel and average of the
// corresponding and predicted pixels; accumulate.
@@ -290,6 +327,32 @@ class SADx4Test : public SADTestBase<SadMxNx4Param> {
}
};
+class SADSkipx4Test : public SADTestBase<SadMxNx4Param> {
+ public:
+ SADSkipx4Test() : SADTestBase(GetParam()) {}
+
+ protected:
+ void SADs(unsigned int *results) const {
+ const uint8_t *references[] = { GetReference(0), GetReference(1),
+ GetReference(2), GetReference(3) };
+
+ ASM_REGISTER_STATE_CHECK(params_.func(
+ source_data_, source_stride_, references, reference_stride_, results));
+ }
+
+ void CheckSADs() const {
+ uint32_t reference_sad;
+ DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]);
+
+ SADs(exp_sad);
+ for (int block = 0; block < 4; ++block) {
+ reference_sad = ReferenceSADSkip(GetBlockRefOffset(block));
+
+ EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
+ }
+ }
+};
+
class SADTest : public AbstractBench, public SADTestBase<SadMxNParam> {
public:
SADTest() : SADTestBase(GetParam()) {}
@@ -317,6 +380,33 @@ class SADTest : public AbstractBench, public SADTestBase<SadMxNParam> {
}
};
+class SADSkipTest : public AbstractBench, public SADTestBase<SadMxNParam> {
+ public:
+ SADSkipTest() : SADTestBase(GetParam()) {}
+
+ protected:
+ unsigned int SAD(int block_idx) const {
+ unsigned int ret;
+ const uint8_t *const reference = GetReference(block_idx);
+
+ ASM_REGISTER_STATE_CHECK(ret = params_.func(source_data_, source_stride_,
+ reference, reference_stride_));
+ return ret;
+ }
+
+ void CheckSAD() const {
+ const unsigned int reference_sad = ReferenceSADSkip(GetBlockRefOffset(0));
+ const unsigned int exp_sad = SAD(0);
+
+ ASSERT_EQ(reference_sad, exp_sad);
+ }
+
+ void Run() override {
+ params_.func(source_data_, source_stride_, reference_data_,
+ reference_stride_);
+ }
+};
+
class SADavgTest : public AbstractBench, public SADTestBase<SadMxNAvgParam> {
public:
SADavgTest() : SADTestBase(GetParam()) {}
@@ -397,6 +487,58 @@ TEST_P(SADTest, DISABLED_Speed) {
PrintMedian(title);
}
+TEST_P(SADSkipTest, MaxRef) {
+ FillConstant(source_data_, source_stride_, 0);
+ FillConstant(reference_data_, reference_stride_, mask_);
+ CheckSAD();
+}
+
+TEST_P(SADSkipTest, MaxSrc) {
+ FillConstant(source_data_, source_stride_, mask_);
+ FillConstant(reference_data_, reference_stride_, 0);
+ CheckSAD();
+}
+
+TEST_P(SADSkipTest, ShortRef) {
+ const int tmp_stride = reference_stride_;
+ reference_stride_ >>= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(reference_data_, reference_stride_);
+ CheckSAD();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, UnalignedRef) {
+ // The reference frame, but not the source frame, may be unaligned for
+ // certain types of searches.
+ const int tmp_stride = reference_stride_;
+ reference_stride_ -= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(reference_data_, reference_stride_);
+ CheckSAD();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, ShortSrc) {
+ const int tmp_stride = source_stride_;
+ source_stride_ >>= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(reference_data_, reference_stride_);
+ CheckSAD();
+ source_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, DISABLED_Speed) {
+ const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height);
+ FillRandom(source_data_, source_stride_);
+
+ RunNTimes(kCountSpeedTestBlock);
+
+ char title[16];
+ snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height);
+ PrintMedian(title);
+}
+
TEST_P(SADavgTest, MaxRef) {
FillConstant(source_data_, source_stride_, 0);
FillConstant(reference_data_, reference_stride_, mask_);
@@ -554,6 +696,105 @@ TEST_P(SADx4Test, DISABLED_Speed) {
reference_stride_ = tmp_stride;
}
+TEST_P(SADSkipx4Test, MaxRef) {
+ FillConstant(source_data_, source_stride_, 0);
+ FillConstant(GetReference(0), reference_stride_, mask_);
+ FillConstant(GetReference(1), reference_stride_, mask_);
+ FillConstant(GetReference(2), reference_stride_, mask_);
+ FillConstant(GetReference(3), reference_stride_, mask_);
+ CheckSADs();
+}
+
+TEST_P(SADSkipx4Test, MaxSrc) {
+ FillConstant(source_data_, source_stride_, mask_);
+ FillConstant(GetReference(0), reference_stride_, 0);
+ FillConstant(GetReference(1), reference_stride_, 0);
+ FillConstant(GetReference(2), reference_stride_, 0);
+ FillConstant(GetReference(3), reference_stride_, 0);
+ CheckSADs();
+}
+
+TEST_P(SADSkipx4Test, ShortRef) {
+ int tmp_stride = reference_stride_;
+ reference_stride_ >>= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ CheckSADs();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, UnalignedRef) {
+ // The reference frame, but not the source frame, may be unaligned for
+ // certain types of searches.
+ int tmp_stride = reference_stride_;
+ reference_stride_ -= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ CheckSADs();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, ShortSrc) {
+ int tmp_stride = source_stride_;
+ source_stride_ >>= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ CheckSADs();
+ source_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, SrcAlignedByWidth) {
+ uint8_t *tmp_source_data = source_data_;
+ source_data_ += params_.width;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ CheckSADs();
+ source_data_ = tmp_source_data;
+}
+
+TEST_P(SADSkipx4Test, DISABLED_Speed) {
+ int tmp_stride = reference_stride_;
+ reference_stride_ -= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ const int kCountSpeedTestBlock = 500000000 / (params_.width * params_.height);
+ uint32_t reference_sad[4];
+ DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]);
+ vpx_usec_timer timer;
+ for (int block = 0; block < 4; ++block) {
+ reference_sad[block] = ReferenceSADSkip(GetBlockRefOffset(block));
+ }
+ vpx_usec_timer_start(&timer);
+ for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+ SADs(exp_sad);
+ }
+ vpx_usec_timer_mark(&timer);
+ for (int block = 0; block < 4; ++block) {
+ EXPECT_EQ(reference_sad[block], exp_sad[block]) << "block " << block;
+ }
+ const int elapsed_time =
+ static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
+ printf("sad%dx%dx4 (%2dbit) time: %5d ms\n", params_.width, params_.height,
+ bit_depth_, elapsed_time);
+
+ reference_stride_ = tmp_stride;
+}
+
//------------------------------------------------------------------------------
// C functions
const SadMxNParam c_tests[] = {
@@ -614,6 +855,56 @@ const SadMxNParam c_tests[] = {
};
INSTANTIATE_TEST_SUITE_P(C, SADTest, ::testing::ValuesIn(c_tests));
+const SadSkipMxNParam skip_c_tests[] = {
+ SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_c),
+ SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_c),
+ SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_c),
+ SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_c),
+ SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_c),
+ SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_c),
+ SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_c),
+ SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_c),
+ SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_c),
+ SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_c),
+ SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_c),
+#if CONFIG_VP9_HIGHBITDEPTH
+ SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 8),
+ SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 8),
+ SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 8),
+ SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 8),
+ SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 8),
+ SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 8),
+ SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 8),
+ SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 8),
+ SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 8),
+ SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 8),
+ SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 8),
+ SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 10),
+ SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 10),
+ SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 10),
+ SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 10),
+ SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 10),
+ SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 10),
+ SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 10),
+ SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 10),
+ SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 10),
+ SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 10),
+ SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 10),
+ SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 12),
+ SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 12),
+ SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 12),
+ SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 12),
+ SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 12),
+ SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 12),
+ SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 12),
+ SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 12),
+ SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 12),
+ SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 12),
+ SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 12),
+#endif // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(C, SADSkipTest, ::testing::ValuesIn(skip_c_tests));
+
const SadMxNAvgParam avg_c_tests[] = {
SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_c),
SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_c),
@@ -730,6 +1021,57 @@ const SadMxNx4Param x4d_c_tests[] = {
};
INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
+const SadSkipMxNx4Param skip_x4d_c_tests[] = {
+ SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_c),
+ SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_c),
+ SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_c),
+ SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_c),
+ SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_c),
+ SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_c),
+ SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_c),
+ SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_c),
+ SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_c),
+ SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_c),
+ SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_c),
+#if CONFIG_VP9_HIGHBITDEPTH
+ SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 8),
+ SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 8),
+ SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 8),
+ SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 8),
+ SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 8),
+ SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 8),
+ SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 8),
+ SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 8),
+ SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 8),
+ SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 8),
+ SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 8),
+ SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 10),
+ SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 10),
+ SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 10),
+ SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 10),
+ SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 10),
+ SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 10),
+ SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 10),
+ SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 10),
+ SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 10),
+ SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 10),
+ SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 10),
+ SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 12),
+ SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 12),
+ SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 12),
+ SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 12),
+ SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 12),
+ SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 12),
+ SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 12),
+ SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 12),
+ SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 12),
+ SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 12),
+ SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 12),
+#endif // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(C, SADSkipx4Test,
+ ::testing::ValuesIn(skip_x4d_c_tests));
+
//------------------------------------------------------------------------------
// ARM functions
#if HAVE_NEON
@@ -956,6 +1298,54 @@ const SadMxNParam sse2_tests[] = {
};
INSTANTIATE_TEST_SUITE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
+const SadSkipMxNParam skip_sse2_tests[] = {
+ SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_sse2),
+ SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_sse2),
+ SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_sse2),
+ SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_sse2),
+ SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_sse2),
+ SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_sse2),
+ SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_sse2),
+ SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_sse2),
+ SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_sse2),
+ SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_sse2),
+ SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_sse2),
+#if CONFIG_VP9_HIGHBITDEPTH
+ SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 8),
+ SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 8),
+ SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 8),
+ SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 8),
+ SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 8),
+ SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 8),
+ SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 8),
+ SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 8),
+ SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 8),
+ SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 8),
+ SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 10),
+ SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 10),
+ SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 10),
+ SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 10),
+ SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 10),
+ SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 10),
+ SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 10),
+ SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 10),
+ SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 10),
+ SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 10),
+ SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 12),
+ SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 12),
+ SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 12),
+ SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 12),
+ SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 12),
+ SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 12),
+ SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 12),
+ SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 12),
+ SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 12),
+ SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 12),
+#endif // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipTest,
+ ::testing::ValuesIn(skip_sse2_tests));
+
const SadMxNAvgParam avg_sse2_tests[] = {
SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_sse2),
SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_sse2),
@@ -1065,6 +1455,57 @@ const SadMxNx4Param x4d_sse2_tests[] = {
#endif // CONFIG_VP9_HIGHBITDEPTH
};
INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
+
+const SadSkipMxNx4Param skip_x4d_sse2_tests[] = {
+ SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_sse2),
+ SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_sse2),
+ SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_sse2),
+ SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_sse2),
+ SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_sse2),
+ SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_sse2),
+ SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_sse2),
+ SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_sse2),
+ SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_sse2),
+ SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_sse2),
+ SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_sse2),
+#if CONFIG_VP9_HIGHBITDEPTH
+ SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 8),
+ SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 8),
+ SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 8),
+ SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 8),
+ SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 8),
+ SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 8),
+ SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 8),
+ SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 8),
+ SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 8),
+ SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 8),
+ SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 8),
+ SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 10),
+ SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 10),
+ SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 10),
+ SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 10),
+ SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 10),
+ SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 10),
+ SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 10),
+ SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 10),
+ SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 10),
+ SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 10),
+ SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 10),
+ SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 12),
+ SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 12),
+ SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 12),
+ SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 12),
+ SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 12),
+ SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 12),
+ SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 12),
+ SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 12),
+ SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 12),
+ SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 12),
+ SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 12),
+#endif // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipx4Test,
+ ::testing::ValuesIn(skip_x4d_sse2_tests));
#endif // HAVE_SSE2
#if HAVE_SSE3
@@ -1113,6 +1554,44 @@ const SadMxNParam avx2_tests[] = {
};
INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
+const SadSkipMxNParam skip_avx2_tests[] = {
+ SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_avx2),
+ SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_avx2),
+ SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_avx2),
+ SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_avx2),
+ SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+ SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 8),
+ SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 8),
+ SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 8),
+ SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 8),
+ SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 8),
+ SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 8),
+ SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 8),
+ SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 8),
+
+ SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 10),
+ SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 10),
+ SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 10),
+ SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 10),
+ SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 10),
+ SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 10),
+ SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 10),
+ SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 10),
+
+ SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 12),
+ SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 12),
+ SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 12),
+ SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 12),
+ SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 12),
+ SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 12),
+ SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 12),
+ SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 12),
+#endif // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipTest,
+ ::testing::ValuesIn(skip_avx2_tests));
+
const SadMxNAvgParam avg_avx2_tests[] = {
SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_avx2),
SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_avx2),
@@ -1180,6 +1659,42 @@ const SadMxNx4Param x4d_avx2_tests[] = {
};
INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
+const SadSkipMxNx4Param skip_x4d_avx2_tests[] = {
+ SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_avx2),
+ SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_avx2),
+ SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_avx2),
+ SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_avx2),
+ SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+ SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 8),
+ SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 8),
+ SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 8),
+ SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 8),
+ SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 8),
+ SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 8),
+ SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 8),
+ SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 8),
+ SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 10),
+ SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 10),
+ SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 10),
+ SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 10),
+ SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 10),
+ SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 10),
+ SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 10),
+ SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 10),
+ SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 12),
+ SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 12),
+ SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 12),
+ SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 12),
+ SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 12),
+ SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 12),
+ SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 12),
+ SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 12),
+#endif // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipx4Test,
+ ::testing::ValuesIn(skip_x4d_avx2_tests));
+
#endif // HAVE_AVX2
#if HAVE_AVX512
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 5e6079255..4b94c31f1 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -23,7 +23,7 @@ struct macroblockd;
/* Encoder forward decls */
struct macroblock;
-struct vp9_variance_vtable;
+struct vp9_sad_table;
struct search_site_config;
struct mv;
union int_mv;
@@ -171,7 +171,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
#
# Motion search
#
-add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
+add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv";
specialize qw/vp9_diamond_search_sad avx neon/;
#
diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
index 15334b413..255e6fbc4 100644
--- a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
+++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
@@ -49,7 +49,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
const search_site_config *cfg, MV *ref_mv,
uint32_t start_mv_sad, MV *best_mv,
int search_param, int sad_per_bit, int *num00,
- const vp9_variance_fn_ptr_t *fn_ptr,
+ const vp9_sad_fn_ptr_t *sad_fn_ptr,
const MV *center_mv) {
static const uint32_t data[4] = { 0, 1, 2, 3 };
const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data);
@@ -188,8 +188,8 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
#endif
}
- fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
- in_what_stride, (uint32_t *)&v_sad_d);
+ sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
+ in_what_stride, (uint32_t *)&v_sad_d);
// Look up the component cost of the residual motion vector
{
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 72a6189d1..354f08eae 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1561,13 +1561,15 @@ void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) {
}
#if CONFIG_VP9_HIGHBITDEPTH
-#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
- cpi->fn_ptr[BT].sdf = SDF; \
- cpi->fn_ptr[BT].sdaf = SDAF; \
- cpi->fn_ptr[BT].vf = VF; \
- cpi->fn_ptr[BT].svf = SVF; \
- cpi->fn_ptr[BT].svaf = SVAF; \
- cpi->fn_ptr[BT].sdx4df = SDX4DF;
+#define HIGHBD_BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \
+ cpi->fn_ptr[BT].sdf = SDF; \
+ cpi->fn_ptr[BT].sdsf = SDSF; \
+ cpi->fn_ptr[BT].sdaf = SDAF; \
+ cpi->fn_ptr[BT].vf = VF; \
+ cpi->fn_ptr[BT].svf = SVF; \
+ cpi->fn_ptr[BT].svaf = SVAF; \
+ cpi->fn_ptr[BT].sdx4df = SDX4DF; \
+ cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
#define MAKE_BFP_SAD_WRAPPER(fnname) \
static unsigned int fnname##_bits8(const uint8_t *src_ptr, \
@@ -1627,284 +1629,361 @@ void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) {
}
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x16)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x16_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x16x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x32)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x32_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x32x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x32)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x32_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x32x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x64)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x64)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x64x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x32)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x32x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x64)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x64x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x16)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x16x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x8)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x8x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x16)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x16x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x8)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x8x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x4)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x4x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x8)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x8x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x4)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x4x4d)
static void highbd_set_var_fns(VP9_COMP *const cpi) {
VP9_COMMON *const cm = &cpi->common;
if (cm->use_highbitdepth) {
switch (cm->bit_depth) {
case VPX_BITS_8:
- HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits8,
- vpx_highbd_sad32x16_avg_bits8, vpx_highbd_8_variance32x16,
- vpx_highbd_8_sub_pixel_variance32x16,
- vpx_highbd_8_sub_pixel_avg_variance32x16,
- vpx_highbd_sad32x16x4d_bits8)
-
- HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits8,
- vpx_highbd_sad16x32_avg_bits8, vpx_highbd_8_variance16x32,
- vpx_highbd_8_sub_pixel_variance16x32,
- vpx_highbd_8_sub_pixel_avg_variance16x32,
- vpx_highbd_sad16x32x4d_bits8)
-
- HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits8,
- vpx_highbd_sad64x32_avg_bits8, vpx_highbd_8_variance64x32,
- vpx_highbd_8_sub_pixel_variance64x32,
- vpx_highbd_8_sub_pixel_avg_variance64x32,
- vpx_highbd_sad64x32x4d_bits8)
-
- HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits8,
- vpx_highbd_sad32x64_avg_bits8, vpx_highbd_8_variance32x64,
- vpx_highbd_8_sub_pixel_variance32x64,
- vpx_highbd_8_sub_pixel_avg_variance32x64,
- vpx_highbd_sad32x64x4d_bits8)
-
- HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits8,
- vpx_highbd_sad32x32_avg_bits8, vpx_highbd_8_variance32x32,
- vpx_highbd_8_sub_pixel_variance32x32,
- vpx_highbd_8_sub_pixel_avg_variance32x32,
- vpx_highbd_sad32x32x4d_bits8)
-
- HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits8,
- vpx_highbd_sad64x64_avg_bits8, vpx_highbd_8_variance64x64,
- vpx_highbd_8_sub_pixel_variance64x64,
- vpx_highbd_8_sub_pixel_avg_variance64x64,
- vpx_highbd_sad64x64x4d_bits8)
-
- HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits8,
- vpx_highbd_sad16x16_avg_bits8, vpx_highbd_8_variance16x16,
- vpx_highbd_8_sub_pixel_variance16x16,
- vpx_highbd_8_sub_pixel_avg_variance16x16,
- vpx_highbd_sad16x16x4d_bits8)
-
- HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits8,
- vpx_highbd_sad16x8_avg_bits8, vpx_highbd_8_variance16x8,
- vpx_highbd_8_sub_pixel_variance16x8,
- vpx_highbd_8_sub_pixel_avg_variance16x8,
- vpx_highbd_sad16x8x4d_bits8)
-
- HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits8,
- vpx_highbd_sad8x16_avg_bits8, vpx_highbd_8_variance8x16,
- vpx_highbd_8_sub_pixel_variance8x16,
- vpx_highbd_8_sub_pixel_avg_variance8x16,
- vpx_highbd_sad8x16x4d_bits8)
+ HIGHBD_BFP(
+ BLOCK_32X16, vpx_highbd_sad32x16_bits8,
+ vpx_highbd_sad_skip_32x16_bits8, vpx_highbd_sad32x16_avg_bits8,
+ vpx_highbd_8_variance32x16, vpx_highbd_8_sub_pixel_variance32x16,
+ vpx_highbd_8_sub_pixel_avg_variance32x16,
+ vpx_highbd_sad32x16x4d_bits8, vpx_highbd_sad_skip_32x16x4d_bits8)
+
+ HIGHBD_BFP(
+ BLOCK_16X32, vpx_highbd_sad16x32_bits8,
+ vpx_highbd_sad_skip_16x32_bits8, vpx_highbd_sad16x32_avg_bits8,
+ vpx_highbd_8_variance16x32, vpx_highbd_8_sub_pixel_variance16x32,
+ vpx_highbd_8_sub_pixel_avg_variance16x32,
+ vpx_highbd_sad16x32x4d_bits8, vpx_highbd_sad_skip_16x32x4d_bits8)
+
+ HIGHBD_BFP(
+ BLOCK_64X32, vpx_highbd_sad64x32_bits8,
+ vpx_highbd_sad_skip_64x32_bits8, vpx_highbd_sad64x32_avg_bits8,
+ vpx_highbd_8_variance64x32, vpx_highbd_8_sub_pixel_variance64x32,
+ vpx_highbd_8_sub_pixel_avg_variance64x32,
+ vpx_highbd_sad64x32x4d_bits8, vpx_highbd_sad_skip_64x32x4d_bits8)
+
+ HIGHBD_BFP(
+ BLOCK_32X64, vpx_highbd_sad32x64_bits8,
+ vpx_highbd_sad_skip_32x64_bits8, vpx_highbd_sad32x64_avg_bits8,
+ vpx_highbd_8_variance32x64, vpx_highbd_8_sub_pixel_variance32x64,
+ vpx_highbd_8_sub_pixel_avg_variance32x64,
+ vpx_highbd_sad32x64x4d_bits8, vpx_highbd_sad_skip_32x64x4d_bits8)
+
+ HIGHBD_BFP(
+ BLOCK_32X32, vpx_highbd_sad32x32_bits8,
+ vpx_highbd_sad_skip_32x32_bits8, vpx_highbd_sad32x32_avg_bits8,
+ vpx_highbd_8_variance32x32, vpx_highbd_8_sub_pixel_variance32x32,
+ vpx_highbd_8_sub_pixel_avg_variance32x32,
+ vpx_highbd_sad32x32x4d_bits8, vpx_highbd_sad_skip_32x32x4d_bits8)
HIGHBD_BFP(
- BLOCK_8X8, vpx_highbd_sad8x8_bits8, vpx_highbd_sad8x8_avg_bits8,
- vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8,
- vpx_highbd_8_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x4d_bits8)
+ BLOCK_64X64, vpx_highbd_sad64x64_bits8,
+ vpx_highbd_sad_skip_64x64_bits8, vpx_highbd_sad64x64_avg_bits8,
+ vpx_highbd_8_variance64x64, vpx_highbd_8_sub_pixel_variance64x64,
+ vpx_highbd_8_sub_pixel_avg_variance64x64,
+ vpx_highbd_sad64x64x4d_bits8, vpx_highbd_sad_skip_64x64x4d_bits8)
HIGHBD_BFP(
- BLOCK_8X4, vpx_highbd_sad8x4_bits8, vpx_highbd_sad8x4_avg_bits8,
- vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4,
- vpx_highbd_8_sub_pixel_avg_variance8x4, vpx_highbd_sad8x4x4d_bits8)
+ BLOCK_16X16, vpx_highbd_sad16x16_bits8,
+ vpx_highbd_sad_skip_16x16_bits8, vpx_highbd_sad16x16_avg_bits8,
+ vpx_highbd_8_variance16x16, vpx_highbd_8_sub_pixel_variance16x16,
+ vpx_highbd_8_sub_pixel_avg_variance16x16,
+ vpx_highbd_sad16x16x4d_bits8, vpx_highbd_sad_skip_16x16x4d_bits8)
HIGHBD_BFP(
- BLOCK_4X8, vpx_highbd_sad4x8_bits8, vpx_highbd_sad4x8_avg_bits8,
- vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8,
- vpx_highbd_8_sub_pixel_avg_variance4x8, vpx_highbd_sad4x8x4d_bits8)
+ BLOCK_16X8, vpx_highbd_sad16x8_bits8,
+ vpx_highbd_sad_skip_16x8_bits8, vpx_highbd_sad16x8_avg_bits8,
+ vpx_highbd_8_variance16x8, vpx_highbd_8_sub_pixel_variance16x8,
+ vpx_highbd_8_sub_pixel_avg_variance16x8,
+ vpx_highbd_sad16x8x4d_bits8, vpx_highbd_sad_skip_16x8x4d_bits8)
HIGHBD_BFP(
- BLOCK_4X4, vpx_highbd_sad4x4_bits8, vpx_highbd_sad4x4_avg_bits8,
- vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4,
- vpx_highbd_8_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x4d_bits8)
+ BLOCK_8X16, vpx_highbd_sad8x16_bits8,
+ vpx_highbd_sad_skip_8x16_bits8, vpx_highbd_sad8x16_avg_bits8,
+ vpx_highbd_8_variance8x16, vpx_highbd_8_sub_pixel_variance8x16,
+ vpx_highbd_8_sub_pixel_avg_variance8x16,
+ vpx_highbd_sad8x16x4d_bits8, vpx_highbd_sad_skip_8x16x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits8,
+ vpx_highbd_sad_skip_8x8_bits8, vpx_highbd_sad8x8_avg_bits8,
+ vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8,
+ vpx_highbd_8_sub_pixel_avg_variance8x8,
+ vpx_highbd_sad8x8x4d_bits8, vpx_highbd_sad_skip_8x8x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits8,
+ vpx_highbd_sad_skip_8x4_bits8, vpx_highbd_sad8x4_avg_bits8,
+ vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4,
+ vpx_highbd_8_sub_pixel_avg_variance8x4,
+ vpx_highbd_sad8x4x4d_bits8, vpx_highbd_sad_skip_8x4x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits8,
+ vpx_highbd_sad_skip_4x8_bits8, vpx_highbd_sad4x8_avg_bits8,
+ vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8,
+ vpx_highbd_8_sub_pixel_avg_variance4x8,
+ vpx_highbd_sad4x8x4d_bits8, vpx_highbd_sad_skip_4x8x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits8,
+ vpx_highbd_sad_skip_4x4_bits8, vpx_highbd_sad4x4_avg_bits8,
+ vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4,
+ vpx_highbd_8_sub_pixel_avg_variance4x4,
+ vpx_highbd_sad4x4x4d_bits8, vpx_highbd_sad_skip_4x4x4d_bits8)
break;
case VPX_BITS_10:
- HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits10,
- vpx_highbd_sad32x16_avg_bits10, vpx_highbd_10_variance32x16,
- vpx_highbd_10_sub_pixel_variance32x16,
- vpx_highbd_10_sub_pixel_avg_variance32x16,
- vpx_highbd_sad32x16x4d_bits10)
-
- HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits10,
- vpx_highbd_sad16x32_avg_bits10, vpx_highbd_10_variance16x32,
- vpx_highbd_10_sub_pixel_variance16x32,
- vpx_highbd_10_sub_pixel_avg_variance16x32,
- vpx_highbd_sad16x32x4d_bits10)
-
- HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits10,
- vpx_highbd_sad64x32_avg_bits10, vpx_highbd_10_variance64x32,
- vpx_highbd_10_sub_pixel_variance64x32,
- vpx_highbd_10_sub_pixel_avg_variance64x32,
- vpx_highbd_sad64x32x4d_bits10)
-
- HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits10,
- vpx_highbd_sad32x64_avg_bits10, vpx_highbd_10_variance32x64,
- vpx_highbd_10_sub_pixel_variance32x64,
- vpx_highbd_10_sub_pixel_avg_variance32x64,
- vpx_highbd_sad32x64x4d_bits10)
-
- HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits10,
- vpx_highbd_sad32x32_avg_bits10, vpx_highbd_10_variance32x32,
- vpx_highbd_10_sub_pixel_variance32x32,
- vpx_highbd_10_sub_pixel_avg_variance32x32,
- vpx_highbd_sad32x32x4d_bits10)
-
- HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits10,
- vpx_highbd_sad64x64_avg_bits10, vpx_highbd_10_variance64x64,
- vpx_highbd_10_sub_pixel_variance64x64,
- vpx_highbd_10_sub_pixel_avg_variance64x64,
- vpx_highbd_sad64x64x4d_bits10)
-
- HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits10,
- vpx_highbd_sad16x16_avg_bits10, vpx_highbd_10_variance16x16,
- vpx_highbd_10_sub_pixel_variance16x16,
- vpx_highbd_10_sub_pixel_avg_variance16x16,
- vpx_highbd_sad16x16x4d_bits10)
-
- HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits10,
- vpx_highbd_sad16x8_avg_bits10, vpx_highbd_10_variance16x8,
- vpx_highbd_10_sub_pixel_variance16x8,
- vpx_highbd_10_sub_pixel_avg_variance16x8,
- vpx_highbd_sad16x8x4d_bits10)
-
- HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits10,
- vpx_highbd_sad8x16_avg_bits10, vpx_highbd_10_variance8x16,
- vpx_highbd_10_sub_pixel_variance8x16,
- vpx_highbd_10_sub_pixel_avg_variance8x16,
- vpx_highbd_sad8x16x4d_bits10)
-
- HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits10,
- vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8,
- vpx_highbd_10_sub_pixel_variance8x8,
- vpx_highbd_10_sub_pixel_avg_variance8x8,
- vpx_highbd_sad8x8x4d_bits10)
-
- HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits10,
- vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4,
- vpx_highbd_10_sub_pixel_variance8x4,
- vpx_highbd_10_sub_pixel_avg_variance8x4,
- vpx_highbd_sad8x4x4d_bits10)
-
- HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits10,
- vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8,
- vpx_highbd_10_sub_pixel_variance4x8,
- vpx_highbd_10_sub_pixel_avg_variance4x8,
- vpx_highbd_sad4x8x4d_bits10)
-
- HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits10,
- vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4,
- vpx_highbd_10_sub_pixel_variance4x4,
- vpx_highbd_10_sub_pixel_avg_variance4x4,
- vpx_highbd_sad4x4x4d_bits10)
+ HIGHBD_BFP(
+ BLOCK_32X16, vpx_highbd_sad32x16_bits10,
+ vpx_highbd_sad_skip_32x16_bits10, vpx_highbd_sad32x16_avg_bits10,
+ vpx_highbd_10_variance32x16, vpx_highbd_10_sub_pixel_variance32x16,
+ vpx_highbd_10_sub_pixel_avg_variance32x16,
+ vpx_highbd_sad32x16x4d_bits10, vpx_highbd_sad_skip_32x16x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_16X32, vpx_highbd_sad16x32_bits10,
+ vpx_highbd_sad_skip_16x32_bits10, vpx_highbd_sad16x32_avg_bits10,
+ vpx_highbd_10_variance16x32, vpx_highbd_10_sub_pixel_variance16x32,
+ vpx_highbd_10_sub_pixel_avg_variance16x32,
+ vpx_highbd_sad16x32x4d_bits10, vpx_highbd_sad_skip_16x32x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_64X32, vpx_highbd_sad64x32_bits10,
+ vpx_highbd_sad_skip_64x32_bits10, vpx_highbd_sad64x32_avg_bits10,
+ vpx_highbd_10_variance64x32, vpx_highbd_10_sub_pixel_variance64x32,
+ vpx_highbd_10_sub_pixel_avg_variance64x32,
+ vpx_highbd_sad64x32x4d_bits10, vpx_highbd_sad_skip_64x32x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_32X64, vpx_highbd_sad32x64_bits10,
+ vpx_highbd_sad_skip_32x64_bits10, vpx_highbd_sad32x64_avg_bits10,
+ vpx_highbd_10_variance32x64, vpx_highbd_10_sub_pixel_variance32x64,
+ vpx_highbd_10_sub_pixel_avg_variance32x64,
+ vpx_highbd_sad32x64x4d_bits10, vpx_highbd_sad_skip_32x64x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_32X32, vpx_highbd_sad32x32_bits10,
+ vpx_highbd_sad_skip_32x32_bits10, vpx_highbd_sad32x32_avg_bits10,
+ vpx_highbd_10_variance32x32, vpx_highbd_10_sub_pixel_variance32x32,
+ vpx_highbd_10_sub_pixel_avg_variance32x32,
+ vpx_highbd_sad32x32x4d_bits10, vpx_highbd_sad_skip_32x32x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_64X64, vpx_highbd_sad64x64_bits10,
+ vpx_highbd_sad_skip_64x64_bits10, vpx_highbd_sad64x64_avg_bits10,
+ vpx_highbd_10_variance64x64, vpx_highbd_10_sub_pixel_variance64x64,
+ vpx_highbd_10_sub_pixel_avg_variance64x64,
+ vpx_highbd_sad64x64x4d_bits10, vpx_highbd_sad_skip_64x64x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_16X16, vpx_highbd_sad16x16_bits10,
+ vpx_highbd_sad_skip_16x16_bits10, vpx_highbd_sad16x16_avg_bits10,
+ vpx_highbd_10_variance16x16, vpx_highbd_10_sub_pixel_variance16x16,
+ vpx_highbd_10_sub_pixel_avg_variance16x16,
+ vpx_highbd_sad16x16x4d_bits10, vpx_highbd_sad_skip_16x16x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_16X8, vpx_highbd_sad16x8_bits10,
+ vpx_highbd_sad_skip_16x8_bits10, vpx_highbd_sad16x8_avg_bits10,
+ vpx_highbd_10_variance16x8, vpx_highbd_10_sub_pixel_variance16x8,
+ vpx_highbd_10_sub_pixel_avg_variance16x8,
+ vpx_highbd_sad16x8x4d_bits10, vpx_highbd_sad_skip_16x8x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_8X16, vpx_highbd_sad8x16_bits10,
+ vpx_highbd_sad_skip_8x16_bits10, vpx_highbd_sad8x16_avg_bits10,
+ vpx_highbd_10_variance8x16, vpx_highbd_10_sub_pixel_variance8x16,
+ vpx_highbd_10_sub_pixel_avg_variance8x16,
+ vpx_highbd_sad8x16x4d_bits10, vpx_highbd_sad_skip_8x16x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_8X8, vpx_highbd_sad8x8_bits10, vpx_highbd_sad_skip_8x8_bits10,
+ vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8,
+ vpx_highbd_10_sub_pixel_variance8x8,
+ vpx_highbd_10_sub_pixel_avg_variance8x8,
+ vpx_highbd_sad8x8x4d_bits10, vpx_highbd_sad_skip_8x8x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_8X4, vpx_highbd_sad8x4_bits10, vpx_highbd_sad_skip_8x4_bits10,
+ vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4,
+ vpx_highbd_10_sub_pixel_variance8x4,
+ vpx_highbd_10_sub_pixel_avg_variance8x4,
+ vpx_highbd_sad8x4x4d_bits10, vpx_highbd_sad_skip_8x4x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_4X8, vpx_highbd_sad4x8_bits10, vpx_highbd_sad_skip_4x8_bits10,
+ vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8,
+ vpx_highbd_10_sub_pixel_variance4x8,
+ vpx_highbd_10_sub_pixel_avg_variance4x8,
+ vpx_highbd_sad4x8x4d_bits10, vpx_highbd_sad_skip_4x8x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_4X4, vpx_highbd_sad4x4_bits10, vpx_highbd_sad_skip_4x4_bits10,
+ vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4,
+ vpx_highbd_10_sub_pixel_variance4x4,
+ vpx_highbd_10_sub_pixel_avg_variance4x4,
+ vpx_highbd_sad4x4x4d_bits10, vpx_highbd_sad_skip_4x4x4d_bits10)
break;
default:
assert(cm->bit_depth == VPX_BITS_12);
- HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12,
- vpx_highbd_sad32x16_avg_bits12, vpx_highbd_12_variance32x16,
- vpx_highbd_12_sub_pixel_variance32x16,
- vpx_highbd_12_sub_pixel_avg_variance32x16,
- vpx_highbd_sad32x16x4d_bits12)
-
- HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits12,
- vpx_highbd_sad16x32_avg_bits12, vpx_highbd_12_variance16x32,
- vpx_highbd_12_sub_pixel_variance16x32,
- vpx_highbd_12_sub_pixel_avg_variance16x32,
- vpx_highbd_sad16x32x4d_bits12)
-
- HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits12,
- vpx_highbd_sad64x32_avg_bits12, vpx_highbd_12_variance64x32,
- vpx_highbd_12_sub_pixel_variance64x32,
- vpx_highbd_12_sub_pixel_avg_variance64x32,
- vpx_highbd_sad64x32x4d_bits12)
-
- HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits12,
- vpx_highbd_sad32x64_avg_bits12, vpx_highbd_12_variance32x64,
- vpx_highbd_12_sub_pixel_variance32x64,
- vpx_highbd_12_sub_pixel_avg_variance32x64,
- vpx_highbd_sad32x64x4d_bits12)
-
- HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits12,
- vpx_highbd_sad32x32_avg_bits12, vpx_highbd_12_variance32x32,
- vpx_highbd_12_sub_pixel_variance32x32,
- vpx_highbd_12_sub_pixel_avg_variance32x32,
- vpx_highbd_sad32x32x4d_bits12)
-
- HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits12,
- vpx_highbd_sad64x64_avg_bits12, vpx_highbd_12_variance64x64,
- vpx_highbd_12_sub_pixel_variance64x64,
- vpx_highbd_12_sub_pixel_avg_variance64x64,
- vpx_highbd_sad64x64x4d_bits12)
-
- HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits12,
- vpx_highbd_sad16x16_avg_bits12, vpx_highbd_12_variance16x16,
- vpx_highbd_12_sub_pixel_variance16x16,
- vpx_highbd_12_sub_pixel_avg_variance16x16,
- vpx_highbd_sad16x16x4d_bits12)
-
- HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits12,
- vpx_highbd_sad16x8_avg_bits12, vpx_highbd_12_variance16x8,
- vpx_highbd_12_sub_pixel_variance16x8,
- vpx_highbd_12_sub_pixel_avg_variance16x8,
- vpx_highbd_sad16x8x4d_bits12)
-
- HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits12,
- vpx_highbd_sad8x16_avg_bits12, vpx_highbd_12_variance8x16,
- vpx_highbd_12_sub_pixel_variance8x16,
- vpx_highbd_12_sub_pixel_avg_variance8x16,
- vpx_highbd_sad8x16x4d_bits12)
-
- HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits12,
- vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8,
- vpx_highbd_12_sub_pixel_variance8x8,
- vpx_highbd_12_sub_pixel_avg_variance8x8,
- vpx_highbd_sad8x8x4d_bits12)
-
- HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits12,
- vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4,
- vpx_highbd_12_sub_pixel_variance8x4,
- vpx_highbd_12_sub_pixel_avg_variance8x4,
- vpx_highbd_sad8x4x4d_bits12)
-
- HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits12,
- vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8,
- vpx_highbd_12_sub_pixel_variance4x8,
- vpx_highbd_12_sub_pixel_avg_variance4x8,
- vpx_highbd_sad4x8x4d_bits12)
-
- HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits12,
- vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4,
- vpx_highbd_12_sub_pixel_variance4x4,
- vpx_highbd_12_sub_pixel_avg_variance4x4,
- vpx_highbd_sad4x4x4d_bits12)
+ HIGHBD_BFP(
+ BLOCK_32X16, vpx_highbd_sad32x16_bits12,
+ vpx_highbd_sad_skip_32x16_bits12, vpx_highbd_sad32x16_avg_bits12,
+ vpx_highbd_12_variance32x16, vpx_highbd_12_sub_pixel_variance32x16,
+ vpx_highbd_12_sub_pixel_avg_variance32x16,
+ vpx_highbd_sad32x16x4d_bits12, vpx_highbd_sad_skip_32x16x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_16X32, vpx_highbd_sad16x32_bits12,
+ vpx_highbd_sad_skip_16x32_bits12, vpx_highbd_sad16x32_avg_bits12,
+ vpx_highbd_12_variance16x32, vpx_highbd_12_sub_pixel_variance16x32,
+ vpx_highbd_12_sub_pixel_avg_variance16x32,
+ vpx_highbd_sad16x32x4d_bits12, vpx_highbd_sad_skip_16x32x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_64X32, vpx_highbd_sad64x32_bits12,
+ vpx_highbd_sad_skip_64x32_bits12, vpx_highbd_sad64x32_avg_bits12,
+ vpx_highbd_12_variance64x32, vpx_highbd_12_sub_pixel_variance64x32,
+ vpx_highbd_12_sub_pixel_avg_variance64x32,
+ vpx_highbd_sad64x32x4d_bits12, vpx_highbd_sad_skip_64x32x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_32X64, vpx_highbd_sad32x64_bits12,
+ vpx_highbd_sad_skip_32x64_bits12, vpx_highbd_sad32x64_avg_bits12,
+ vpx_highbd_12_variance32x64, vpx_highbd_12_sub_pixel_variance32x64,
+ vpx_highbd_12_sub_pixel_avg_variance32x64,
+ vpx_highbd_sad32x64x4d_bits12, vpx_highbd_sad_skip_32x64x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_32X32, vpx_highbd_sad32x32_bits12,
+ vpx_highbd_sad_skip_32x32_bits12, vpx_highbd_sad32x32_avg_bits12,
+ vpx_highbd_12_variance32x32, vpx_highbd_12_sub_pixel_variance32x32,
+ vpx_highbd_12_sub_pixel_avg_variance32x32,
+ vpx_highbd_sad32x32x4d_bits12, vpx_highbd_sad_skip_32x32x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_64X64, vpx_highbd_sad64x64_bits12,
+ vpx_highbd_sad_skip_64x64_bits12, vpx_highbd_sad64x64_avg_bits12,
+ vpx_highbd_12_variance64x64, vpx_highbd_12_sub_pixel_variance64x64,
+ vpx_highbd_12_sub_pixel_avg_variance64x64,
+ vpx_highbd_sad64x64x4d_bits12, vpx_highbd_sad_skip_64x64x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_16X16, vpx_highbd_sad16x16_bits12,
+ vpx_highbd_sad_skip_16x16_bits12, vpx_highbd_sad16x16_avg_bits12,
+ vpx_highbd_12_variance16x16, vpx_highbd_12_sub_pixel_variance16x16,
+ vpx_highbd_12_sub_pixel_avg_variance16x16,
+ vpx_highbd_sad16x16x4d_bits12, vpx_highbd_sad_skip_16x16x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_16X8, vpx_highbd_sad16x8_bits12,
+ vpx_highbd_sad_skip_16x8_bits12, vpx_highbd_sad16x8_avg_bits12,
+ vpx_highbd_12_variance16x8, vpx_highbd_12_sub_pixel_variance16x8,
+ vpx_highbd_12_sub_pixel_avg_variance16x8,
+ vpx_highbd_sad16x8x4d_bits12, vpx_highbd_sad_skip_16x8x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_8X16, vpx_highbd_sad8x16_bits12,
+ vpx_highbd_sad_skip_8x16_bits12, vpx_highbd_sad8x16_avg_bits12,
+ vpx_highbd_12_variance8x16, vpx_highbd_12_sub_pixel_variance8x16,
+ vpx_highbd_12_sub_pixel_avg_variance8x16,
+ vpx_highbd_sad8x16x4d_bits12, vpx_highbd_sad_skip_8x16x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_8X8, vpx_highbd_sad8x8_bits12, vpx_highbd_sad_skip_8x8_bits12,
+ vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8,
+ vpx_highbd_12_sub_pixel_variance8x8,
+ vpx_highbd_12_sub_pixel_avg_variance8x8,
+ vpx_highbd_sad8x8x4d_bits12, vpx_highbd_sad_skip_8x8x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_8X4, vpx_highbd_sad8x4_bits12, vpx_highbd_sad_skip_8x4_bits12,
+ vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4,
+ vpx_highbd_12_sub_pixel_variance8x4,
+ vpx_highbd_12_sub_pixel_avg_variance8x4,
+ vpx_highbd_sad8x4x4d_bits12, vpx_highbd_sad_skip_8x4x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_4X8, vpx_highbd_sad4x8_bits12, vpx_highbd_sad_skip_4x8_bits12,
+ vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8,
+ vpx_highbd_12_sub_pixel_variance4x8,
+ vpx_highbd_12_sub_pixel_avg_variance4x8,
+ vpx_highbd_sad4x8x4d_bits12, vpx_highbd_sad_skip_4x8x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_4X4, vpx_highbd_sad4x4_bits12, vpx_highbd_sad_skip_4x4_bits12,
+ vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4,
+ vpx_highbd_12_sub_pixel_variance4x4,
+ vpx_highbd_12_sub_pixel_avg_variance4x4,
+ vpx_highbd_sad4x4x4d_bits12, vpx_highbd_sad_skip_4x4x4d_bits12)
break;
}
}
@@ -2550,61 +2629,67 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var)));
cpi->source_var_thresh = 0;
cpi->frames_till_next_var_check = 0;
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
- cpi->fn_ptr[BT].sdf = SDF; \
- cpi->fn_ptr[BT].sdaf = SDAF; \
- cpi->fn_ptr[BT].vf = VF; \
- cpi->fn_ptr[BT].svf = SVF; \
- cpi->fn_ptr[BT].svaf = SVAF; \
- cpi->fn_ptr[BT].sdx4df = SDX4DF;
-
- BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vpx_variance32x16,
- vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16,
- vpx_sad32x16x4d)
-
- BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vpx_variance16x32,
- vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32,
- vpx_sad16x32x4d)
-
- BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vpx_variance64x32,
- vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32,
- vpx_sad64x32x4d)
-
- BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vpx_variance32x64,
- vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64,
- vpx_sad32x64x4d)
-
- BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32,
- vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32,
- vpx_sad32x32x4d)
-
- BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64,
- vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64,
- vpx_sad64x64x4d)
-
- BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vpx_variance16x16,
- vpx_sub_pixel_variance16x16, vpx_sub_pixel_avg_variance16x16,
- vpx_sad16x16x4d)
-
- BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vpx_variance16x8,
- vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8,
- vpx_sad16x8x4d)
-
- BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vpx_variance8x16,
- vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16,
- vpx_sad8x16x4d)
-
- BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vpx_variance8x8,
- vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d)
-
- BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vpx_variance8x4,
- vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d)
-
- BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vpx_variance4x8,
- vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d)
-
- BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vpx_variance4x4,
- vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d)
+#define BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \
+ cpi->fn_ptr[BT].sdf = SDF; \
+ cpi->fn_ptr[BT].sdsf = SDSF; \
+ cpi->fn_ptr[BT].sdaf = SDAF; \
+ cpi->fn_ptr[BT].vf = VF; \
+ cpi->fn_ptr[BT].svf = SVF; \
+ cpi->fn_ptr[BT].svaf = SVAF; \
+ cpi->fn_ptr[BT].sdx4df = SDX4DF; \
+ cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
+
+ BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad_skip_32x16, vpx_sad32x16_avg,
+ vpx_variance32x16, vpx_sub_pixel_variance32x16,
+ vpx_sub_pixel_avg_variance32x16, vpx_sad32x16x4d, vpx_sad_skip_32x16x4d)
+
+ BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad_skip_16x32, vpx_sad16x32_avg,
+ vpx_variance16x32, vpx_sub_pixel_variance16x32,
+ vpx_sub_pixel_avg_variance16x32, vpx_sad16x32x4d, vpx_sad_skip_16x32x4d)
+
+ BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad_skip_64x32, vpx_sad64x32_avg,
+ vpx_variance64x32, vpx_sub_pixel_variance64x32,
+ vpx_sub_pixel_avg_variance64x32, vpx_sad64x32x4d, vpx_sad_skip_64x32x4d)
+
+ BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad_skip_32x64, vpx_sad32x64_avg,
+ vpx_variance32x64, vpx_sub_pixel_variance32x64,
+ vpx_sub_pixel_avg_variance32x64, vpx_sad32x64x4d, vpx_sad_skip_32x64x4d)
+
+ BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad_skip_32x32, vpx_sad32x32_avg,
+ vpx_variance32x32, vpx_sub_pixel_variance32x32,
+ vpx_sub_pixel_avg_variance32x32, vpx_sad32x32x4d, vpx_sad_skip_32x32x4d)
+
+ BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad_skip_64x64, vpx_sad64x64_avg,
+ vpx_variance64x64, vpx_sub_pixel_variance64x64,
+ vpx_sub_pixel_avg_variance64x64, vpx_sad64x64x4d, vpx_sad_skip_64x64x4d)
+
+ BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad_skip_16x16, vpx_sad16x16_avg,
+ vpx_variance16x16, vpx_sub_pixel_variance16x16,
+ vpx_sub_pixel_avg_variance16x16, vpx_sad16x16x4d, vpx_sad_skip_16x16x4d)
+
+ BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad_skip_16x8, vpx_sad16x8_avg,
+ vpx_variance16x8, vpx_sub_pixel_variance16x8,
+ vpx_sub_pixel_avg_variance16x8, vpx_sad16x8x4d, vpx_sad_skip_16x8x4d)
+
+ BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad_skip_8x16, vpx_sad8x16_avg,
+ vpx_variance8x16, vpx_sub_pixel_variance8x16,
+ vpx_sub_pixel_avg_variance8x16, vpx_sad8x16x4d, vpx_sad_skip_8x16x4d)
+
+ BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad_skip_8x8, vpx_sad8x8_avg, vpx_variance8x8,
+ vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d,
+ vpx_sad_skip_8x8x4d)
+
+ BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad_skip_8x4, vpx_sad8x4_avg, vpx_variance8x4,
+ vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d,
+ vpx_sad_skip_8x4x4d)
+
+ BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad_skip_4x8, vpx_sad4x8_avg, vpx_variance4x8,
+ vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d,
+ vpx_sad_skip_4x8x4d)
+
+ BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad_skip_4x4, vpx_sad4x4_avg, vpx_variance4x4,
+ vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d,
+ vpx_sad_skip_4x4x4d)
#if CONFIG_VP9_HIGHBITDEPTH
highbd_set_var_fns(cpi);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 0efa836ac..71d8775ea 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -437,6 +437,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
MV center_mv_full = ref_mv_full;
unsigned int start_mv_sad;
+ vp9_sad_fn_ptr_t sad_fn_ptr;
int step_param = 3;
int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
@@ -462,11 +463,13 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
x->mv_limits.row_min, x->mv_limits.row_max);
start_mv_sad = get_start_mv_sad(x, &ref_mv_full, &center_mv_full,
cpi->fn_ptr[bsize].sdf, x->sadperbit16);
+ sad_fn_ptr.sdf = cpi->fn_ptr[bsize].sdf;
+ sad_fn_ptr.sdx4df = cpi->fn_ptr[bsize].sdx4df;
// Center the initial step/diamond search on best mv.
tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad,
&tmp_mv, step_param, x->sadperbit16, &num00,
- &v_fn_ptr, ref_mv);
+ &sad_fn_ptr, ref_mv);
if (tmp_err < INT_MAX)
tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
if (tmp_err < INT_MAX - new_mv_mode_penalty) tmp_err += new_mv_mode_penalty;
@@ -488,7 +491,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
} else {
tmp_err = cpi->diamond_search_sad(
x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad, &tmp_mv, step_param + n,
- x->sadperbit16, &num00, &v_fn_ptr, ref_mv);
+ x->sadperbit16, &num00, &sad_fn_ptr, ref_mv);
if (tmp_err < INT_MAX)
tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
if (tmp_err < INT_MAX - new_mv_mode_penalty)
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 4ff685b24..64e9ef0f9 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -2055,7 +2055,7 @@ int vp9_prepare_nb_full_mvs(const MotionField *motion_field, int mi_row,
int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
MV *ref_mv, uint32_t start_mv_sad, MV *best_mv,
int search_param, int sad_per_bit, int *num00,
- const vp9_variance_fn_ptr_t *fn_ptr,
+ const vp9_sad_fn_ptr_t *sad_fn_ptr,
const MV *center_mv) {
int i, j, step;
@@ -2117,8 +2117,8 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
for (t = 0; t < 4; t++) block_offset[t] = ss_os[i + t] + best_address;
- fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
- sad_array);
+ sad_fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+ sad_array);
for (t = 0; t < 4; t++, i++) {
if (sad_array[t] < bestsad) {
@@ -2142,7 +2142,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
if (is_mv_in(&x->mv_limits, &this_mv)) {
const uint8_t *const check_here = ss_os[i] + best_address;
unsigned int thissad =
- fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
+ sad_fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
if (thissad < bestsad) {
thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
@@ -2484,24 +2484,54 @@ int vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x,
point as the best match, we will do a final 1-away diamond
refining search */
static int full_pixel_diamond(const VP9_COMP *const cpi,
- const MACROBLOCK *const x, MV *mvp_full,
- int step_param, int sadpb, int further_steps,
- int do_refine, int *cost_list,
+ const MACROBLOCK *const x, BLOCK_SIZE bsize,
+ MV *mvp_full, int step_param, int sadpb,
+ int further_steps, int do_refine,
+ int use_downsampled_sad, int *cost_list,
const vp9_variance_fn_ptr_t *fn_ptr,
const MV *ref_mv, MV *dst_mv) {
MV temp_mv;
int thissme, n, num00 = 0;
int bestsme;
- unsigned int start_mv_sad;
+ const int src_buf_stride = x->plane[0].src.stride;
+ const uint8_t *const src_buf = x->plane[0].src.buf;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const int pred_buf_stride = xd->plane[0].pre[0].stride;
+ uint8_t *pred_buf;
+ vp9_sad_fn_ptr_t sad_fn_ptr;
+ unsigned int start_mv_sad, start_mv_sad_even_rows, start_mv_sad_odd_rows;
const MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 };
clamp_mv(mvp_full, x->mv_limits.col_min, x->mv_limits.col_max,
x->mv_limits.row_min, x->mv_limits.row_max);
- start_mv_sad =
- get_start_mv_sad(x, mvp_full, &ref_mv_full, fn_ptr->sdf, sadpb);
+
+ pred_buf =
+ xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col;
+ start_mv_sad_even_rows =
+ fn_ptr->sdsf(src_buf, src_buf_stride, pred_buf, pred_buf_stride);
+ start_mv_sad_odd_rows =
+ fn_ptr->sdsf(src_buf + src_buf_stride, src_buf_stride,
+ pred_buf + pred_buf_stride, pred_buf_stride);
+ start_mv_sad = (start_mv_sad_even_rows + start_mv_sad_odd_rows) >> 1;
+ start_mv_sad += mvsad_err_cost(x, mvp_full, &ref_mv_full, sadpb);
+
+ sad_fn_ptr.sdf = fn_ptr->sdf;
+ sad_fn_ptr.sdx4df = fn_ptr->sdx4df;
+ if (use_downsampled_sad && num_4x4_blocks_high_lookup[bsize] >= 2) {
+ // If the absolute difference between the pred-to-src SAD of even rows and
+ // the pred-to-src SAD of odd rows is small, skip every other row in sad
+ // computation.
+ const int odd_to_even_diff_sad =
+ abs((int)start_mv_sad_even_rows - (int)start_mv_sad_odd_rows);
+ const int mult_thresh = 10;
+ if (odd_to_even_diff_sad * mult_thresh < (int)start_mv_sad_even_rows) {
+ sad_fn_ptr.sdf = fn_ptr->sdsf;
+ sad_fn_ptr.sdx4df = fn_ptr->sdsx4df;
+ }
+ }
bestsme =
cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad, &temp_mv,
- step_param, sadpb, &n, fn_ptr, ref_mv);
+ step_param, sadpb, &n, &sad_fn_ptr, ref_mv);
if (bestsme < INT_MAX)
bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
*dst_mv = temp_mv;
@@ -2518,7 +2548,7 @@ static int full_pixel_diamond(const VP9_COMP *const cpi,
} else {
thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad,
&temp_mv, step_param + n, sadpb, &num00,
- fn_ptr, ref_mv);
+ &sad_fn_ptr, ref_mv);
if (thissme < INT_MAX)
thissme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
@@ -2536,8 +2566,8 @@ static int full_pixel_diamond(const VP9_COMP *const cpi,
if (do_refine) {
const int search_range = 8;
MV best_mv = *dst_mv;
- thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range, fn_ptr,
- ref_mv);
+ thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range,
+ &sad_fn_ptr, ref_mv);
if (thissme < INT_MAX)
thissme = vp9_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1);
if (thissme < bestsme) {
@@ -2546,6 +2576,27 @@ static int full_pixel_diamond(const VP9_COMP *const cpi,
}
}
+ if (sad_fn_ptr.sdf != fn_ptr->sdf) {
+ // If we are skipping rows when we perform the motion search, we need to
+ // check the quality of skipping. If it's bad, then we run search with
+ // skip row features off.
+ const uint8_t *best_address = get_buf_from_mv(&xd->plane[0].pre[0], dst_mv);
+ const int sad =
+ fn_ptr->sdf(src_buf, src_buf_stride, best_address, pred_buf_stride);
+ const int skip_sad =
+ fn_ptr->sdsf(src_buf, src_buf_stride, best_address, pred_buf_stride);
+ // We will keep the result of skipping rows if it's good enough.
+ const int kSADThresh =
+ 1 << (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+ if (sad > kSADThresh && abs(skip_sad - sad) * 10 >= VPXMAX(sad, 1) * 9) {
+ // There is a large discrepancy between skipping and not skipping, so we
+ // need to redo the motion search.
+ return full_pixel_diamond(cpi, x, bsize, mvp_full, step_param, sadpb,
+ further_steps, do_refine, 0, cost_list, fn_ptr,
+ ref_mv, dst_mv);
+ }
+ }
+
// Return cost list.
if (cost_list) {
calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
@@ -2697,7 +2748,7 @@ int64_t vp9_refining_search_sad_new(const MACROBLOCK *x, MV *best_full_mv,
int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
int search_range,
- const vp9_variance_fn_ptr_t *fn_ptr,
+ const vp9_sad_fn_ptr_t *sad_fn_ptr,
const MV *center_mv) {
const MACROBLOCKD *const xd = &x->e_mbd;
const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
@@ -2706,7 +2757,7 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv);
unsigned int best_sad =
- fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) +
+ sad_fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) +
mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
int i, j;
@@ -2723,7 +2774,8 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
best_address - 1, best_address + 1,
best_address + in_what->stride };
- fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads);
+ sad_fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride,
+ sads);
for (j = 0; j < 4; ++j) {
if (sads[j] < best_sad) {
@@ -2743,8 +2795,8 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
if (is_mv_in(&x->mv_limits, &mv)) {
unsigned int sad =
- fn_ptr->sdf(what->buf, what->stride,
- get_buf_from_mv(in_what, &mv), in_what->stride);
+ sad_fn_ptr->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &mv), in_what->stride);
if (sad < best_sad) {
sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
if (sad < best_sad) {
@@ -2861,9 +2913,10 @@ int vp9_full_pixel_search(const VP9_COMP *const cpi, const MACROBLOCK *const x,
break;
case NSTEP:
case MESH:
- var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
- MAX_MVSEARCH_STEPS - 1 - step_param, 1,
- cost_list, fn_ptr, ref_mv, tmp_mv);
+ var = full_pixel_diamond(
+ cpi, x, bsize, mvp_full, step_param, error_per_bit,
+ MAX_MVSEARCH_STEPS - 1 - step_param, 1,
+ cpi->sf.mv.use_downsampled_sad, cost_list, fn_ptr, ref_mv, tmp_mv);
break;
default: assert(0 && "Unknown search method");
}
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 62a7a047d..fd6a8b9ac 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -41,6 +41,11 @@ typedef struct search_site_config {
int total_steps;
} search_site_config;
+typedef struct vp9_sad_table {
+ vpx_sad_fn_t sdf;
+ vpx_sad_multi_d_fn_t sdx4df;
+} vp9_sad_fn_ptr_t;
+
static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
const MV *mv) {
return &buf->buf[mv->row * buf->stride + mv->col];
@@ -63,12 +68,13 @@ int vp9_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
struct VP9_COMP;
struct SPEED_FEATURES;
+struct vp9_sad_table;
int vp9_init_search_range(int size);
int vp9_refining_search_sad(const struct macroblock *x, struct mv *ref_mv,
int error_per_bit, int search_range,
- const struct vp9_variance_vtable *fn_ptr,
+ const struct vp9_sad_table *sad_fn_ptr,
const struct mv *center_mv);
// Perform integral projection based motion estimation.
@@ -96,7 +102,7 @@ extern fractional_mv_step_fp vp9_return_min_sub_pixel_mv;
typedef int (*vp9_diamond_search_fn_t)(
const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv,
uint32_t start_mv_sad, MV *best_mv, int search_param, int sad_per_bit,
- int *num00, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv);
+ int *num00, const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv);
int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
int search_range,
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 034673b49..04804da1c 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -231,6 +231,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
sf->allow_skip_recode = 1;
sf->less_rectangular_check = 1;
sf->mv.auto_mv_step_size = 1;
+ sf->mv.use_downsampled_sad = 1;
sf->prune_ref_frame_for_rect_partitions = 1;
sf->temporal_filter_search_method = NSTEP;
sf->tx_size_search_breakout = 1;
@@ -926,6 +927,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
sf->coeff_prob_appx_step = 1;
sf->mv.auto_mv_step_size = 0;
sf->mv.fullpel_search_step_param = 6;
+ sf->mv.use_downsampled_sad = 0;
sf->comp_inter_joint_search_thresh = BLOCK_4X4;
sf->tx_size_search_method = USE_FULL_RD;
sf->use_lp32x32fdct = 0;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index e267e55c4..7cb3f3527 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -210,6 +210,10 @@ typedef struct MV_SPEED_FEATURES {
// This variable sets the step_param used in full pel motion search.
int fullpel_search_step_param;
+
+ // Whether to downsample the rows in sad calculation during motion search.
+ // This is only active when there are at least 8 rows.
+ int use_downsampled_sad;
} MV_SPEED_FEATURES;
typedef struct PARTITION_SEARCH_BREAKOUT_THR {
diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
index 719ab40f9..80442e359 100644
--- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+++ b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
@@ -51,7 +51,7 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
const search_site_config *cfg, MV *ref_mv,
uint32_t start_mv_sad, MV *best_mv,
int search_param, int sad_per_bit, int *num00,
- const vp9_variance_fn_ptr_t *fn_ptr,
+ const vp9_sad_fn_ptr_t *sad_fn_ptr,
const MV *center_mv) {
const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max);
const __m128i v_max_mv_w = _mm_set1_epi32((int)maxmv.as_int);
@@ -167,8 +167,8 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
#endif
}
- fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
- in_what_stride, (uint32_t *)&v_sad_d);
+ sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
+ in_what_stride, (uint32_t *)&v_sad_d);
// Look up the component cost of the residual motion vector
{
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index b47c43430..619d7aa95 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -43,6 +43,12 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride,
DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]); \
vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \
return sad(src_ptr, src_stride, comp_pred, m, m, n); \
+ } \
+ unsigned int vpx_sad_skip_##m##x##n##_c( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return 2 * sad(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride, (m), \
+ (n / 2)); \
}
// Compare |src_ptr| to 4 distinct references in |ref_array[4]|
@@ -54,6 +60,15 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride,
for (i = 0; i < 4; ++i) \
sad_array[i] = \
vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_array[i], ref_stride); \
+ } \
+ void vpx_sad_skip_##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ int i; \
+ for (i = 0; i < 4; ++i) { \
+ sad_array[i] = 2 * sad(src_ptr, 2 * src_stride, ref_array[i], \
+ 2 * ref_stride, (m), (n / 2)); \
+ } \
}
/* clang-format off */
@@ -156,6 +171,12 @@ static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride,
vpx_highbd_comp_avg_pred_c(comp_pred, CONVERT_TO_SHORTPTR(second_pred), m, \
n, CONVERT_TO_SHORTPTR(ref_ptr), ref_stride); \
return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n); \
+ } \
+ unsigned int vpx_highbd_sad_skip_##m##x##n##_c( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * \
+ highbd_sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2)); \
}
#define highbd_sadMxNx4D(m, n) \
@@ -167,6 +188,15 @@ static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride,
sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride, \
ref_array[i], ref_stride); \
} \
+ } \
+ void vpx_highbd_sad_skip_##m##x##n##x4d_c( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ int i; \
+ for (i = 0; i < 4; ++i) { \
+ sad_array[i] = vpx_highbd_sad_skip_##m##x##n##_c( \
+ src, src_stride, ref_array[i], ref_stride); \
+ } \
}
/* clang-format off */
diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h
index 755cb907d..ccdb2f90b 100644
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@@ -69,11 +69,15 @@ typedef struct variance_vtable {
#if CONFIG_VP9
typedef struct vp9_variance_vtable {
vpx_sad_fn_t sdf;
+ // Same as normal sad, but downsample the rows by a factor of 2.
+ vpx_sad_fn_t sdsf;
vpx_sad_avg_fn_t sdaf;
vpx_variance_fn_t vf;
vpx_subpixvariance_fn_t svf;
vpx_subp_avg_variance_fn_t svaf;
vpx_sad_multi_d_fn_t sdx4df;
+ // Same as sadx4, but downsample the rows by a factor of 2.
+ vpx_sad_multi_d_fn_t sdsx4df;
} vp9_variance_fn_ptr_t;
#endif // CONFIG_VP9
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 97682a425..e3d48f493 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -786,6 +786,43 @@ specialize qw/vpx_sad4x8 neon msa sse2 mmi/;
add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_sad4x4 neon msa sse2 mmi/;
+add_proto qw/unsigned int vpx_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_64x64 avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_64x32 avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_32x64 avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_32x32 avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_32x16 avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_16x32 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_16x16 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_16x8 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_8x16 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_8x8 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+
+add_proto qw/unsigned int vpx_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_4x8 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+
#
# Avg
#
@@ -928,6 +965,43 @@ specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/;
add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
+add_proto qw/void vpx_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_64x64x4d avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_64x32x4d avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_32x64x4d avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_32x32x4d avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_32x16x4d avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_16x32x4d sse2/;
+
+add_proto qw/void vpx_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_16x16x4d sse2/;
+
+add_proto qw/void vpx_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_16x8x4d sse2/;
+
+add_proto qw/void vpx_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_8x16x4d sse2/;
+
+add_proto qw/void vpx_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_8x8x4d sse2/;
+
+add_proto qw/void vpx_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+
+add_proto qw/void vpx_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_4x8x4d sse2/;
+
+add_proto qw/void vpx_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+
add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
specialize qw/vpx_sum_squares_2d_i16 neon sse2 msa/;
@@ -991,6 +1065,42 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_highbd_sad4x4 neon/;
+ add_proto qw/unsigned int vpx_highbd_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad_skip_64x64 sse2 avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad_skip_64x32 sse2 avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad_skip_32x64 sse2 avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad_skip_32x32 sse2 avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad_skip_32x16 sse2 avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad_skip_16x32 sse2 avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad_skip_16x16 sse2 avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad_skip_16x8 sse2 avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad_skip_8x16 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad_skip_8x8 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+
#
# Avg
#
@@ -1084,6 +1194,43 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_highbd_sad4x4x4d sse2 neon/;
+ add_proto qw/void vpx_highbd_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_64x64x4d sse2 avx2/;
+
+ add_proto qw/void vpx_highbd_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_64x32x4d sse2 avx2/;
+
+ add_proto qw/void vpx_highbd_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_32x64x4d sse2 avx2/;
+
+ add_proto qw/void vpx_highbd_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_32x32x4d sse2 avx2/;
+
+ add_proto qw/void vpx_highbd_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_32x16x4d sse2 avx2/;
+
+ add_proto qw/void vpx_highbd_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_16x32x4d sse2 avx2/;
+
+ add_proto qw/void vpx_highbd_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_16x16x4d sse2 avx2/;
+
+ add_proto qw/void vpx_highbd_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_16x8x4d sse2 avx2/;
+
+ add_proto qw/void vpx_highbd_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_8x16x4d sse2/;
+
+ add_proto qw/void vpx_highbd_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_8x8x4d sse2/;
+
+ add_proto qw/void vpx_highbd_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+
+ add_proto qw/void vpx_highbd_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_4x8x4d sse2/;
+
+ add_proto qw/void vpx_highbd_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+
#
# Structured Similarity (SSIM)
#
diff --git a/vpx_dsp/x86/highbd_sad4d_avx2.c b/vpx_dsp/x86/highbd_sad4d_avx2.c
index 947b5e977..e483fdce7 100644
--- a/vpx_dsp/x86/highbd_sad4d_avx2.c
+++ b/vpx_dsp/x86/highbd_sad4d_avx2.c
@@ -61,70 +61,79 @@ static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/,
}
}
+static VPX_FORCE_INLINE void highbd_sad64xNx4d_avx2(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+ int ref_stride, uint32_t sad_array[4], int n) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *refs[4];
+ __m256i sums_16[4];
+ __m256i sums_32[4];
+ int i;
+
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+ sums_32[0] = _mm256_setzero_si256();
+ sums_32[1] = _mm256_setzero_si256();
+ sums_32[2] = _mm256_setzero_si256();
+ sums_32[3] = _mm256_setzero_si256();
+
+ for (i = 0; i < (n / 2); ++i) {
+ sums_16[0] = _mm256_setzero_si256();
+ sums_16[1] = _mm256_setzero_si256();
+ sums_16[2] = _mm256_setzero_si256();
+ sums_16[3] = _mm256_setzero_si256();
+
+ highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2);
+
+ /* sums_16 will outrange after 2 rows, so add current sums_16 to
+ * sums_32*/
+ sums_32[0] = _mm256_add_epi32(
+ sums_32[0],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+ sums_32[1] = _mm256_add_epi32(
+ sums_32[1],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+ sums_32[2] = _mm256_add_epi32(
+ sums_32[2],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+ sums_32[3] = _mm256_add_epi32(
+ sums_32[3],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+ src += src_stride << 1;
+ }
+ calc_final_4(sums_32, sad_array);
+}
+
#define HIGHBD_SAD64XNX4D(n) \
- void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \
+ void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src, int src_stride, \
const uint8_t *const ref_array[4], \
int ref_stride, uint32_t sad_array[4]) { \
- const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
- uint16_t *refs[4]; \
- __m256i sums_16[4]; \
- __m256i sums_32[4]; \
- int i; \
- \
- refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \
- refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \
- refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \
- refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \
- sums_32[0] = _mm256_setzero_si256(); \
- sums_32[1] = _mm256_setzero_si256(); \
- sums_32[2] = _mm256_setzero_si256(); \
- sums_32[3] = _mm256_setzero_si256(); \
- \
- for (i = 0; i < (n / 2); ++i) { \
- sums_16[0] = _mm256_setzero_si256(); \
- sums_16[1] = _mm256_setzero_si256(); \
- sums_16[2] = _mm256_setzero_si256(); \
- sums_16[3] = _mm256_setzero_si256(); \
- \
- highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2); \
- \
- /* sums_16 will outrange after 2 rows, so add current sums_16 to \
- * sums_32*/ \
- sums_32[0] = _mm256_add_epi32( \
- sums_32[0], \
- _mm256_add_epi32( \
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \
- _mm256_cvtepu16_epi32( \
- _mm256_extractf128_si256(sums_16[0], 1)))); \
- sums_32[1] = _mm256_add_epi32( \
- sums_32[1], \
- _mm256_add_epi32( \
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \
- _mm256_cvtepu16_epi32( \
- _mm256_extractf128_si256(sums_16[1], 1)))); \
- sums_32[2] = _mm256_add_epi32( \
- sums_32[2], \
- _mm256_add_epi32( \
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \
- _mm256_cvtepu16_epi32( \
- _mm256_extractf128_si256(sums_16[2], 1)))); \
- sums_32[3] = _mm256_add_epi32( \
- sums_32[3], \
- _mm256_add_epi32( \
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \
- _mm256_cvtepu16_epi32( \
- _mm256_extractf128_si256(sums_16[3], 1)))); \
- \
- src += src_stride << 1; \
- } \
- calc_final_4(sums_32, sad_array); \
+ highbd_sad64xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \
+ n); \
}
-// 64x64
-HIGHBD_SAD64XNX4D(64)
-
-// 64x32
-HIGHBD_SAD64XNX4D(32)
+#define HIGHBD_SADSKIP64XNx4D(n) \
+ void vpx_highbd_sad_skip_64x##n##x4d_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad64xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+ sad_array, n / 2); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/,
const uint16_t *src,
@@ -171,73 +180,79 @@ static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/,
}
}
+static VPX_FORCE_INLINE void highbd_sad32xNx4d_avx2(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+ int ref_stride, uint32_t sad_array[4], int n) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *refs[4];
+ __m256i sums_16[4];
+ __m256i sums_32[4];
+ int i;
+
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+ sums_32[0] = _mm256_setzero_si256();
+ sums_32[1] = _mm256_setzero_si256();
+ sums_32[2] = _mm256_setzero_si256();
+ sums_32[3] = _mm256_setzero_si256();
+
+ for (i = 0; i < (n / 8); ++i) {
+ sums_16[0] = _mm256_setzero_si256();
+ sums_16[1] = _mm256_setzero_si256();
+ sums_16[2] = _mm256_setzero_si256();
+ sums_16[3] = _mm256_setzero_si256();
+
+ highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);
+
+ /* sums_16 will outrange after 8 rows, so add current sums_16 to
+ * sums_32*/
+ sums_32[0] = _mm256_add_epi32(
+ sums_32[0],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+ sums_32[1] = _mm256_add_epi32(
+ sums_32[1],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+ sums_32[2] = _mm256_add_epi32(
+ sums_32[2],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+ sums_32[3] = _mm256_add_epi32(
+ sums_32[3],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+ src += src_stride << 3;
+ }
+ calc_final_4(sums_32, sad_array);
+}
+
#define HIGHBD_SAD32XNX4D(n) \
- void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \
+ void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src, int src_stride, \
const uint8_t *const ref_array[4], \
int ref_stride, uint32_t sad_array[4]) { \
- const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
- uint16_t *refs[4]; \
- __m256i sums_16[4]; \
- __m256i sums_32[4]; \
- int i; \
- \
- refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \
- refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \
- refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \
- refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \
- sums_32[0] = _mm256_setzero_si256(); \
- sums_32[1] = _mm256_setzero_si256(); \
- sums_32[2] = _mm256_setzero_si256(); \
- sums_32[3] = _mm256_setzero_si256(); \
- \
- for (i = 0; i < (n / 8); ++i) { \
- sums_16[0] = _mm256_setzero_si256(); \
- sums_16[1] = _mm256_setzero_si256(); \
- sums_16[2] = _mm256_setzero_si256(); \
- sums_16[3] = _mm256_setzero_si256(); \
- \
- highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); \
- \
- /* sums_16 will outrange after 8 rows, so add current sums_16 to \
- * sums_32*/ \
- sums_32[0] = _mm256_add_epi32( \
- sums_32[0], \
- _mm256_add_epi32( \
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \
- _mm256_cvtepu16_epi32( \
- _mm256_extractf128_si256(sums_16[0], 1)))); \
- sums_32[1] = _mm256_add_epi32( \
- sums_32[1], \
- _mm256_add_epi32( \
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \
- _mm256_cvtepu16_epi32( \
- _mm256_extractf128_si256(sums_16[1], 1)))); \
- sums_32[2] = _mm256_add_epi32( \
- sums_32[2], \
- _mm256_add_epi32( \
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \
- _mm256_cvtepu16_epi32( \
- _mm256_extractf128_si256(sums_16[2], 1)))); \
- sums_32[3] = _mm256_add_epi32( \
- sums_32[3], \
- _mm256_add_epi32( \
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \
- _mm256_cvtepu16_epi32( \
- _mm256_extractf128_si256(sums_16[3], 1)))); \
- \
- src += src_stride << 3; \
- } \
- calc_final_4(sums_32, sad_array); \
+ highbd_sad32xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \
+ n); \
}
-// 32x64
-HIGHBD_SAD32XNX4D(64)
-
-// 32x32
-HIGHBD_SAD32XNX4D(32)
-
-// 32x16
-HIGHBD_SAD32XNX4D(16)
+#define HIGHBD_SADSKIP32XNx4D(n) \
+ void vpx_highbd_sad_skip_32x##n##x4d_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad32xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+ sad_array, n / 2); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/,
const uint16_t *src,
@@ -275,13 +290,15 @@ static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/,
}
}
-void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *const ref_array[4],
- int ref_stride, uint32_t sad_array[4]) {
+static VPX_FORCE_INLINE void highbd_sad16xNx4d_avx2(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+ int ref_stride, uint32_t sad_array[4], int n) {
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
uint16_t *refs[4];
__m256i sums_16[4];
__m256i sums_32[4];
+ const int height = VPXMIN(16, n);
+ const int num_iters = n / height;
int i;
refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
@@ -293,13 +310,13 @@ void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
sums_32[2] = _mm256_setzero_si256();
sums_32[3] = _mm256_setzero_si256();
- for (i = 0; i < 2; ++i) {
+ for (i = 0; i < num_iters; ++i) {
sums_16[0] = _mm256_setzero_si256();
sums_16[1] = _mm256_setzero_si256();
sums_16[2] = _mm256_setzero_si256();
sums_16[3] = _mm256_setzero_si256();
- highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16);
+ highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, height);
// sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
sums_32[0] = _mm256_add_epi32(
@@ -328,6 +345,26 @@ void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
calc_final_4(sums_32, sad_array);
}
+#define HIGHBD_SAD16XNX4D(n) \
+ void vpx_highbd_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad16xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \
+ n); \
+ }
+
+#define HIGHBD_SADSKIP16XNx4D(n) \
+ void vpx_highbd_sad_skip_16x##n##x4d_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad16xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+ sad_array, n / 2); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4],
int ref_stride, uint32_t sad_array[4]) {
@@ -399,3 +436,27 @@ void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride,
calc_final_4(sums_32, sad_array);
}
}
+
+// clang-format off
+HIGHBD_SAD64XNX4D(64)
+HIGHBD_SADSKIP64XNx4D(64)
+
+HIGHBD_SAD64XNX4D(32)
+HIGHBD_SADSKIP64XNx4D(32)
+
+HIGHBD_SAD32XNX4D(64)
+HIGHBD_SADSKIP32XNx4D(64)
+
+HIGHBD_SAD32XNX4D(32)
+HIGHBD_SADSKIP32XNx4D(32)
+
+HIGHBD_SAD32XNX4D(16)
+HIGHBD_SADSKIP32XNx4D(16)
+
+HIGHBD_SAD16XNX4D(32)
+HIGHBD_SADSKIP16XNx4D(32)
+
+HIGHBD_SADSKIP16XNx4D(16)
+
+HIGHBD_SADSKIP16XNx4D(8)
+ // clang-format on
diff --git a/vpx_dsp/x86/highbd_sad4d_sse2.asm b/vpx_dsp/x86/highbd_sad4d_sse2.asm
index 6c2a61e01..a07892d81 100644
--- a/vpx_dsp/x86/highbd_sad4d_sse2.asm
+++ b/vpx_dsp/x86/highbd_sad4d_sse2.asm
@@ -213,7 +213,12 @@ SECTION .text
; uint8_t *ref[4], int ref_stride,
; uint32_t res[4]);
; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
-%macro HIGH_SADNXN4D 2
+; Macro Arguments:
+; 1: Width
+; 2: Height
+; 3: If 0, then normal sad, if 2, then skip every other row
+%macro HIGH_SADNXN4D 2-3 0
+%if %3 == 0 ; normal sad
%if UNIX64
cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
res, ref2, ref3, ref4
@@ -221,6 +226,15 @@ cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
ref2, ref3, ref4
%endif
+%else ; %3 == 2, downsample
+%if UNIX64
+cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+ res, ref2, ref3, ref4
+%else
+cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+ ref2, ref3, ref4
+%endif ;
+%endif ; sad/avg/skip
; set m1
push srcq
@@ -229,6 +243,10 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
pshufd m1, m1, 0x0
pop srcq
+%if %3 == 2 ; skip rows
+ lea src_strided, [2*src_strided]
+ lea ref_strided, [2*ref_strided]
+%endif ; skip rows
movsxdifnidn src_strideq, src_strided
movsxdifnidn ref_strideq, ref_strided
mov ref2q, [ref1q+gprsize*1]
@@ -244,9 +262,15 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
shl ref1q, 1
HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
-%rep (%2-4)/2
+%if %3 == 2 ; Downsampling by two
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
%endrep
+%undef rep
HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
; N.B. HIGH_PROCESS outputs dwords (32 bits)
; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
@@ -265,6 +289,9 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
paddd m4, m0
paddd m6, m1
punpcklqdq m4, m6
+%if %3 == 2 ; skip rows
+ pslld m4, 1
+%endif
movifnidn r4, r4mp
movu [r4], m4
RET
@@ -285,3 +312,15 @@ HIGH_SADNXN4D 8, 8
HIGH_SADNXN4D 8, 4
HIGH_SADNXN4D 4, 8
HIGH_SADNXN4D 4, 4
+
+HIGH_SADNXN4D 64, 64, 2
+HIGH_SADNXN4D 64, 32, 2
+HIGH_SADNXN4D 32, 64, 2
+HIGH_SADNXN4D 32, 32, 2
+HIGH_SADNXN4D 32, 16, 2
+HIGH_SADNXN4D 16, 32, 2
+HIGH_SADNXN4D 16, 16, 2
+HIGH_SADNXN4D 16, 8, 2
+HIGH_SADNXN4D 8, 16, 2
+HIGH_SADNXN4D 8, 8, 2
+HIGH_SADNXN4D 4, 8, 2
diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c
index 231b67f80..78f8eb8bf 100644
--- a/vpx_dsp/x86/highbd_sad_avx2.c
+++ b/vpx_dsp/x86/highbd_sad_avx2.c
@@ -50,39 +50,49 @@ static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16,
}
}
-#define HIGHBD_SAD64XN(n) \
- unsigned int vpx_highbd_sad64x##n##_avx2( \
- const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
- int ref_stride) { \
- const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
- __m256i sums_32 = _mm256_setzero_si256(); \
- int i; \
- \
- for (i = 0; i < (n / 2); ++i) { \
- __m256i sums_16 = _mm256_setzero_si256(); \
- \
- highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2); \
- \
- /* sums_16 will outrange after 2 rows, so add current sums_16 to \
- * sums_32*/ \
- sums_32 = _mm256_add_epi32( \
- sums_32, \
- _mm256_add_epi32( \
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
- _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
- \
- src += src_stride << 1; \
- ref += ref_stride << 1; \
- } \
- return calc_final(sums_32); \
+static VPX_FORCE_INLINE unsigned int highbd_sad64xN_avx2(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride,
+ int n) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ __m256i sums_32 = _mm256_setzero_si256();
+ int i;
+
+ for (i = 0; i < (n / 2); ++i) {
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2);
+
+ /* sums_16 will outrange after 2 rows, so add current sums_16 to
+ * sums_32*/
+ sums_32 = _mm256_add_epi32(
+ sums_32,
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+ src += src_stride << 1;
+ ref += ref_stride << 1;
}
+ return calc_final(sums_32);
+}
-// 64x64
-HIGHBD_SAD64XN(64)
+#define HIGHBD_SAD64XN(n) \
+ unsigned int vpx_highbd_sad64x##n##_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, \
+ int ref_stride) { \
+ return highbd_sad64xN_avx2(src, src_stride, ref, ref_stride, n); \
+ }
-// 64x32
-HIGHBD_SAD64XN(32)
+#define HIGHBD_SADSKIP64xN(n) \
+ unsigned int vpx_highbd_sad_skip_64x##n##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * highbd_sad64xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+ n / 2); \
+ }
static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
const uint16_t *src, int src_stride,
@@ -107,42 +117,49 @@ static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
}
}
-#define HIGHBD_SAD32XN(n) \
- unsigned int vpx_highbd_sad32x##n##_avx2( \
- const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
- int ref_stride) { \
- const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
- __m256i sums_32 = _mm256_setzero_si256(); \
- int i; \
- \
- for (i = 0; i < (n / 8); ++i) { \
- __m256i sums_16 = _mm256_setzero_si256(); \
- \
- highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8); \
- \
- /* sums_16 will outrange after 8 rows, so add current sums_16 to \
- * sums_32*/ \
- sums_32 = _mm256_add_epi32( \
- sums_32, \
- _mm256_add_epi32( \
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
- _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
- \
- src += src_stride << 3; \
- ref += ref_stride << 3; \
- } \
- return calc_final(sums_32); \
- }
+static VPX_FORCE_INLINE unsigned int highbd_sad32xN_avx2(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride,
+ int n) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ __m256i sums_32 = _mm256_setzero_si256();
+ int i;
-// 32x64
-HIGHBD_SAD32XN(64)
+ for (i = 0; i < (n / 8); ++i) {
+ __m256i sums_16 = _mm256_setzero_si256();
-// 32x32
-HIGHBD_SAD32XN(32)
+ highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8);
-// 32x16
-HIGHBD_SAD32XN(16)
+ /* sums_16 will outrange after 8 rows, so add current sums_16 to
+ * sums_32*/
+ sums_32 = _mm256_add_epi32(
+ sums_32,
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+ src += src_stride << 3;
+ ref += ref_stride << 3;
+ }
+ return calc_final(sums_32);
+}
+
+#define HIGHBD_SAD32XN(n) \
+ unsigned int vpx_highbd_sad32x##n##_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, \
+ int ref_stride) { \
+ return highbd_sad32xN_avx2(src, src_stride, ref, ref_stride, n); \
+ }
+
+#define HIGHBD_SADSKIP32xN(n) \
+ unsigned int vpx_highbd_sad_skip_32x##n##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * highbd_sad32xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+ n / 2); \
+ }
static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
const uint16_t *src, int src_stride,
@@ -167,17 +184,22 @@ static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
}
}
-unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride) {
+static VPX_FORCE_INLINE unsigned int highbd_sad16xN_avx2(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride,
+ int n) {
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
__m256i sums_32 = _mm256_setzero_si256();
+ const int height = VPXMIN(16, n);
+ const int num_iters = n / height;
int i;
- for (i = 0; i < 2; ++i) {
+ for (i = 0; i < num_iters; ++i) {
__m256i sums_16 = _mm256_setzero_si256();
- highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
+ highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, height);
// sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
sums_32 = _mm256_add_epi32(
@@ -192,6 +214,21 @@ unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride,
return calc_final(sums_32);
}
+#define HIGHBD_SAD16XN(n) \
+ unsigned int vpx_highbd_sad16x##n##_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, \
+ int ref_stride) { \
+ return highbd_sad16xN_avx2(src, src_stride, ref, ref_stride, n); \
+ }
+
+#define HIGHBD_SADSKIP16xN(n) \
+ unsigned int vpx_highbd_sad_skip_16x##n##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * highbd_sad16xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+ n / 2); \
+ }
+
unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride) {
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
@@ -224,6 +261,23 @@ unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride,
}
}
+// clang-format off
+HIGHBD_SAD64XN(64)
+HIGHBD_SADSKIP64xN(64)
+HIGHBD_SAD64XN(32)
+HIGHBD_SADSKIP64xN(32)
+HIGHBD_SAD32XN(64)
+HIGHBD_SADSKIP32xN(64)
+HIGHBD_SAD32XN(32)
+HIGHBD_SADSKIP32xN(32)
+HIGHBD_SAD32XN(16)
+HIGHBD_SADSKIP32xN(16)
+HIGHBD_SAD16XN(32)
+HIGHBD_SADSKIP16xN(32)
+HIGHBD_SADSKIP16xN(16)
+HIGHBD_SADSKIP16xN(8)
+//clang-format on
+
// AVG -------------------------------------------------------------------------
static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16,
const uint16_t *src,
diff --git a/vpx_dsp/x86/highbd_sad_sse2.asm b/vpx_dsp/x86/highbd_sad_sse2.asm
index 6a1a6f3d6..62ad2237f 100644
--- a/vpx_dsp/x86/highbd_sad_sse2.asm
+++ b/vpx_dsp/x86/highbd_sad_sse2.asm
@@ -12,6 +12,11 @@
SECTION .text
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
%macro HIGH_SAD_FN 4
%if %4 == 0
%if %3 == 5
@@ -20,7 +25,7 @@ cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
src_stride3, ref_stride3, n_rows
%endif ; %3 == 5/7
-%else ; avg
+%elif %4 == 1 ; avg
%if %3 == 5
cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
second_pred, n_rows
@@ -35,7 +40,18 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \
%define n_rowsd dword r0m
%endif ; x86-32/64
%endif ; %3 == 5/7
-%endif ; avg/sad
+%else ; %4 == 2, skip rows
+%if %3 == 5
+cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%endif ; sad/avg/skip
+%if %4 == 2 ; double the stride if we are skipping rows
+ lea src_strided, [src_strided*2]
+ lea ref_strided, [ref_strided*2]
+%endif
movsxdifnidn src_strideq, src_strided
movsxdifnidn ref_strideq, ref_strided
%if %3 == 7
@@ -54,7 +70,11 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \
; uint8_t *ref, int ref_stride);
%macro HIGH_SAD64XN 1-2 0
HIGH_SAD_FN 64, %1, 5, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/2
+%else
mov n_rowsd, %1
+%endif
pxor m0, m0
pxor m6, m6
@@ -146,6 +166,9 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \
punpckldq m0, m6
movhlps m1, m0
paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
movd eax, m0
RET
%endmacro
@@ -155,13 +178,19 @@ HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
+HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2
+HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2
; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro HIGH_SAD32XN 1-2 0
HIGH_SAD_FN 32, %1, 5, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/2
+%else
mov n_rowsd, %1
+%endif
pxor m0, m0
pxor m6, m6
@@ -213,6 +242,9 @@ HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
punpckldq m0, m6
movhlps m1, m0
paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
movd eax, m0
RET
%endmacro
@@ -224,12 +256,19 @@ HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
+HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2
+HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2
+HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2
; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro HIGH_SAD16XN 1-2 0
HIGH_SAD_FN 16, %1, 5, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/4
+%else
mov n_rowsd, %1/2
+%endif
pxor m0, m0
pxor m6, m6
@@ -281,6 +320,9 @@ HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
punpckldq m0, m6
movhlps m1, m0
paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
movd eax, m0
RET
%endmacro
@@ -292,13 +334,19 @@ HIGH_SAD16XN 8 ; highbd_sad16x8_sse2
HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2
-
+HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2
+HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2
+HIGH_SAD16XN 8, 2 ; highbd_sad_skip_16x8_sse2
; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro HIGH_SAD8XN 1-2 0
HIGH_SAD_FN 8, %1, 7, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/8
+%else
mov n_rowsd, %1/4
+%endif
pxor m0, m0
pxor m6, m6
@@ -350,6 +398,9 @@ HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2
punpckldq m0, m6
movhlps m1, m0
paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
movd eax, m0
RET
%endmacro
@@ -361,3 +412,5 @@ HIGH_SAD8XN 4 ; highbd_sad8x4_sse2
HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2
HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2
+HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2
+HIGH_SAD8XN 8, 2 ; highbd_sad_skip_8x8_sse2
diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c
index 399b67b3f..c87fd3cd2 100644
--- a/vpx_dsp/x86/sad4d_avx2.c
+++ b/vpx_dsp/x86/sad4d_avx2.c
@@ -25,9 +25,10 @@ static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
_mm_storeu_si128((__m128i *)sad_array, sum);
}
-void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *const ref_array[4], int ref_stride,
- uint32_t sad_array[4]) {
+static INLINE void sad32xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4],
+ int ref_stride, int h,
+ uint32_t sad_array[4]) {
int i;
const uint8_t *refs[4];
__m256i sums[4];
@@ -41,7 +42,7 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
sums[2] = _mm256_setzero_si256();
sums[3] = _mm256_setzero_si256();
- for (i = 0; i < 32; i++) {
+ for (i = 0; i < h; i++) {
__m256i r[4];
// load src and all ref[]
@@ -73,9 +74,10 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
calc_final_4(sums, sad_array);
}
-void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *const ref_array[4], int ref_stride,
- uint32_t sad_array[4]) {
+static INLINE void sad64xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4],
+ int ref_stride, int h,
+ uint32_t sad_array[4]) {
__m256i sums[4];
int i;
const uint8_t *refs[4];
@@ -89,7 +91,7 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
sums[2] = _mm256_setzero_si256();
sums[3] = _mm256_setzero_si256();
- for (i = 0; i < 64; i++) {
+ for (i = 0; i < h; i++) {
__m256i r_lo[4], r_hi[4];
// load 64 bytes from src and all ref[]
const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr);
@@ -132,3 +134,51 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
calc_final_4(sums, sad_array);
}
+
+#define SAD64_H(h) \
+ void vpx_sad64x##h##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], int ref_stride, \
+ uint32_t res[4]) { \
+ sad64xhx4d_avx2(src, src_stride, ref, ref_stride, h, res); \
+ }
+
+#define SAD32_H(h) \
+ void vpx_sad32x##h##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], int ref_stride, \
+ uint32_t res[4]) { \
+ sad32xhx4d_avx2(src, src_stride, ref, ref_stride, h, res); \
+ }
+
+SAD64_H(64)
+SAD32_H(32)
+
+#define SADS64_H(h) \
+ void vpx_sad_skip_64x##h##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], \
+ int ref_stride, uint32_t res[4]) { \
+ sad64xhx4d_avx2(src, 2 * src_stride, ref, 2 * ref_stride, ((h) >> 1), \
+ res); \
+ res[0] <<= 1; \
+ res[1] <<= 1; \
+ res[2] <<= 1; \
+ res[3] <<= 1; \
+ }
+
+#define SADS32_H(h) \
+ void vpx_sad_skip_32x##h##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], \
+ int ref_stride, uint32_t res[4]) { \
+ sad32xhx4d_avx2(src, 2 * src_stride, ref, 2 * ref_stride, ((h) >> 1), \
+ res); \
+ res[0] <<= 1; \
+ res[1] <<= 1; \
+ res[2] <<= 1; \
+ res[3] <<= 1; \
+ }
+
+SADS64_H(64)
+SADS64_H(32)
+
+SADS32_H(64)
+SADS32_H(32)
+SADS32_H(16)
diff --git a/vpx_dsp/x86/sad4d_sse2.asm b/vpx_dsp/x86/sad4d_sse2.asm
index 3f6e55ce9..ed4ea3ef9 100644
--- a/vpx_dsp/x86/sad4d_sse2.asm
+++ b/vpx_dsp/x86/sad4d_sse2.asm
@@ -179,7 +179,16 @@ SECTION .text
; uint8_t *ref[4], int ref_stride,
; uint32_t res[4]);
; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
-%macro SADNXN4D 2
+%macro SADNXN4D 2-3 0
+%if %3 == 1 ; skip rows
+%if UNIX64
+cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+ res, ref2, ref3, ref4
+%else
+cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+ ref2, ref3, ref4
+%endif
+%else ; normal sad
%if UNIX64
cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
res, ref2, ref3, ref4
@@ -187,6 +196,11 @@ cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
ref2, ref3, ref4
%endif
+%endif
+%if %3 == 1
+ lea src_strided, [2*src_strided]
+ lea ref_strided, [2*ref_strided]
+%endif
movsxdifnidn src_strideq, src_strided
movsxdifnidn ref_strideq, ref_strided
mov ref2q, [ref1q+gprsize*1]
@@ -195,9 +209,15 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
mov ref1q, [ref1q+gprsize*0]
PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
-%rep (%2-4)/2
+%if %3 == 1 ; downsample number of rows by 2
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
%endrep
+%undef num_rep
PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
%if %1 > 4
@@ -211,12 +231,19 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
punpckhqdq m5, m7
movifnidn r4, r4mp
paddd m4, m5
+%if %3 == 1
+ pslld m4, 1
+%endif
movu [r4], m4
RET
%else
movifnidn r4, r4mp
pshufd m6, m6, 0x08
pshufd m7, m7, 0x08
+%if %3 == 1
+ pslld m6, 1
+ pslld m7, 1
+%endif
movq [r4+0], m6
movq [r4+8], m7
RET
@@ -237,3 +264,15 @@ SADNXN4D 8, 8
SADNXN4D 8, 4
SADNXN4D 4, 8
SADNXN4D 4, 4
+
+SADNXN4D 64, 64, 1
+SADNXN4D 64, 32, 1
+SADNXN4D 32, 64, 1
+SADNXN4D 32, 32, 1
+SADNXN4D 32, 16, 1
+SADNXN4D 16, 32, 1
+SADNXN4D 16, 16, 1
+SADNXN4D 16, 8, 1
+SADNXN4D 8, 16, 1
+SADNXN4D 8, 8, 1
+SADNXN4D 4, 8, 1
diff --git a/vpx_dsp/x86/sad_avx2.c b/vpx_dsp/x86/sad_avx2.c
index 29bedb0e6..e00494d76 100644
--- a/vpx_dsp/x86/sad_avx2.c
+++ b/vpx_dsp/x86/sad_avx2.c
@@ -11,73 +11,104 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/mem.h"
+static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ int i, res;
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+ __m256i sum_sad = _mm256_setzero_si256();
+ __m256i sum_sad_h;
+ __m128i sum_sad128;
+ for (i = 0; i < h; i++) {
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
+ sad1_reg =
+ _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+ sad2_reg = _mm256_sad_epu8(
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
+ sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+ ref_ptr += ref_stride;
+ src_ptr += src_stride;
+ }
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+ res = _mm_cvtsi128_si32(sum_sad128);
+ return res;
+}
+
+static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ int i, res;
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+ __m256i sum_sad = _mm256_setzero_si256();
+ __m256i sum_sad_h;
+ __m128i sum_sad128;
+ const int ref2_stride = ref_stride << 1;
+ const int src2_stride = src_stride << 1;
+ const int max = h >> 1;
+ for (i = 0; i < max; i++) {
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
+ sad1_reg =
+ _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+ sad2_reg = _mm256_sad_epu8(
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
+ sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+ ref_ptr += ref2_stride;
+ src_ptr += src2_stride;
+ }
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+ res = _mm_cvtsi128_si32(sum_sad128);
+ return res;
+}
+
#define FSAD64_H(h) \
unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
const uint8_t *ref_ptr, int ref_stride) { \
- int i; \
- __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
- __m256i sum_sad = _mm256_setzero_si256(); \
- __m256i sum_sad_h; \
- __m128i sum_sad128; \
- for (i = 0; i < h; i++) { \
- ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
- ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
- sad1_reg = _mm256_sad_epu8( \
- ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
- sad2_reg = _mm256_sad_epu8( \
- ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
- sum_sad = \
- _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
- ref_ptr += ref_stride; \
- src_ptr += src_stride; \
- } \
- sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
- sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
- sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
- sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
- return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \
+ return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \
+ }
+
+#define FSADS64_H(h) \
+ unsigned int vpx_sad_skip_64x##h##_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+ h / 2); \
}
#define FSAD32_H(h) \
unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
const uint8_t *ref_ptr, int ref_stride) { \
- int i, res; \
- __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
- __m256i sum_sad = _mm256_setzero_si256(); \
- __m256i sum_sad_h; \
- __m128i sum_sad128; \
- int ref2_stride = ref_stride << 1; \
- int src2_stride = src_stride << 1; \
- int max = h >> 1; \
- for (i = 0; i < max; i++) { \
- ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
- ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
- sad1_reg = _mm256_sad_epu8( \
- ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
- sad2_reg = _mm256_sad_epu8( \
- ref2_reg, \
- _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
- sum_sad = \
- _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
- ref_ptr += ref2_stride; \
- src_ptr += src2_stride; \
- } \
- sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
- sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
- sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
- sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
- res = _mm_cvtsi128_si32(sum_sad128); \
- return res; \
+ return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \
+ }
+
+#define FSADS32_H(h) \
+ unsigned int vpx_sad_skip_32x##h##_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+ h / 2); \
}
-#define FSAD64 \
- FSAD64_H(64) \
- FSAD64_H(32)
+#define FSAD64 \
+ FSAD64_H(64) \
+ FSAD64_H(32) \
+ FSADS64_H(64) \
+ FSADS64_H(32)
-#define FSAD32 \
- FSAD32_H(64) \
- FSAD32_H(32) \
- FSAD32_H(16)
+#define FSAD32 \
+ FSAD32_H(64) \
+ FSAD32_H(32) \
+ FSAD32_H(16) \
+ FSADS32_H(64) \
+ FSADS32_H(32) \
+ FSADS32_H(16)
FSAD64
FSAD32
@@ -86,6 +117,8 @@ FSAD32
#undef FSAD32
#undef FSAD64_H
#undef FSAD32_H
+#undef FSADS64_H
+#undef FSADS32_H
#define FSADAVG64_H(h) \
unsigned int vpx_sad64x##h##_avg_avx2( \
diff --git a/vpx_dsp/x86/sad_sse2.asm b/vpx_dsp/x86/sad_sse2.asm
index e4e1bc3e9..627e463bf 100644
--- a/vpx_dsp/x86/sad_sse2.asm
+++ b/vpx_dsp/x86/sad_sse2.asm
@@ -12,15 +12,29 @@
SECTION .text
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
%macro SAD_FN 4
-%if %4 == 0
+%if %4 == 0 ; normal sad
%if %3 == 5
cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
%else ; %3 == 7
cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
src_stride3, ref_stride3, n_rows
%endif ; %3 == 5/7
-%else ; avg
+
+%elif %4 == 2 ; skip
+%if %3 == 5
+cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+
+%else
%if %3 == 5
cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
second_pred, n_rows
@@ -35,7 +49,11 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \
%define n_rowsd dword r0m
%endif ; x86-32/64
%endif ; %3 == 5/7
-%endif ; avg/sad
+%endif ; sad/avg/skip
+%if %4 == 2; skip rows so double the stride
+lea src_strided, [src_strided*2]
+lea ref_strided, [ref_strided*2]
+%endif ; %4 skip
movsxdifnidn src_strideq, src_strided
movsxdifnidn ref_strideq, ref_strided
%if %3 == 7
@@ -48,7 +66,11 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \
; uint8_t *ref, int ref_stride);
%macro SAD64XN 1-2 0
SAD_FN 64, %1, 5, %2
+%if %2 == 2
+ mov n_rowsd, %1/2
+%else
mov n_rowsd, %1
+%endif
pxor m0, m0
.loop:
movu m1, [refq]
@@ -77,6 +99,9 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \
movhlps m1, m0
paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
movd eax, m0
RET
%endmacro
@@ -86,12 +111,18 @@ SAD64XN 64 ; sad64x64_sse2
SAD64XN 32 ; sad64x32_sse2
SAD64XN 64, 1 ; sad64x64_avg_sse2
SAD64XN 32, 1 ; sad64x32_avg_sse2
+SAD64XN 64, 2 ; sad64x64_skip_sse2
+SAD64XN 32, 2 ; sad64x32_skip_sse2
; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD32XN 1-2 0
SAD_FN 32, %1, 5, %2
+%if %2 == 2
+ mov n_rowsd, %1/4
+%else
mov n_rowsd, %1/2
+%endif
pxor m0, m0
.loop:
movu m1, [refq]
@@ -120,6 +151,9 @@ SAD64XN 32, 1 ; sad64x32_avg_sse2
movhlps m1, m0
paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
movd eax, m0
RET
%endmacro
@@ -131,12 +165,19 @@ SAD32XN 16 ; sad32x16_sse2
SAD32XN 64, 1 ; sad32x64_avg_sse2
SAD32XN 32, 1 ; sad32x32_avg_sse2
SAD32XN 16, 1 ; sad32x16_avg_sse2
+SAD32XN 64, 2 ; sad32x64_skip_sse2
+SAD32XN 32, 2 ; sad32x32_skip_sse2
+SAD32XN 16, 2 ; sad32x16_skip_sse2
; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD16XN 1-2 0
SAD_FN 16, %1, 7, %2
+%if %2 == 2
+ mov n_rowsd, %1/8
+%else
mov n_rowsd, %1/4
+%endif
pxor m0, m0
.loop:
@@ -166,6 +207,9 @@ SAD32XN 16, 1 ; sad32x16_avg_sse2
movhlps m1, m0
paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
movd eax, m0
RET
%endmacro
@@ -177,12 +221,19 @@ SAD16XN 8 ; sad16x8_sse2
SAD16XN 32, 1 ; sad16x32_avg_sse2
SAD16XN 16, 1 ; sad16x16_avg_sse2
SAD16XN 8, 1 ; sad16x8_avg_sse2
+SAD16XN 32, 2 ; sad16x32_skip_sse2
+SAD16XN 16, 2 ; sad16x16_skip_sse2
+SAD16XN 8, 2 ; sad16x8_skip_sse2
; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD8XN 1-2 0
SAD_FN 8, %1, 7, %2
+%if %2 == 2
+ mov n_rowsd, %1/8
+%else
mov n_rowsd, %1/4
+%endif
pxor m0, m0
.loop:
@@ -210,6 +261,9 @@ SAD16XN 8, 1 ; sad16x8_avg_sse2
movhlps m1, m0
paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
movd eax, m0
RET
%endmacro
@@ -221,12 +275,18 @@ SAD8XN 4 ; sad8x4_sse2
SAD8XN 16, 1 ; sad8x16_avg_sse2
SAD8XN 8, 1 ; sad8x8_avg_sse2
SAD8XN 4, 1 ; sad8x4_avg_sse2
+SAD8XN 16, 2 ; sad8x16_skip_sse2
+SAD8XN 8, 2 ; sad8x8_skip_sse2
; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD4XN 1-2 0
SAD_FN 4, %1, 7, %2
+%if %2 == 2
+ mov n_rowsd, %1/8
+%else
mov n_rowsd, %1/4
+%endif
pxor m0, m0
.loop:
@@ -257,6 +317,9 @@ SAD8XN 4, 1 ; sad8x4_avg_sse2
movhlps m1, m0
paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
movd eax, m0
RET
%endmacro
@@ -266,3 +329,4 @@ SAD4XN 8 ; sad4x8_sse
SAD4XN 4 ; sad4x4_sse
SAD4XN 8, 1 ; sad4x8_avg_sse
SAD4XN 4, 1 ; sad4x4_avg_sse
+SAD4XN 8, 2 ; sad4x8_skip_sse