diff options
-rw-r--r-- | test/dct16x16_test.cc | 48 | ||||
-rw-r--r-- | test/sad_test.cc | 515 | ||||
-rw-r--r-- | vp8/vp8_ratectrl_rtc.cc | 28 | ||||
-rw-r--r-- | vp8/vp8_ratectrl_rtc.h | 3 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.pl | 4 | ||||
-rw-r--r-- | vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c | 6 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.c | 647 | ||||
-rw-r--r-- | vp9/encoder/vp9_firstpass.c | 7 | ||||
-rw-r--r-- | vp9/encoder/vp9_mcomp.c | 97 | ||||
-rw-r--r-- | vp9/encoder/vp9_mcomp.h | 10 | ||||
-rw-r--r-- | vp9/encoder/vp9_speed_features.c | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_speed_features.h | 4 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_diamond_search_sad_avx.c | 6 | ||||
-rw-r--r-- | vpx_dsp/sad.c | 30 | ||||
-rw-r--r-- | vpx_dsp/variance.h | 4 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 149 | ||||
-rw-r--r-- | vpx_dsp/x86/fwd_txfm_avx2.c | 373 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_sad4d_avx2.c | 313 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_sad4d_sse2.asm | 43 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_sad_avx2.c | 188 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_sad_sse2.asm | 59 | ||||
-rw-r--r-- | vpx_dsp/x86/sad4d_avx2.c | 66 | ||||
-rw-r--r-- | vpx_dsp/x86/sad4d_sse2.asm | 43 | ||||
-rw-r--r-- | vpx_dsp/x86/sad_avx2.c | 145 | ||||
-rw-r--r-- | vpx_dsp/x86/sad_sse2.asm | 70 |
25 files changed, 2277 insertions, 583 deletions
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index d4ef7ae13..3c104f3a4 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -27,6 +27,7 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" #include "vpx_ports/msvc.h" // for round() +#include "vpx_ports/vpx_timer.h" using libvpx_test::ACMRandom; @@ -548,6 +549,44 @@ class Trans16x16TestBase { } } + void RunSpeedTest() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10000; + int c_sum_time = 0; + int simd_sum_time = 0; + + DECLARE_ALIGNED(32, int16_t, input_block[kNumCoeffs]); + DECLARE_ALIGNED(32, tran_low_t, output_ref_block[kNumCoeffs]); + DECLARE_ALIGNED(32, tran_low_t, output_block[kNumCoeffs]); + + // Initialize a test block with input range [-mask_, mask_]. + for (int j = 0; j < kNumCoeffs; ++j) { + input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_); + } + + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + vpx_fdct16x16_c(input_block, output_ref_block, pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += static_cast<int>(vpx_usec_timer_elapsed(&timer_c)); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunFwdTxfm(input_block, output_block, pitch_); + } + + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += static_cast<int>(vpx_usec_timer_elapsed(&timer_mod)); + + printf( + "c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time, + simd_sum_time, + (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time))); + } + void CompareInvReference(IdctFunc ref_txfm, int thresh) { ACMRandom rnd(ACMRandom::DeterministicSeed()); const int count_test_block = 10000; @@ -664,6 +703,8 @@ TEST_P(Trans16x16DCT, QuantCheck) { TEST_P(Trans16x16DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); } +TEST_P(Trans16x16DCT, DISABLED_Speed) { RunSpeedTest(); } + class Trans16x16HT : public Trans16x16TestBase, public ::testing::TestWithParam<Ht16x16Param> { public: @@ -823,6 +864,13 @@ INSTANTIATE_TEST_SUITE_P( 3, VPX_BITS_8))); #endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + AVX2, Trans16x16DCT, + ::testing::Values(make_tuple(&vpx_fdct16x16_avx2, + &vpx_idct16x16_256_add_sse2, 0, VPX_BITS_8))); +#endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_SUITE_P( SSE2, Trans16x16DCT, diff --git a/test/sad_test.cc b/test/sad_test.cc index 0896c77f1..561da5ddf 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -42,6 +42,10 @@ typedef unsigned int (*SadMxNFunc)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); typedef TestParams<SadMxNFunc> SadMxNParam; +typedef unsigned int (*SadSkipMxNFunc)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride); +typedef TestParams<SadSkipMxNFunc> SadSkipMxNParam; + typedef unsigned int (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); @@ -52,6 +56,11 @@ typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, int src_stride, unsigned int *sad_array); typedef TestParams<SadMxNx4Func> SadMxNx4Param; +typedef void (*SadSkipMxNx4Func)(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[], int ref_stride, + unsigned int *sad_array); +typedef TestParams<SadSkipMxNx4Func> SadSkipMxNx4Param; + typedef void (*SadMxNx8Func)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); @@ -170,6 +179,34 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> { return sad; } + // Sum of Absolute Differences Skip rows. Given two blocks, calculate the + // absolute difference between two pixels in the same relative location every + // other row; accumulate and double the result at the end. + uint32_t ReferenceSADSkip(int ref_offset) const { + uint32_t sad = 0; + const uint8_t *const reference8 = GetReferenceFromOffset(ref_offset); + const uint8_t *const source8 = source_data_; +#if CONFIG_VP9_HIGHBITDEPTH + const uint16_t *const reference16 = + CONVERT_TO_SHORTPTR(GetReferenceFromOffset(ref_offset)); + const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_); +#endif // CONFIG_VP9_HIGHBITDEPTH + for (int h = 0; h < params_.height; h += 2) { + for (int w = 0; w < params_.width; ++w) { + if (!use_high_bit_depth_) { + sad += abs(source8[h * source_stride_ + w] - + reference8[h * reference_stride_ + w]); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + sad += abs(source16[h * source_stride_ + w] - + reference16[h * reference_stride_ + w]); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + } + return sad * 2; + } + // Sum of Absolute Differences Average. Given two blocks, and a prediction // calculate the absolute difference between one pixel and average of the // corresponding and predicted pixels; accumulate. @@ -290,6 +327,32 @@ class SADx4Test : public SADTestBase<SadMxNx4Param> { } }; +class SADSkipx4Test : public SADTestBase<SadMxNx4Param> { + public: + SADSkipx4Test() : SADTestBase(GetParam()) {} + + protected: + void SADs(unsigned int *results) const { + const uint8_t *references[] = { GetReference(0), GetReference(1), + GetReference(2), GetReference(3) }; + + ASM_REGISTER_STATE_CHECK(params_.func( + source_data_, source_stride_, references, reference_stride_, results)); + } + + void CheckSADs() const { + uint32_t reference_sad; + DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]); + + SADs(exp_sad); + for (int block = 0; block < 4; ++block) { + reference_sad = ReferenceSADSkip(GetBlockRefOffset(block)); + + EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block; + } + } +}; + class SADTest : public AbstractBench, public SADTestBase<SadMxNParam> { public: SADTest() : SADTestBase(GetParam()) {} @@ -317,6 +380,33 @@ class SADTest : public AbstractBench, public SADTestBase<SadMxNParam> { } }; +class SADSkipTest : public AbstractBench, public SADTestBase<SadMxNParam> { + public: + SADSkipTest() : SADTestBase(GetParam()) {} + + protected: + unsigned int SAD(int block_idx) const { + unsigned int ret; + const uint8_t *const reference = GetReference(block_idx); + + ASM_REGISTER_STATE_CHECK(ret = params_.func(source_data_, source_stride_, + reference, reference_stride_)); + return ret; + } + + void CheckSAD() const { + const unsigned int reference_sad = ReferenceSADSkip(GetBlockRefOffset(0)); + const unsigned int exp_sad = SAD(0); + + ASSERT_EQ(reference_sad, exp_sad); + } + + void Run() override { + params_.func(source_data_, source_stride_, reference_data_, + reference_stride_); + } +}; + class SADavgTest : public AbstractBench, public SADTestBase<SadMxNAvgParam> { public: SADavgTest() : SADTestBase(GetParam()) {} @@ -397,6 +487,58 @@ TEST_P(SADTest, DISABLED_Speed) { PrintMedian(title); } +TEST_P(SADSkipTest, MaxRef) { + FillConstant(source_data_, source_stride_, 0); + FillConstant(reference_data_, reference_stride_, mask_); + CheckSAD(); +} + +TEST_P(SADSkipTest, MaxSrc) { + FillConstant(source_data_, source_stride_, mask_); + FillConstant(reference_data_, reference_stride_, 0); + CheckSAD(); +} + +TEST_P(SADSkipTest, ShortRef) { + const int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADSkipTest, UnalignedRef) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + const int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADSkipTest, ShortSrc) { + const int tmp_stride = source_stride_; + source_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + source_stride_ = tmp_stride; +} + +TEST_P(SADSkipTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height); + FillRandom(source_data_, source_stride_); + + RunNTimes(kCountSpeedTestBlock); + + char title[16]; + snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height); + PrintMedian(title); +} + TEST_P(SADavgTest, MaxRef) { FillConstant(source_data_, source_stride_, 0); FillConstant(reference_data_, reference_stride_, mask_); @@ -554,6 +696,105 @@ TEST_P(SADx4Test, DISABLED_Speed) { reference_stride_ = tmp_stride; } +TEST_P(SADSkipx4Test, MaxRef) { + FillConstant(source_data_, source_stride_, 0); + FillConstant(GetReference(0), reference_stride_, mask_); + FillConstant(GetReference(1), reference_stride_, mask_); + FillConstant(GetReference(2), reference_stride_, mask_); + FillConstant(GetReference(3), reference_stride_, mask_); + CheckSADs(); +} + +TEST_P(SADSkipx4Test, MaxSrc) { + FillConstant(source_data_, source_stride_, mask_); + FillConstant(GetReference(0), reference_stride_, 0); + FillConstant(GetReference(1), reference_stride_, 0); + FillConstant(GetReference(2), reference_stride_, 0); + FillConstant(GetReference(3), reference_stride_, 0); + CheckSADs(); +} + +TEST_P(SADSkipx4Test, ShortRef) { + int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADSkipx4Test, UnalignedRef) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADSkipx4Test, ShortSrc) { + int tmp_stride = source_stride_; + source_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + source_stride_ = tmp_stride; +} + +TEST_P(SADSkipx4Test, SrcAlignedByWidth) { + uint8_t *tmp_source_data = source_data_; + source_data_ += params_.width; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + source_data_ = tmp_source_data; +} + +TEST_P(SADSkipx4Test, DISABLED_Speed) { + int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + const int kCountSpeedTestBlock = 500000000 / (params_.width * params_.height); + uint32_t reference_sad[4]; + DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]); + vpx_usec_timer timer; + for (int block = 0; block < 4; ++block) { + reference_sad[block] = ReferenceSADSkip(GetBlockRefOffset(block)); + } + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + SADs(exp_sad); + } + vpx_usec_timer_mark(&timer); + for (int block = 0; block < 4; ++block) { + EXPECT_EQ(reference_sad[block], exp_sad[block]) << "block " << block; + } + const int elapsed_time = + static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000); + printf("sad%dx%dx4 (%2dbit) time: %5d ms\n", params_.width, params_.height, + bit_depth_, elapsed_time); + + reference_stride_ = tmp_stride; +} + //------------------------------------------------------------------------------ // C functions const SadMxNParam c_tests[] = { @@ -614,6 +855,56 @@ const SadMxNParam c_tests[] = { }; INSTANTIATE_TEST_SUITE_P(C, SADTest, ::testing::ValuesIn(c_tests)); +const SadSkipMxNParam skip_c_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_c), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_c), + SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_c), + SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_c), + SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_c), + SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_c), + SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_c), + SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_c), + SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_c), + SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_c), + SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_c), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 8), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 8), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 8), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 8), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 8), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 8), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 8), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 8), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 8), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 8), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 8), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 10), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 10), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 10), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 10), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 10), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 10), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 10), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 10), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 10), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 10), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 10), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 12), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 12), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 12), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 12), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 12), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 12), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 12), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 12), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 12), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 12), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(C, SADSkipTest, ::testing::ValuesIn(skip_c_tests)); + const SadMxNAvgParam avg_c_tests[] = { SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_c), SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_c), @@ -730,6 +1021,57 @@ const SadMxNx4Param x4d_c_tests[] = { }; INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests)); +const SadSkipMxNx4Param skip_x4d_c_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_c), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_c), + SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_c), + SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_c), + SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_c), + SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_c), + SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_c), + SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_c), + SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_c), + SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_c), + SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_c), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 8), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 8), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 8), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 8), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 8), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 8), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 8), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 8), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 8), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 8), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 8), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 10), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 10), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 10), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 10), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 10), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 10), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 10), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 10), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 10), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 10), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 10), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 12), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 12), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 12), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 12), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 12), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 12), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 12), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 12), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 12), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 12), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(C, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_c_tests)); + //------------------------------------------------------------------------------ // ARM functions #if HAVE_NEON @@ -956,6 +1298,54 @@ const SadMxNParam sse2_tests[] = { }; INSTANTIATE_TEST_SUITE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests)); +const SadSkipMxNParam skip_sse2_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_sse2), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_sse2), + SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_sse2), + SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_sse2), + SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_sse2), + SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_sse2), + SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_sse2), + SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_sse2), + SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_sse2), + SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_sse2), + SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_sse2), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 8), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 8), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 8), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 8), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 8), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 8), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 8), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 8), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 8), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 8), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 10), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 10), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 10), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 10), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 10), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 10), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 10), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 10), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 10), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 10), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 12), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 12), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 12), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 12), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 12), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 12), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 12), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 12), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 12), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipTest, + ::testing::ValuesIn(skip_sse2_tests)); + const SadMxNAvgParam avg_sse2_tests[] = { SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_sse2), SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_sse2), @@ -1065,6 +1455,57 @@ const SadMxNx4Param x4d_sse2_tests[] = { #endif // CONFIG_VP9_HIGHBITDEPTH }; INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests)); + +const SadSkipMxNx4Param skip_x4d_sse2_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_sse2), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_sse2), + SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_sse2), + SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_sse2), + SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_sse2), + SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_sse2), + SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_sse2), + SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_sse2), + SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_sse2), + SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_sse2), + SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_sse2), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 8), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 8), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 8), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 8), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 8), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 8), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 8), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 8), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 8), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 8), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 8), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 10), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 10), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 10), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 10), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 10), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 10), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 10), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 10), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 10), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 10), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 10), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 12), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 12), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 12), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 12), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 12), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 12), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 12), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 12), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 12), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 12), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_sse2_tests)); #endif // HAVE_SSE2 #if HAVE_SSE3 @@ -1113,6 +1554,44 @@ const SadMxNParam avx2_tests[] = { }; INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests)); +const SadSkipMxNParam skip_avx2_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_avx2), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_avx2), + SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_avx2), + SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_avx2), + SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 8), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 8), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 8), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 8), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 8), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 8), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 8), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 8), + + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 10), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 10), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 10), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 10), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 10), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 10), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 10), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 10), + + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 12), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 12), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 12), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 12), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 12), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 12), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 12), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipTest, + ::testing::ValuesIn(skip_avx2_tests)); + const SadMxNAvgParam avg_avx2_tests[] = { SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_avx2), SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_avx2), @@ -1180,6 +1659,42 @@ const SadMxNx4Param x4d_avx2_tests[] = { }; INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests)); +const SadSkipMxNx4Param skip_x4d_avx2_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_avx2), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_avx2), + SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_avx2), + SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_avx2), + SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 8), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 8), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 8), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 8), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 8), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 8), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 8), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 8), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 10), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 10), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 10), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 10), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 10), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 10), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 10), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 10), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 12), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 12), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 12), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 12), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 12), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 12), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 12), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_avx2_tests)); + #endif // HAVE_AVX2 #if HAVE_AVX512 diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc index 65c58536a..60bc258a6 100644 --- a/vp8/vp8_ratectrl_rtc.cc +++ b/vp8/vp8_ratectrl_rtc.cc @@ -294,6 +294,34 @@ void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) { int VP8RateControlRTC::GetQP() const { return q_; } +int VP8RateControlRTC::GetLoopfilterLevel() const { + VP8_COMMON *cm = &cpi_->common; + const double qp = q_; + + // This model is from linear regression + if (cm->Width * cm->Height <= 320 * 240) { + cm->filter_level = static_cast<int>(0.352685 * qp + 2.957774); + } else if (cm->Width * cm->Height <= 640 * 480) { + cm->filter_level = static_cast<int>(0.485069 * qp - 0.534462); + } else { + cm->filter_level = static_cast<int>(0.314875 * qp + 7.959003); + } + + int min_filter_level = 0; + // This logic is from get_min_filter_level() in picklpf.c + if (q_ > 6 && q_ <= 16) { + min_filter_level = 1; + } else { + min_filter_level = (q_ / 8); + } + + const int max_filter_level = 63; + if (cm->filter_level < min_filter_level) cm->filter_level = min_filter_level; + if (cm->filter_level > max_filter_level) cm->filter_level = max_filter_level; + + return cm->filter_level; +} + void VP8RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) { VP8_COMMON *const cm = &cpi_->common; vpx_clear_system_state(); diff --git a/vp8/vp8_ratectrl_rtc.h b/vp8/vp8_ratectrl_rtc.h index a8a886c56..496ef9eaa 100644 --- a/vp8/vp8_ratectrl_rtc.h +++ b/vp8/vp8_ratectrl_rtc.h @@ -42,6 +42,9 @@ class VP8RateControlRTC { bool UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg); // GetQP() needs to be called after ComputeQP() to get the latest QP int GetQP() const; + // GetLoopfilterLevel() needs to be called after ComputeQP() since loopfilter + // level is calculated from frame qp. + int GetLoopfilterLevel() const; // int GetLoopfilterLevel() const; void ComputeQP(const VP8FrameParamsQpRTC &frame_params); // Feedback to rate control with the size of current encoded frame diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 5e6079255..4b94c31f1 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -23,7 +23,7 @@ struct macroblockd; /* Encoder forward decls */ struct macroblock; -struct vp9_variance_vtable; +struct vp9_sad_table; struct search_site_config; struct mv; union int_mv; @@ -171,7 +171,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") { # # Motion search # -add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv"; +add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv"; specialize qw/vp9_diamond_search_sad avx neon/; # diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c index 15334b413..255e6fbc4 100644 --- a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c +++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c @@ -49,7 +49,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, uint32_t start_mv_sad, MV *best_mv, int search_param, int sad_per_bit, int *num00, - const vp9_variance_fn_ptr_t *fn_ptr, + const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv) { static const uint32_t data[4] = { 0, 1, 2, 3 }; const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data); @@ -188,8 +188,8 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, #endif } - fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], - in_what_stride, (uint32_t *)&v_sad_d); + sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], + in_what_stride, (uint32_t *)&v_sad_d); // Look up the component cost of the residual motion vector { diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 72a6189d1..354f08eae 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -1561,13 +1561,15 @@ void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) { } #if CONFIG_VP9_HIGHBITDEPTH -#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \ - cpi->fn_ptr[BT].sdf = SDF; \ - cpi->fn_ptr[BT].sdaf = SDAF; \ - cpi->fn_ptr[BT].vf = VF; \ - cpi->fn_ptr[BT].svf = SVF; \ - cpi->fn_ptr[BT].svaf = SVAF; \ - cpi->fn_ptr[BT].sdx4df = SDX4DF; +#define HIGHBD_BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \ + cpi->fn_ptr[BT].sdf = SDF; \ + cpi->fn_ptr[BT].sdsf = SDSF; \ + cpi->fn_ptr[BT].sdaf = SDAF; \ + cpi->fn_ptr[BT].vf = VF; \ + cpi->fn_ptr[BT].svf = SVF; \ + cpi->fn_ptr[BT].svaf = SVAF; \ + cpi->fn_ptr[BT].sdx4df = SDX4DF; \ + cpi->fn_ptr[BT].sdsx4df = SDSX4DF; #define MAKE_BFP_SAD_WRAPPER(fnname) \ static unsigned int fnname##_bits8(const uint8_t *src_ptr, \ @@ -1627,284 +1629,361 @@ void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) { } MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x16) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x16) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x16_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x16x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x16x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x32) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x32) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x32_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x32x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x32x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x32) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x32) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x32_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x32x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x32x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x64) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x64) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x64x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x32) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x32x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x64) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x64x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x16) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x16x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x8) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x8x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x16) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x16x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x8) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x8x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x4) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x4x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x8) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x8x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x4) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x4x4d) static void highbd_set_var_fns(VP9_COMP *const cpi) { VP9_COMMON *const cm = &cpi->common; if (cm->use_highbitdepth) { switch (cm->bit_depth) { case VPX_BITS_8: - HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits8, - vpx_highbd_sad32x16_avg_bits8, vpx_highbd_8_variance32x16, - vpx_highbd_8_sub_pixel_variance32x16, - vpx_highbd_8_sub_pixel_avg_variance32x16, - vpx_highbd_sad32x16x4d_bits8) - - HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits8, - vpx_highbd_sad16x32_avg_bits8, vpx_highbd_8_variance16x32, - vpx_highbd_8_sub_pixel_variance16x32, - vpx_highbd_8_sub_pixel_avg_variance16x32, - vpx_highbd_sad16x32x4d_bits8) - - HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits8, - vpx_highbd_sad64x32_avg_bits8, vpx_highbd_8_variance64x32, - vpx_highbd_8_sub_pixel_variance64x32, - vpx_highbd_8_sub_pixel_avg_variance64x32, - vpx_highbd_sad64x32x4d_bits8) - - HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits8, - vpx_highbd_sad32x64_avg_bits8, vpx_highbd_8_variance32x64, - vpx_highbd_8_sub_pixel_variance32x64, - vpx_highbd_8_sub_pixel_avg_variance32x64, - vpx_highbd_sad32x64x4d_bits8) - - HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits8, - vpx_highbd_sad32x32_avg_bits8, vpx_highbd_8_variance32x32, - vpx_highbd_8_sub_pixel_variance32x32, - vpx_highbd_8_sub_pixel_avg_variance32x32, - vpx_highbd_sad32x32x4d_bits8) - - HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits8, - vpx_highbd_sad64x64_avg_bits8, vpx_highbd_8_variance64x64, - vpx_highbd_8_sub_pixel_variance64x64, - vpx_highbd_8_sub_pixel_avg_variance64x64, - vpx_highbd_sad64x64x4d_bits8) - - HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits8, - vpx_highbd_sad16x16_avg_bits8, vpx_highbd_8_variance16x16, - vpx_highbd_8_sub_pixel_variance16x16, - vpx_highbd_8_sub_pixel_avg_variance16x16, - vpx_highbd_sad16x16x4d_bits8) - - HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits8, - vpx_highbd_sad16x8_avg_bits8, vpx_highbd_8_variance16x8, - vpx_highbd_8_sub_pixel_variance16x8, - vpx_highbd_8_sub_pixel_avg_variance16x8, - vpx_highbd_sad16x8x4d_bits8) - - HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits8, - vpx_highbd_sad8x16_avg_bits8, vpx_highbd_8_variance8x16, - vpx_highbd_8_sub_pixel_variance8x16, - vpx_highbd_8_sub_pixel_avg_variance8x16, - vpx_highbd_sad8x16x4d_bits8) + HIGHBD_BFP( + BLOCK_32X16, vpx_highbd_sad32x16_bits8, + vpx_highbd_sad_skip_32x16_bits8, vpx_highbd_sad32x16_avg_bits8, + vpx_highbd_8_variance32x16, vpx_highbd_8_sub_pixel_variance32x16, + vpx_highbd_8_sub_pixel_avg_variance32x16, + vpx_highbd_sad32x16x4d_bits8, vpx_highbd_sad_skip_32x16x4d_bits8) + + HIGHBD_BFP( + BLOCK_16X32, vpx_highbd_sad16x32_bits8, + vpx_highbd_sad_skip_16x32_bits8, vpx_highbd_sad16x32_avg_bits8, + vpx_highbd_8_variance16x32, vpx_highbd_8_sub_pixel_variance16x32, + vpx_highbd_8_sub_pixel_avg_variance16x32, + vpx_highbd_sad16x32x4d_bits8, vpx_highbd_sad_skip_16x32x4d_bits8) + + HIGHBD_BFP( + BLOCK_64X32, vpx_highbd_sad64x32_bits8, + vpx_highbd_sad_skip_64x32_bits8, vpx_highbd_sad64x32_avg_bits8, + vpx_highbd_8_variance64x32, vpx_highbd_8_sub_pixel_variance64x32, + vpx_highbd_8_sub_pixel_avg_variance64x32, + vpx_highbd_sad64x32x4d_bits8, vpx_highbd_sad_skip_64x32x4d_bits8) + + HIGHBD_BFP( + BLOCK_32X64, vpx_highbd_sad32x64_bits8, + vpx_highbd_sad_skip_32x64_bits8, vpx_highbd_sad32x64_avg_bits8, + vpx_highbd_8_variance32x64, vpx_highbd_8_sub_pixel_variance32x64, + vpx_highbd_8_sub_pixel_avg_variance32x64, + vpx_highbd_sad32x64x4d_bits8, vpx_highbd_sad_skip_32x64x4d_bits8) + + HIGHBD_BFP( + BLOCK_32X32, vpx_highbd_sad32x32_bits8, + vpx_highbd_sad_skip_32x32_bits8, vpx_highbd_sad32x32_avg_bits8, + vpx_highbd_8_variance32x32, vpx_highbd_8_sub_pixel_variance32x32, + vpx_highbd_8_sub_pixel_avg_variance32x32, + vpx_highbd_sad32x32x4d_bits8, vpx_highbd_sad_skip_32x32x4d_bits8) HIGHBD_BFP( - BLOCK_8X8, vpx_highbd_sad8x8_bits8, vpx_highbd_sad8x8_avg_bits8, - vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8, - vpx_highbd_8_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x4d_bits8) + BLOCK_64X64, vpx_highbd_sad64x64_bits8, + vpx_highbd_sad_skip_64x64_bits8, vpx_highbd_sad64x64_avg_bits8, + vpx_highbd_8_variance64x64, vpx_highbd_8_sub_pixel_variance64x64, + vpx_highbd_8_sub_pixel_avg_variance64x64, + vpx_highbd_sad64x64x4d_bits8, vpx_highbd_sad_skip_64x64x4d_bits8) HIGHBD_BFP( - BLOCK_8X4, vpx_highbd_sad8x4_bits8, vpx_highbd_sad8x4_avg_bits8, - vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4, - vpx_highbd_8_sub_pixel_avg_variance8x4, vpx_highbd_sad8x4x4d_bits8) + BLOCK_16X16, vpx_highbd_sad16x16_bits8, + vpx_highbd_sad_skip_16x16_bits8, vpx_highbd_sad16x16_avg_bits8, + vpx_highbd_8_variance16x16, vpx_highbd_8_sub_pixel_variance16x16, + vpx_highbd_8_sub_pixel_avg_variance16x16, + vpx_highbd_sad16x16x4d_bits8, vpx_highbd_sad_skip_16x16x4d_bits8) HIGHBD_BFP( - BLOCK_4X8, vpx_highbd_sad4x8_bits8, vpx_highbd_sad4x8_avg_bits8, - vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8, - vpx_highbd_8_sub_pixel_avg_variance4x8, vpx_highbd_sad4x8x4d_bits8) + BLOCK_16X8, vpx_highbd_sad16x8_bits8, + vpx_highbd_sad_skip_16x8_bits8, vpx_highbd_sad16x8_avg_bits8, + vpx_highbd_8_variance16x8, vpx_highbd_8_sub_pixel_variance16x8, + vpx_highbd_8_sub_pixel_avg_variance16x8, + vpx_highbd_sad16x8x4d_bits8, vpx_highbd_sad_skip_16x8x4d_bits8) HIGHBD_BFP( - BLOCK_4X4, vpx_highbd_sad4x4_bits8, vpx_highbd_sad4x4_avg_bits8, - vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4, - vpx_highbd_8_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x4d_bits8) + BLOCK_8X16, vpx_highbd_sad8x16_bits8, + vpx_highbd_sad_skip_8x16_bits8, vpx_highbd_sad8x16_avg_bits8, + vpx_highbd_8_variance8x16, vpx_highbd_8_sub_pixel_variance8x16, + vpx_highbd_8_sub_pixel_avg_variance8x16, + vpx_highbd_sad8x16x4d_bits8, vpx_highbd_sad_skip_8x16x4d_bits8) + + HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits8, + vpx_highbd_sad_skip_8x8_bits8, vpx_highbd_sad8x8_avg_bits8, + vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8, + vpx_highbd_8_sub_pixel_avg_variance8x8, + vpx_highbd_sad8x8x4d_bits8, vpx_highbd_sad_skip_8x8x4d_bits8) + + HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits8, + vpx_highbd_sad_skip_8x4_bits8, vpx_highbd_sad8x4_avg_bits8, + vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4, + vpx_highbd_8_sub_pixel_avg_variance8x4, + vpx_highbd_sad8x4x4d_bits8, vpx_highbd_sad_skip_8x4x4d_bits8) + + HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits8, + vpx_highbd_sad_skip_4x8_bits8, vpx_highbd_sad4x8_avg_bits8, + vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8, + vpx_highbd_8_sub_pixel_avg_variance4x8, + vpx_highbd_sad4x8x4d_bits8, vpx_highbd_sad_skip_4x8x4d_bits8) + + HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits8, + vpx_highbd_sad_skip_4x4_bits8, vpx_highbd_sad4x4_avg_bits8, + vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4, + vpx_highbd_8_sub_pixel_avg_variance4x4, + vpx_highbd_sad4x4x4d_bits8, vpx_highbd_sad_skip_4x4x4d_bits8) break; case VPX_BITS_10: - HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits10, - vpx_highbd_sad32x16_avg_bits10, vpx_highbd_10_variance32x16, - vpx_highbd_10_sub_pixel_variance32x16, - vpx_highbd_10_sub_pixel_avg_variance32x16, - vpx_highbd_sad32x16x4d_bits10) - - HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits10, - vpx_highbd_sad16x32_avg_bits10, vpx_highbd_10_variance16x32, - vpx_highbd_10_sub_pixel_variance16x32, - vpx_highbd_10_sub_pixel_avg_variance16x32, - vpx_highbd_sad16x32x4d_bits10) - - HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits10, - vpx_highbd_sad64x32_avg_bits10, vpx_highbd_10_variance64x32, - vpx_highbd_10_sub_pixel_variance64x32, - vpx_highbd_10_sub_pixel_avg_variance64x32, - vpx_highbd_sad64x32x4d_bits10) - - HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits10, - vpx_highbd_sad32x64_avg_bits10, vpx_highbd_10_variance32x64, - vpx_highbd_10_sub_pixel_variance32x64, - vpx_highbd_10_sub_pixel_avg_variance32x64, - vpx_highbd_sad32x64x4d_bits10) - - HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits10, - vpx_highbd_sad32x32_avg_bits10, vpx_highbd_10_variance32x32, - vpx_highbd_10_sub_pixel_variance32x32, - vpx_highbd_10_sub_pixel_avg_variance32x32, - vpx_highbd_sad32x32x4d_bits10) - - HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits10, - vpx_highbd_sad64x64_avg_bits10, vpx_highbd_10_variance64x64, - vpx_highbd_10_sub_pixel_variance64x64, - vpx_highbd_10_sub_pixel_avg_variance64x64, - vpx_highbd_sad64x64x4d_bits10) - - HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits10, - vpx_highbd_sad16x16_avg_bits10, vpx_highbd_10_variance16x16, - vpx_highbd_10_sub_pixel_variance16x16, - vpx_highbd_10_sub_pixel_avg_variance16x16, - vpx_highbd_sad16x16x4d_bits10) - - HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits10, - vpx_highbd_sad16x8_avg_bits10, vpx_highbd_10_variance16x8, - vpx_highbd_10_sub_pixel_variance16x8, - vpx_highbd_10_sub_pixel_avg_variance16x8, - vpx_highbd_sad16x8x4d_bits10) - - HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits10, - vpx_highbd_sad8x16_avg_bits10, vpx_highbd_10_variance8x16, - vpx_highbd_10_sub_pixel_variance8x16, - vpx_highbd_10_sub_pixel_avg_variance8x16, - vpx_highbd_sad8x16x4d_bits10) - - HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits10, - vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8, - vpx_highbd_10_sub_pixel_variance8x8, - vpx_highbd_10_sub_pixel_avg_variance8x8, - vpx_highbd_sad8x8x4d_bits10) - - HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits10, - vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4, - vpx_highbd_10_sub_pixel_variance8x4, - vpx_highbd_10_sub_pixel_avg_variance8x4, - vpx_highbd_sad8x4x4d_bits10) - - HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits10, - vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8, - vpx_highbd_10_sub_pixel_variance4x8, - vpx_highbd_10_sub_pixel_avg_variance4x8, - vpx_highbd_sad4x8x4d_bits10) - - HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits10, - vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4, - vpx_highbd_10_sub_pixel_variance4x4, - vpx_highbd_10_sub_pixel_avg_variance4x4, - vpx_highbd_sad4x4x4d_bits10) + HIGHBD_BFP( + BLOCK_32X16, vpx_highbd_sad32x16_bits10, + vpx_highbd_sad_skip_32x16_bits10, vpx_highbd_sad32x16_avg_bits10, + vpx_highbd_10_variance32x16, vpx_highbd_10_sub_pixel_variance32x16, + vpx_highbd_10_sub_pixel_avg_variance32x16, + vpx_highbd_sad32x16x4d_bits10, vpx_highbd_sad_skip_32x16x4d_bits10) + + HIGHBD_BFP( + BLOCK_16X32, vpx_highbd_sad16x32_bits10, + vpx_highbd_sad_skip_16x32_bits10, vpx_highbd_sad16x32_avg_bits10, + vpx_highbd_10_variance16x32, vpx_highbd_10_sub_pixel_variance16x32, + vpx_highbd_10_sub_pixel_avg_variance16x32, + vpx_highbd_sad16x32x4d_bits10, vpx_highbd_sad_skip_16x32x4d_bits10) + + HIGHBD_BFP( + BLOCK_64X32, vpx_highbd_sad64x32_bits10, + vpx_highbd_sad_skip_64x32_bits10, vpx_highbd_sad64x32_avg_bits10, + vpx_highbd_10_variance64x32, vpx_highbd_10_sub_pixel_variance64x32, + vpx_highbd_10_sub_pixel_avg_variance64x32, + vpx_highbd_sad64x32x4d_bits10, vpx_highbd_sad_skip_64x32x4d_bits10) + + HIGHBD_BFP( + BLOCK_32X64, vpx_highbd_sad32x64_bits10, + vpx_highbd_sad_skip_32x64_bits10, vpx_highbd_sad32x64_avg_bits10, + vpx_highbd_10_variance32x64, vpx_highbd_10_sub_pixel_variance32x64, + vpx_highbd_10_sub_pixel_avg_variance32x64, + vpx_highbd_sad32x64x4d_bits10, vpx_highbd_sad_skip_32x64x4d_bits10) + + HIGHBD_BFP( + BLOCK_32X32, vpx_highbd_sad32x32_bits10, + vpx_highbd_sad_skip_32x32_bits10, vpx_highbd_sad32x32_avg_bits10, + vpx_highbd_10_variance32x32, vpx_highbd_10_sub_pixel_variance32x32, + vpx_highbd_10_sub_pixel_avg_variance32x32, + vpx_highbd_sad32x32x4d_bits10, vpx_highbd_sad_skip_32x32x4d_bits10) + + HIGHBD_BFP( + BLOCK_64X64, vpx_highbd_sad64x64_bits10, + vpx_highbd_sad_skip_64x64_bits10, vpx_highbd_sad64x64_avg_bits10, + vpx_highbd_10_variance64x64, vpx_highbd_10_sub_pixel_variance64x64, + vpx_highbd_10_sub_pixel_avg_variance64x64, + vpx_highbd_sad64x64x4d_bits10, vpx_highbd_sad_skip_64x64x4d_bits10) + + HIGHBD_BFP( + BLOCK_16X16, vpx_highbd_sad16x16_bits10, + vpx_highbd_sad_skip_16x16_bits10, vpx_highbd_sad16x16_avg_bits10, + vpx_highbd_10_variance16x16, vpx_highbd_10_sub_pixel_variance16x16, + vpx_highbd_10_sub_pixel_avg_variance16x16, + vpx_highbd_sad16x16x4d_bits10, vpx_highbd_sad_skip_16x16x4d_bits10) + + HIGHBD_BFP( + BLOCK_16X8, vpx_highbd_sad16x8_bits10, + vpx_highbd_sad_skip_16x8_bits10, vpx_highbd_sad16x8_avg_bits10, + vpx_highbd_10_variance16x8, vpx_highbd_10_sub_pixel_variance16x8, + vpx_highbd_10_sub_pixel_avg_variance16x8, + vpx_highbd_sad16x8x4d_bits10, vpx_highbd_sad_skip_16x8x4d_bits10) + + HIGHBD_BFP( + BLOCK_8X16, vpx_highbd_sad8x16_bits10, + vpx_highbd_sad_skip_8x16_bits10, vpx_highbd_sad8x16_avg_bits10, + vpx_highbd_10_variance8x16, vpx_highbd_10_sub_pixel_variance8x16, + vpx_highbd_10_sub_pixel_avg_variance8x16, + vpx_highbd_sad8x16x4d_bits10, vpx_highbd_sad_skip_8x16x4d_bits10) + + HIGHBD_BFP( + BLOCK_8X8, vpx_highbd_sad8x8_bits10, vpx_highbd_sad_skip_8x8_bits10, + vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8, + vpx_highbd_10_sub_pixel_variance8x8, + vpx_highbd_10_sub_pixel_avg_variance8x8, + vpx_highbd_sad8x8x4d_bits10, vpx_highbd_sad_skip_8x8x4d_bits10) + + HIGHBD_BFP( + BLOCK_8X4, vpx_highbd_sad8x4_bits10, vpx_highbd_sad_skip_8x4_bits10, + vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4, + vpx_highbd_10_sub_pixel_variance8x4, + vpx_highbd_10_sub_pixel_avg_variance8x4, + vpx_highbd_sad8x4x4d_bits10, vpx_highbd_sad_skip_8x4x4d_bits10) + + HIGHBD_BFP( + BLOCK_4X8, vpx_highbd_sad4x8_bits10, vpx_highbd_sad_skip_4x8_bits10, + vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8, + vpx_highbd_10_sub_pixel_variance4x8, + vpx_highbd_10_sub_pixel_avg_variance4x8, + vpx_highbd_sad4x8x4d_bits10, vpx_highbd_sad_skip_4x8x4d_bits10) + + HIGHBD_BFP( + BLOCK_4X4, vpx_highbd_sad4x4_bits10, vpx_highbd_sad_skip_4x4_bits10, + vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4, + vpx_highbd_10_sub_pixel_variance4x4, + vpx_highbd_10_sub_pixel_avg_variance4x4, + vpx_highbd_sad4x4x4d_bits10, vpx_highbd_sad_skip_4x4x4d_bits10) break; default: assert(cm->bit_depth == VPX_BITS_12); - HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12, - vpx_highbd_sad32x16_avg_bits12, vpx_highbd_12_variance32x16, - vpx_highbd_12_sub_pixel_variance32x16, - vpx_highbd_12_sub_pixel_avg_variance32x16, - vpx_highbd_sad32x16x4d_bits12) - - HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits12, - vpx_highbd_sad16x32_avg_bits12, vpx_highbd_12_variance16x32, - vpx_highbd_12_sub_pixel_variance16x32, - vpx_highbd_12_sub_pixel_avg_variance16x32, - vpx_highbd_sad16x32x4d_bits12) - - HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits12, - vpx_highbd_sad64x32_avg_bits12, vpx_highbd_12_variance64x32, - vpx_highbd_12_sub_pixel_variance64x32, - vpx_highbd_12_sub_pixel_avg_variance64x32, - vpx_highbd_sad64x32x4d_bits12) - - HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits12, - vpx_highbd_sad32x64_avg_bits12, vpx_highbd_12_variance32x64, - vpx_highbd_12_sub_pixel_variance32x64, - vpx_highbd_12_sub_pixel_avg_variance32x64, - vpx_highbd_sad32x64x4d_bits12) - - HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits12, - vpx_highbd_sad32x32_avg_bits12, vpx_highbd_12_variance32x32, - vpx_highbd_12_sub_pixel_variance32x32, - vpx_highbd_12_sub_pixel_avg_variance32x32, - vpx_highbd_sad32x32x4d_bits12) - - HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits12, - vpx_highbd_sad64x64_avg_bits12, vpx_highbd_12_variance64x64, - vpx_highbd_12_sub_pixel_variance64x64, - vpx_highbd_12_sub_pixel_avg_variance64x64, - vpx_highbd_sad64x64x4d_bits12) - - HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits12, - vpx_highbd_sad16x16_avg_bits12, vpx_highbd_12_variance16x16, - vpx_highbd_12_sub_pixel_variance16x16, - vpx_highbd_12_sub_pixel_avg_variance16x16, - vpx_highbd_sad16x16x4d_bits12) - - HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits12, - vpx_highbd_sad16x8_avg_bits12, vpx_highbd_12_variance16x8, - vpx_highbd_12_sub_pixel_variance16x8, - vpx_highbd_12_sub_pixel_avg_variance16x8, - vpx_highbd_sad16x8x4d_bits12) - - HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits12, - vpx_highbd_sad8x16_avg_bits12, vpx_highbd_12_variance8x16, - vpx_highbd_12_sub_pixel_variance8x16, - vpx_highbd_12_sub_pixel_avg_variance8x16, - vpx_highbd_sad8x16x4d_bits12) - - HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits12, - vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8, - vpx_highbd_12_sub_pixel_variance8x8, - vpx_highbd_12_sub_pixel_avg_variance8x8, - vpx_highbd_sad8x8x4d_bits12) - - HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits12, - vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4, - vpx_highbd_12_sub_pixel_variance8x4, - vpx_highbd_12_sub_pixel_avg_variance8x4, - vpx_highbd_sad8x4x4d_bits12) - - HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits12, - vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8, - vpx_highbd_12_sub_pixel_variance4x8, - vpx_highbd_12_sub_pixel_avg_variance4x8, - vpx_highbd_sad4x8x4d_bits12) - - HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits12, - vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4, - vpx_highbd_12_sub_pixel_variance4x4, - vpx_highbd_12_sub_pixel_avg_variance4x4, - vpx_highbd_sad4x4x4d_bits12) + HIGHBD_BFP( + BLOCK_32X16, vpx_highbd_sad32x16_bits12, + vpx_highbd_sad_skip_32x16_bits12, vpx_highbd_sad32x16_avg_bits12, + vpx_highbd_12_variance32x16, vpx_highbd_12_sub_pixel_variance32x16, + vpx_highbd_12_sub_pixel_avg_variance32x16, + vpx_highbd_sad32x16x4d_bits12, vpx_highbd_sad_skip_32x16x4d_bits12) + + HIGHBD_BFP( + BLOCK_16X32, vpx_highbd_sad16x32_bits12, + vpx_highbd_sad_skip_16x32_bits12, vpx_highbd_sad16x32_avg_bits12, + vpx_highbd_12_variance16x32, vpx_highbd_12_sub_pixel_variance16x32, + vpx_highbd_12_sub_pixel_avg_variance16x32, + vpx_highbd_sad16x32x4d_bits12, vpx_highbd_sad_skip_16x32x4d_bits12) + + HIGHBD_BFP( + BLOCK_64X32, vpx_highbd_sad64x32_bits12, + vpx_highbd_sad_skip_64x32_bits12, vpx_highbd_sad64x32_avg_bits12, + vpx_highbd_12_variance64x32, vpx_highbd_12_sub_pixel_variance64x32, + vpx_highbd_12_sub_pixel_avg_variance64x32, + vpx_highbd_sad64x32x4d_bits12, vpx_highbd_sad_skip_64x32x4d_bits12) + + HIGHBD_BFP( + BLOCK_32X64, vpx_highbd_sad32x64_bits12, + vpx_highbd_sad_skip_32x64_bits12, vpx_highbd_sad32x64_avg_bits12, + vpx_highbd_12_variance32x64, vpx_highbd_12_sub_pixel_variance32x64, + vpx_highbd_12_sub_pixel_avg_variance32x64, + vpx_highbd_sad32x64x4d_bits12, vpx_highbd_sad_skip_32x64x4d_bits12) + + HIGHBD_BFP( + BLOCK_32X32, vpx_highbd_sad32x32_bits12, + vpx_highbd_sad_skip_32x32_bits12, vpx_highbd_sad32x32_avg_bits12, + vpx_highbd_12_variance32x32, vpx_highbd_12_sub_pixel_variance32x32, + vpx_highbd_12_sub_pixel_avg_variance32x32, + vpx_highbd_sad32x32x4d_bits12, vpx_highbd_sad_skip_32x32x4d_bits12) + + HIGHBD_BFP( + BLOCK_64X64, vpx_highbd_sad64x64_bits12, + vpx_highbd_sad_skip_64x64_bits12, vpx_highbd_sad64x64_avg_bits12, + vpx_highbd_12_variance64x64, vpx_highbd_12_sub_pixel_variance64x64, + vpx_highbd_12_sub_pixel_avg_variance64x64, + vpx_highbd_sad64x64x4d_bits12, vpx_highbd_sad_skip_64x64x4d_bits12) + + HIGHBD_BFP( + BLOCK_16X16, vpx_highbd_sad16x16_bits12, + vpx_highbd_sad_skip_16x16_bits12, vpx_highbd_sad16x16_avg_bits12, + vpx_highbd_12_variance16x16, vpx_highbd_12_sub_pixel_variance16x16, + vpx_highbd_12_sub_pixel_avg_variance16x16, + vpx_highbd_sad16x16x4d_bits12, vpx_highbd_sad_skip_16x16x4d_bits12) + + HIGHBD_BFP( + BLOCK_16X8, vpx_highbd_sad16x8_bits12, + vpx_highbd_sad_skip_16x8_bits12, vpx_highbd_sad16x8_avg_bits12, + vpx_highbd_12_variance16x8, vpx_highbd_12_sub_pixel_variance16x8, + vpx_highbd_12_sub_pixel_avg_variance16x8, + vpx_highbd_sad16x8x4d_bits12, vpx_highbd_sad_skip_16x8x4d_bits12) + + HIGHBD_BFP( + BLOCK_8X16, vpx_highbd_sad8x16_bits12, + vpx_highbd_sad_skip_8x16_bits12, vpx_highbd_sad8x16_avg_bits12, + vpx_highbd_12_variance8x16, vpx_highbd_12_sub_pixel_variance8x16, + vpx_highbd_12_sub_pixel_avg_variance8x16, + vpx_highbd_sad8x16x4d_bits12, vpx_highbd_sad_skip_8x16x4d_bits12) + + HIGHBD_BFP( + BLOCK_8X8, vpx_highbd_sad8x8_bits12, vpx_highbd_sad_skip_8x8_bits12, + vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8, + vpx_highbd_12_sub_pixel_variance8x8, + vpx_highbd_12_sub_pixel_avg_variance8x8, + vpx_highbd_sad8x8x4d_bits12, vpx_highbd_sad_skip_8x8x4d_bits12) + + HIGHBD_BFP( + BLOCK_8X4, vpx_highbd_sad8x4_bits12, vpx_highbd_sad_skip_8x4_bits12, + vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4, + vpx_highbd_12_sub_pixel_variance8x4, + vpx_highbd_12_sub_pixel_avg_variance8x4, + vpx_highbd_sad8x4x4d_bits12, vpx_highbd_sad_skip_8x4x4d_bits12) + + HIGHBD_BFP( + BLOCK_4X8, vpx_highbd_sad4x8_bits12, vpx_highbd_sad_skip_4x8_bits12, + vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8, + vpx_highbd_12_sub_pixel_variance4x8, + vpx_highbd_12_sub_pixel_avg_variance4x8, + vpx_highbd_sad4x8x4d_bits12, vpx_highbd_sad_skip_4x8x4d_bits12) + + HIGHBD_BFP( + BLOCK_4X4, vpx_highbd_sad4x4_bits12, vpx_highbd_sad_skip_4x4_bits12, + vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4, + vpx_highbd_12_sub_pixel_variance4x4, + vpx_highbd_12_sub_pixel_avg_variance4x4, + vpx_highbd_sad4x4x4d_bits12, vpx_highbd_sad_skip_4x4x4d_bits12) break; } } @@ -2550,61 +2629,67 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var))); cpi->source_var_thresh = 0; cpi->frames_till_next_var_check = 0; -#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \ - cpi->fn_ptr[BT].sdf = SDF; \ - cpi->fn_ptr[BT].sdaf = SDAF; \ - cpi->fn_ptr[BT].vf = VF; \ - cpi->fn_ptr[BT].svf = SVF; \ - cpi->fn_ptr[BT].svaf = SVAF; \ - cpi->fn_ptr[BT].sdx4df = SDX4DF; - - BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vpx_variance32x16, - vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16, - vpx_sad32x16x4d) - - BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vpx_variance16x32, - vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32, - vpx_sad16x32x4d) - - BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vpx_variance64x32, - vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32, - vpx_sad64x32x4d) - - BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vpx_variance32x64, - vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64, - vpx_sad32x64x4d) - - BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32, - vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32, - vpx_sad32x32x4d) - - BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64, - vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64, - vpx_sad64x64x4d) - - BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vpx_variance16x16, - vpx_sub_pixel_variance16x16, vpx_sub_pixel_avg_variance16x16, - vpx_sad16x16x4d) - - BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vpx_variance16x8, - vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8, - vpx_sad16x8x4d) - - BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vpx_variance8x16, - vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16, - vpx_sad8x16x4d) - - BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vpx_variance8x8, - vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d) - - BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vpx_variance8x4, - vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d) - - BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vpx_variance4x8, - vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d) - - BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vpx_variance4x4, - vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d) +#define BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \ + cpi->fn_ptr[BT].sdf = SDF; \ + cpi->fn_ptr[BT].sdsf = SDSF; \ + cpi->fn_ptr[BT].sdaf = SDAF; \ + cpi->fn_ptr[BT].vf = VF; \ + cpi->fn_ptr[BT].svf = SVF; \ + cpi->fn_ptr[BT].svaf = SVAF; \ + cpi->fn_ptr[BT].sdx4df = SDX4DF; \ + cpi->fn_ptr[BT].sdsx4df = SDSX4DF; + + BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad_skip_32x16, vpx_sad32x16_avg, + vpx_variance32x16, vpx_sub_pixel_variance32x16, + vpx_sub_pixel_avg_variance32x16, vpx_sad32x16x4d, vpx_sad_skip_32x16x4d) + + BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad_skip_16x32, vpx_sad16x32_avg, + vpx_variance16x32, vpx_sub_pixel_variance16x32, + vpx_sub_pixel_avg_variance16x32, vpx_sad16x32x4d, vpx_sad_skip_16x32x4d) + + BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad_skip_64x32, vpx_sad64x32_avg, + vpx_variance64x32, vpx_sub_pixel_variance64x32, + vpx_sub_pixel_avg_variance64x32, vpx_sad64x32x4d, vpx_sad_skip_64x32x4d) + + BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad_skip_32x64, vpx_sad32x64_avg, + vpx_variance32x64, vpx_sub_pixel_variance32x64, + vpx_sub_pixel_avg_variance32x64, vpx_sad32x64x4d, vpx_sad_skip_32x64x4d) + + BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad_skip_32x32, vpx_sad32x32_avg, + vpx_variance32x32, vpx_sub_pixel_variance32x32, + vpx_sub_pixel_avg_variance32x32, vpx_sad32x32x4d, vpx_sad_skip_32x32x4d) + + BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad_skip_64x64, vpx_sad64x64_avg, + vpx_variance64x64, vpx_sub_pixel_variance64x64, + vpx_sub_pixel_avg_variance64x64, vpx_sad64x64x4d, vpx_sad_skip_64x64x4d) + + BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad_skip_16x16, vpx_sad16x16_avg, + vpx_variance16x16, vpx_sub_pixel_variance16x16, + vpx_sub_pixel_avg_variance16x16, vpx_sad16x16x4d, vpx_sad_skip_16x16x4d) + + BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad_skip_16x8, vpx_sad16x8_avg, + vpx_variance16x8, vpx_sub_pixel_variance16x8, + vpx_sub_pixel_avg_variance16x8, vpx_sad16x8x4d, vpx_sad_skip_16x8x4d) + + BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad_skip_8x16, vpx_sad8x16_avg, + vpx_variance8x16, vpx_sub_pixel_variance8x16, + vpx_sub_pixel_avg_variance8x16, vpx_sad8x16x4d, vpx_sad_skip_8x16x4d) + + BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad_skip_8x8, vpx_sad8x8_avg, vpx_variance8x8, + vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d, + vpx_sad_skip_8x8x4d) + + BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad_skip_8x4, vpx_sad8x4_avg, vpx_variance8x4, + vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d, + vpx_sad_skip_8x4x4d) + + BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad_skip_4x8, vpx_sad4x8_avg, vpx_variance4x8, + vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d, + vpx_sad_skip_4x8x4d) + + BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad_skip_4x4, vpx_sad4x4_avg, vpx_variance4x4, + vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d, + vpx_sad_skip_4x4x4d) #if CONFIG_VP9_HIGHBITDEPTH highbd_set_var_fns(cpi); diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 0efa836ac..71d8775ea 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -437,6 +437,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY; MV center_mv_full = ref_mv_full; unsigned int start_mv_sad; + vp9_sad_fn_ptr_t sad_fn_ptr; int step_param = 3; int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; @@ -462,11 +463,13 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, x->mv_limits.row_min, x->mv_limits.row_max); start_mv_sad = get_start_mv_sad(x, &ref_mv_full, ¢er_mv_full, cpi->fn_ptr[bsize].sdf, x->sadperbit16); + sad_fn_ptr.sdf = cpi->fn_ptr[bsize].sdf; + sad_fn_ptr.sdx4df = cpi->fn_ptr[bsize].sdx4df; // Center the initial step/diamond search on best mv. tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad, &tmp_mv, step_param, x->sadperbit16, &num00, - &v_fn_ptr, ref_mv); + &sad_fn_ptr, ref_mv); if (tmp_err < INT_MAX) tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1); if (tmp_err < INT_MAX - new_mv_mode_penalty) tmp_err += new_mv_mode_penalty; @@ -488,7 +491,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, } else { tmp_err = cpi->diamond_search_sad( x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad, &tmp_mv, step_param + n, - x->sadperbit16, &num00, &v_fn_ptr, ref_mv); + x->sadperbit16, &num00, &sad_fn_ptr, ref_mv); if (tmp_err < INT_MAX) tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1); if (tmp_err < INT_MAX - new_mv_mode_penalty) diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 4ff685b24..64e9ef0f9 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -2055,7 +2055,7 @@ int vp9_prepare_nb_full_mvs(const MotionField *motion_field, int mi_row, int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, uint32_t start_mv_sad, MV *best_mv, int search_param, int sad_per_bit, int *num00, - const vp9_variance_fn_ptr_t *fn_ptr, + const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv) { int i, j, step; @@ -2117,8 +2117,8 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, for (t = 0; t < 4; t++) block_offset[t] = ss_os[i + t] + best_address; - fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, - sad_array); + sad_fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, + sad_array); for (t = 0; t < 4; t++, i++) { if (sad_array[t] < bestsad) { @@ -2142,7 +2142,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, if (is_mv_in(&x->mv_limits, &this_mv)) { const uint8_t *const check_here = ss_os[i] + best_address; unsigned int thissad = - fn_ptr->sdf(what, what_stride, check_here, in_what_stride); + sad_fn_ptr->sdf(what, what_stride, check_here, in_what_stride); if (thissad < bestsad) { thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); @@ -2484,24 +2484,54 @@ int vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x, point as the best match, we will do a final 1-away diamond refining search */ static int full_pixel_diamond(const VP9_COMP *const cpi, - const MACROBLOCK *const x, MV *mvp_full, - int step_param, int sadpb, int further_steps, - int do_refine, int *cost_list, + const MACROBLOCK *const x, BLOCK_SIZE bsize, + MV *mvp_full, int step_param, int sadpb, + int further_steps, int do_refine, + int use_downsampled_sad, int *cost_list, const vp9_variance_fn_ptr_t *fn_ptr, const MV *ref_mv, MV *dst_mv) { MV temp_mv; int thissme, n, num00 = 0; int bestsme; - unsigned int start_mv_sad; + const int src_buf_stride = x->plane[0].src.stride; + const uint8_t *const src_buf = x->plane[0].src.buf; + const MACROBLOCKD *const xd = &x->e_mbd; + const int pred_buf_stride = xd->plane[0].pre[0].stride; + uint8_t *pred_buf; + vp9_sad_fn_ptr_t sad_fn_ptr; + unsigned int start_mv_sad, start_mv_sad_even_rows, start_mv_sad_odd_rows; const MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 }; clamp_mv(mvp_full, x->mv_limits.col_min, x->mv_limits.col_max, x->mv_limits.row_min, x->mv_limits.row_max); - start_mv_sad = - get_start_mv_sad(x, mvp_full, &ref_mv_full, fn_ptr->sdf, sadpb); + + pred_buf = + xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col; + start_mv_sad_even_rows = + fn_ptr->sdsf(src_buf, src_buf_stride, pred_buf, pred_buf_stride); + start_mv_sad_odd_rows = + fn_ptr->sdsf(src_buf + src_buf_stride, src_buf_stride, + pred_buf + pred_buf_stride, pred_buf_stride); + start_mv_sad = (start_mv_sad_even_rows + start_mv_sad_odd_rows) >> 1; + start_mv_sad += mvsad_err_cost(x, mvp_full, &ref_mv_full, sadpb); + + sad_fn_ptr.sdf = fn_ptr->sdf; + sad_fn_ptr.sdx4df = fn_ptr->sdx4df; + if (use_downsampled_sad && num_4x4_blocks_high_lookup[bsize] >= 2) { + // If the absolute difference between the pred-to-src SAD of even rows and + // the pred-to-src SAD of odd rows is small, skip every other row in sad + // computation. + const int odd_to_even_diff_sad = + abs((int)start_mv_sad_even_rows - (int)start_mv_sad_odd_rows); + const int mult_thresh = 10; + if (odd_to_even_diff_sad * mult_thresh < (int)start_mv_sad_even_rows) { + sad_fn_ptr.sdf = fn_ptr->sdsf; + sad_fn_ptr.sdx4df = fn_ptr->sdsx4df; + } + } bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad, &temp_mv, - step_param, sadpb, &n, fn_ptr, ref_mv); + step_param, sadpb, &n, &sad_fn_ptr, ref_mv); if (bestsme < INT_MAX) bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); *dst_mv = temp_mv; @@ -2518,7 +2548,7 @@ static int full_pixel_diamond(const VP9_COMP *const cpi, } else { thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad, &temp_mv, step_param + n, sadpb, &num00, - fn_ptr, ref_mv); + &sad_fn_ptr, ref_mv); if (thissme < INT_MAX) thissme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); @@ -2536,8 +2566,8 @@ static int full_pixel_diamond(const VP9_COMP *const cpi, if (do_refine) { const int search_range = 8; MV best_mv = *dst_mv; - thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range, fn_ptr, - ref_mv); + thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range, + &sad_fn_ptr, ref_mv); if (thissme < INT_MAX) thissme = vp9_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1); if (thissme < bestsme) { @@ -2546,6 +2576,27 @@ static int full_pixel_diamond(const VP9_COMP *const cpi, } } + if (sad_fn_ptr.sdf != fn_ptr->sdf) { + // If we are skipping rows when we perform the motion search, we need to + // check the quality of skipping. If it's bad, then we run search with + // skip row features off. + const uint8_t *best_address = get_buf_from_mv(&xd->plane[0].pre[0], dst_mv); + const int sad = + fn_ptr->sdf(src_buf, src_buf_stride, best_address, pred_buf_stride); + const int skip_sad = + fn_ptr->sdsf(src_buf, src_buf_stride, best_address, pred_buf_stride); + // We will keep the result of skipping rows if it's good enough. + const int kSADThresh = + 1 << (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + if (sad > kSADThresh && abs(skip_sad - sad) * 10 >= VPXMAX(sad, 1) * 9) { + // There is a large discrepancy between skipping and not skipping, so we + // need to redo the motion search. + return full_pixel_diamond(cpi, x, bsize, mvp_full, step_param, sadpb, + further_steps, do_refine, 0, cost_list, fn_ptr, + ref_mv, dst_mv); + } + } + // Return cost list. if (cost_list) { calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list); @@ -2697,7 +2748,7 @@ int64_t vp9_refining_search_sad_new(const MACROBLOCK *x, MV *best_full_mv, int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, int search_range, - const vp9_variance_fn_ptr_t *fn_ptr, + const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv) { const MACROBLOCKD *const xd = &x->e_mbd; const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } }; @@ -2706,7 +2757,7 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv); unsigned int best_sad = - fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) + + sad_fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) + mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit); int i, j; @@ -2723,7 +2774,8 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, best_address - 1, best_address + 1, best_address + in_what->stride }; - fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads); + sad_fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, + sads); for (j = 0; j < 4; ++j) { if (sads[j] < best_sad) { @@ -2743,8 +2795,8 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, if (is_mv_in(&x->mv_limits, &mv)) { unsigned int sad = - fn_ptr->sdf(what->buf, what->stride, - get_buf_from_mv(in_what, &mv), in_what->stride); + sad_fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), in_what->stride); if (sad < best_sad) { sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit); if (sad < best_sad) { @@ -2861,9 +2913,10 @@ int vp9_full_pixel_search(const VP9_COMP *const cpi, const MACROBLOCK *const x, break; case NSTEP: case MESH: - var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit, - MAX_MVSEARCH_STEPS - 1 - step_param, 1, - cost_list, fn_ptr, ref_mv, tmp_mv); + var = full_pixel_diamond( + cpi, x, bsize, mvp_full, step_param, error_per_bit, + MAX_MVSEARCH_STEPS - 1 - step_param, 1, + cpi->sf.mv.use_downsampled_sad, cost_list, fn_ptr, ref_mv, tmp_mv); break; default: assert(0 && "Unknown search method"); } diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index 62a7a047d..fd6a8b9ac 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -41,6 +41,11 @@ typedef struct search_site_config { int total_steps; } search_site_config; +typedef struct vp9_sad_table { + vpx_sad_fn_t sdf; + vpx_sad_multi_d_fn_t sdx4df; +} vp9_sad_fn_ptr_t; + static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf, const MV *mv) { return &buf->buf[mv->row * buf->stride + mv->col]; @@ -63,12 +68,13 @@ int vp9_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv, struct VP9_COMP; struct SPEED_FEATURES; +struct vp9_sad_table; int vp9_init_search_range(int size); int vp9_refining_search_sad(const struct macroblock *x, struct mv *ref_mv, int error_per_bit, int search_range, - const struct vp9_variance_vtable *fn_ptr, + const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); // Perform integral projection based motion estimation. @@ -96,7 +102,7 @@ extern fractional_mv_step_fp vp9_return_min_sub_pixel_mv; typedef int (*vp9_diamond_search_fn_t)( const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, uint32_t start_mv_sad, MV *best_mv, int search_param, int sad_per_bit, - int *num00, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv); + int *num00, const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv); int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, int search_range, diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 034673b49..04804da1c 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -231,6 +231,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->allow_skip_recode = 1; sf->less_rectangular_check = 1; sf->mv.auto_mv_step_size = 1; + sf->mv.use_downsampled_sad = 1; sf->prune_ref_frame_for_rect_partitions = 1; sf->temporal_filter_search_method = NSTEP; sf->tx_size_search_breakout = 1; @@ -926,6 +927,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) { sf->coeff_prob_appx_step = 1; sf->mv.auto_mv_step_size = 0; sf->mv.fullpel_search_step_param = 6; + sf->mv.use_downsampled_sad = 0; sf->comp_inter_joint_search_thresh = BLOCK_4X4; sf->tx_size_search_method = USE_FULL_RD; sf->use_lp32x32fdct = 0; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index e267e55c4..7cb3f3527 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -210,6 +210,10 @@ typedef struct MV_SPEED_FEATURES { // This variable sets the step_param used in full pel motion search. int fullpel_search_step_param; + + // Whether to downsample the rows in sad calculation during motion search. + // This is only active when there are at least 8 rows. + int use_downsampled_sad; } MV_SPEED_FEATURES; typedef struct PARTITION_SEARCH_BREAKOUT_THR { diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c index 719ab40f9..80442e359 100644 --- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c +++ b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c @@ -51,7 +51,7 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, uint32_t start_mv_sad, MV *best_mv, int search_param, int sad_per_bit, int *num00, - const vp9_variance_fn_ptr_t *fn_ptr, + const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv) { const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max); const __m128i v_max_mv_w = _mm_set1_epi32((int)maxmv.as_int); @@ -167,8 +167,8 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, #endif } - fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], - in_what_stride, (uint32_t *)&v_sad_d); + sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], + in_what_stride, (uint32_t *)&v_sad_d); // Look up the component cost of the residual motion vector { diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c index b47c43430..619d7aa95 100644 --- a/vpx_dsp/sad.c +++ b/vpx_dsp/sad.c @@ -43,6 +43,12 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride, DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]); \ vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \ return sad(src_ptr, src_stride, comp_pred, m, m, n); \ + } \ + unsigned int vpx_sad_skip_##m##x##n##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride, (m), \ + (n / 2)); \ } // Compare |src_ptr| to 4 distinct references in |ref_array[4]| @@ -54,6 +60,15 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride, for (i = 0; i < 4; ++i) \ sad_array[i] = \ vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_array[i], ref_stride); \ + } \ + void vpx_sad_skip_##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = 2 * sad(src_ptr, 2 * src_stride, ref_array[i], \ + 2 * ref_stride, (m), (n / 2)); \ + } \ } /* clang-format off */ @@ -156,6 +171,12 @@ static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride, vpx_highbd_comp_avg_pred_c(comp_pred, CONVERT_TO_SHORTPTR(second_pred), m, \ n, CONVERT_TO_SHORTPTR(ref_ptr), ref_stride); \ return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n); \ + } \ + unsigned int vpx_highbd_sad_skip_##m##x##n##_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * \ + highbd_sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2)); \ } #define highbd_sadMxNx4D(m, n) \ @@ -167,6 +188,15 @@ static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride, sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride, \ ref_array[i], ref_stride); \ } \ + } \ + void vpx_highbd_sad_skip_##m##x##n##x4d_c( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = vpx_highbd_sad_skip_##m##x##n##_c( \ + src, src_stride, ref_array[i], ref_stride); \ + } \ } /* clang-format off */ diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h index 755cb907d..ccdb2f90b 100644 --- a/vpx_dsp/variance.h +++ b/vpx_dsp/variance.h @@ -69,11 +69,15 @@ typedef struct variance_vtable { #if CONFIG_VP9 typedef struct vp9_variance_vtable { vpx_sad_fn_t sdf; + // Same as normal sad, but downsample the rows by a factor of 2. + vpx_sad_fn_t sdsf; vpx_sad_avg_fn_t sdaf; vpx_variance_fn_t vf; vpx_subpixvariance_fn_t svf; vpx_subp_avg_variance_fn_t svaf; vpx_sad_multi_d_fn_t sdx4df; + // Same as sadx4, but downsample the rows by a factor of 2. + vpx_sad_multi_d_fn_t sdsx4df; } vp9_variance_fn_ptr_t; #endif // CONFIG_VP9 diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index d63be5fb8..e3d48f493 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -597,7 +597,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_fdct8x8_1 sse2 neon msa/; add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct16x16 neon sse2 msa lsx/; + specialize qw/vpx_fdct16x16 neon sse2 avx2 msa lsx/; add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct16x16_1 sse2 neon msa/; @@ -786,6 +786,43 @@ specialize qw/vpx_sad4x8 neon msa sse2 mmi/; add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_sad4x4 neon msa sse2 mmi/; +add_proto qw/unsigned int vpx_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_64x64 avx2 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_64x32 avx2 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_32x64 avx2 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_32x32 avx2 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_32x16 avx2 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_16x32 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_16x16 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_16x8 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_8x16 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_8x8 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + +add_proto qw/unsigned int vpx_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_4x8 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + # # Avg # @@ -928,6 +965,43 @@ specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/; add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/; +add_proto qw/void vpx_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_64x64x4d avx2 sse2/; + +add_proto qw/void vpx_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_64x32x4d avx2 sse2/; + +add_proto qw/void vpx_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_32x64x4d avx2 sse2/; + +add_proto qw/void vpx_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_32x32x4d avx2 sse2/; + +add_proto qw/void vpx_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_32x16x4d avx2 sse2/; + +add_proto qw/void vpx_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_16x32x4d sse2/; + +add_proto qw/void vpx_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_16x16x4d sse2/; + +add_proto qw/void vpx_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_16x8x4d sse2/; + +add_proto qw/void vpx_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_8x16x4d sse2/; + +add_proto qw/void vpx_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_8x8x4d sse2/; + +add_proto qw/void vpx_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + +add_proto qw/void vpx_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_4x8x4d sse2/; + +add_proto qw/void vpx_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size"; specialize qw/vpx_sum_squares_2d_i16 neon sse2 msa/; @@ -991,6 +1065,42 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_highbd_sad4x4 neon/; + add_proto qw/unsigned int vpx_highbd_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_64x64 sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_64x32 sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_32x64 sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_32x32 sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_32x16 sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_16x32 sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_16x16 sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_16x8 sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_8x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_8x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + + add_proto qw/unsigned int vpx_highbd_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + + add_proto qw/unsigned int vpx_highbd_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + # # Avg # @@ -1084,6 +1194,43 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad4x4x4d sse2 neon/; + add_proto qw/void vpx_highbd_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_64x64x4d sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_64x32x4d sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_32x64x4d sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_32x32x4d sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_32x16x4d sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_16x32x4d sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_16x16x4d sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_16x8x4d sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_8x16x4d sse2/; + + add_proto qw/void vpx_highbd_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_8x8x4d sse2/; + + add_proto qw/void vpx_highbd_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + + add_proto qw/void vpx_highbd_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_4x8x4d sse2/; + + add_proto qw/void vpx_highbd_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + # # Structured Similarity (SSIM) # diff --git a/vpx_dsp/x86/fwd_txfm_avx2.c b/vpx_dsp/x86/fwd_txfm_avx2.c index a2ed420e3..c8f54a49c 100644 --- a/vpx_dsp/x86/fwd_txfm_avx2.c +++ b/vpx_dsp/x86/fwd_txfm_avx2.c @@ -8,9 +8,382 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <immintrin.h> // AVX2 #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#define ADD256_EPI16 _mm256_add_epi16 +#define SUB256_EPI16 _mm256_sub_epi16 + +static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in, + int stride, __m256i *out, + int out_size, int pass) { + int i; + const __m256i kOne = _mm256_set1_epi16(1); + if (pass == 0) { + for (i = 0; i < out_size; i++) { + out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride)); + // x = x << 2 + out[i] = _mm256_slli_epi16(out[i], 2); + } + } else { + for (i = 0; i < out_size; i++) { + out[i] = _mm256_loadu_si256((const __m256i *)(in + i * 16)); + // x = (x + 1) >> 2 + out[i] = _mm256_add_epi16(out[i], kOne); + out[i] = _mm256_srai_epi16(out[i], 2); + } + } +} + +static INLINE void transpose2_8x8_avx2(const __m256i *const in, + __m256i *const out) { + int i; + __m256i t[16], u[16]; + // (1st, 2nd) ==> (lo, hi) + // (0, 1) ==> (0, 1) + // (2, 3) ==> (2, 3) + // (4, 5) ==> (4, 5) + // (6, 7) ==> (6, 7) + for (i = 0; i < 4; i++) { + t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]); + t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 2) ==> (0, 2) + // (1, 3) ==> (1, 3) + // (4, 6) ==> (4, 6) + // (5, 7) ==> (5, 7) + for (i = 0; i < 2; i++) { + u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]); + u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]); + + u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]); + u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 4) ==> (0, 1) + // (1, 5) ==> (4, 5) + // (2, 6) ==> (2, 3) + // (3, 7) ==> (6, 7) + for (i = 0; i < 2; i++) { + out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]); + out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]); + + out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]); + out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]); + } +} + +static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in, + __m256i *const out) { + __m256i t[16]; + +#define LOADL(idx) \ + t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \ + t[idx] = _mm256_inserti128_si256( \ + t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1); + +#define LOADR(idx) \ + t[8 + idx] = \ + _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \ + t[8 + idx] = _mm256_inserti128_si256( \ + t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1); + + // load left 8x16 + LOADL(0) + LOADL(1) + LOADL(2) + LOADL(3) + LOADL(4) + LOADL(5) + LOADL(6) + LOADL(7) + + // load right 8x16 + LOADR(0) + LOADR(1) + LOADR(2) + LOADR(3) + LOADR(4) + LOADR(5) + LOADR(6) + LOADR(7) + + // get the top 16x8 result + transpose2_8x8_avx2(t, out); + // get the bottom 16x8 result + transpose2_8x8_avx2(&t[8], &out[8]); +} + +// Store 8 16-bit values. Sign extend the values. +static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in, + tran_low_t *out, + const int stride, + const int out_size) { + int i; + for (i = 0; i < out_size; ++i) { + _mm256_storeu_si256((__m256i *)(out), in[i]); + out += stride; + } +} + +#define PAIR256_SET_EPI16(a, b) \ + _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) + +static INLINE __m256i mult256_round_shift(const __m256i *pin0, + const __m256i *pin1, + const __m256i *pmultiplier, + const __m256i *prounding, + const int shift) { + const __m256i u0 = _mm256_madd_epi16(*pin0, *pmultiplier); + const __m256i u1 = _mm256_madd_epi16(*pin1, *pmultiplier); + const __m256i v0 = _mm256_add_epi32(u0, *prounding); + const __m256i v1 = _mm256_add_epi32(u1, *prounding); + const __m256i w0 = _mm256_srai_epi32(v0, shift); + const __m256i w1 = _mm256_srai_epi32(v1, shift); + return _mm256_packs_epi32(w0, w1); +} + +static INLINE void fdct16x16_1D_avx2(__m256i *input, __m256i *output) { + int i; + __m256i step2[4]; + __m256i in[8]; + __m256i step1[8]; + __m256i step3[8]; + + const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64); + const __m256i k__cospi_p16_m16 = PAIR256_SET_EPI16(cospi_16_64, -cospi_16_64); + const __m256i k__cospi_p24_p08 = PAIR256_SET_EPI16(cospi_24_64, cospi_8_64); + const __m256i k__cospi_p08_m24 = PAIR256_SET_EPI16(cospi_8_64, -cospi_24_64); + const __m256i k__cospi_m08_p24 = PAIR256_SET_EPI16(-cospi_8_64, cospi_24_64); + const __m256i k__cospi_p28_p04 = PAIR256_SET_EPI16(cospi_28_64, cospi_4_64); + const __m256i k__cospi_m04_p28 = PAIR256_SET_EPI16(-cospi_4_64, cospi_28_64); + const __m256i k__cospi_p12_p20 = PAIR256_SET_EPI16(cospi_12_64, cospi_20_64); + const __m256i k__cospi_m20_p12 = PAIR256_SET_EPI16(-cospi_20_64, cospi_12_64); + const __m256i k__cospi_p30_p02 = PAIR256_SET_EPI16(cospi_30_64, cospi_2_64); + const __m256i k__cospi_p14_p18 = PAIR256_SET_EPI16(cospi_14_64, cospi_18_64); + const __m256i k__cospi_m02_p30 = PAIR256_SET_EPI16(-cospi_2_64, cospi_30_64); + const __m256i k__cospi_m18_p14 = PAIR256_SET_EPI16(-cospi_18_64, cospi_14_64); + const __m256i k__cospi_p22_p10 = PAIR256_SET_EPI16(cospi_22_64, cospi_10_64); + const __m256i k__cospi_p06_p26 = PAIR256_SET_EPI16(cospi_6_64, cospi_26_64); + const __m256i k__cospi_m10_p22 = PAIR256_SET_EPI16(-cospi_10_64, cospi_22_64); + const __m256i k__cospi_m26_p06 = PAIR256_SET_EPI16(-cospi_26_64, cospi_6_64); + const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING); + + // Calculate input for the first 8 results. + for (i = 0; i < 8; i++) { + in[i] = ADD256_EPI16(input[i], input[15 - i]); + } + + // Calculate input for the next 8 results. + for (i = 0; i < 8; i++) { + step1[i] = SUB256_EPI16(input[7 - i], input[8 + i]); + } + + // Work on the first eight values; fdct8(input, even_results); + { + // Add/subtract + const __m256i q0 = ADD256_EPI16(in[0], in[7]); + const __m256i q1 = ADD256_EPI16(in[1], in[6]); + const __m256i q2 = ADD256_EPI16(in[2], in[5]); + const __m256i q3 = ADD256_EPI16(in[3], in[4]); + const __m256i q4 = SUB256_EPI16(in[3], in[4]); + const __m256i q5 = SUB256_EPI16(in[2], in[5]); + const __m256i q6 = SUB256_EPI16(in[1], in[6]); + const __m256i q7 = SUB256_EPI16(in[0], in[7]); + + // Work on first four results + { + // Add/subtract + const __m256i r0 = ADD256_EPI16(q0, q3); + const __m256i r1 = ADD256_EPI16(q1, q2); + const __m256i r2 = SUB256_EPI16(q1, q2); + const __m256i r3 = SUB256_EPI16(q0, q3); + + // Interleave to do the multiply by constants which gets us + // into 32 bits. + { + const __m256i t0 = _mm256_unpacklo_epi16(r0, r1); + const __m256i t1 = _mm256_unpackhi_epi16(r0, r1); + const __m256i t2 = _mm256_unpacklo_epi16(r2, r3); + const __m256i t3 = _mm256_unpackhi_epi16(r2, r3); + + output[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[8] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[4] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[12] = + mult256_round_shift(&t2, &t3, &k__cospi_m08_p24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + } + + // Work on next four results + { + // Interleave to do the multiply by constants which gets us + // into 32 bits. + const __m256i d0 = _mm256_unpacklo_epi16(q6, q5); + const __m256i d1 = _mm256_unpackhi_epi16(q6, q5); + const __m256i r0 = mult256_round_shift( + &d0, &d1, &k__cospi_p16_m16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + const __m256i r1 = mult256_round_shift( + &d0, &d1, &k__cospi_p16_p16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + + { + // Add/subtract + const __m256i x0 = ADD256_EPI16(q4, r0); + const __m256i x1 = SUB256_EPI16(q4, r0); + const __m256i x2 = SUB256_EPI16(q7, r1); + const __m256i x3 = ADD256_EPI16(q7, r1); + + // Interleave to do the multiply by constants which gets us + // into 32 bits. + { + const __m256i t0 = _mm256_unpacklo_epi16(x0, x3); + const __m256i t1 = _mm256_unpackhi_epi16(x0, x3); + const __m256i t2 = _mm256_unpacklo_epi16(x1, x2); + const __m256i t3 = _mm256_unpackhi_epi16(x1, x2); + output[2] = + mult256_round_shift(&t0, &t1, &k__cospi_p28_p04, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[14] = + mult256_round_shift(&t0, &t1, &k__cospi_m04_p28, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[10] = + mult256_round_shift(&t2, &t3, &k__cospi_p12_p20, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[6] = + mult256_round_shift(&t2, &t3, &k__cospi_m20_p12, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + } + } + } + // Work on the next eight values; step1 -> odd_results + { // step 2 + { + const __m256i t0 = _mm256_unpacklo_epi16(step1[5], step1[2]); + const __m256i t1 = _mm256_unpackhi_epi16(step1[5], step1[2]); + const __m256i t2 = _mm256_unpacklo_epi16(step1[4], step1[3]); + const __m256i t3 = _mm256_unpackhi_epi16(step1[4], step1[3]); + step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + // step 3 + { + step3[0] = ADD256_EPI16(step1[0], step2[1]); + step3[1] = ADD256_EPI16(step1[1], step2[0]); + step3[2] = SUB256_EPI16(step1[1], step2[0]); + step3[3] = SUB256_EPI16(step1[0], step2[1]); + step3[4] = SUB256_EPI16(step1[7], step2[3]); + step3[5] = SUB256_EPI16(step1[6], step2[2]); + step3[6] = ADD256_EPI16(step1[6], step2[2]); + step3[7] = ADD256_EPI16(step1[7], step2[3]); + } + // step 4 + { + const __m256i t0 = _mm256_unpacklo_epi16(step3[1], step3[6]); + const __m256i t1 = _mm256_unpackhi_epi16(step3[1], step3[6]); + const __m256i t2 = _mm256_unpacklo_epi16(step3[2], step3[5]); + const __m256i t3 = _mm256_unpackhi_epi16(step3[2], step3[5]); + step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_m08_p24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p08_m24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + // step 5 + { + step1[0] = ADD256_EPI16(step3[0], step2[0]); + step1[1] = SUB256_EPI16(step3[0], step2[0]); + step1[2] = ADD256_EPI16(step3[3], step2[1]); + step1[3] = SUB256_EPI16(step3[3], step2[1]); + step1[4] = SUB256_EPI16(step3[4], step2[3]); + step1[5] = ADD256_EPI16(step3[4], step2[3]); + step1[6] = SUB256_EPI16(step3[7], step2[2]); + step1[7] = ADD256_EPI16(step3[7], step2[2]); + } + // step 6 + { + const __m256i t0 = _mm256_unpacklo_epi16(step1[0], step1[7]); + const __m256i t1 = _mm256_unpackhi_epi16(step1[0], step1[7]); + const __m256i t2 = _mm256_unpacklo_epi16(step1[1], step1[6]); + const __m256i t3 = _mm256_unpackhi_epi16(step1[1], step1[6]); + output[1] = mult256_round_shift(&t0, &t1, &k__cospi_p30_p02, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[9] = mult256_round_shift(&t2, &t3, &k__cospi_p14_p18, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[15] = mult256_round_shift(&t0, &t1, &k__cospi_m02_p30, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[7] = mult256_round_shift(&t2, &t3, &k__cospi_m18_p14, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + { + const __m256i t0 = _mm256_unpacklo_epi16(step1[2], step1[5]); + const __m256i t1 = _mm256_unpackhi_epi16(step1[2], step1[5]); + const __m256i t2 = _mm256_unpacklo_epi16(step1[3], step1[4]); + const __m256i t3 = _mm256_unpackhi_epi16(step1[3], step1[4]); + output[5] = mult256_round_shift(&t0, &t1, &k__cospi_p22_p10, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[13] = mult256_round_shift(&t2, &t3, &k__cospi_p06_p26, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[11] = mult256_round_shift(&t0, &t1, &k__cospi_m10_p22, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[3] = mult256_round_shift(&t2, &t3, &k__cospi_m26_p06, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + } +} + +void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride) { + int pass; + DECLARE_ALIGNED(32, int16_t, intermediate[256]); + int16_t *out0 = intermediate; + tran_low_t *out1 = output; + const int width = 16; + const int height = 16; + __m256i buf0[16], buf1[16]; + + // Two transform and transpose passes + // Process 16 columns (transposed rows in second pass) at a time. + for (pass = 0; pass < 2; ++pass) { + // Load and pre-condition input. + load_buffer_16bit_to_16bit_avx2(input, stride, buf1, height, pass); + + // Calculate dct for 16x16 values + fdct16x16_1D_avx2(buf1, buf0); + + // Transpose the results. + transpose_16bit_16x16_avx2(buf0, buf1); + + if (pass == 0) { + store_buffer_16bit_to_32bit_w16_avx2(buf1, out0, width, height); + } else { + store_buffer_16bit_to_32bit_w16_avx2(buf1, out1, width, height); + } + // Setup in/out for next pass. + input = intermediate; + } +} + #if !CONFIG_VP9_HIGHBITDEPTH #define FDCT32x32_2D_AVX2 vpx_fdct32x32_rd_avx2 #define FDCT32x32_HIGH_PRECISION 0 diff --git a/vpx_dsp/x86/highbd_sad4d_avx2.c b/vpx_dsp/x86/highbd_sad4d_avx2.c index 947b5e977..e483fdce7 100644 --- a/vpx_dsp/x86/highbd_sad4d_avx2.c +++ b/vpx_dsp/x86/highbd_sad4d_avx2.c @@ -61,70 +61,79 @@ static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/, } } +static VPX_FORCE_INLINE void highbd_sad64xNx4d_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4], int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + __m256i sums_32[4]; + int i; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_32[0] = _mm256_setzero_si256(); + sums_32[1] = _mm256_setzero_si256(); + sums_32[2] = _mm256_setzero_si256(); + sums_32[3] = _mm256_setzero_si256(); + + for (i = 0; i < (n / 2); ++i) { + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2); + + /* sums_16 will outrange after 2 rows, so add current sums_16 to + * sums_32*/ + sums_32[0] = _mm256_add_epi32( + sums_32[0], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)))); + sums_32[1] = _mm256_add_epi32( + sums_32[1], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)))); + sums_32[2] = _mm256_add_epi32( + sums_32[2], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)))); + sums_32[3] = _mm256_add_epi32( + sums_32[3], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)))); + + src += src_stride << 1; + } + calc_final_4(sums_32, sad_array); +} + #define HIGHBD_SAD64XNX4D(n) \ - void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \ + void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src, int src_stride, \ const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *refs[4]; \ - __m256i sums_16[4]; \ - __m256i sums_32[4]; \ - int i; \ - \ - refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \ - refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \ - refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \ - refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \ - sums_32[0] = _mm256_setzero_si256(); \ - sums_32[1] = _mm256_setzero_si256(); \ - sums_32[2] = _mm256_setzero_si256(); \ - sums_32[3] = _mm256_setzero_si256(); \ - \ - for (i = 0; i < (n / 2); ++i) { \ - sums_16[0] = _mm256_setzero_si256(); \ - sums_16[1] = _mm256_setzero_si256(); \ - sums_16[2] = _mm256_setzero_si256(); \ - sums_16[3] = _mm256_setzero_si256(); \ - \ - highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2); \ - \ - /* sums_16 will outrange after 2 rows, so add current sums_16 to \ - * sums_32*/ \ - sums_32[0] = _mm256_add_epi32( \ - sums_32[0], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[0], 1)))); \ - sums_32[1] = _mm256_add_epi32( \ - sums_32[1], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[1], 1)))); \ - sums_32[2] = _mm256_add_epi32( \ - sums_32[2], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[2], 1)))); \ - sums_32[3] = _mm256_add_epi32( \ - sums_32[3], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[3], 1)))); \ - \ - src += src_stride << 1; \ - } \ - calc_final_4(sums_32, sad_array); \ + highbd_sad64xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \ + n); \ } -// 64x64 -HIGHBD_SAD64XNX4D(64) - -// 64x32 -HIGHBD_SAD64XNX4D(32) +#define HIGHBD_SADSKIP64XNx4D(n) \ + void vpx_highbd_sad_skip_64x##n##x4d_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad64xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, n / 2); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/, const uint16_t *src, @@ -171,73 +180,79 @@ static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/, } } +static VPX_FORCE_INLINE void highbd_sad32xNx4d_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4], int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + __m256i sums_32[4]; + int i; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_32[0] = _mm256_setzero_si256(); + sums_32[1] = _mm256_setzero_si256(); + sums_32[2] = _mm256_setzero_si256(); + sums_32[3] = _mm256_setzero_si256(); + + for (i = 0; i < (n / 8); ++i) { + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); + + /* sums_16 will outrange after 8 rows, so add current sums_16 to + * sums_32*/ + sums_32[0] = _mm256_add_epi32( + sums_32[0], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)))); + sums_32[1] = _mm256_add_epi32( + sums_32[1], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)))); + sums_32[2] = _mm256_add_epi32( + sums_32[2], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)))); + sums_32[3] = _mm256_add_epi32( + sums_32[3], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)))); + + src += src_stride << 3; + } + calc_final_4(sums_32, sad_array); +} + #define HIGHBD_SAD32XNX4D(n) \ - void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \ + void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src, int src_stride, \ const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *refs[4]; \ - __m256i sums_16[4]; \ - __m256i sums_32[4]; \ - int i; \ - \ - refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \ - refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \ - refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \ - refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \ - sums_32[0] = _mm256_setzero_si256(); \ - sums_32[1] = _mm256_setzero_si256(); \ - sums_32[2] = _mm256_setzero_si256(); \ - sums_32[3] = _mm256_setzero_si256(); \ - \ - for (i = 0; i < (n / 8); ++i) { \ - sums_16[0] = _mm256_setzero_si256(); \ - sums_16[1] = _mm256_setzero_si256(); \ - sums_16[2] = _mm256_setzero_si256(); \ - sums_16[3] = _mm256_setzero_si256(); \ - \ - highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); \ - \ - /* sums_16 will outrange after 8 rows, so add current sums_16 to \ - * sums_32*/ \ - sums_32[0] = _mm256_add_epi32( \ - sums_32[0], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[0], 1)))); \ - sums_32[1] = _mm256_add_epi32( \ - sums_32[1], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[1], 1)))); \ - sums_32[2] = _mm256_add_epi32( \ - sums_32[2], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[2], 1)))); \ - sums_32[3] = _mm256_add_epi32( \ - sums_32[3], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[3], 1)))); \ - \ - src += src_stride << 3; \ - } \ - calc_final_4(sums_32, sad_array); \ + highbd_sad32xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \ + n); \ } -// 32x64 -HIGHBD_SAD32XNX4D(64) - -// 32x32 -HIGHBD_SAD32XNX4D(32) - -// 32x16 -HIGHBD_SAD32XNX4D(16) +#define HIGHBD_SADSKIP32XNx4D(n) \ + void vpx_highbd_sad_skip_32x##n##x4d_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad32xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, n / 2); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/, const uint16_t *src, @@ -275,13 +290,15 @@ static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/, } } -void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], - int ref_stride, uint32_t sad_array[4]) { +static VPX_FORCE_INLINE void highbd_sad16xNx4d_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4], int n) { const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); uint16_t *refs[4]; __m256i sums_16[4]; __m256i sums_32[4]; + const int height = VPXMIN(16, n); + const int num_iters = n / height; int i; refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); @@ -293,13 +310,13 @@ void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride, sums_32[2] = _mm256_setzero_si256(); sums_32[3] = _mm256_setzero_si256(); - for (i = 0; i < 2; ++i) { + for (i = 0; i < num_iters; ++i) { sums_16[0] = _mm256_setzero_si256(); sums_16[1] = _mm256_setzero_si256(); sums_16[2] = _mm256_setzero_si256(); sums_16[3] = _mm256_setzero_si256(); - highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16); + highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, height); // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 sums_32[0] = _mm256_add_epi32( @@ -328,6 +345,26 @@ void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride, calc_final_4(sums_32, sad_array); } +#define HIGHBD_SAD16XNX4D(n) \ + void vpx_highbd_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad16xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \ + n); \ + } + +#define HIGHBD_SADSKIP16XNx4D(n) \ + void vpx_highbd_sad_skip_16x##n##x4d_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad16xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, n / 2); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) { @@ -399,3 +436,27 @@ void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride, calc_final_4(sums_32, sad_array); } } + +// clang-format off +HIGHBD_SAD64XNX4D(64) +HIGHBD_SADSKIP64XNx4D(64) + +HIGHBD_SAD64XNX4D(32) +HIGHBD_SADSKIP64XNx4D(32) + +HIGHBD_SAD32XNX4D(64) +HIGHBD_SADSKIP32XNx4D(64) + +HIGHBD_SAD32XNX4D(32) +HIGHBD_SADSKIP32XNx4D(32) + +HIGHBD_SAD32XNX4D(16) +HIGHBD_SADSKIP32XNx4D(16) + +HIGHBD_SAD16XNX4D(32) +HIGHBD_SADSKIP16XNx4D(32) + +HIGHBD_SADSKIP16XNx4D(16) + +HIGHBD_SADSKIP16XNx4D(8) + // clang-format on diff --git a/vpx_dsp/x86/highbd_sad4d_sse2.asm b/vpx_dsp/x86/highbd_sad4d_sse2.asm index 6c2a61e01..a07892d81 100644 --- a/vpx_dsp/x86/highbd_sad4d_sse2.asm +++ b/vpx_dsp/x86/highbd_sad4d_sse2.asm @@ -213,7 +213,12 @@ SECTION .text ; uint8_t *ref[4], int ref_stride, ; uint32_t res[4]); ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 -%macro HIGH_SADNXN4D 2 +; Macro Arguments: +; 1: Width +; 2: Height +; 3: If 0, then normal sad, if 2, then skip every other row +%macro HIGH_SADNXN4D 2-3 0 +%if %3 == 0 ; normal sad %if UNIX64 cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ res, ref2, ref3, ref4 @@ -221,6 +226,15 @@ cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ ref2, ref3, ref4 %endif +%else ; %3 == 2, downsample +%if UNIX64 +cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif ; +%endif ; sad/avg/skip ; set m1 push srcq @@ -229,6 +243,10 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ pshufd m1, m1, 0x0 pop srcq +%if %3 == 2 ; skip rows + lea src_strided, [2*src_strided] + lea ref_strided, [2*ref_strided] +%endif ; skip rows movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided mov ref2q, [ref1q+gprsize*1] @@ -244,9 +262,15 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ shl ref1q, 1 HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 -%rep (%2-4)/2 +%if %3 == 2 ; Downsampling by two +%define num_rep (%2-8)/4 +%else +%define num_rep (%2-4)/2 +%endif +%rep num_rep HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 %endrep +%undef rep HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 ; N.B. HIGH_PROCESS outputs dwords (32 bits) ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM @@ -265,6 +289,9 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ paddd m4, m0 paddd m6, m1 punpcklqdq m4, m6 +%if %3 == 2 ; skip rows + pslld m4, 1 +%endif movifnidn r4, r4mp movu [r4], m4 RET @@ -285,3 +312,15 @@ HIGH_SADNXN4D 8, 8 HIGH_SADNXN4D 8, 4 HIGH_SADNXN4D 4, 8 HIGH_SADNXN4D 4, 4 + +HIGH_SADNXN4D 64, 64, 2 +HIGH_SADNXN4D 64, 32, 2 +HIGH_SADNXN4D 32, 64, 2 +HIGH_SADNXN4D 32, 32, 2 +HIGH_SADNXN4D 32, 16, 2 +HIGH_SADNXN4D 16, 32, 2 +HIGH_SADNXN4D 16, 16, 2 +HIGH_SADNXN4D 16, 8, 2 +HIGH_SADNXN4D 8, 16, 2 +HIGH_SADNXN4D 8, 8, 2 +HIGH_SADNXN4D 4, 8, 2 diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c index 231b67f80..78f8eb8bf 100644 --- a/vpx_dsp/x86/highbd_sad_avx2.c +++ b/vpx_dsp/x86/highbd_sad_avx2.c @@ -50,39 +50,49 @@ static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16, } } -#define HIGHBD_SAD64XN(n) \ - unsigned int vpx_highbd_sad64x##n##_avx2( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride) { \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - __m256i sums_32 = _mm256_setzero_si256(); \ - int i; \ - \ - for (i = 0; i < (n / 2); ++i) { \ - __m256i sums_16 = _mm256_setzero_si256(); \ - \ - highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2); \ - \ - /* sums_16 will outrange after 2 rows, so add current sums_16 to \ - * sums_32*/ \ - sums_32 = _mm256_add_epi32( \ - sums_32, \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ - _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ - \ - src += src_stride << 1; \ - ref += ref_stride << 1; \ - } \ - return calc_final(sums_32); \ +static VPX_FORCE_INLINE unsigned int highbd_sad64xN_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_32 = _mm256_setzero_si256(); + int i; + + for (i = 0; i < (n / 2); ++i) { + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2); + + /* sums_16 will outrange after 2 rows, so add current sums_16 to + * sums_32*/ + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 1; + ref += ref_stride << 1; } + return calc_final(sums_32); +} -// 64x64 -HIGHBD_SAD64XN(64) +#define HIGHBD_SAD64XN(n) \ + unsigned int vpx_highbd_sad64x##n##_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad64xN_avx2(src, src_stride, ref, ref_stride, n); \ + } -// 64x32 -HIGHBD_SAD64XN(32) +#define HIGHBD_SADSKIP64xN(n) \ + unsigned int vpx_highbd_sad_skip_64x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad64xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ + n / 2); \ + } static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16, const uint16_t *src, int src_stride, @@ -107,42 +117,49 @@ static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16, } } -#define HIGHBD_SAD32XN(n) \ - unsigned int vpx_highbd_sad32x##n##_avx2( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride) { \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - __m256i sums_32 = _mm256_setzero_si256(); \ - int i; \ - \ - for (i = 0; i < (n / 8); ++i) { \ - __m256i sums_16 = _mm256_setzero_si256(); \ - \ - highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8); \ - \ - /* sums_16 will outrange after 8 rows, so add current sums_16 to \ - * sums_32*/ \ - sums_32 = _mm256_add_epi32( \ - sums_32, \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ - _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ - \ - src += src_stride << 3; \ - ref += ref_stride << 3; \ - } \ - return calc_final(sums_32); \ - } +static VPX_FORCE_INLINE unsigned int highbd_sad32xN_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_32 = _mm256_setzero_si256(); + int i; -// 32x64 -HIGHBD_SAD32XN(64) + for (i = 0; i < (n / 8); ++i) { + __m256i sums_16 = _mm256_setzero_si256(); -// 32x32 -HIGHBD_SAD32XN(32) + highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8); -// 32x16 -HIGHBD_SAD32XN(16) + /* sums_16 will outrange after 8 rows, so add current sums_16 to + * sums_32*/ + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 3; + ref += ref_stride << 3; + } + return calc_final(sums_32); +} + +#define HIGHBD_SAD32XN(n) \ + unsigned int vpx_highbd_sad32x##n##_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad32xN_avx2(src, src_stride, ref, ref_stride, n); \ + } + +#define HIGHBD_SADSKIP32xN(n) \ + unsigned int vpx_highbd_sad_skip_32x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad32xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ + n / 2); \ + } static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16, const uint16_t *src, int src_stride, @@ -167,17 +184,22 @@ static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16, } } -unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride) { +static VPX_FORCE_INLINE unsigned int highbd_sad16xN_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + int n) { const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); __m256i sums_32 = _mm256_setzero_si256(); + const int height = VPXMIN(16, n); + const int num_iters = n / height; int i; - for (i = 0; i < 2; ++i) { + for (i = 0; i < num_iters; ++i) { __m256i sums_16 = _mm256_setzero_si256(); - highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16); + highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, height); // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 sums_32 = _mm256_add_epi32( @@ -192,6 +214,21 @@ unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride, return calc_final(sums_32); } +#define HIGHBD_SAD16XN(n) \ + unsigned int vpx_highbd_sad16x##n##_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad16xN_avx2(src, src_stride, ref, ref_stride, n); \ + } + +#define HIGHBD_SADSKIP16xN(n) \ + unsigned int vpx_highbd_sad_skip_16x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad16xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ + n / 2); \ + } + unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); @@ -224,6 +261,23 @@ unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride, } } +// clang-format off +HIGHBD_SAD64XN(64) +HIGHBD_SADSKIP64xN(64) +HIGHBD_SAD64XN(32) +HIGHBD_SADSKIP64xN(32) +HIGHBD_SAD32XN(64) +HIGHBD_SADSKIP32xN(64) +HIGHBD_SAD32XN(32) +HIGHBD_SADSKIP32xN(32) +HIGHBD_SAD32XN(16) +HIGHBD_SADSKIP32xN(16) +HIGHBD_SAD16XN(32) +HIGHBD_SADSKIP16xN(32) +HIGHBD_SADSKIP16xN(16) +HIGHBD_SADSKIP16xN(8) +//clang-format on + // AVG ------------------------------------------------------------------------- static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16, const uint16_t *src, diff --git a/vpx_dsp/x86/highbd_sad_sse2.asm b/vpx_dsp/x86/highbd_sad_sse2.asm index 6a1a6f3d6..62ad2237f 100644 --- a/vpx_dsp/x86/highbd_sad_sse2.asm +++ b/vpx_dsp/x86/highbd_sad_sse2.asm @@ -12,6 +12,11 @@ SECTION .text +; Macro Arguments +; Arg 1: Width +; Arg 2: Height +; Arg 3: Number of general purpose registers +; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows %macro HIGH_SAD_FN 4 %if %4 == 0 %if %3 == 5 @@ -20,7 +25,7 @@ cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ src_stride3, ref_stride3, n_rows %endif ; %3 == 5/7 -%else ; avg +%elif %4 == 1 ; avg %if %3 == 5 cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \ second_pred, n_rows @@ -35,7 +40,18 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \ %define n_rowsd dword r0m %endif ; x86-32/64 %endif ; %3 == 5/7 -%endif ; avg/sad +%else ; %4 == 2, skip rows +%if %3 == 5 +cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%endif ; sad/avg/skip +%if %4 == 2 ; double the stride if we are skipping rows + lea src_strided, [src_strided*2] + lea ref_strided, [ref_strided*2] +%endif movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided %if %3 == 7 @@ -54,7 +70,11 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \ ; uint8_t *ref, int ref_stride); %macro HIGH_SAD64XN 1-2 0 HIGH_SAD_FN 64, %1, 5, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/2 +%else mov n_rowsd, %1 +%endif pxor m0, m0 pxor m6, m6 @@ -146,6 +166,9 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \ punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -155,13 +178,19 @@ HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 +HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2 +HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2 ; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro HIGH_SAD32XN 1-2 0 HIGH_SAD_FN 32, %1, 5, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/2 +%else mov n_rowsd, %1 +%endif pxor m0, m0 pxor m6, m6 @@ -213,6 +242,9 @@ HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -224,12 +256,19 @@ HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 +HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2 +HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2 +HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2 ; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro HIGH_SAD16XN 1-2 0 HIGH_SAD_FN 16, %1, 5, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/4 +%else mov n_rowsd, %1/2 +%endif pxor m0, m0 pxor m6, m6 @@ -281,6 +320,9 @@ HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -292,13 +334,19 @@ HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 - +HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2 +HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2 +HIGH_SAD16XN 8, 2 ; highbd_sad_skip_16x8_sse2 ; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro HIGH_SAD8XN 1-2 0 HIGH_SAD_FN 8, %1, 7, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/8 +%else mov n_rowsd, %1/4 +%endif pxor m0, m0 pxor m6, m6 @@ -350,6 +398,9 @@ HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -361,3 +412,5 @@ HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 +HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2 +HIGH_SAD8XN 8, 2 ; highbd_sad_skip_8x8_sse2 diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c index 399b67b3f..c87fd3cd2 100644 --- a/vpx_dsp/x86/sad4d_avx2.c +++ b/vpx_dsp/x86/sad4d_avx2.c @@ -25,9 +25,10 @@ static INLINE void calc_final_4(const __m256i *const sums /*[4]*/, _mm_storeu_si128((__m128i *)sad_array, sum); } -void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { +static INLINE void sad32xhx4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, int h, + uint32_t sad_array[4]) { int i; const uint8_t *refs[4]; __m256i sums[4]; @@ -41,7 +42,7 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, sums[2] = _mm256_setzero_si256(); sums[3] = _mm256_setzero_si256(); - for (i = 0; i < 32; i++) { + for (i = 0; i < h; i++) { __m256i r[4]; // load src and all ref[] @@ -73,9 +74,10 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, calc_final_4(sums, sad_array); } -void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { +static INLINE void sad64xhx4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, int h, + uint32_t sad_array[4]) { __m256i sums[4]; int i; const uint8_t *refs[4]; @@ -89,7 +91,7 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, sums[2] = _mm256_setzero_si256(); sums[3] = _mm256_setzero_si256(); - for (i = 0; i < 64; i++) { + for (i = 0; i < h; i++) { __m256i r_lo[4], r_hi[4]; // load 64 bytes from src and all ref[] const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr); @@ -132,3 +134,51 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, calc_final_4(sums, sad_array); } + +#define SAD64_H(h) \ + void vpx_sad64x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], int ref_stride, \ + uint32_t res[4]) { \ + sad64xhx4d_avx2(src, src_stride, ref, ref_stride, h, res); \ + } + +#define SAD32_H(h) \ + void vpx_sad32x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], int ref_stride, \ + uint32_t res[4]) { \ + sad32xhx4d_avx2(src, src_stride, ref, ref_stride, h, res); \ + } + +SAD64_H(64) +SAD32_H(32) + +#define SADS64_H(h) \ + void vpx_sad_skip_64x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], \ + int ref_stride, uint32_t res[4]) { \ + sad64xhx4d_avx2(src, 2 * src_stride, ref, 2 * ref_stride, ((h) >> 1), \ + res); \ + res[0] <<= 1; \ + res[1] <<= 1; \ + res[2] <<= 1; \ + res[3] <<= 1; \ + } + +#define SADS32_H(h) \ + void vpx_sad_skip_32x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], \ + int ref_stride, uint32_t res[4]) { \ + sad32xhx4d_avx2(src, 2 * src_stride, ref, 2 * ref_stride, ((h) >> 1), \ + res); \ + res[0] <<= 1; \ + res[1] <<= 1; \ + res[2] <<= 1; \ + res[3] <<= 1; \ + } + +SADS64_H(64) +SADS64_H(32) + +SADS32_H(64) +SADS32_H(32) +SADS32_H(16) diff --git a/vpx_dsp/x86/sad4d_sse2.asm b/vpx_dsp/x86/sad4d_sse2.asm index 3f6e55ce9..ed4ea3ef9 100644 --- a/vpx_dsp/x86/sad4d_sse2.asm +++ b/vpx_dsp/x86/sad4d_sse2.asm @@ -179,7 +179,16 @@ SECTION .text ; uint8_t *ref[4], int ref_stride, ; uint32_t res[4]); ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4 -%macro SADNXN4D 2 +%macro SADNXN4D 2-3 0 +%if %3 == 1 ; skip rows +%if UNIX64 +cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif +%else ; normal sad %if UNIX64 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ res, ref2, ref3, ref4 @@ -187,6 +196,11 @@ cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ ref2, ref3, ref4 %endif +%endif +%if %3 == 1 + lea src_strided, [2*src_strided] + lea ref_strided, [2*ref_strided] +%endif movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided mov ref2q, [ref1q+gprsize*1] @@ -195,9 +209,15 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ mov ref1q, [ref1q+gprsize*0] PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 -%rep (%2-4)/2 +%if %3 == 1 ; downsample number of rows by 2 +%define num_rep (%2-8)/4 +%else +%define num_rep (%2-4)/2 +%endif +%rep num_rep PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 %endrep +%undef num_rep PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 %if %1 > 4 @@ -211,12 +231,19 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ punpckhqdq m5, m7 movifnidn r4, r4mp paddd m4, m5 +%if %3 == 1 + pslld m4, 1 +%endif movu [r4], m4 RET %else movifnidn r4, r4mp pshufd m6, m6, 0x08 pshufd m7, m7, 0x08 +%if %3 == 1 + pslld m6, 1 + pslld m7, 1 +%endif movq [r4+0], m6 movq [r4+8], m7 RET @@ -237,3 +264,15 @@ SADNXN4D 8, 8 SADNXN4D 8, 4 SADNXN4D 4, 8 SADNXN4D 4, 4 + +SADNXN4D 64, 64, 1 +SADNXN4D 64, 32, 1 +SADNXN4D 32, 64, 1 +SADNXN4D 32, 32, 1 +SADNXN4D 32, 16, 1 +SADNXN4D 16, 32, 1 +SADNXN4D 16, 16, 1 +SADNXN4D 16, 8, 1 +SADNXN4D 8, 16, 1 +SADNXN4D 8, 8, 1 +SADNXN4D 4, 8, 1 diff --git a/vpx_dsp/x86/sad_avx2.c b/vpx_dsp/x86/sad_avx2.c index 29bedb0e6..e00494d76 100644 --- a/vpx_dsp/x86/sad_avx2.c +++ b/vpx_dsp/x86/sad_avx2.c @@ -11,73 +11,104 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" +static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + int i, res; + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; + __m256i sum_sad = _mm256_setzero_si256(); + __m256i sum_sad_h; + __m128i sum_sad128; + for (i = 0; i < h; i++) { + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); + sad1_reg = + _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); + sad2_reg = _mm256_sad_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); + ref_ptr += ref_stride; + src_ptr += src_stride; + } + sum_sad_h = _mm256_srli_si256(sum_sad, 8); + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); + res = _mm_cvtsi128_si32(sum_sad128); + return res; +} + +static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + int i, res; + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; + __m256i sum_sad = _mm256_setzero_si256(); + __m256i sum_sad_h; + __m128i sum_sad128; + const int ref2_stride = ref_stride << 1; + const int src2_stride = src_stride << 1; + const int max = h >> 1; + for (i = 0; i < max; i++) { + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); + sad1_reg = + _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); + sad2_reg = _mm256_sad_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); + ref_ptr += ref2_stride; + src_ptr += src2_stride; + } + sum_sad_h = _mm256_srli_si256(sum_sad, 8); + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); + res = _mm_cvtsi128_si32(sum_sad128); + return res; +} + #define FSAD64_H(h) \ unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ const uint8_t *ref_ptr, int ref_stride) { \ - int i; \ - __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ - __m256i sum_sad = _mm256_setzero_si256(); \ - __m256i sum_sad_h; \ - __m128i sum_sad128; \ - for (i = 0; i < h; i++) { \ - ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ - ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ - sad1_reg = _mm256_sad_epu8( \ - ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ - sad2_reg = _mm256_sad_epu8( \ - ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ - sum_sad = \ - _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ - ref_ptr += ref_stride; \ - src_ptr += src_stride; \ - } \ - sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ - sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ - sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ - sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ + return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ + } + +#define FSADS64_H(h) \ + unsigned int vpx_sad_skip_64x##h##_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ + h / 2); \ } #define FSAD32_H(h) \ unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ const uint8_t *ref_ptr, int ref_stride) { \ - int i, res; \ - __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ - __m256i sum_sad = _mm256_setzero_si256(); \ - __m256i sum_sad_h; \ - __m128i sum_sad128; \ - int ref2_stride = ref_stride << 1; \ - int src2_stride = src_stride << 1; \ - int max = h >> 1; \ - for (i = 0; i < max; i++) { \ - ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ - ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ - sad1_reg = _mm256_sad_epu8( \ - ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ - sad2_reg = _mm256_sad_epu8( \ - ref2_reg, \ - _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ - sum_sad = \ - _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ - ref_ptr += ref2_stride; \ - src_ptr += src2_stride; \ - } \ - sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ - sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ - sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ - sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ + return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ + } + +#define FSADS32_H(h) \ + unsigned int vpx_sad_skip_32x##h##_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ + h / 2); \ } -#define FSAD64 \ - FSAD64_H(64) \ - FSAD64_H(32) +#define FSAD64 \ + FSAD64_H(64) \ + FSAD64_H(32) \ + FSADS64_H(64) \ + FSADS64_H(32) -#define FSAD32 \ - FSAD32_H(64) \ - FSAD32_H(32) \ - FSAD32_H(16) +#define FSAD32 \ + FSAD32_H(64) \ + FSAD32_H(32) \ + FSAD32_H(16) \ + FSADS32_H(64) \ + FSADS32_H(32) \ + FSADS32_H(16) FSAD64 FSAD32 @@ -86,6 +117,8 @@ FSAD32 #undef FSAD32 #undef FSAD64_H #undef FSAD32_H +#undef FSADS64_H +#undef FSADS32_H #define FSADAVG64_H(h) \ unsigned int vpx_sad64x##h##_avg_avx2( \ diff --git a/vpx_dsp/x86/sad_sse2.asm b/vpx_dsp/x86/sad_sse2.asm index e4e1bc3e9..627e463bf 100644 --- a/vpx_dsp/x86/sad_sse2.asm +++ b/vpx_dsp/x86/sad_sse2.asm @@ -12,15 +12,29 @@ SECTION .text +; Macro Arguments +; Arg 1: Width +; Arg 2: Height +; Arg 3: Number of general purpose registers +; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows %macro SAD_FN 4 -%if %4 == 0 +%if %4 == 0 ; normal sad %if %3 == 5 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows %else ; %3 == 7 cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ src_stride3, ref_stride3, n_rows %endif ; %3 == 5/7 -%else ; avg + +%elif %4 == 2 ; skip +%if %3 == 5 +cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 + +%else %if %3 == 5 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ second_pred, n_rows @@ -35,7 +49,11 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \ %define n_rowsd dword r0m %endif ; x86-32/64 %endif ; %3 == 5/7 -%endif ; avg/sad +%endif ; sad/avg/skip +%if %4 == 2; skip rows so double the stride +lea src_strided, [src_strided*2] +lea ref_strided, [ref_strided*2] +%endif ; %4 skip movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided %if %3 == 7 @@ -48,7 +66,11 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \ ; uint8_t *ref, int ref_stride); %macro SAD64XN 1-2 0 SAD_FN 64, %1, 5, %2 +%if %2 == 2 + mov n_rowsd, %1/2 +%else mov n_rowsd, %1 +%endif pxor m0, m0 .loop: movu m1, [refq] @@ -77,6 +99,9 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \ movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -86,12 +111,18 @@ SAD64XN 64 ; sad64x64_sse2 SAD64XN 32 ; sad64x32_sse2 SAD64XN 64, 1 ; sad64x64_avg_sse2 SAD64XN 32, 1 ; sad64x32_avg_sse2 +SAD64XN 64, 2 ; sad64x64_skip_sse2 +SAD64XN 32, 2 ; sad64x32_skip_sse2 ; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD32XN 1-2 0 SAD_FN 32, %1, 5, %2 +%if %2 == 2 + mov n_rowsd, %1/4 +%else mov n_rowsd, %1/2 +%endif pxor m0, m0 .loop: movu m1, [refq] @@ -120,6 +151,9 @@ SAD64XN 32, 1 ; sad64x32_avg_sse2 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -131,12 +165,19 @@ SAD32XN 16 ; sad32x16_sse2 SAD32XN 64, 1 ; sad32x64_avg_sse2 SAD32XN 32, 1 ; sad32x32_avg_sse2 SAD32XN 16, 1 ; sad32x16_avg_sse2 +SAD32XN 64, 2 ; sad32x64_skip_sse2 +SAD32XN 32, 2 ; sad32x32_skip_sse2 +SAD32XN 16, 2 ; sad32x16_skip_sse2 ; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD16XN 1-2 0 SAD_FN 16, %1, 7, %2 +%if %2 == 2 + mov n_rowsd, %1/8 +%else mov n_rowsd, %1/4 +%endif pxor m0, m0 .loop: @@ -166,6 +207,9 @@ SAD32XN 16, 1 ; sad32x16_avg_sse2 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -177,12 +221,19 @@ SAD16XN 8 ; sad16x8_sse2 SAD16XN 32, 1 ; sad16x32_avg_sse2 SAD16XN 16, 1 ; sad16x16_avg_sse2 SAD16XN 8, 1 ; sad16x8_avg_sse2 +SAD16XN 32, 2 ; sad16x32_skip_sse2 +SAD16XN 16, 2 ; sad16x16_skip_sse2 +SAD16XN 8, 2 ; sad16x8_skip_sse2 ; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD8XN 1-2 0 SAD_FN 8, %1, 7, %2 +%if %2 == 2 + mov n_rowsd, %1/8 +%else mov n_rowsd, %1/4 +%endif pxor m0, m0 .loop: @@ -210,6 +261,9 @@ SAD16XN 8, 1 ; sad16x8_avg_sse2 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -221,12 +275,18 @@ SAD8XN 4 ; sad8x4_sse2 SAD8XN 16, 1 ; sad8x16_avg_sse2 SAD8XN 8, 1 ; sad8x8_avg_sse2 SAD8XN 4, 1 ; sad8x4_avg_sse2 +SAD8XN 16, 2 ; sad8x16_skip_sse2 +SAD8XN 8, 2 ; sad8x8_skip_sse2 ; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD4XN 1-2 0 SAD_FN 4, %1, 7, %2 +%if %2 == 2 + mov n_rowsd, %1/8 +%else mov n_rowsd, %1/4 +%endif pxor m0, m0 .loop: @@ -257,6 +317,9 @@ SAD8XN 4, 1 ; sad8x4_avg_sse2 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -266,3 +329,4 @@ SAD4XN 8 ; sad4x8_sse SAD4XN 4 ; sad4x4_sse SAD4XN 8, 1 ; sad4x8_avg_sse SAD4XN 4, 1 ; sad4x4_avg_sse +SAD4XN 8, 2 ; sad4x8_skip_sse |