25 files changed, 2277 insertions, 583 deletions
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index d4ef7ae13..3c104f3a4 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -27,6 +27,7 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/msvc.h"  // for round()
+#include "vpx_ports/vpx_timer.h"
 
 using libvpx_test::ACMRandom;
 
@@ -548,6 +549,44 @@ class Trans16x16TestBase {
     }
   }
 
+  void RunSpeedTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    int c_sum_time = 0;
+    int simd_sum_time = 0;
+
+    DECLARE_ALIGNED(32, int16_t, input_block[kNumCoeffs]);
+    DECLARE_ALIGNED(32, tran_low_t, output_ref_block[kNumCoeffs]);
+    DECLARE_ALIGNED(32, tran_low_t, output_block[kNumCoeffs]);
+
+    // Initialize a test block with input range [-mask_, mask_].
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+    }
+
+    vpx_usec_timer timer_c;
+    vpx_usec_timer_start(&timer_c);
+    for (int i = 0; i < count_test_block; ++i) {
+      vpx_fdct16x16_c(input_block, output_ref_block, pitch_);
+    }
+    vpx_usec_timer_mark(&timer_c);
+    c_sum_time += static_cast<int>(vpx_usec_timer_elapsed(&timer_c));
+
+    vpx_usec_timer timer_mod;
+    vpx_usec_timer_start(&timer_mod);
+    for (int i = 0; i < count_test_block; ++i) {
+      RunFwdTxfm(input_block, output_block, pitch_);
+    }
+
+    vpx_usec_timer_mark(&timer_mod);
+    simd_sum_time += static_cast<int>(vpx_usec_timer_elapsed(&timer_mod));
+
+    printf(
+        "c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+        simd_sum_time,
+        (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+  }
+
   void CompareInvReference(IdctFunc ref_txfm, int thresh) {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     const int count_test_block = 10000;
@@ -664,6 +703,8 @@ TEST_P(Trans16x16DCT, QuantCheck) {
 
 TEST_P(Trans16x16DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); }
 
+TEST_P(Trans16x16DCT, DISABLED_Speed) { RunSpeedTest(); }
+
 class Trans16x16HT : public Trans16x16TestBase,
                      public ::testing::TestWithParam<Ht16x16Param> {
  public:
@@ -823,6 +864,13 @@ INSTANTIATE_TEST_SUITE_P(
                                  3, VPX_BITS_8)));
 #endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
+#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, Trans16x16DCT,
+    ::testing::Values(make_tuple(&vpx_fdct16x16_avx2,
+                                 &vpx_idct16x16_256_add_sse2, 0, VPX_BITS_8)));
+#endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
 #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_SUITE_P(
     SSE2, Trans16x16DCT,
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 0896c77f1..561da5ddf 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -42,6 +42,10 @@ typedef unsigned int (*SadMxNFunc)(const uint8_t *src_ptr, int src_stride,
                                    const uint8_t *ref_ptr, int ref_stride);
 typedef TestParams<SadMxNFunc> SadMxNParam;
 
+typedef unsigned int (*SadSkipMxNFunc)(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride);
+typedef TestParams<SadSkipMxNFunc> SadSkipMxNParam;
+
 typedef unsigned int (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
                                       const uint8_t *ref_ptr, int ref_stride,
                                       const uint8_t *second_pred);
@@ -52,6 +56,11 @@ typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, int src_stride,
                              unsigned int *sad_array);
 typedef TestParams<SadMxNx4Func> SadMxNx4Param;
 
+typedef void (*SadSkipMxNx4Func)(const uint8_t *src_ptr, int src_stride,
+                                 const uint8_t *const ref_ptr[], int ref_stride,
+                                 unsigned int *sad_array);
+typedef TestParams<SadSkipMxNx4Func> SadSkipMxNx4Param;
+
 typedef void (*SadMxNx8Func)(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *ref_ptr, int ref_stride,
                              unsigned int *sad_array);
@@ -170,6 +179,34 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {
     return sad;
   }
 
+  // Sum of Absolute Differences Skip rows. Given two blocks, calculate the
+  // absolute difference between two pixels in the same relative location every
+  // other row; accumulate and double the result at the end.
+  uint32_t ReferenceSADSkip(int ref_offset) const {
+    uint32_t sad = 0;
+    const uint8_t *const reference8 = GetReferenceFromOffset(ref_offset);
+    const uint8_t *const source8 = source_data_;
+#if CONFIG_VP9_HIGHBITDEPTH
+    const uint16_t *const reference16 =
+        CONVERT_TO_SHORTPTR(GetReferenceFromOffset(ref_offset));
+    const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    for (int h = 0; h < params_.height; h += 2) {
+      for (int w = 0; w < params_.width; ++w) {
+        if (!use_high_bit_depth_) {
+          sad += abs(source8[h * source_stride_ + w] -
+                     reference8[h * reference_stride_ + w]);
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          sad += abs(source16[h * source_stride_ + w] -
+                     reference16[h * reference_stride_ + w]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
+      }
+    }
+    return sad * 2;
+  }
+
   // Sum of Absolute Differences Average. Given two blocks, and a prediction
   // calculate the absolute difference between one pixel and average of the
   // corresponding and predicted pixels; accumulate.
@@ -290,6 +327,32 @@ class SADx4Test : public SADTestBase<SadMxNx4Param> {
   }
 };
 
+class SADSkipx4Test : public SADTestBase<SadMxNx4Param> {
+ public:
+  SADSkipx4Test() : SADTestBase(GetParam()) {}
+
+ protected:
+  void SADs(unsigned int *results) const {
+    const uint8_t *references[] = { GetReference(0), GetReference(1),
+                                    GetReference(2), GetReference(3) };
+
+    ASM_REGISTER_STATE_CHECK(params_.func(
+        source_data_, source_stride_, references, reference_stride_, results));
+  }
+
+  void CheckSADs() const {
+    uint32_t reference_sad;
+    DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]);
+
+    SADs(exp_sad);
+    for (int block = 0; block < 4; ++block) {
+      reference_sad = ReferenceSADSkip(GetBlockRefOffset(block));
+
+      EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
+    }
+  }
+};
+
 class SADTest : public AbstractBench, public SADTestBase<SadMxNParam> {
  public:
   SADTest() : SADTestBase(GetParam()) {}
@@ -317,6 +380,33 @@ class SADTest : public AbstractBench, public SADTestBase<SadMxNParam> {
   }
 };
 
+class SADSkipTest : public AbstractBench, public SADTestBase<SadMxNParam> {
+ public:
+  SADSkipTest() : SADTestBase(GetParam()) {}
+
+ protected:
+  unsigned int SAD(int block_idx) const {
+    unsigned int ret;
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(ret = params_.func(source_data_, source_stride_,
+                                                reference, reference_stride_));
+    return ret;
+  }
+
+  void CheckSAD() const {
+    const unsigned int reference_sad = ReferenceSADSkip(GetBlockRefOffset(0));
+    const unsigned int exp_sad = SAD(0);
+
+    ASSERT_EQ(reference_sad, exp_sad);
+  }
+
+  void Run() override {
+    params_.func(source_data_, source_stride_, reference_data_,
+                 reference_stride_);
+  }
+};
+
 class SADavgTest : public AbstractBench, public SADTestBase<SadMxNAvgParam> {
  public:
   SADavgTest() : SADTestBase(GetParam()) {}
@@ -397,6 +487,58 @@ TEST_P(SADTest, DISABLED_Speed) {
   PrintMedian(title);
 }
 
+TEST_P(SADSkipTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, mask_);
+  CheckSAD();
+}
+
+TEST_P(SADSkipTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(reference_data_, reference_stride_, 0);
+  CheckSAD();
+}
+
+TEST_P(SADSkipTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, ShortSrc) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height);
+  FillRandom(source_data_, source_stride_);
+
+  RunNTimes(kCountSpeedTestBlock);
+
+  char title[16];
+  snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height);
+  PrintMedian(title);
+}
+
 TEST_P(SADavgTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
   FillConstant(reference_data_, reference_stride_, mask_);
@@ -554,6 +696,105 @@ TEST_P(SADx4Test, DISABLED_Speed) {
   reference_stride_ = tmp_stride;
 }
 
+TEST_P(SADSkipx4Test, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(GetReference(0), reference_stride_, mask_);
+  FillConstant(GetReference(1), reference_stride_, mask_);
+  FillConstant(GetReference(2), reference_stride_, mask_);
+  FillConstant(GetReference(3), reference_stride_, mask_);
+  CheckSADs();
+}
+
+TEST_P(SADSkipx4Test, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(GetReference(0), reference_stride_, 0);
+  FillConstant(GetReference(1), reference_stride_, 0);
+  FillConstant(GetReference(2), reference_stride_, 0);
+  FillConstant(GetReference(3), reference_stride_, 0);
+  CheckSADs();
+}
+
+TEST_P(SADSkipx4Test, ShortRef) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, ShortSrc) {
+  int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, SrcAlignedByWidth) {
+  uint8_t *tmp_source_data = source_data_;
+  source_data_ += params_.width;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  source_data_ = tmp_source_data;
+}
+
+TEST_P(SADSkipx4Test, DISABLED_Speed) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  const int kCountSpeedTestBlock = 500000000 / (params_.width * params_.height);
+  uint32_t reference_sad[4];
+  DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]);
+  vpx_usec_timer timer;
+  for (int block = 0; block < 4; ++block) {
+    reference_sad[block] = ReferenceSADSkip(GetBlockRefOffset(block));
+  }
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+    SADs(exp_sad);
+  }
+  vpx_usec_timer_mark(&timer);
+  for (int block = 0; block < 4; ++block) {
+    EXPECT_EQ(reference_sad[block], exp_sad[block]) << "block " << block;
+  }
+  const int elapsed_time =
+      static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
+  printf("sad%dx%dx4 (%2dbit) time: %5d ms\n", params_.width, params_.height,
+         bit_depth_, elapsed_time);
+
+  reference_stride_ = tmp_stride;
+}
+
 //------------------------------------------------------------------------------
 // C functions
 const SadMxNParam c_tests[] = {
@@ -614,6 +855,56 @@ const SadMxNParam c_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(C, SADTest, ::testing::ValuesIn(c_tests));
 
+const SadSkipMxNParam skip_c_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_c),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_c),
+  SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_c),
+  SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_c),
+  SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_c),
+  SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_c),
+  SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_c),
+  SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_c),
+  SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_c),
+  SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_c),
+  SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_c),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 8),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 8),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 8),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 8),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 8),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 8),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 8),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 8),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 8),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 8),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 8),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 10),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 10),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 10),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 10),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 10),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 10),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 10),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 10),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 10),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 10),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 10),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 12),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 12),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 12),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 12),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 12),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 12),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 12),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 12),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 12),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 12),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(C, SADSkipTest, ::testing::ValuesIn(skip_c_tests));
+
 const SadMxNAvgParam avg_c_tests[] = {
   SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_c),
   SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_c),
@@ -730,6 +1021,57 @@ const SadMxNx4Param x4d_c_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
 
+const SadSkipMxNx4Param skip_x4d_c_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_c),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_c),
+  SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_c),
+  SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_c),
+  SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_c),
+  SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_c),
+  SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_c),
+  SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_c),
+  SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_c),
+  SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_c),
+  SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_c),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 8),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 8),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 8),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 8),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 8),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 8),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 8),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 8),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 8),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 8),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 8),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 10),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 10),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 10),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 10),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 10),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 10),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 10),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 10),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 10),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 10),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 10),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 12),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 12),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 12),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 12),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 12),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 12),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 12),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 12),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 12),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 12),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(C, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_c_tests));
+
 //------------------------------------------------------------------------------
 // ARM functions
 #if HAVE_NEON
@@ -956,6 +1298,54 @@ const SadMxNParam sse2_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
 
+const SadSkipMxNParam skip_sse2_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_sse2),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_sse2),
+  SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_sse2),
+  SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_sse2),
+  SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_sse2),
+  SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_sse2),
+  SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_sse2),
+  SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_sse2),
+  SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_sse2),
+  SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_sse2),
+  SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_sse2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 8),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 8),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 8),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 8),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 8),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 8),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 8),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 8),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 8),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 8),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 10),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 10),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 10),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 10),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 10),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 10),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 10),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 10),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 10),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 10),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 12),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 12),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 12),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 12),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 12),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 12),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 12),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 12),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 12),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipTest,
+                         ::testing::ValuesIn(skip_sse2_tests));
+
 const SadMxNAvgParam avg_sse2_tests[] = {
   SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_sse2),
   SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_sse2),
@@ -1065,6 +1455,57 @@ const SadMxNx4Param x4d_sse2_tests[] = {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
+
+const SadSkipMxNx4Param skip_x4d_sse2_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_sse2),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_sse2),
+  SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_sse2),
+  SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_sse2),
+  SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_sse2),
+  SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_sse2),
+  SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_sse2),
+  SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_sse2),
+  SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_sse2),
+  SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_sse2),
+  SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_sse2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 8),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 8),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 8),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 8),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 8),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 8),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 8),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 8),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 8),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 8),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 8),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 10),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 10),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 10),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 10),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 10),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 10),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 10),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 10),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 10),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 10),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 10),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 12),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 12),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 12),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 12),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 12),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 12),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 12),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 12),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 12),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 12),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_sse2_tests));
 #endif  // HAVE_SSE2
 
 #if HAVE_SSE3
@@ -1113,6 +1554,44 @@ const SadMxNParam avx2_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
 
+const SadSkipMxNParam skip_avx2_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_avx2),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_avx2),
+  SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_avx2),
+  SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_avx2),
+  SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 8),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 8),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 8),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 8),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 8),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 8),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 8),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 8),
+
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 10),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 10),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 10),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 10),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 10),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 10),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 10),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 10),
+
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 12),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 12),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 12),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 12),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 12),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 12),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 12),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipTest,
+                         ::testing::ValuesIn(skip_avx2_tests));
+
 const SadMxNAvgParam avg_avx2_tests[] = {
   SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_avx2),
   SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_avx2),
@@ -1180,6 +1659,42 @@ const SadMxNx4Param x4d_avx2_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
 
+const SadSkipMxNx4Param skip_x4d_avx2_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_avx2),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_avx2),
+  SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_avx2),
+  SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_avx2),
+  SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 8),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 8),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 8),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 8),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 8),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 8),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 8),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 8),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 10),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 10),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 10),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 10),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 10),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 10),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 10),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 10),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 12),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 12),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 12),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 12),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 12),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 12),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 12),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_avx2_tests));
+
 #endif  // HAVE_AVX2
 
 #if HAVE_AVX512
diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc
index 65c58536a..60bc258a6 100644
--- a/vp8/vp8_ratectrl_rtc.cc
+++ b/vp8/vp8_ratectrl_rtc.cc
@@ -294,6 +294,34 @@ void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) {
 
 int VP8RateControlRTC::GetQP() const { return q_; }
 
+int VP8RateControlRTC::GetLoopfilterLevel() const {
+  VP8_COMMON *cm = &cpi_->common;
+  const double qp = q_;
+
+  // This model is from linear regression
+  if (cm->Width * cm->Height <= 320 * 240) {
+    cm->filter_level = static_cast<int>(0.352685 * qp + 2.957774);
+  } else if (cm->Width * cm->Height <= 640 * 480) {
+    cm->filter_level = static_cast<int>(0.485069 * qp - 0.534462);
+  } else {
+    cm->filter_level = static_cast<int>(0.314875 * qp + 7.959003);
+  }
+
+  int min_filter_level = 0;
+  // This logic is from get_min_filter_level() in picklpf.c
+  if (q_ > 6 && q_ <= 16) {
+    min_filter_level = 1;
+  } else {
+    min_filter_level = (q_ / 8);
+  }
+
+  const int max_filter_level = 63;
+  if (cm->filter_level < min_filter_level) cm->filter_level = min_filter_level;
+  if (cm->filter_level > max_filter_level) cm->filter_level = max_filter_level;
+
+  return cm->filter_level;
+}
+
 void VP8RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
   VP8_COMMON *const cm = &cpi_->common;
   vpx_clear_system_state();
diff --git a/vp8/vp8_ratectrl_rtc.h b/vp8/vp8_ratectrl_rtc.h
index a8a886c56..496ef9eaa 100644
--- a/vp8/vp8_ratectrl_rtc.h
+++ b/vp8/vp8_ratectrl_rtc.h
@@ -42,6 +42,9 @@ class VP8RateControlRTC {
   bool UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg);
   // GetQP() needs to be called after ComputeQP() to get the latest QP
   int GetQP() const;
+  // GetLoopfilterLevel() needs to be called after ComputeQP() since loopfilter
+  // level is calculated from frame qp.
+  int GetLoopfilterLevel() const;
   // int GetLoopfilterLevel() const;
   void ComputeQP(const VP8FrameParamsQpRTC &frame_params);
   // Feedback to rate control with the size of current encoded frame
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 5e6079255..4b94c31f1 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -23,7 +23,7 @@ struct macroblockd;
 
 /* Encoder forward decls */
 struct macroblock;
-struct vp9_variance_vtable;
+struct vp9_sad_table;
 struct search_site_config;
 struct mv;
 union int_mv;
@@ -171,7 +171,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
 #
 # Motion search
 #
-add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
+add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv";
 specialize qw/vp9_diamond_search_sad avx neon/;
 
 #
diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
index 15334b413..255e6fbc4 100644
--- a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
+++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
@@ -49,7 +49,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
                                 const search_site_config *cfg, MV *ref_mv,
                                 uint32_t start_mv_sad, MV *best_mv,
                                 int search_param, int sad_per_bit, int *num00,
-                                const vp9_variance_fn_ptr_t *fn_ptr,
+                                const vp9_sad_fn_ptr_t *sad_fn_ptr,
                                 const MV *center_mv) {
   static const uint32_t data[4] = { 0, 1, 2, 3 };
   const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data);
@@ -188,8 +188,8 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
 #endif
       }
 
-      fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
-                     in_what_stride, (uint32_t *)&v_sad_d);
+      sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
+                         in_what_stride, (uint32_t *)&v_sad_d);
 
       // Look up the component cost of the residual motion vector
       {
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 72a6189d1..354f08eae 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1561,13 +1561,15 @@ void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) {
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
-  cpi->fn_ptr[BT].sdf = SDF;                             \
-  cpi->fn_ptr[BT].sdaf = SDAF;                           \
-  cpi->fn_ptr[BT].vf = VF;                               \
-  cpi->fn_ptr[BT].svf = SVF;                             \
-  cpi->fn_ptr[BT].svaf = SVAF;                           \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;
+#define HIGHBD_BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \
+  cpi->fn_ptr[BT].sdf = SDF;                                            \
+  cpi->fn_ptr[BT].sdsf = SDSF;                                          \
+  cpi->fn_ptr[BT].sdaf = SDAF;                                          \
+  cpi->fn_ptr[BT].vf = VF;                                              \
+  cpi->fn_ptr[BT].svf = SVF;                                            \
+  cpi->fn_ptr[BT].svaf = SVAF;                                          \
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;                                      \
+  cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
 
 #define MAKE_BFP_SAD_WRAPPER(fnname)                                           \
   static unsigned int fnname##_bits8(const uint8_t *src_ptr,                   \
@@ -1627,284 +1629,361 @@ void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) {
   }
 
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x16)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x16x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x32)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x32_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x32x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x32)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x32_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x32x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x64)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x64)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x64x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x32)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x32x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x64)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x64x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x16)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x16x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x8)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x8x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x16)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x16x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x8)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x8x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x4)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x4x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x8)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x8x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x4)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x4x4d)
 
 static void highbd_set_var_fns(VP9_COMP *const cpi) {
   VP9_COMMON *const cm = &cpi->common;
   if (cm->use_highbitdepth) {
     switch (cm->bit_depth) {
       case VPX_BITS_8:
-        HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits8,
-                   vpx_highbd_sad32x16_avg_bits8, vpx_highbd_8_variance32x16,
-                   vpx_highbd_8_sub_pixel_variance32x16,
-                   vpx_highbd_8_sub_pixel_avg_variance32x16,
-                   vpx_highbd_sad32x16x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits8,
-                   vpx_highbd_sad16x32_avg_bits8, vpx_highbd_8_variance16x32,
-                   vpx_highbd_8_sub_pixel_variance16x32,
-                   vpx_highbd_8_sub_pixel_avg_variance16x32,
-                   vpx_highbd_sad16x32x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits8,
-                   vpx_highbd_sad64x32_avg_bits8, vpx_highbd_8_variance64x32,
-                   vpx_highbd_8_sub_pixel_variance64x32,
-                   vpx_highbd_8_sub_pixel_avg_variance64x32,
-                   vpx_highbd_sad64x32x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits8,
-                   vpx_highbd_sad32x64_avg_bits8, vpx_highbd_8_variance32x64,
-                   vpx_highbd_8_sub_pixel_variance32x64,
-                   vpx_highbd_8_sub_pixel_avg_variance32x64,
-                   vpx_highbd_sad32x64x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits8,
-                   vpx_highbd_sad32x32_avg_bits8, vpx_highbd_8_variance32x32,
-                   vpx_highbd_8_sub_pixel_variance32x32,
-                   vpx_highbd_8_sub_pixel_avg_variance32x32,
-                   vpx_highbd_sad32x32x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits8,
-                   vpx_highbd_sad64x64_avg_bits8, vpx_highbd_8_variance64x64,
-                   vpx_highbd_8_sub_pixel_variance64x64,
-                   vpx_highbd_8_sub_pixel_avg_variance64x64,
-                   vpx_highbd_sad64x64x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits8,
-                   vpx_highbd_sad16x16_avg_bits8, vpx_highbd_8_variance16x16,
-                   vpx_highbd_8_sub_pixel_variance16x16,
-                   vpx_highbd_8_sub_pixel_avg_variance16x16,
-                   vpx_highbd_sad16x16x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits8,
-                   vpx_highbd_sad16x8_avg_bits8, vpx_highbd_8_variance16x8,
-                   vpx_highbd_8_sub_pixel_variance16x8,
-                   vpx_highbd_8_sub_pixel_avg_variance16x8,
-                   vpx_highbd_sad16x8x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits8,
-                   vpx_highbd_sad8x16_avg_bits8, vpx_highbd_8_variance8x16,
-                   vpx_highbd_8_sub_pixel_variance8x16,
-                   vpx_highbd_8_sub_pixel_avg_variance8x16,
-                   vpx_highbd_sad8x16x4d_bits8)
+        HIGHBD_BFP(
+            BLOCK_32X16, vpx_highbd_sad32x16_bits8,
+            vpx_highbd_sad_skip_32x16_bits8, vpx_highbd_sad32x16_avg_bits8,
+            vpx_highbd_8_variance32x16, vpx_highbd_8_sub_pixel_variance32x16,
+            vpx_highbd_8_sub_pixel_avg_variance32x16,
+            vpx_highbd_sad32x16x4d_bits8, vpx_highbd_sad_skip_32x16x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_16X32, vpx_highbd_sad16x32_bits8,
+            vpx_highbd_sad_skip_16x32_bits8, vpx_highbd_sad16x32_avg_bits8,
+            vpx_highbd_8_variance16x32, vpx_highbd_8_sub_pixel_variance16x32,
+            vpx_highbd_8_sub_pixel_avg_variance16x32,
+            vpx_highbd_sad16x32x4d_bits8, vpx_highbd_sad_skip_16x32x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_64X32, vpx_highbd_sad64x32_bits8,
+            vpx_highbd_sad_skip_64x32_bits8, vpx_highbd_sad64x32_avg_bits8,
+            vpx_highbd_8_variance64x32, vpx_highbd_8_sub_pixel_variance64x32,
+            vpx_highbd_8_sub_pixel_avg_variance64x32,
+            vpx_highbd_sad64x32x4d_bits8, vpx_highbd_sad_skip_64x32x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_32X64, vpx_highbd_sad32x64_bits8,
+            vpx_highbd_sad_skip_32x64_bits8, vpx_highbd_sad32x64_avg_bits8,
+            vpx_highbd_8_variance32x64, vpx_highbd_8_sub_pixel_variance32x64,
+            vpx_highbd_8_sub_pixel_avg_variance32x64,
+            vpx_highbd_sad32x64x4d_bits8, vpx_highbd_sad_skip_32x64x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_32X32, vpx_highbd_sad32x32_bits8,
+            vpx_highbd_sad_skip_32x32_bits8, vpx_highbd_sad32x32_avg_bits8,
+            vpx_highbd_8_variance32x32, vpx_highbd_8_sub_pixel_variance32x32,
+            vpx_highbd_8_sub_pixel_avg_variance32x32,
+            vpx_highbd_sad32x32x4d_bits8, vpx_highbd_sad_skip_32x32x4d_bits8)
 
         HIGHBD_BFP(
-            BLOCK_8X8, vpx_highbd_sad8x8_bits8, vpx_highbd_sad8x8_avg_bits8,
-            vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8,
-            vpx_highbd_8_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x4d_bits8)
+            BLOCK_64X64, vpx_highbd_sad64x64_bits8,
+            vpx_highbd_sad_skip_64x64_bits8, vpx_highbd_sad64x64_avg_bits8,
+            vpx_highbd_8_variance64x64, vpx_highbd_8_sub_pixel_variance64x64,
+            vpx_highbd_8_sub_pixel_avg_variance64x64,
+            vpx_highbd_sad64x64x4d_bits8, vpx_highbd_sad_skip_64x64x4d_bits8)
 
         HIGHBD_BFP(
-            BLOCK_8X4, vpx_highbd_sad8x4_bits8, vpx_highbd_sad8x4_avg_bits8,
-            vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4,
-            vpx_highbd_8_sub_pixel_avg_variance8x4, vpx_highbd_sad8x4x4d_bits8)
+            BLOCK_16X16, vpx_highbd_sad16x16_bits8,
+            vpx_highbd_sad_skip_16x16_bits8, vpx_highbd_sad16x16_avg_bits8,
+            vpx_highbd_8_variance16x16, vpx_highbd_8_sub_pixel_variance16x16,
+            vpx_highbd_8_sub_pixel_avg_variance16x16,
+            vpx_highbd_sad16x16x4d_bits8, vpx_highbd_sad_skip_16x16x4d_bits8)
 
         HIGHBD_BFP(
-            BLOCK_4X8, vpx_highbd_sad4x8_bits8, vpx_highbd_sad4x8_avg_bits8,
-            vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8,
-            vpx_highbd_8_sub_pixel_avg_variance4x8, vpx_highbd_sad4x8x4d_bits8)
+            BLOCK_16X8, vpx_highbd_sad16x8_bits8,
+            vpx_highbd_sad_skip_16x8_bits8, vpx_highbd_sad16x8_avg_bits8,
+            vpx_highbd_8_variance16x8, vpx_highbd_8_sub_pixel_variance16x8,
+            vpx_highbd_8_sub_pixel_avg_variance16x8,
+            vpx_highbd_sad16x8x4d_bits8, vpx_highbd_sad_skip_16x8x4d_bits8)
 
         HIGHBD_BFP(
-            BLOCK_4X4, vpx_highbd_sad4x4_bits8, vpx_highbd_sad4x4_avg_bits8,
-            vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4,
-            vpx_highbd_8_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x4d_bits8)
+            BLOCK_8X16, vpx_highbd_sad8x16_bits8,
+            vpx_highbd_sad_skip_8x16_bits8, vpx_highbd_sad8x16_avg_bits8,
+            vpx_highbd_8_variance8x16, vpx_highbd_8_sub_pixel_variance8x16,
+            vpx_highbd_8_sub_pixel_avg_variance8x16,
+            vpx_highbd_sad8x16x4d_bits8, vpx_highbd_sad_skip_8x16x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits8,
+                   vpx_highbd_sad_skip_8x8_bits8, vpx_highbd_sad8x8_avg_bits8,
+                   vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8,
+                   vpx_highbd_8_sub_pixel_avg_variance8x8,
+                   vpx_highbd_sad8x8x4d_bits8, vpx_highbd_sad_skip_8x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits8,
+                   vpx_highbd_sad_skip_8x4_bits8, vpx_highbd_sad8x4_avg_bits8,
+                   vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4,
+                   vpx_highbd_8_sub_pixel_avg_variance8x4,
+                   vpx_highbd_sad8x4x4d_bits8, vpx_highbd_sad_skip_8x4x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits8,
+                   vpx_highbd_sad_skip_4x8_bits8, vpx_highbd_sad4x8_avg_bits8,
+                   vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8,
+                   vpx_highbd_8_sub_pixel_avg_variance4x8,
+                   vpx_highbd_sad4x8x4d_bits8, vpx_highbd_sad_skip_4x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits8,
+                   vpx_highbd_sad_skip_4x4_bits8, vpx_highbd_sad4x4_avg_bits8,
+                   vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4,
+                   vpx_highbd_8_sub_pixel_avg_variance4x4,
+                   vpx_highbd_sad4x4x4d_bits8, vpx_highbd_sad_skip_4x4x4d_bits8)
         break;
 
       case VPX_BITS_10:
-        HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits10,
-                   vpx_highbd_sad32x16_avg_bits10, vpx_highbd_10_variance32x16,
-                   vpx_highbd_10_sub_pixel_variance32x16,
-                   vpx_highbd_10_sub_pixel_avg_variance32x16,
-                   vpx_highbd_sad32x16x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits10,
-                   vpx_highbd_sad16x32_avg_bits10, vpx_highbd_10_variance16x32,
-                   vpx_highbd_10_sub_pixel_variance16x32,
-                   vpx_highbd_10_sub_pixel_avg_variance16x32,
-                   vpx_highbd_sad16x32x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits10,
-                   vpx_highbd_sad64x32_avg_bits10, vpx_highbd_10_variance64x32,
-                   vpx_highbd_10_sub_pixel_variance64x32,
-                   vpx_highbd_10_sub_pixel_avg_variance64x32,
-                   vpx_highbd_sad64x32x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits10,
-                   vpx_highbd_sad32x64_avg_bits10, vpx_highbd_10_variance32x64,
-                   vpx_highbd_10_sub_pixel_variance32x64,
-                   vpx_highbd_10_sub_pixel_avg_variance32x64,
-                   vpx_highbd_sad32x64x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits10,
-                   vpx_highbd_sad32x32_avg_bits10, vpx_highbd_10_variance32x32,
-                   vpx_highbd_10_sub_pixel_variance32x32,
-                   vpx_highbd_10_sub_pixel_avg_variance32x32,
-                   vpx_highbd_sad32x32x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits10,
-                   vpx_highbd_sad64x64_avg_bits10, vpx_highbd_10_variance64x64,
-                   vpx_highbd_10_sub_pixel_variance64x64,
-                   vpx_highbd_10_sub_pixel_avg_variance64x64,
-                   vpx_highbd_sad64x64x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits10,
-                   vpx_highbd_sad16x16_avg_bits10, vpx_highbd_10_variance16x16,
-                   vpx_highbd_10_sub_pixel_variance16x16,
-                   vpx_highbd_10_sub_pixel_avg_variance16x16,
-                   vpx_highbd_sad16x16x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits10,
-                   vpx_highbd_sad16x8_avg_bits10, vpx_highbd_10_variance16x8,
-                   vpx_highbd_10_sub_pixel_variance16x8,
-                   vpx_highbd_10_sub_pixel_avg_variance16x8,
-                   vpx_highbd_sad16x8x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits10,
-                   vpx_highbd_sad8x16_avg_bits10, vpx_highbd_10_variance8x16,
-                   vpx_highbd_10_sub_pixel_variance8x16,
-                   vpx_highbd_10_sub_pixel_avg_variance8x16,
-                   vpx_highbd_sad8x16x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits10,
-                   vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8,
-                   vpx_highbd_10_sub_pixel_variance8x8,
-                   vpx_highbd_10_sub_pixel_avg_variance8x8,
-                   vpx_highbd_sad8x8x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits10,
-                   vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4,
-                   vpx_highbd_10_sub_pixel_variance8x4,
-                   vpx_highbd_10_sub_pixel_avg_variance8x4,
-                   vpx_highbd_sad8x4x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits10,
-                   vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8,
-                   vpx_highbd_10_sub_pixel_variance4x8,
-                   vpx_highbd_10_sub_pixel_avg_variance4x8,
-                   vpx_highbd_sad4x8x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits10,
-                   vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4,
-                   vpx_highbd_10_sub_pixel_variance4x4,
-                   vpx_highbd_10_sub_pixel_avg_variance4x4,
-                   vpx_highbd_sad4x4x4d_bits10)
+        HIGHBD_BFP(
+            BLOCK_32X16, vpx_highbd_sad32x16_bits10,
+            vpx_highbd_sad_skip_32x16_bits10, vpx_highbd_sad32x16_avg_bits10,
+            vpx_highbd_10_variance32x16, vpx_highbd_10_sub_pixel_variance32x16,
+            vpx_highbd_10_sub_pixel_avg_variance32x16,
+            vpx_highbd_sad32x16x4d_bits10, vpx_highbd_sad_skip_32x16x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_16X32, vpx_highbd_sad16x32_bits10,
+            vpx_highbd_sad_skip_16x32_bits10, vpx_highbd_sad16x32_avg_bits10,
+            vpx_highbd_10_variance16x32, vpx_highbd_10_sub_pixel_variance16x32,
+            vpx_highbd_10_sub_pixel_avg_variance16x32,
+            vpx_highbd_sad16x32x4d_bits10, vpx_highbd_sad_skip_16x32x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_64X32, vpx_highbd_sad64x32_bits10,
+            vpx_highbd_sad_skip_64x32_bits10, vpx_highbd_sad64x32_avg_bits10,
+            vpx_highbd_10_variance64x32, vpx_highbd_10_sub_pixel_variance64x32,
+            vpx_highbd_10_sub_pixel_avg_variance64x32,
+            vpx_highbd_sad64x32x4d_bits10, vpx_highbd_sad_skip_64x32x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_32X64, vpx_highbd_sad32x64_bits10,
+            vpx_highbd_sad_skip_32x64_bits10, vpx_highbd_sad32x64_avg_bits10,
+            vpx_highbd_10_variance32x64, vpx_highbd_10_sub_pixel_variance32x64,
+            vpx_highbd_10_sub_pixel_avg_variance32x64,
+            vpx_highbd_sad32x64x4d_bits10, vpx_highbd_sad_skip_32x64x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_32X32, vpx_highbd_sad32x32_bits10,
+            vpx_highbd_sad_skip_32x32_bits10, vpx_highbd_sad32x32_avg_bits10,
+            vpx_highbd_10_variance32x32, vpx_highbd_10_sub_pixel_variance32x32,
+            vpx_highbd_10_sub_pixel_avg_variance32x32,
+            vpx_highbd_sad32x32x4d_bits10, vpx_highbd_sad_skip_32x32x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_64X64, vpx_highbd_sad64x64_bits10,
+            vpx_highbd_sad_skip_64x64_bits10, vpx_highbd_sad64x64_avg_bits10,
+            vpx_highbd_10_variance64x64, vpx_highbd_10_sub_pixel_variance64x64,
+            vpx_highbd_10_sub_pixel_avg_variance64x64,
+            vpx_highbd_sad64x64x4d_bits10, vpx_highbd_sad_skip_64x64x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_16X16, vpx_highbd_sad16x16_bits10,
+            vpx_highbd_sad_skip_16x16_bits10, vpx_highbd_sad16x16_avg_bits10,
+            vpx_highbd_10_variance16x16, vpx_highbd_10_sub_pixel_variance16x16,
+            vpx_highbd_10_sub_pixel_avg_variance16x16,
+            vpx_highbd_sad16x16x4d_bits10, vpx_highbd_sad_skip_16x16x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_16X8, vpx_highbd_sad16x8_bits10,
+            vpx_highbd_sad_skip_16x8_bits10, vpx_highbd_sad16x8_avg_bits10,
+            vpx_highbd_10_variance16x8, vpx_highbd_10_sub_pixel_variance16x8,
+            vpx_highbd_10_sub_pixel_avg_variance16x8,
+            vpx_highbd_sad16x8x4d_bits10, vpx_highbd_sad_skip_16x8x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_8X16, vpx_highbd_sad8x16_bits10,
+            vpx_highbd_sad_skip_8x16_bits10, vpx_highbd_sad8x16_avg_bits10,
+            vpx_highbd_10_variance8x16, vpx_highbd_10_sub_pixel_variance8x16,
+            vpx_highbd_10_sub_pixel_avg_variance8x16,
+            vpx_highbd_sad8x16x4d_bits10, vpx_highbd_sad_skip_8x16x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_8X8, vpx_highbd_sad8x8_bits10, vpx_highbd_sad_skip_8x8_bits10,
+            vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8,
+            vpx_highbd_10_sub_pixel_variance8x8,
+            vpx_highbd_10_sub_pixel_avg_variance8x8,
+            vpx_highbd_sad8x8x4d_bits10, vpx_highbd_sad_skip_8x8x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_8X4, vpx_highbd_sad8x4_bits10, vpx_highbd_sad_skip_8x4_bits10,
+            vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4,
+            vpx_highbd_10_sub_pixel_variance8x4,
+            vpx_highbd_10_sub_pixel_avg_variance8x4,
+            vpx_highbd_sad8x4x4d_bits10, vpx_highbd_sad_skip_8x4x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_4X8, vpx_highbd_sad4x8_bits10, vpx_highbd_sad_skip_4x8_bits10,
+            vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8,
+            vpx_highbd_10_sub_pixel_variance4x8,
+            vpx_highbd_10_sub_pixel_avg_variance4x8,
+            vpx_highbd_sad4x8x4d_bits10, vpx_highbd_sad_skip_4x8x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_4X4, vpx_highbd_sad4x4_bits10, vpx_highbd_sad_skip_4x4_bits10,
+            vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4,
+            vpx_highbd_10_sub_pixel_variance4x4,
+            vpx_highbd_10_sub_pixel_avg_variance4x4,
+            vpx_highbd_sad4x4x4d_bits10, vpx_highbd_sad_skip_4x4x4d_bits10)
         break;
 
       default:
         assert(cm->bit_depth == VPX_BITS_12);
-        HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12,
-                   vpx_highbd_sad32x16_avg_bits12, vpx_highbd_12_variance32x16,
-                   vpx_highbd_12_sub_pixel_variance32x16,
-                   vpx_highbd_12_sub_pixel_avg_variance32x16,
-                   vpx_highbd_sad32x16x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits12,
-                   vpx_highbd_sad16x32_avg_bits12, vpx_highbd_12_variance16x32,
-                   vpx_highbd_12_sub_pixel_variance16x32,
-                   vpx_highbd_12_sub_pixel_avg_variance16x32,
-                   vpx_highbd_sad16x32x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits12,
-                   vpx_highbd_sad64x32_avg_bits12, vpx_highbd_12_variance64x32,
-                   vpx_highbd_12_sub_pixel_variance64x32,
-                   vpx_highbd_12_sub_pixel_avg_variance64x32,
-                   vpx_highbd_sad64x32x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits12,
-                   vpx_highbd_sad32x64_avg_bits12, vpx_highbd_12_variance32x64,
-                   vpx_highbd_12_sub_pixel_variance32x64,
-                   vpx_highbd_12_sub_pixel_avg_variance32x64,
-                   vpx_highbd_sad32x64x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits12,
-                   vpx_highbd_sad32x32_avg_bits12, vpx_highbd_12_variance32x32,
-                   vpx_highbd_12_sub_pixel_variance32x32,
-                   vpx_highbd_12_sub_pixel_avg_variance32x32,
-                   vpx_highbd_sad32x32x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits12,
-                   vpx_highbd_sad64x64_avg_bits12, vpx_highbd_12_variance64x64,
-                   vpx_highbd_12_sub_pixel_variance64x64,
-                   vpx_highbd_12_sub_pixel_avg_variance64x64,
-                   vpx_highbd_sad64x64x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits12,
-                   vpx_highbd_sad16x16_avg_bits12, vpx_highbd_12_variance16x16,
-                   vpx_highbd_12_sub_pixel_variance16x16,
-                   vpx_highbd_12_sub_pixel_avg_variance16x16,
-                   vpx_highbd_sad16x16x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits12,
-                   vpx_highbd_sad16x8_avg_bits12, vpx_highbd_12_variance16x8,
-                   vpx_highbd_12_sub_pixel_variance16x8,
-                   vpx_highbd_12_sub_pixel_avg_variance16x8,
-                   vpx_highbd_sad16x8x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits12,
-                   vpx_highbd_sad8x16_avg_bits12, vpx_highbd_12_variance8x16,
-                   vpx_highbd_12_sub_pixel_variance8x16,
-                   vpx_highbd_12_sub_pixel_avg_variance8x16,
-                   vpx_highbd_sad8x16x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits12,
-                   vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8,
-                   vpx_highbd_12_sub_pixel_variance8x8,
-                   vpx_highbd_12_sub_pixel_avg_variance8x8,
-                   vpx_highbd_sad8x8x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits12,
-                   vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4,
-                   vpx_highbd_12_sub_pixel_variance8x4,
-                   vpx_highbd_12_sub_pixel_avg_variance8x4,
-                   vpx_highbd_sad8x4x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits12,
-                   vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8,
-                   vpx_highbd_12_sub_pixel_variance4x8,
-                   vpx_highbd_12_sub_pixel_avg_variance4x8,
-                   vpx_highbd_sad4x8x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits12,
-                   vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4,
-                   vpx_highbd_12_sub_pixel_variance4x4,
-                   vpx_highbd_12_sub_pixel_avg_variance4x4,
-                   vpx_highbd_sad4x4x4d_bits12)
+        HIGHBD_BFP(
+            BLOCK_32X16, vpx_highbd_sad32x16_bits12,
+            vpx_highbd_sad_skip_32x16_bits12, vpx_highbd_sad32x16_avg_bits12,
+            vpx_highbd_12_variance32x16, vpx_highbd_12_sub_pixel_variance32x16,
+            vpx_highbd_12_sub_pixel_avg_variance32x16,
+            vpx_highbd_sad32x16x4d_bits12, vpx_highbd_sad_skip_32x16x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_16X32, vpx_highbd_sad16x32_bits12,
+            vpx_highbd_sad_skip_16x32_bits12, vpx_highbd_sad16x32_avg_bits12,
+            vpx_highbd_12_variance16x32, vpx_highbd_12_sub_pixel_variance16x32,
+            vpx_highbd_12_sub_pixel_avg_variance16x32,
+            vpx_highbd_sad16x32x4d_bits12, vpx_highbd_sad_skip_16x32x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_64X32, vpx_highbd_sad64x32_bits12,
+            vpx_highbd_sad_skip_64x32_bits12, vpx_highbd_sad64x32_avg_bits12,
+            vpx_highbd_12_variance64x32, vpx_highbd_12_sub_pixel_variance64x32,
+            vpx_highbd_12_sub_pixel_avg_variance64x32,
+            vpx_highbd_sad64x32x4d_bits12, vpx_highbd_sad_skip_64x32x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_32X64, vpx_highbd_sad32x64_bits12,
+            vpx_highbd_sad_skip_32x64_bits12, vpx_highbd_sad32x64_avg_bits12,
+            vpx_highbd_12_variance32x64, vpx_highbd_12_sub_pixel_variance32x64,
+            vpx_highbd_12_sub_pixel_avg_variance32x64,
+            vpx_highbd_sad32x64x4d_bits12, vpx_highbd_sad_skip_32x64x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_32X32, vpx_highbd_sad32x32_bits12,
+            vpx_highbd_sad_skip_32x32_bits12, vpx_highbd_sad32x32_avg_bits12,
+            vpx_highbd_12_variance32x32, vpx_highbd_12_sub_pixel_variance32x32,
+            vpx_highbd_12_sub_pixel_avg_variance32x32,
+            vpx_highbd_sad32x32x4d_bits12, vpx_highbd_sad_skip_32x32x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_64X64, vpx_highbd_sad64x64_bits12,
+            vpx_highbd_sad_skip_64x64_bits12, vpx_highbd_sad64x64_avg_bits12,
+            vpx_highbd_12_variance64x64, vpx_highbd_12_sub_pixel_variance64x64,
+            vpx_highbd_12_sub_pixel_avg_variance64x64,
+            vpx_highbd_sad64x64x4d_bits12, vpx_highbd_sad_skip_64x64x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_16X16, vpx_highbd_sad16x16_bits12,
+            vpx_highbd_sad_skip_16x16_bits12, vpx_highbd_sad16x16_avg_bits12,
+            vpx_highbd_12_variance16x16, vpx_highbd_12_sub_pixel_variance16x16,
+            vpx_highbd_12_sub_pixel_avg_variance16x16,
+            vpx_highbd_sad16x16x4d_bits12, vpx_highbd_sad_skip_16x16x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_16X8, vpx_highbd_sad16x8_bits12,
+            vpx_highbd_sad_skip_16x8_bits12, vpx_highbd_sad16x8_avg_bits12,
+            vpx_highbd_12_variance16x8, vpx_highbd_12_sub_pixel_variance16x8,
+            vpx_highbd_12_sub_pixel_avg_variance16x8,
+            vpx_highbd_sad16x8x4d_bits12, vpx_highbd_sad_skip_16x8x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_8X16, vpx_highbd_sad8x16_bits12,
+            vpx_highbd_sad_skip_8x16_bits12, vpx_highbd_sad8x16_avg_bits12,
+            vpx_highbd_12_variance8x16, vpx_highbd_12_sub_pixel_variance8x16,
+            vpx_highbd_12_sub_pixel_avg_variance8x16,
+            vpx_highbd_sad8x16x4d_bits12, vpx_highbd_sad_skip_8x16x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_8X8, vpx_highbd_sad8x8_bits12, vpx_highbd_sad_skip_8x8_bits12,
+            vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8,
+            vpx_highbd_12_sub_pixel_variance8x8,
+            vpx_highbd_12_sub_pixel_avg_variance8x8,
+            vpx_highbd_sad8x8x4d_bits12, vpx_highbd_sad_skip_8x8x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_8X4, vpx_highbd_sad8x4_bits12, vpx_highbd_sad_skip_8x4_bits12,
+            vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4,
+            vpx_highbd_12_sub_pixel_variance8x4,
+            vpx_highbd_12_sub_pixel_avg_variance8x4,
+            vpx_highbd_sad8x4x4d_bits12, vpx_highbd_sad_skip_8x4x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_4X8, vpx_highbd_sad4x8_bits12, vpx_highbd_sad_skip_4x8_bits12,
+            vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8,
+            vpx_highbd_12_sub_pixel_variance4x8,
+            vpx_highbd_12_sub_pixel_avg_variance4x8,
+            vpx_highbd_sad4x8x4d_bits12, vpx_highbd_sad_skip_4x8x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_4X4, vpx_highbd_sad4x4_bits12, vpx_highbd_sad_skip_4x4_bits12,
+            vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4,
+            vpx_highbd_12_sub_pixel_variance4x4,
+            vpx_highbd_12_sub_pixel_avg_variance4x4,
+            vpx_highbd_sad4x4x4d_bits12, vpx_highbd_sad_skip_4x4x4d_bits12)
         break;
     }
   }
@@ -2550,61 +2629,67 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
                   vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var)));
   cpi->source_var_thresh = 0;
   cpi->frames_till_next_var_check = 0;
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
-  cpi->fn_ptr[BT].sdf = SDF;                      \
-  cpi->fn_ptr[BT].sdaf = SDAF;                    \
-  cpi->fn_ptr[BT].vf = VF;                        \
-  cpi->fn_ptr[BT].svf = SVF;                      \
-  cpi->fn_ptr[BT].svaf = SVAF;                    \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;
-
-  BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vpx_variance32x16,
-      vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16,
-      vpx_sad32x16x4d)
-
-  BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vpx_variance16x32,
-      vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32,
-      vpx_sad16x32x4d)
-
-  BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vpx_variance64x32,
-      vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32,
-      vpx_sad64x32x4d)
-
-  BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vpx_variance32x64,
-      vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64,
-      vpx_sad32x64x4d)
-
-  BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32,
-      vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32,
-      vpx_sad32x32x4d)
-
-  BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64,
-      vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64,
-      vpx_sad64x64x4d)
-
-  BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vpx_variance16x16,
-      vpx_sub_pixel_variance16x16, vpx_sub_pixel_avg_variance16x16,
-      vpx_sad16x16x4d)
-
-  BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vpx_variance16x8,
-      vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8,
-      vpx_sad16x8x4d)
-
-  BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vpx_variance8x16,
-      vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16,
-      vpx_sad8x16x4d)
-
-  BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vpx_variance8x8,
-      vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d)
-
-  BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vpx_variance8x4,
-      vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d)
-
-  BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vpx_variance4x8,
-      vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d)
-
-  BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vpx_variance4x4,
-      vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d)
+#define BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \
+  cpi->fn_ptr[BT].sdf = SDF;                                     \
+  cpi->fn_ptr[BT].sdsf = SDSF;                                   \
+  cpi->fn_ptr[BT].sdaf = SDAF;                                   \
+  cpi->fn_ptr[BT].vf = VF;                                       \
+  cpi->fn_ptr[BT].svf = SVF;                                     \
+  cpi->fn_ptr[BT].svaf = SVAF;                                   \
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;                               \
+  cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
+
+  BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad_skip_32x16, vpx_sad32x16_avg,
+      vpx_variance32x16, vpx_sub_pixel_variance32x16,
+      vpx_sub_pixel_avg_variance32x16, vpx_sad32x16x4d, vpx_sad_skip_32x16x4d)
+
+  BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad_skip_16x32, vpx_sad16x32_avg,
+      vpx_variance16x32, vpx_sub_pixel_variance16x32,
+      vpx_sub_pixel_avg_variance16x32, vpx_sad16x32x4d, vpx_sad_skip_16x32x4d)
+
+  BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad_skip_64x32, vpx_sad64x32_avg,
+      vpx_variance64x32, vpx_sub_pixel_variance64x32,
+      vpx_sub_pixel_avg_variance64x32, vpx_sad64x32x4d, vpx_sad_skip_64x32x4d)
+
+  BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad_skip_32x64, vpx_sad32x64_avg,
+      vpx_variance32x64, vpx_sub_pixel_variance32x64,
+      vpx_sub_pixel_avg_variance32x64, vpx_sad32x64x4d, vpx_sad_skip_32x64x4d)
+
+  BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad_skip_32x32, vpx_sad32x32_avg,
+      vpx_variance32x32, vpx_sub_pixel_variance32x32,
+      vpx_sub_pixel_avg_variance32x32, vpx_sad32x32x4d, vpx_sad_skip_32x32x4d)
+
+  BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad_skip_64x64, vpx_sad64x64_avg,
+      vpx_variance64x64, vpx_sub_pixel_variance64x64,
+      vpx_sub_pixel_avg_variance64x64, vpx_sad64x64x4d, vpx_sad_skip_64x64x4d)
+
+  BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad_skip_16x16, vpx_sad16x16_avg,
+      vpx_variance16x16, vpx_sub_pixel_variance16x16,
+      vpx_sub_pixel_avg_variance16x16, vpx_sad16x16x4d, vpx_sad_skip_16x16x4d)
+
+  BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad_skip_16x8, vpx_sad16x8_avg,
+      vpx_variance16x8, vpx_sub_pixel_variance16x8,
+      vpx_sub_pixel_avg_variance16x8, vpx_sad16x8x4d, vpx_sad_skip_16x8x4d)
+
+  BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad_skip_8x16, vpx_sad8x16_avg,
+      vpx_variance8x16, vpx_sub_pixel_variance8x16,
+      vpx_sub_pixel_avg_variance8x16, vpx_sad8x16x4d, vpx_sad_skip_8x16x4d)
+
+  BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad_skip_8x8, vpx_sad8x8_avg, vpx_variance8x8,
+      vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d,
+      vpx_sad_skip_8x8x4d)
+
+  BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad_skip_8x4, vpx_sad8x4_avg, vpx_variance8x4,
+      vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d,
+      vpx_sad_skip_8x4x4d)
+
+  BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad_skip_4x8, vpx_sad4x8_avg, vpx_variance4x8,
+      vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d,
+      vpx_sad_skip_4x8x4d)
+
+  BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad_skip_4x4, vpx_sad4x4_avg, vpx_variance4x4,
+      vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d,
+      vpx_sad_skip_4x4x4d)
 
 #if CONFIG_VP9_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 0efa836ac..71d8775ea 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -437,6 +437,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
   MV center_mv_full = ref_mv_full;
   unsigned int start_mv_sad;
+  vp9_sad_fn_ptr_t sad_fn_ptr;
 
   int step_param = 3;
   int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
@@ -462,11 +463,13 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
            x->mv_limits.row_min, x->mv_limits.row_max);
   start_mv_sad = get_start_mv_sad(x, &ref_mv_full, &center_mv_full,
                                   cpi->fn_ptr[bsize].sdf, x->sadperbit16);
+  sad_fn_ptr.sdf = cpi->fn_ptr[bsize].sdf;
+  sad_fn_ptr.sdx4df = cpi->fn_ptr[bsize].sdx4df;
 
   // Center the initial step/diamond search on best mv.
   tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad,
                                     &tmp_mv, step_param, x->sadperbit16, &num00,
-                                    &v_fn_ptr, ref_mv);
+                                    &sad_fn_ptr, ref_mv);
   if (tmp_err < INT_MAX)
     tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
   if (tmp_err < INT_MAX - new_mv_mode_penalty) tmp_err += new_mv_mode_penalty;
@@ -488,7 +491,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     } else {
       tmp_err = cpi->diamond_search_sad(
           x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad, &tmp_mv, step_param + n,
-          x->sadperbit16, &num00, &v_fn_ptr, ref_mv);
+          x->sadperbit16, &num00, &sad_fn_ptr, ref_mv);
       if (tmp_err < INT_MAX)
         tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
       if (tmp_err < INT_MAX - new_mv_mode_penalty)
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 4ff685b24..64e9ef0f9 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -2055,7 +2055,7 @@ int vp9_prepare_nb_full_mvs(const MotionField *motion_field, int mi_row,
 int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
                              MV *ref_mv, uint32_t start_mv_sad, MV *best_mv,
                              int search_param, int sad_per_bit, int *num00,
-                             const vp9_variance_fn_ptr_t *fn_ptr,
+                             const vp9_sad_fn_ptr_t *sad_fn_ptr,
                              const MV *center_mv) {
   int i, j, step;
 
@@ -2117,8 +2117,8 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
 
         for (t = 0; t < 4; t++) block_offset[t] = ss_os[i + t] + best_address;
 
-        fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
-                       sad_array);
+        sad_fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+                           sad_array);
 
         for (t = 0; t < 4; t++, i++) {
           if (sad_array[t] < bestsad) {
@@ -2142,7 +2142,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
         if (is_mv_in(&x->mv_limits, &this_mv)) {
           const uint8_t *const check_here = ss_os[i] + best_address;
           unsigned int thissad =
-              fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
+              sad_fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
 
           if (thissad < bestsad) {
             thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
@@ -2484,24 +2484,54 @@ int vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x,
               point as the best match, we will do a final 1-away diamond
               refining search  */
 static int full_pixel_diamond(const VP9_COMP *const cpi,
-                              const MACROBLOCK *const x, MV *mvp_full,
-                              int step_param, int sadpb, int further_steps,
-                              int do_refine, int *cost_list,
+                              const MACROBLOCK *const x, BLOCK_SIZE bsize,
+                              MV *mvp_full, int step_param, int sadpb,
+                              int further_steps, int do_refine,
+                              int use_downsampled_sad, int *cost_list,
                               const vp9_variance_fn_ptr_t *fn_ptr,
                               const MV *ref_mv, MV *dst_mv) {
   MV temp_mv;
   int thissme, n, num00 = 0;
   int bestsme;
-  unsigned int start_mv_sad;
+  const int src_buf_stride = x->plane[0].src.stride;
+  const uint8_t *const src_buf = x->plane[0].src.buf;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int pred_buf_stride = xd->plane[0].pre[0].stride;
+  uint8_t *pred_buf;
+  vp9_sad_fn_ptr_t sad_fn_ptr;
+  unsigned int start_mv_sad, start_mv_sad_even_rows, start_mv_sad_odd_rows;
   const MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 };
   clamp_mv(mvp_full, x->mv_limits.col_min, x->mv_limits.col_max,
            x->mv_limits.row_min, x->mv_limits.row_max);
-  start_mv_sad =
-      get_start_mv_sad(x, mvp_full, &ref_mv_full, fn_ptr->sdf, sadpb);
+
+  pred_buf =
+      xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col;
+  start_mv_sad_even_rows =
+      fn_ptr->sdsf(src_buf, src_buf_stride, pred_buf, pred_buf_stride);
+  start_mv_sad_odd_rows =
+      fn_ptr->sdsf(src_buf + src_buf_stride, src_buf_stride,
+                   pred_buf + pred_buf_stride, pred_buf_stride);
+  start_mv_sad = (start_mv_sad_even_rows + start_mv_sad_odd_rows) >> 1;
+  start_mv_sad += mvsad_err_cost(x, mvp_full, &ref_mv_full, sadpb);
+
+  sad_fn_ptr.sdf = fn_ptr->sdf;
+  sad_fn_ptr.sdx4df = fn_ptr->sdx4df;
+  if (use_downsampled_sad && num_4x4_blocks_high_lookup[bsize] >= 2) {
+    // If the absolute difference between the pred-to-src SAD of even rows and
+    // the pred-to-src SAD of odd rows is small, skip every other row in sad
+    // computation.
+    const int odd_to_even_diff_sad =
+        abs((int)start_mv_sad_even_rows - (int)start_mv_sad_odd_rows);
+    const int mult_thresh = 10;
+    if (odd_to_even_diff_sad * mult_thresh < (int)start_mv_sad_even_rows) {
+      sad_fn_ptr.sdf = fn_ptr->sdsf;
+      sad_fn_ptr.sdx4df = fn_ptr->sdsx4df;
+    }
+  }
 
   bestsme =
       cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad, &temp_mv,
-                              step_param, sadpb, &n, fn_ptr, ref_mv);
+                              step_param, sadpb, &n, &sad_fn_ptr, ref_mv);
   if (bestsme < INT_MAX)
     bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
   *dst_mv = temp_mv;
@@ -2518,7 +2548,7 @@ static int full_pixel_diamond(const VP9_COMP *const cpi,
     } else {
       thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad,
                                         &temp_mv, step_param + n, sadpb, &num00,
-                                        fn_ptr, ref_mv);
+                                        &sad_fn_ptr, ref_mv);
       if (thissme < INT_MAX)
         thissme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
 
@@ -2536,8 +2566,8 @@ static int full_pixel_diamond(const VP9_COMP *const cpi,
   if (do_refine) {
     const int search_range = 8;
     MV best_mv = *dst_mv;
-    thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range, fn_ptr,
-                                      ref_mv);
+    thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range,
+                                      &sad_fn_ptr, ref_mv);
     if (thissme < INT_MAX)
       thissme = vp9_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1);
     if (thissme < bestsme) {
@@ -2546,6 +2576,27 @@ static int full_pixel_diamond(const VP9_COMP *const cpi,
     }
   }
 
+  if (sad_fn_ptr.sdf != fn_ptr->sdf) {
+    // If we are skipping rows when we perform the motion search, we need to
+    // check the quality of skipping. If it's bad, then we run search with
+    // skip row features off.
+    const uint8_t *best_address = get_buf_from_mv(&xd->plane[0].pre[0], dst_mv);
+    const int sad =
+        fn_ptr->sdf(src_buf, src_buf_stride, best_address, pred_buf_stride);
+    const int skip_sad =
+        fn_ptr->sdsf(src_buf, src_buf_stride, best_address, pred_buf_stride);
+    // We will keep the result of skipping rows if it's good enough.
+    const int kSADThresh =
+        1 << (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+    if (sad > kSADThresh && abs(skip_sad - sad) * 10 >= VPXMAX(sad, 1) * 9) {
+      // There is a large discrepancy between skipping and not skipping, so we
+      // need to redo the motion search.
+      return full_pixel_diamond(cpi, x, bsize, mvp_full, step_param, sadpb,
+                                further_steps, do_refine, 0, cost_list, fn_ptr,
+                                ref_mv, dst_mv);
+    }
+  }
+
   // Return cost list.
   if (cost_list) {
     calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
@@ -2697,7 +2748,7 @@ int64_t vp9_refining_search_sad_new(const MACROBLOCK *x, MV *best_full_mv,
 
 int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
                             int search_range,
-                            const vp9_variance_fn_ptr_t *fn_ptr,
+                            const vp9_sad_fn_ptr_t *sad_fn_ptr,
                             const MV *center_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
@@ -2706,7 +2757,7 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
   const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
   const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv);
   unsigned int best_sad =
-      fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) +
+      sad_fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
   int i, j;
 
@@ -2723,7 +2774,8 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
                                             best_address - 1, best_address + 1,
                                             best_address + in_what->stride };
 
-      fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads);
+      sad_fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride,
+                         sads);
 
       for (j = 0; j < 4; ++j) {
         if (sads[j] < best_sad) {
@@ -2743,8 +2795,8 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
 
         if (is_mv_in(&x->mv_limits, &mv)) {
           unsigned int sad =
-              fn_ptr->sdf(what->buf, what->stride,
-                          get_buf_from_mv(in_what, &mv), in_what->stride);
+              sad_fn_ptr->sdf(what->buf, what->stride,
+                              get_buf_from_mv(in_what, &mv), in_what->stride);
           if (sad < best_sad) {
             sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
             if (sad < best_sad) {
@@ -2861,9 +2913,10 @@ int vp9_full_pixel_search(const VP9_COMP *const cpi, const MACROBLOCK *const x,
       break;
     case NSTEP:
     case MESH:
-      var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
-                               MAX_MVSEARCH_STEPS - 1 - step_param, 1,
-                               cost_list, fn_ptr, ref_mv, tmp_mv);
+      var = full_pixel_diamond(
+          cpi, x, bsize, mvp_full, step_param, error_per_bit,
+          MAX_MVSEARCH_STEPS - 1 - step_param, 1,
+          cpi->sf.mv.use_downsampled_sad, cost_list, fn_ptr, ref_mv, tmp_mv);
       break;
     default: assert(0 && "Unknown search method");
   }
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 62a7a047d..fd6a8b9ac 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -41,6 +41,11 @@ typedef struct search_site_config {
   int total_steps;
 } search_site_config;
 
+typedef struct vp9_sad_table {
+  vpx_sad_fn_t sdf;
+  vpx_sad_multi_d_fn_t sdx4df;
+} vp9_sad_fn_ptr_t;
+
 static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
                                              const MV *mv) {
   return &buf->buf[mv->row * buf->stride + mv->col];
@@ -63,12 +68,13 @@ int vp9_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
 
 struct VP9_COMP;
 struct SPEED_FEATURES;
+struct vp9_sad_table;
 
 int vp9_init_search_range(int size);
 
 int vp9_refining_search_sad(const struct macroblock *x, struct mv *ref_mv,
                             int error_per_bit, int search_range,
-                            const struct vp9_variance_vtable *fn_ptr,
+                            const struct vp9_sad_table *sad_fn_ptr,
                             const struct mv *center_mv);
 
 // Perform integral projection based motion estimation.
@@ -96,7 +102,7 @@ extern fractional_mv_step_fp vp9_return_min_sub_pixel_mv;
 typedef int (*vp9_diamond_search_fn_t)(
     const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv,
     uint32_t start_mv_sad, MV *best_mv, int search_param, int sad_per_bit,
-    int *num00, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv);
+    int *num00, const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv);
 
 int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
                              int search_range,
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 034673b49..04804da1c 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -231,6 +231,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
   sf->allow_skip_recode = 1;
   sf->less_rectangular_check = 1;
   sf->mv.auto_mv_step_size = 1;
+  sf->mv.use_downsampled_sad = 1;
   sf->prune_ref_frame_for_rect_partitions = 1;
   sf->temporal_filter_search_method = NSTEP;
   sf->tx_size_search_breakout = 1;
@@ -926,6 +927,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
   sf->coeff_prob_appx_step = 1;
   sf->mv.auto_mv_step_size = 0;
   sf->mv.fullpel_search_step_param = 6;
+  sf->mv.use_downsampled_sad = 0;
   sf->comp_inter_joint_search_thresh = BLOCK_4X4;
   sf->tx_size_search_method = USE_FULL_RD;
   sf->use_lp32x32fdct = 0;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index e267e55c4..7cb3f3527 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -210,6 +210,10 @@ typedef struct MV_SPEED_FEATURES {
 
   // This variable sets the step_param used in full pel motion search.
   int fullpel_search_step_param;
+
+  // Whether to downsample the rows in sad calculation during motion search.
+  // This is only active when there are at least 8 rows.
+  int use_downsampled_sad;
 } MV_SPEED_FEATURES;
 
 typedef struct PARTITION_SEARCH_BREAKOUT_THR {
diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
index 719ab40f9..80442e359 100644
--- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+++ b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
@@ -51,7 +51,7 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
                                const search_site_config *cfg, MV *ref_mv,
                                uint32_t start_mv_sad, MV *best_mv,
                                int search_param, int sad_per_bit, int *num00,
-                               const vp9_variance_fn_ptr_t *fn_ptr,
+                               const vp9_sad_fn_ptr_t *sad_fn_ptr,
                                const MV *center_mv) {
   const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max);
   const __m128i v_max_mv_w = _mm_set1_epi32((int)maxmv.as_int);
@@ -167,8 +167,8 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
 #endif
       }
 
-      fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
-                     in_what_stride, (uint32_t *)&v_sad_d);
+      sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
+                         in_what_stride, (uint32_t *)&v_sad_d);
 
       // Look up the component cost of the residual motion vector
       {
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index b47c43430..619d7aa95 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -43,6 +43,12 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride,
     DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]);                           \
     vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride);   \
     return sad(src_ptr, src_stride, comp_pred, m, m, n);                      \
+  }                                                                           \
+  unsigned int vpx_sad_skip_##m##x##n##_c(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride, (m),     \
+                   (n / 2));                                                  \
   }
 
 // Compare |src_ptr| to 4 distinct references in |ref_array[4]|
@@ -54,6 +60,15 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride,
     for (i = 0; i < 4; ++i)                                                    \
       sad_array[i] =                                                           \
           vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_array[i], ref_stride); \
+  }                                                                            \
+  void vpx_sad_skip_##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride,   \
+                                     const uint8_t *const ref_array[4],        \
+                                     int ref_stride, uint32_t sad_array[4]) {  \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i) {                                                  \
+      sad_array[i] = 2 * sad(src_ptr, 2 * src_stride, ref_array[i],            \
+                             2 * ref_stride, (m), (n / 2));                    \
+    }                                                                          \
   }
 
 /* clang-format off */
@@ -156,6 +171,12 @@ static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride,
     vpx_highbd_comp_avg_pred_c(comp_pred, CONVERT_TO_SHORTPTR(second_pred), m, \
                                n, CONVERT_TO_SHORTPTR(ref_ptr), ref_stride);   \
     return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n);               \
+  }                                                                            \
+  unsigned int vpx_highbd_sad_skip_##m##x##n##_c(                              \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                  \
+      int ref_stride) {                                                        \
+    return 2 *                                                                 \
+           highbd_sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2)); \
   }
 
 #define highbd_sadMxNx4D(m, n)                                                 \
@@ -167,6 +188,15 @@ static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride,
       sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride,          \
                                                  ref_array[i], ref_stride);    \
     }                                                                          \
+  }                                                                            \
+  void vpx_highbd_sad_skip_##m##x##n##x4d_c(                                   \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4],   \
+      int ref_stride, uint32_t sad_array[4]) {                                 \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i) {                                                  \
+      sad_array[i] = vpx_highbd_sad_skip_##m##x##n##_c(                        \
+          src, src_stride, ref_array[i], ref_stride);                          \
+    }                                                                          \
   }
 
 /* clang-format off */
diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h
index 755cb907d..ccdb2f90b 100644
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@@ -69,11 +69,15 @@ typedef struct variance_vtable {
 #if CONFIG_VP9
 typedef struct vp9_variance_vtable {
   vpx_sad_fn_t sdf;
+  // Same as normal sad, but downsample the rows by a factor of 2.
+  vpx_sad_fn_t sdsf;
   vpx_sad_avg_fn_t sdaf;
   vpx_variance_fn_t vf;
   vpx_subpixvariance_fn_t svf;
   vpx_subp_avg_variance_fn_t svaf;
   vpx_sad_multi_d_fn_t sdx4df;
+  // Same as sadx4, but downsample the rows by a factor of 2.
+  vpx_sad_multi_d_fn_t sdsx4df;
 } vp9_variance_fn_ptr_t;
 #endif  // CONFIG_VP9
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index d63be5fb8..e3d48f493 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -597,7 +597,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_fdct8x8_1 sse2 neon msa/;
 
   add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct16x16 neon sse2 msa lsx/;
+  specialize qw/vpx_fdct16x16 neon sse2 avx2 msa lsx/;
 
   add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct16x16_1 sse2 neon msa/;
@@ -786,6 +786,43 @@ specialize qw/vpx_sad4x8 neon msa sse2 mmi/;
 add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vpx_sad4x4 neon msa sse2 mmi/;
 
+add_proto qw/unsigned int vpx_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_64x64 avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_64x32 avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_32x64 avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_32x32 avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_32x16 avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_16x32 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_16x16 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_16x8 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_8x16 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_8x8 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+
+add_proto qw/unsigned int vpx_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_4x8 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+
 #
 # Avg
 #
@@ -928,6 +965,43 @@ specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/;
 add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
 
+add_proto qw/void vpx_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_64x64x4d avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_64x32x4d avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_32x64x4d avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_32x32x4d avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_32x16x4d avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_16x32x4d sse2/;
+
+add_proto qw/void vpx_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_16x16x4d sse2/;
+
+add_proto qw/void vpx_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_16x8x4d sse2/;
+
+add_proto qw/void vpx_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_8x16x4d sse2/;
+
+add_proto qw/void vpx_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_8x8x4d sse2/;
+
+add_proto qw/void vpx_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+
+add_proto qw/void vpx_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_4x8x4d sse2/;
+
+add_proto qw/void vpx_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+
 add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
 specialize qw/vpx_sum_squares_2d_i16 neon sse2 msa/;
 
@@ -991,6 +1065,42 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
   specialize qw/vpx_highbd_sad4x4 neon/;
 
+  add_proto qw/unsigned int vpx_highbd_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_64x64 sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_64x32 sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_32x64 sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_32x32 sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_32x16 sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_16x32 sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_16x16 sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_16x8 sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_8x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+
   #
   # Avg
   #
@@ -1084,6 +1194,43 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad4x4x4d sse2 neon/;
 
+  add_proto qw/void vpx_highbd_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_64x64x4d sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_64x32x4d sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_32x64x4d sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_32x32x4d sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_32x16x4d sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_16x32x4d sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_16x16x4d sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_16x8x4d sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_8x16x4d sse2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_8x8x4d sse2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+
+  add_proto qw/void vpx_highbd_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_4x8x4d sse2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+
   #
   # Structured Similarity (SSIM)
   #
diff --git a/vpx_dsp/x86/fwd_txfm_avx2.c b/vpx_dsp/x86/fwd_txfm_avx2.c
index a2ed420e3..c8f54a49c 100644
--- a/vpx_dsp/x86/fwd_txfm_avx2.c
+++ b/vpx_dsp/x86/fwd_txfm_avx2.c
@@ -8,9 +8,382 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <immintrin.h>  // AVX2
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 
+#include "vpx_dsp/txfm_common.h"
+#define ADD256_EPI16 _mm256_add_epi16
+#define SUB256_EPI16 _mm256_sub_epi16
+
+static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in,
+                                                   int stride, __m256i *out,
+                                                   int out_size, int pass) {
+  int i;
+  const __m256i kOne = _mm256_set1_epi16(1);
+  if (pass == 0) {
+    for (i = 0; i < out_size; i++) {
+      out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride));
+      // x = x << 2
+      out[i] = _mm256_slli_epi16(out[i], 2);
+    }
+  } else {
+    for (i = 0; i < out_size; i++) {
+      out[i] = _mm256_loadu_si256((const __m256i *)(in + i * 16));
+      // x = (x + 1) >> 2
+      out[i] = _mm256_add_epi16(out[i], kOne);
+      out[i] = _mm256_srai_epi16(out[i], 2);
+    }
+  }
+}
+
+static INLINE void transpose2_8x8_avx2(const __m256i *const in,
+                                       __m256i *const out) {
+  int i;
+  __m256i t[16], u[16];
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 1)   ==>  (0, 1)
+  //   (2, 3)   ==>  (2, 3)
+  //   (4, 5)   ==>  (4, 5)
+  //   (6, 7)   ==>  (6, 7)
+  for (i = 0; i < 4; i++) {
+    t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);
+    t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 2)   ==>  (0, 2)
+  //   (1, 3)   ==>  (1, 3)
+  //   (4, 6)   ==>  (4, 6)
+  //   (5, 7)   ==>  (5, 7)
+  for (i = 0; i < 2; i++) {
+    u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);
+    u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);
+
+    u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);
+    u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 4)   ==>  (0, 1)
+  //   (1, 5)   ==>  (4, 5)
+  //   (2, 6)   ==>  (2, 3)
+  //   (3, 7)   ==>  (6, 7)
+  for (i = 0; i < 2; i++) {
+    out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);
+    out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);
+
+    out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);
+    out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);
+  }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
+                                              __m256i *const out) {
+  __m256i t[16];
+
+#define LOADL(idx)                                                            \
+  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
+  t[idx] = _mm256_inserti128_si256(                                           \
+      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
+
+#define LOADR(idx)                                                           \
+  t[8 + idx] =                                                               \
+      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
+  t[8 + idx] = _mm256_inserti128_si256(                                      \
+      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
+
+  // load left 8x16
+  LOADL(0)
+  LOADL(1)
+  LOADL(2)
+  LOADL(3)
+  LOADL(4)
+  LOADL(5)
+  LOADL(6)
+  LOADL(7)
+
+  // load right 8x16
+  LOADR(0)
+  LOADR(1)
+  LOADR(2)
+  LOADR(3)
+  LOADR(4)
+  LOADR(5)
+  LOADR(6)
+  LOADR(7)
+
+  // get the top 16x8 result
+  transpose2_8x8_avx2(t, out);
+  // get the bottom 16x8 result
+  transpose2_8x8_avx2(&t[8], &out[8]);
+}
+
+// Store 8 16-bit values. Sign extend the values.
+static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
+                                                        tran_low_t *out,
+                                                        const int stride,
+                                                        const int out_size) {
+  int i;
+  for (i = 0; i < out_size; ++i) {
+    _mm256_storeu_si256((__m256i *)(out), in[i]);
+    out += stride;
+  }
+}
+
+#define PAIR256_SET_EPI16(a, b)                                            \
+  _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+static INLINE __m256i mult256_round_shift(const __m256i *pin0,
+                                          const __m256i *pin1,
+                                          const __m256i *pmultiplier,
+                                          const __m256i *prounding,
+                                          const int shift) {
+  const __m256i u0 = _mm256_madd_epi16(*pin0, *pmultiplier);
+  const __m256i u1 = _mm256_madd_epi16(*pin1, *pmultiplier);
+  const __m256i v0 = _mm256_add_epi32(u0, *prounding);
+  const __m256i v1 = _mm256_add_epi32(u1, *prounding);
+  const __m256i w0 = _mm256_srai_epi32(v0, shift);
+  const __m256i w1 = _mm256_srai_epi32(v1, shift);
+  return _mm256_packs_epi32(w0, w1);
+}
+
+static INLINE void fdct16x16_1D_avx2(__m256i *input, __m256i *output) {
+  int i;
+  __m256i step2[4];
+  __m256i in[8];
+  __m256i step1[8];
+  __m256i step3[8];
+
+  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64);
+  const __m256i k__cospi_p16_m16 = PAIR256_SET_EPI16(cospi_16_64, -cospi_16_64);
+  const __m256i k__cospi_p24_p08 = PAIR256_SET_EPI16(cospi_24_64, cospi_8_64);
+  const __m256i k__cospi_p08_m24 = PAIR256_SET_EPI16(cospi_8_64, -cospi_24_64);
+  const __m256i k__cospi_m08_p24 = PAIR256_SET_EPI16(-cospi_8_64, cospi_24_64);
+  const __m256i k__cospi_p28_p04 = PAIR256_SET_EPI16(cospi_28_64, cospi_4_64);
+  const __m256i k__cospi_m04_p28 = PAIR256_SET_EPI16(-cospi_4_64, cospi_28_64);
+  const __m256i k__cospi_p12_p20 = PAIR256_SET_EPI16(cospi_12_64, cospi_20_64);
+  const __m256i k__cospi_m20_p12 = PAIR256_SET_EPI16(-cospi_20_64, cospi_12_64);
+  const __m256i k__cospi_p30_p02 = PAIR256_SET_EPI16(cospi_30_64, cospi_2_64);
+  const __m256i k__cospi_p14_p18 = PAIR256_SET_EPI16(cospi_14_64, cospi_18_64);
+  const __m256i k__cospi_m02_p30 = PAIR256_SET_EPI16(-cospi_2_64, cospi_30_64);
+  const __m256i k__cospi_m18_p14 = PAIR256_SET_EPI16(-cospi_18_64, cospi_14_64);
+  const __m256i k__cospi_p22_p10 = PAIR256_SET_EPI16(cospi_22_64, cospi_10_64);
+  const __m256i k__cospi_p06_p26 = PAIR256_SET_EPI16(cospi_6_64, cospi_26_64);
+  const __m256i k__cospi_m10_p22 = PAIR256_SET_EPI16(-cospi_10_64, cospi_22_64);
+  const __m256i k__cospi_m26_p06 = PAIR256_SET_EPI16(-cospi_26_64, cospi_6_64);
+  const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+
+  // Calculate input for the first 8 results.
+  for (i = 0; i < 8; i++) {
+    in[i] = ADD256_EPI16(input[i], input[15 - i]);
+  }
+
+  // Calculate input for the next 8 results.
+  for (i = 0; i < 8; i++) {
+    step1[i] = SUB256_EPI16(input[7 - i], input[8 + i]);
+  }
+
+  // Work on the first eight values; fdct8(input, even_results);
+  {
+    // Add/subtract
+    const __m256i q0 = ADD256_EPI16(in[0], in[7]);
+    const __m256i q1 = ADD256_EPI16(in[1], in[6]);
+    const __m256i q2 = ADD256_EPI16(in[2], in[5]);
+    const __m256i q3 = ADD256_EPI16(in[3], in[4]);
+    const __m256i q4 = SUB256_EPI16(in[3], in[4]);
+    const __m256i q5 = SUB256_EPI16(in[2], in[5]);
+    const __m256i q6 = SUB256_EPI16(in[1], in[6]);
+    const __m256i q7 = SUB256_EPI16(in[0], in[7]);
+
+    // Work on first four results
+    {
+      // Add/subtract
+      const __m256i r0 = ADD256_EPI16(q0, q3);
+      const __m256i r1 = ADD256_EPI16(q1, q2);
+      const __m256i r2 = SUB256_EPI16(q1, q2);
+      const __m256i r3 = SUB256_EPI16(q0, q3);
+
+      // Interleave to do the multiply by constants which gets us
+      // into 32 bits.
+      {
+        const __m256i t0 = _mm256_unpacklo_epi16(r0, r1);
+        const __m256i t1 = _mm256_unpackhi_epi16(r0, r1);
+        const __m256i t2 = _mm256_unpacklo_epi16(r2, r3);
+        const __m256i t3 = _mm256_unpackhi_epi16(r2, r3);
+
+        output[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16,
+                                        &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+        output[8] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16,
+                                        &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+        output[4] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08,
+                                        &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+        output[12] =
+            mult256_round_shift(&t2, &t3, &k__cospi_m08_p24,
+                                &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      }
+    }
+
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us
+      // into 32 bits.
+      const __m256i d0 = _mm256_unpacklo_epi16(q6, q5);
+      const __m256i d1 = _mm256_unpackhi_epi16(q6, q5);
+      const __m256i r0 = mult256_round_shift(
+          &d0, &d1, &k__cospi_p16_m16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      const __m256i r1 = mult256_round_shift(
+          &d0, &d1, &k__cospi_p16_p16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+
+      {
+        // Add/subtract
+        const __m256i x0 = ADD256_EPI16(q4, r0);
+        const __m256i x1 = SUB256_EPI16(q4, r0);
+        const __m256i x2 = SUB256_EPI16(q7, r1);
+        const __m256i x3 = ADD256_EPI16(q7, r1);
+
+        // Interleave to do the multiply by constants which gets us
+        // into 32 bits.
+        {
+          const __m256i t0 = _mm256_unpacklo_epi16(x0, x3);
+          const __m256i t1 = _mm256_unpackhi_epi16(x0, x3);
+          const __m256i t2 = _mm256_unpacklo_epi16(x1, x2);
+          const __m256i t3 = _mm256_unpackhi_epi16(x1, x2);
+          output[2] =
+              mult256_round_shift(&t0, &t1, &k__cospi_p28_p04,
+                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          output[14] =
+              mult256_round_shift(&t0, &t1, &k__cospi_m04_p28,
+                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          output[10] =
+              mult256_round_shift(&t2, &t3, &k__cospi_p12_p20,
+                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          output[6] =
+              mult256_round_shift(&t2, &t3, &k__cospi_m20_p12,
+                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+        }
+      }
+    }
+  }
+  // Work on the next eight values; step1 -> odd_results
+  {  // step 2
+    {
+      const __m256i t0 = _mm256_unpacklo_epi16(step1[5], step1[2]);
+      const __m256i t1 = _mm256_unpackhi_epi16(step1[5], step1[2]);
+      const __m256i t2 = _mm256_unpacklo_epi16(step1[4], step1[3]);
+      const __m256i t3 = _mm256_unpackhi_epi16(step1[4], step1[3]);
+      step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+    }
+    // step 3
+    {
+      step3[0] = ADD256_EPI16(step1[0], step2[1]);
+      step3[1] = ADD256_EPI16(step1[1], step2[0]);
+      step3[2] = SUB256_EPI16(step1[1], step2[0]);
+      step3[3] = SUB256_EPI16(step1[0], step2[1]);
+      step3[4] = SUB256_EPI16(step1[7], step2[3]);
+      step3[5] = SUB256_EPI16(step1[6], step2[2]);
+      step3[6] = ADD256_EPI16(step1[6], step2[2]);
+      step3[7] = ADD256_EPI16(step1[7], step2[3]);
+    }
+    // step 4
+    {
+      const __m256i t0 = _mm256_unpacklo_epi16(step3[1], step3[6]);
+      const __m256i t1 = _mm256_unpackhi_epi16(step3[1], step3[6]);
+      const __m256i t2 = _mm256_unpacklo_epi16(step3[2], step3[5]);
+      const __m256i t3 = _mm256_unpackhi_epi16(step3[2], step3[5]);
+      step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_m08_p24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p08_m24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+    }
+    // step 5
+    {
+      step1[0] = ADD256_EPI16(step3[0], step2[0]);
+      step1[1] = SUB256_EPI16(step3[0], step2[0]);
+      step1[2] = ADD256_EPI16(step3[3], step2[1]);
+      step1[3] = SUB256_EPI16(step3[3], step2[1]);
+      step1[4] = SUB256_EPI16(step3[4], step2[3]);
+      step1[5] = ADD256_EPI16(step3[4], step2[3]);
+      step1[6] = SUB256_EPI16(step3[7], step2[2]);
+      step1[7] = ADD256_EPI16(step3[7], step2[2]);
+    }
+    // step 6
+    {
+      const __m256i t0 = _mm256_unpacklo_epi16(step1[0], step1[7]);
+      const __m256i t1 = _mm256_unpackhi_epi16(step1[0], step1[7]);
+      const __m256i t2 = _mm256_unpacklo_epi16(step1[1], step1[6]);
+      const __m256i t3 = _mm256_unpackhi_epi16(step1[1], step1[6]);
+      output[1] = mult256_round_shift(&t0, &t1, &k__cospi_p30_p02,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[9] = mult256_round_shift(&t2, &t3, &k__cospi_p14_p18,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[15] = mult256_round_shift(&t0, &t1, &k__cospi_m02_p30,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[7] = mult256_round_shift(&t2, &t3, &k__cospi_m18_p14,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+    }
+    {
+      const __m256i t0 = _mm256_unpacklo_epi16(step1[2], step1[5]);
+      const __m256i t1 = _mm256_unpackhi_epi16(step1[2], step1[5]);
+      const __m256i t2 = _mm256_unpacklo_epi16(step1[3], step1[4]);
+      const __m256i t3 = _mm256_unpackhi_epi16(step1[3], step1[4]);
+      output[5] = mult256_round_shift(&t0, &t1, &k__cospi_p22_p10,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[13] = mult256_round_shift(&t2, &t3, &k__cospi_p06_p26,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[11] = mult256_round_shift(&t0, &t1, &k__cospi_m10_p22,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[3] = mult256_round_shift(&t2, &t3, &k__cospi_m26_p06,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+    }
+  }
+}
+
+void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride) {
+  int pass;
+  DECLARE_ALIGNED(32, int16_t, intermediate[256]);
+  int16_t *out0 = intermediate;
+  tran_low_t *out1 = output;
+  const int width = 16;
+  const int height = 16;
+  __m256i buf0[16], buf1[16];
+
+  // Two transform and transpose passes
+  // Process 16 columns (transposed rows in second pass) at a time.
+  for (pass = 0; pass < 2; ++pass) {
+    // Load and pre-condition input.
+    load_buffer_16bit_to_16bit_avx2(input, stride, buf1, height, pass);
+
+    // Calculate dct for 16x16 values
+    fdct16x16_1D_avx2(buf1, buf0);
+
+    // Transpose the results.
+    transpose_16bit_16x16_avx2(buf0, buf1);
+
+    if (pass == 0) {
+      store_buffer_16bit_to_32bit_w16_avx2(buf1, out0, width, height);
+    } else {
+      store_buffer_16bit_to_32bit_w16_avx2(buf1, out1, width, height);
+    }
+    // Setup in/out for next pass.
+    input = intermediate;
+  }
+}
+
 #if !CONFIG_VP9_HIGHBITDEPTH
 #define FDCT32x32_2D_AVX2 vpx_fdct32x32_rd_avx2
 #define FDCT32x32_HIGH_PRECISION 0
diff --git a/vpx_dsp/x86/highbd_sad4d_avx2.c b/vpx_dsp/x86/highbd_sad4d_avx2.c
index 947b5e977..e483fdce7 100644
--- a/vpx_dsp/x86/highbd_sad4d_avx2.c
+++ b/vpx_dsp/x86/highbd_sad4d_avx2.c
@@ -61,70 +61,79 @@ static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/,
   }
 }
 
+static VPX_FORCE_INLINE void highbd_sad64xNx4d_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+    int ref_stride, uint32_t sad_array[4], int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *refs[4];
+  __m256i sums_16[4];
+  __m256i sums_32[4];
+  int i;
+
+  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+  sums_32[0] = _mm256_setzero_si256();
+  sums_32[1] = _mm256_setzero_si256();
+  sums_32[2] = _mm256_setzero_si256();
+  sums_32[3] = _mm256_setzero_si256();
+
+  for (i = 0; i < (n / 2); ++i) {
+    sums_16[0] = _mm256_setzero_si256();
+    sums_16[1] = _mm256_setzero_si256();
+    sums_16[2] = _mm256_setzero_si256();
+    sums_16[3] = _mm256_setzero_si256();
+
+    highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2);
+
+    /* sums_16 will outrange after 2 rows, so add current sums_16 to
+     * sums_32*/
+    sums_32[0] = _mm256_add_epi32(
+        sums_32[0],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+    sums_32[1] = _mm256_add_epi32(
+        sums_32[1],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+    sums_32[2] = _mm256_add_epi32(
+        sums_32[2],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+    sums_32[3] = _mm256_add_epi32(
+        sums_32[3],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+    src += src_stride << 1;
+  }
+  calc_final_4(sums_32, sad_array);
+}
+
 #define HIGHBD_SAD64XNX4D(n)                                                   \
-  void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride,  \
+  void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src, int src_stride,      \
                                       const uint8_t *const ref_array[4],       \
                                       int ref_stride, uint32_t sad_array[4]) { \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                        \
-    uint16_t *refs[4];                                                         \
-    __m256i sums_16[4];                                                        \
-    __m256i sums_32[4];                                                        \
-    int i;                                                                     \
-                                                                               \
-    refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);                               \
-    refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);                               \
-    refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);                               \
-    refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);                               \
-    sums_32[0] = _mm256_setzero_si256();                                       \
-    sums_32[1] = _mm256_setzero_si256();                                       \
-    sums_32[2] = _mm256_setzero_si256();                                       \
-    sums_32[3] = _mm256_setzero_si256();                                       \
-                                                                               \
-    for (i = 0; i < (n / 2); ++i) {                                            \
-      sums_16[0] = _mm256_setzero_si256();                                     \
-      sums_16[1] = _mm256_setzero_si256();                                     \
-      sums_16[2] = _mm256_setzero_si256();                                     \
-      sums_16[3] = _mm256_setzero_si256();                                     \
-                                                                               \
-      highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2);        \
-                                                                               \
-      /* sums_16 will outrange after 2 rows, so add current sums_16 to         \
-       * sums_32*/                                                             \
-      sums_32[0] = _mm256_add_epi32(                                           \
-          sums_32[0],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[0], 1))));                  \
-      sums_32[1] = _mm256_add_epi32(                                           \
-          sums_32[1],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[1], 1))));                  \
-      sums_32[2] = _mm256_add_epi32(                                           \
-          sums_32[2],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[2], 1))));                  \
-      sums_32[3] = _mm256_add_epi32(                                           \
-          sums_32[3],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[3], 1))));                  \
-                                                                               \
-      src += src_stride << 1;                                                  \
-    }                                                                          \
-    calc_final_4(sums_32, sad_array);                                          \
+    highbd_sad64xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array,  \
+                           n);                                                 \
   }
 
-// 64x64
-HIGHBD_SAD64XNX4D(64)
-
-// 64x32
-HIGHBD_SAD64XNX4D(32)
+#define HIGHBD_SADSKIP64XNx4D(n)                                             \
+  void vpx_highbd_sad_skip_64x##n##x4d_avx2(                                 \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad64xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,   \
+                           sad_array, n / 2);                                \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
+  }
 
 static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/,
                                                const uint16_t *src,
@@ -171,73 +180,79 @@ static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/,
   }
 }
 
+static VPX_FORCE_INLINE void highbd_sad32xNx4d_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+    int ref_stride, uint32_t sad_array[4], int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *refs[4];
+  __m256i sums_16[4];
+  __m256i sums_32[4];
+  int i;
+
+  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+  sums_32[0] = _mm256_setzero_si256();
+  sums_32[1] = _mm256_setzero_si256();
+  sums_32[2] = _mm256_setzero_si256();
+  sums_32[3] = _mm256_setzero_si256();
+
+  for (i = 0; i < (n / 8); ++i) {
+    sums_16[0] = _mm256_setzero_si256();
+    sums_16[1] = _mm256_setzero_si256();
+    sums_16[2] = _mm256_setzero_si256();
+    sums_16[3] = _mm256_setzero_si256();
+
+    highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);
+
+    /* sums_16 will outrange after 8 rows, so add current sums_16 to
+     * sums_32*/
+    sums_32[0] = _mm256_add_epi32(
+        sums_32[0],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+    sums_32[1] = _mm256_add_epi32(
+        sums_32[1],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+    sums_32[2] = _mm256_add_epi32(
+        sums_32[2],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+    sums_32[3] = _mm256_add_epi32(
+        sums_32[3],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+    src += src_stride << 3;
+  }
+  calc_final_4(sums_32, sad_array);
+}
+
 #define HIGHBD_SAD32XNX4D(n)                                                   \
-  void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride,  \
+  void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src, int src_stride,      \
                                       const uint8_t *const ref_array[4],       \
                                       int ref_stride, uint32_t sad_array[4]) { \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                        \
-    uint16_t *refs[4];                                                         \
-    __m256i sums_16[4];                                                        \
-    __m256i sums_32[4];                                                        \
-    int i;                                                                     \
-                                                                               \
-    refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);                               \
-    refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);                               \
-    refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);                               \
-    refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);                               \
-    sums_32[0] = _mm256_setzero_si256();                                       \
-    sums_32[1] = _mm256_setzero_si256();                                       \
-    sums_32[2] = _mm256_setzero_si256();                                       \
-    sums_32[3] = _mm256_setzero_si256();                                       \
-                                                                               \
-    for (i = 0; i < (n / 8); ++i) {                                            \
-      sums_16[0] = _mm256_setzero_si256();                                     \
-      sums_16[1] = _mm256_setzero_si256();                                     \
-      sums_16[2] = _mm256_setzero_si256();                                     \
-      sums_16[3] = _mm256_setzero_si256();                                     \
-                                                                               \
-      highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);        \
-                                                                               \
-      /* sums_16 will outrange after 8 rows, so add current sums_16 to         \
-       * sums_32*/                                                             \
-      sums_32[0] = _mm256_add_epi32(                                           \
-          sums_32[0],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[0], 1))));                  \
-      sums_32[1] = _mm256_add_epi32(                                           \
-          sums_32[1],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[1], 1))));                  \
-      sums_32[2] = _mm256_add_epi32(                                           \
-          sums_32[2],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[2], 1))));                  \
-      sums_32[3] = _mm256_add_epi32(                                           \
-          sums_32[3],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[3], 1))));                  \
-                                                                               \
-      src += src_stride << 3;                                                  \
-    }                                                                          \
-    calc_final_4(sums_32, sad_array);                                          \
+    highbd_sad32xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array,  \
+                           n);                                                 \
   }
 
-// 32x64
-HIGHBD_SAD32XNX4D(64)
-
-// 32x32
-HIGHBD_SAD32XNX4D(32)
-
-// 32x16
-HIGHBD_SAD32XNX4D(16)
+#define HIGHBD_SADSKIP32XNx4D(n)                                             \
+  void vpx_highbd_sad_skip_32x##n##x4d_avx2(                                 \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad32xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,   \
+                           sad_array, n / 2);                                \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
+  }
 
 static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/,
                                                const uint16_t *src,
@@ -275,13 +290,15 @@ static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/,
   }
 }
 
-void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
-                                 const uint8_t *const ref_array[4],
-                                 int ref_stride, uint32_t sad_array[4]) {
+static VPX_FORCE_INLINE void highbd_sad16xNx4d_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+    int ref_stride, uint32_t sad_array[4], int n) {
   const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
   uint16_t *refs[4];
   __m256i sums_16[4];
   __m256i sums_32[4];
+  const int height = VPXMIN(16, n);
+  const int num_iters = n / height;
   int i;
 
   refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
@@ -293,13 +310,13 @@ void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
   sums_32[2] = _mm256_setzero_si256();
   sums_32[3] = _mm256_setzero_si256();
 
-  for (i = 0; i < 2; ++i) {
+  for (i = 0; i < num_iters; ++i) {
     sums_16[0] = _mm256_setzero_si256();
     sums_16[1] = _mm256_setzero_si256();
     sums_16[2] = _mm256_setzero_si256();
     sums_16[3] = _mm256_setzero_si256();
 
-    highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16);
+    highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, height);
 
     // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
     sums_32[0] = _mm256_add_epi32(
@@ -328,6 +345,26 @@ void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
   calc_final_4(sums_32, sad_array);
 }
 
+#define HIGHBD_SAD16XNX4D(n)                                                   \
+  void vpx_highbd_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride,      \
+                                      const uint8_t *const ref_array[4],       \
+                                      int ref_stride, uint32_t sad_array[4]) { \
+    highbd_sad16xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array,  \
+                           n);                                                 \
+  }
+
+#define HIGHBD_SADSKIP16XNx4D(n)                                             \
+  void vpx_highbd_sad_skip_16x##n##x4d_avx2(                                 \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad16xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,   \
+                           sad_array, n / 2);                                \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
+  }
+
 void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride,
                                  const uint8_t *const ref_array[4],
                                  int ref_stride, uint32_t sad_array[4]) {
@@ -399,3 +436,27 @@ void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride,
     calc_final_4(sums_32, sad_array);
   }
 }
+
+// clang-format off
+HIGHBD_SAD64XNX4D(64)
+HIGHBD_SADSKIP64XNx4D(64)
+
+HIGHBD_SAD64XNX4D(32)
+HIGHBD_SADSKIP64XNx4D(32)
+
+HIGHBD_SAD32XNX4D(64)
+HIGHBD_SADSKIP32XNx4D(64)
+
+HIGHBD_SAD32XNX4D(32)
+HIGHBD_SADSKIP32XNx4D(32)
+
+HIGHBD_SAD32XNX4D(16)
+HIGHBD_SADSKIP32XNx4D(16)
+
+HIGHBD_SAD16XNX4D(32)
+HIGHBD_SADSKIP16XNx4D(32)
+
+HIGHBD_SADSKIP16XNx4D(16)
+
+HIGHBD_SADSKIP16XNx4D(8)
+    // clang-format on
diff --git a/vpx_dsp/x86/highbd_sad4d_sse2.asm b/vpx_dsp/x86/highbd_sad4d_sse2.asm
index 6c2a61e01..a07892d81 100644
--- a/vpx_dsp/x86/highbd_sad4d_sse2.asm
+++ b/vpx_dsp/x86/highbd_sad4d_sse2.asm
@@ -213,7 +213,12 @@ SECTION .text
 ;                         uint8_t *ref[4], int ref_stride,
 ;                         uint32_t res[4]);
 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
-%macro HIGH_SADNXN4D 2
+; Macro Arguments:
+;   1: Width
+;   2: Height
+;   3: If 0, then normal sad, if 2, then skip every other row
+%macro HIGH_SADNXN4D 2-3 0
+%if %3 == 0  ; normal sad
 %if UNIX64
 cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
                               res, ref2, ref3, ref4
@@ -221,6 +226,15 @@ cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
 cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
                               ref2, ref3, ref4
 %endif
+%else  ; %3 == 2, downsample
+%if UNIX64
+cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif  ;
+%endif  ; sad/avg/skip
 
 ; set m1
   push                srcq
@@ -229,6 +243,10 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   pshufd                m1, m1, 0x0
   pop                 srcq
 
+%if %3 == 2  ; skip rows
+  lea          src_strided, [2*src_strided]
+  lea          ref_strided, [2*ref_strided]
+%endif  ; skip rows
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
   mov                ref2q, [ref1q+gprsize*1]
@@ -244,9 +262,15 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   shl                ref1q, 1
 
   HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
-%rep (%2-4)/2
+%if %3 == 2  ;  Downsampling by two
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
   HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
 %endrep
+%undef rep
   HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
   ; N.B. HIGH_PROCESS outputs dwords (32 bits)
   ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
@@ -265,6 +289,9 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   paddd                 m4, m0
   paddd                 m6, m1
   punpcklqdq            m4, m6
+%if %3 == 2  ; skip rows
+  pslld                 m4, 1
+%endif
   movifnidn             r4, r4mp
   movu                [r4], m4
   RET
@@ -285,3 +312,15 @@ HIGH_SADNXN4D  8,  8
 HIGH_SADNXN4D  8,  4
 HIGH_SADNXN4D  4,  8
 HIGH_SADNXN4D  4,  4
+
+HIGH_SADNXN4D 64, 64, 2
+HIGH_SADNXN4D 64, 32, 2
+HIGH_SADNXN4D 32, 64, 2
+HIGH_SADNXN4D 32, 32, 2
+HIGH_SADNXN4D 32, 16, 2
+HIGH_SADNXN4D 16, 32, 2
+HIGH_SADNXN4D 16, 16, 2
+HIGH_SADNXN4D 16,  8, 2
+HIGH_SADNXN4D  8, 16, 2
+HIGH_SADNXN4D  8,  8, 2
+HIGH_SADNXN4D  4,  8, 2
diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c
index 231b67f80..78f8eb8bf 100644
--- a/vpx_dsp/x86/highbd_sad_avx2.c
+++ b/vpx_dsp/x86/highbd_sad_avx2.c
@@ -50,39 +50,49 @@ static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16,
   }
 }
 
-#define HIGHBD_SAD64XN(n)                                                    \
-  unsigned int vpx_highbd_sad64x##n##_avx2(                                  \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
-      int ref_stride) {                                                      \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                      \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                            \
-    __m256i sums_32 = _mm256_setzero_si256();                                \
-    int i;                                                                   \
-                                                                             \
-    for (i = 0; i < (n / 2); ++i) {                                          \
-      __m256i sums_16 = _mm256_setzero_si256();                              \
-                                                                             \
-      highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2);         \
-                                                                             \
-      /* sums_16 will outrange after 2 rows, so add current sums_16 to       \
-       * sums_32*/                                                           \
-      sums_32 = _mm256_add_epi32(                                            \
-          sums_32,                                                           \
-          _mm256_add_epi32(                                                  \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),        \
-              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
-                                                                             \
-      src += src_stride << 1;                                                \
-      ref += ref_stride << 1;                                                \
-    }                                                                        \
-    return calc_final(sums_32);                                              \
+static VPX_FORCE_INLINE unsigned int highbd_sad64xN_avx2(const uint8_t *src_ptr,
+                                                         int src_stride,
+                                                         const uint8_t *ref_ptr,
+                                                         int ref_stride,
+                                                         int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  __m256i sums_32 = _mm256_setzero_si256();
+  int i;
+
+  for (i = 0; i < (n / 2); ++i) {
+    __m256i sums_16 = _mm256_setzero_si256();
+
+    highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2);
+
+    /* sums_16 will outrange after 2 rows, so add current sums_16 to
+     * sums_32*/
+    sums_32 = _mm256_add_epi32(
+        sums_32,
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+    src += src_stride << 1;
+    ref += ref_stride << 1;
   }
+  return calc_final(sums_32);
+}
 
-// 64x64
-HIGHBD_SAD64XN(64)
+#define HIGHBD_SAD64XN(n)                                                      \
+  unsigned int vpx_highbd_sad64x##n##_avx2(const uint8_t *src, int src_stride, \
+                                           const uint8_t *ref,                 \
+                                           int ref_stride) {                   \
+    return highbd_sad64xN_avx2(src, src_stride, ref, ref_stride, n);           \
+  }
 
-// 64x32
-HIGHBD_SAD64XN(32)
+#define HIGHBD_SADSKIP64xN(n)                                                \
+  unsigned int vpx_highbd_sad_skip_64x##n##_avx2(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                \
+      int ref_stride) {                                                      \
+    return 2 * highbd_sad64xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+                                   n / 2);                                   \
+  }
 
 static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
                                             const uint16_t *src, int src_stride,
@@ -107,42 +117,49 @@ static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
   }
 }
 
-#define HIGHBD_SAD32XN(n)                                                    \
-  unsigned int vpx_highbd_sad32x##n##_avx2(                                  \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
-      int ref_stride) {                                                      \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                      \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                            \
-    __m256i sums_32 = _mm256_setzero_si256();                                \
-    int i;                                                                   \
-                                                                             \
-    for (i = 0; i < (n / 8); ++i) {                                          \
-      __m256i sums_16 = _mm256_setzero_si256();                              \
-                                                                             \
-      highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8);         \
-                                                                             \
-      /* sums_16 will outrange after 8 rows, so add current sums_16 to       \
-       * sums_32*/                                                           \
-      sums_32 = _mm256_add_epi32(                                            \
-          sums_32,                                                           \
-          _mm256_add_epi32(                                                  \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),        \
-              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
-                                                                             \
-      src += src_stride << 3;                                                \
-      ref += ref_stride << 3;                                                \
-    }                                                                        \
-    return calc_final(sums_32);                                              \
-  }
+static VPX_FORCE_INLINE unsigned int highbd_sad32xN_avx2(const uint8_t *src_ptr,
+                                                         int src_stride,
+                                                         const uint8_t *ref_ptr,
+                                                         int ref_stride,
+                                                         int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  __m256i sums_32 = _mm256_setzero_si256();
+  int i;
 
-// 32x64
-HIGHBD_SAD32XN(64)
+  for (i = 0; i < (n / 8); ++i) {
+    __m256i sums_16 = _mm256_setzero_si256();
 
-// 32x32
-HIGHBD_SAD32XN(32)
+    highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8);
 
-// 32x16
-HIGHBD_SAD32XN(16)
+    /* sums_16 will outrange after 8 rows, so add current sums_16 to
+     * sums_32*/
+    sums_32 = _mm256_add_epi32(
+        sums_32,
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+    src += src_stride << 3;
+    ref += ref_stride << 3;
+  }
+  return calc_final(sums_32);
+}
+
+#define HIGHBD_SAD32XN(n)                                                      \
+  unsigned int vpx_highbd_sad32x##n##_avx2(const uint8_t *src, int src_stride, \
+                                           const uint8_t *ref,                 \
+                                           int ref_stride) {                   \
+    return highbd_sad32xN_avx2(src, src_stride, ref, ref_stride, n);           \
+  }
+
+#define HIGHBD_SADSKIP32xN(n)                                                \
+  unsigned int vpx_highbd_sad_skip_32x##n##_avx2(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                \
+      int ref_stride) {                                                      \
+    return 2 * highbd_sad32xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+                                   n / 2);                                   \
+  }
 
 static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
                                             const uint16_t *src, int src_stride,
@@ -167,17 +184,22 @@ static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
   }
 }
 
-unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride,
-                                      const uint8_t *ref_ptr, int ref_stride) {
+static VPX_FORCE_INLINE unsigned int highbd_sad16xN_avx2(const uint8_t *src_ptr,
+                                                         int src_stride,
+                                                         const uint8_t *ref_ptr,
+                                                         int ref_stride,
+                                                         int n) {
   const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
   __m256i sums_32 = _mm256_setzero_si256();
+  const int height = VPXMIN(16, n);
+  const int num_iters = n / height;
   int i;
 
-  for (i = 0; i < 2; ++i) {
+  for (i = 0; i < num_iters; ++i) {
     __m256i sums_16 = _mm256_setzero_si256();
 
-    highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
+    highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, height);
 
     // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
     sums_32 = _mm256_add_epi32(
@@ -192,6 +214,21 @@ unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride,
   return calc_final(sums_32);
 }
 
+#define HIGHBD_SAD16XN(n)                                                      \
+  unsigned int vpx_highbd_sad16x##n##_avx2(const uint8_t *src, int src_stride, \
+                                           const uint8_t *ref,                 \
+                                           int ref_stride) {                   \
+    return highbd_sad16xN_avx2(src, src_stride, ref, ref_stride, n);           \
+  }
+
+#define HIGHBD_SADSKIP16xN(n)                                                \
+  unsigned int vpx_highbd_sad_skip_16x##n##_avx2(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                \
+      int ref_stride) {                                                      \
+    return 2 * highbd_sad16xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+                                   n / 2);                                   \
+  }
+
 unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride,
                                       const uint8_t *ref_ptr, int ref_stride) {
   const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
@@ -224,6 +261,23 @@ unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride,
   }
 }
 
+// clang-format off
+HIGHBD_SAD64XN(64)
+HIGHBD_SADSKIP64xN(64)
+HIGHBD_SAD64XN(32)
+HIGHBD_SADSKIP64xN(32)
+HIGHBD_SAD32XN(64)
+HIGHBD_SADSKIP32xN(64)
+HIGHBD_SAD32XN(32)
+HIGHBD_SADSKIP32xN(32)
+HIGHBD_SAD32XN(16)
+HIGHBD_SADSKIP32xN(16)
+HIGHBD_SAD16XN(32)
+HIGHBD_SADSKIP16xN(32)
+HIGHBD_SADSKIP16xN(16)
+HIGHBD_SADSKIP16xN(8)
+//clang-format on
+
 // AVG -------------------------------------------------------------------------
 static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16,
                                                 const uint16_t *src,
diff --git a/vpx_dsp/x86/highbd_sad_sse2.asm b/vpx_dsp/x86/highbd_sad_sse2.asm
index 6a1a6f3d6..62ad2237f 100644
--- a/vpx_dsp/x86/highbd_sad_sse2.asm
+++ b/vpx_dsp/x86/highbd_sad_sse2.asm
@@ -12,6 +12,11 @@
 
 SECTION .text
 
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
 %macro HIGH_SAD_FN 4
 %if %4 == 0
 %if %3 == 5
@@ -20,7 +25,7 @@ cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
 cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
                             src_stride3, ref_stride3, n_rows
 %endif ; %3 == 5/7
-%else ; avg
+%elif %4 == 1 ; avg
 %if %3 == 5
 cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
                                     second_pred, n_rows
@@ -35,7 +40,18 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \
 %define n_rowsd dword r0m
 %endif ; x86-32/64
 %endif ; %3 == 5/7
-%endif ; avg/sad
+%else  ; %4 == 2, skip rows
+%if %3 == 5
+cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%endif ; sad/avg/skip
+%if %4 == 2  ; double the stride if we are skipping rows
+  lea          src_strided, [src_strided*2]
+  lea          ref_strided, [ref_strided*2]
+%endif
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
 %if %3 == 7
@@ -54,7 +70,11 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD64XN 1-2 0
   HIGH_SAD_FN 64, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/2
+%else
   mov              n_rowsd, %1
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -146,6 +166,9 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -155,13 +178,19 @@ HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
+HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2
+HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2
 
 
 ; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD32XN 1-2 0
   HIGH_SAD_FN 32, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/2
+%else
   mov              n_rowsd, %1
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -213,6 +242,9 @@ HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -224,12 +256,19 @@ HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
 HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
 HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
 HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
+HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2
+HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2
+HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2
 
 ; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD16XN 1-2 0
   HIGH_SAD_FN 16, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/4
+%else
   mov              n_rowsd, %1/2
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -281,6 +320,9 @@ HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -292,13 +334,19 @@ HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
 HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
 HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
 HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
-
+HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2
+HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2
+HIGH_SAD16XN  8, 2 ; highbd_sad_skip_16x8_sse2
 
 ; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD8XN 1-2 0
   HIGH_SAD_FN 8, %1, 7, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -350,6 +398,9 @@ HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -361,3 +412,5 @@ HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
 HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
 HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
 HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
+HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2
+HIGH_SAD8XN  8, 2 ; highbd_sad_skip_8x8_sse2
diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c
index 399b67b3f..c87fd3cd2 100644
--- a/vpx_dsp/x86/sad4d_avx2.c
+++ b/vpx_dsp/x86/sad4d_avx2.c
@@ -25,9 +25,10 @@ static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
   _mm_storeu_si128((__m128i *)sad_array, sum);
 }
 
-void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
+static INLINE void sad32xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *const ref_array[4],
+                                   int ref_stride, int h,
+                                   uint32_t sad_array[4]) {
   int i;
   const uint8_t *refs[4];
   __m256i sums[4];
@@ -41,7 +42,7 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
   sums[2] = _mm256_setzero_si256();
   sums[3] = _mm256_setzero_si256();
 
-  for (i = 0; i < 32; i++) {
+  for (i = 0; i < h; i++) {
     __m256i r[4];
 
     // load src and all ref[]
@@ -73,9 +74,10 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
   calc_final_4(sums, sad_array);
 }
 
-void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
+static INLINE void sad64xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *const ref_array[4],
+                                   int ref_stride, int h,
+                                   uint32_t sad_array[4]) {
   __m256i sums[4];
   int i;
   const uint8_t *refs[4];
@@ -89,7 +91,7 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
   sums[2] = _mm256_setzero_si256();
   sums[3] = _mm256_setzero_si256();
 
-  for (i = 0; i < 64; i++) {
+  for (i = 0; i < h; i++) {
     __m256i r_lo[4], r_hi[4];
     // load 64 bytes from src and all ref[]
     const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr);
@@ -132,3 +134,51 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
 
   calc_final_4(sums, sad_array);
 }
+
+#define SAD64_H(h)                                                          \
+  void vpx_sad64x##h##x4d_avx2(const uint8_t *src, int src_stride,          \
+                               const uint8_t *const ref[4], int ref_stride, \
+                               uint32_t res[4]) {                           \
+    sad64xhx4d_avx2(src, src_stride, ref, ref_stride, h, res);              \
+  }
+
+#define SAD32_H(h)                                                          \
+  void vpx_sad32x##h##x4d_avx2(const uint8_t *src, int src_stride,          \
+                               const uint8_t *const ref[4], int ref_stride, \
+                               uint32_t res[4]) {                           \
+    sad32xhx4d_avx2(src, src_stride, ref, ref_stride, h, res);              \
+  }
+
+SAD64_H(64)
+SAD32_H(32)
+
+#define SADS64_H(h)                                                       \
+  void vpx_sad_skip_64x##h##x4d_avx2(const uint8_t *src, int src_stride,  \
+                                     const uint8_t *const ref[4],         \
+                                     int ref_stride, uint32_t res[4]) {   \
+    sad64xhx4d_avx2(src, 2 * src_stride, ref, 2 * ref_stride, ((h) >> 1), \
+                    res);                                                 \
+    res[0] <<= 1;                                                         \
+    res[1] <<= 1;                                                         \
+    res[2] <<= 1;                                                         \
+    res[3] <<= 1;                                                         \
+  }
+
+#define SADS32_H(h)                                                       \
+  void vpx_sad_skip_32x##h##x4d_avx2(const uint8_t *src, int src_stride,  \
+                                     const uint8_t *const ref[4],         \
+                                     int ref_stride, uint32_t res[4]) {   \
+    sad32xhx4d_avx2(src, 2 * src_stride, ref, 2 * ref_stride, ((h) >> 1), \
+                    res);                                                 \
+    res[0] <<= 1;                                                         \
+    res[1] <<= 1;                                                         \
+    res[2] <<= 1;                                                         \
+    res[3] <<= 1;                                                         \
+  }
+
+SADS64_H(64)
+SADS64_H(32)
+
+SADS32_H(64)
+SADS32_H(32)
+SADS32_H(16)
diff --git a/vpx_dsp/x86/sad4d_sse2.asm b/vpx_dsp/x86/sad4d_sse2.asm
index 3f6e55ce9..ed4ea3ef9 100644
--- a/vpx_dsp/x86/sad4d_sse2.asm
+++ b/vpx_dsp/x86/sad4d_sse2.asm
@@ -179,7 +179,16 @@ SECTION .text
 ;                         uint8_t *ref[4], int ref_stride,
 ;                         uint32_t res[4]);
 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
-%macro SADNXN4D 2
+%macro SADNXN4D 2-3 0
+%if %3 == 1  ; skip rows
+%if UNIX64
+cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif
+%else  ; normal sad
 %if UNIX64
 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
                               res, ref2, ref3, ref4
@@ -187,6 +196,11 @@ cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
 cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
                               ref2, ref3, ref4
 %endif
+%endif
+%if %3 == 1
+  lea          src_strided, [2*src_strided]
+  lea          ref_strided, [2*ref_strided]
+%endif
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
   mov                ref2q, [ref1q+gprsize*1]
@@ -195,9 +209,15 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   mov                ref1q, [ref1q+gprsize*0]
 
   PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
-%rep (%2-4)/2
+%if %3 == 1  ; downsample number of rows by 2
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
   PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
 %endrep
+%undef num_rep
   PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
 
 %if %1 > 4
@@ -211,12 +231,19 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   punpckhqdq            m5, m7
   movifnidn             r4, r4mp
   paddd                 m4, m5
+%if %3 == 1
+  pslld                 m4, 1
+%endif
   movu                [r4], m4
   RET
 %else
   movifnidn             r4, r4mp
   pshufd            m6, m6, 0x08
   pshufd            m7, m7, 0x08
+%if %3 == 1
+  pslld                 m6, 1
+  pslld                 m7, 1
+%endif
   movq              [r4+0], m6
   movq              [r4+8], m7
   RET
@@ -237,3 +264,15 @@ SADNXN4D  8,  8
 SADNXN4D  8,  4
 SADNXN4D  4,  8
 SADNXN4D  4,  4
+
+SADNXN4D 64, 64, 1
+SADNXN4D 64, 32, 1
+SADNXN4D 32, 64, 1
+SADNXN4D 32, 32, 1
+SADNXN4D 32, 16, 1
+SADNXN4D 16, 32, 1
+SADNXN4D 16, 16, 1
+SADNXN4D 16,  8, 1
+SADNXN4D  8, 16, 1
+SADNXN4D  8,  8, 1
+SADNXN4D  4,  8, 1
diff --git a/vpx_dsp/x86/sad_avx2.c b/vpx_dsp/x86/sad_avx2.c
index 29bedb0e6..e00494d76 100644
--- a/vpx_dsp/x86/sad_avx2.c
+++ b/vpx_dsp/x86/sad_avx2.c
@@ -11,73 +11,104 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
+static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  int i, res;
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+  __m256i sum_sad = _mm256_setzero_si256();
+  __m256i sum_sad_h;
+  __m128i sum_sad128;
+  for (i = 0; i < h; i++) {
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
+    sad1_reg =
+        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+    sad2_reg = _mm256_sad_epu8(
+        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+    ref_ptr += ref_stride;
+    src_ptr += src_stride;
+  }
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+  res = _mm_cvtsi128_si32(sum_sad128);
+  return res;
+}
+
+static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  int i, res;
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+  __m256i sum_sad = _mm256_setzero_si256();
+  __m256i sum_sad_h;
+  __m128i sum_sad128;
+  const int ref2_stride = ref_stride << 1;
+  const int src2_stride = src_stride << 1;
+  const int max = h >> 1;
+  for (i = 0; i < max; i++) {
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
+    sad1_reg =
+        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+    sad2_reg = _mm256_sad_epu8(
+        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+    ref_ptr += ref2_stride;
+    src_ptr += src2_stride;
+  }
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+  res = _mm_cvtsi128_si32(sum_sad128);
+  return res;
+}
+
 #define FSAD64_H(h)                                                           \
   unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
                                     const uint8_t *ref_ptr, int ref_stride) { \
-    int i;                                                                    \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    for (i = 0; i < h; i++) {                                                 \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref_stride;                                                  \
-      src_ptr += src_stride;                                                  \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    return (unsigned int)_mm_cvtsi128_si32(sum_sad128);                       \
+    return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
+  }
+
+#define FSADS64_H(h)                                                          \
+  unsigned int vpx_sad_skip_64x##h##_avx2(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                            h / 2);                                           \
   }
 
 #define FSAD32_H(h)                                                           \
   unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
                                     const uint8_t *ref_ptr, int ref_stride) { \
-    int i, res;                                                               \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    int ref2_stride = ref_stride << 1;                                        \
-    int src2_stride = src_stride << 1;                                        \
-    int max = h >> 1;                                                         \
-    for (i = 0; i < max; i++) {                                               \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg,                                                           \
-          _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref2_stride;                                                 \
-      src_ptr += src2_stride;                                                 \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    return res;                                                               \
+    return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
+  }
+
+#define FSADS32_H(h)                                                          \
+  unsigned int vpx_sad_skip_32x##h##_avx2(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                            h / 2);                                           \
   }
 
-#define FSAD64 \
-  FSAD64_H(64) \
-  FSAD64_H(32)
+#define FSAD64  \
+  FSAD64_H(64)  \
+  FSAD64_H(32)  \
+  FSADS64_H(64) \
+  FSADS64_H(32)
 
-#define FSAD32 \
-  FSAD32_H(64) \
-  FSAD32_H(32) \
-  FSAD32_H(16)
+#define FSAD32  \
+  FSAD32_H(64)  \
+  FSAD32_H(32)  \
+  FSAD32_H(16)  \
+  FSADS32_H(64) \
+  FSADS32_H(32) \
+  FSADS32_H(16)
 
 FSAD64
 FSAD32
@@ -86,6 +117,8 @@ FSAD32
 #undef FSAD32
 #undef FSAD64_H
 #undef FSAD32_H
+#undef FSADS64_H
+#undef FSADS32_H
 
 #define FSADAVG64_H(h)                                                        \
   unsigned int vpx_sad64x##h##_avg_avx2(                                      \
diff --git a/vpx_dsp/x86/sad_sse2.asm b/vpx_dsp/x86/sad_sse2.asm
index e4e1bc3e9..627e463bf 100644
--- a/vpx_dsp/x86/sad_sse2.asm
+++ b/vpx_dsp/x86/sad_sse2.asm
@@ -12,15 +12,29 @@
 
 SECTION .text
 
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
 %macro SAD_FN 4
-%if %4 == 0
+%if %4 == 0 ; normal sad
 %if %3 == 5
 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
 %else ; %3 == 7
 cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
                             src_stride3, ref_stride3, n_rows
 %endif ; %3 == 5/7
-%else ; avg
+
+%elif %4 == 2 ; skip
+%if %3 == 5
+cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+
+%else
 %if %3 == 5
 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
                                     second_pred, n_rows
@@ -35,7 +49,11 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \
 %define n_rowsd dword r0m
 %endif ; x86-32/64
 %endif ; %3 == 5/7
-%endif ; avg/sad
+%endif ; sad/avg/skip
+%if %4 == 2; skip rows so double the stride
+lea           src_strided, [src_strided*2]
+lea           ref_strided, [ref_strided*2]
+%endif ; %4 skip
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
 %if %3 == 7
@@ -48,7 +66,11 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \
 ;                                uint8_t *ref, int ref_stride);
 %macro SAD64XN 1-2 0
   SAD_FN 64, %1, 5, %2
+%if %2 == 2
+  mov              n_rowsd, %1/2
+%else
   mov              n_rowsd, %1
+%endif
   pxor                  m0, m0
 .loop:
   movu                  m1, [refq]
@@ -77,6 +99,9 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -86,12 +111,18 @@ SAD64XN 64 ; sad64x64_sse2
 SAD64XN 32 ; sad64x32_sse2
 SAD64XN 64, 1 ; sad64x64_avg_sse2
 SAD64XN 32, 1 ; sad64x32_avg_sse2
+SAD64XN  64, 2  ; sad64x64_skip_sse2
+SAD64XN  32, 2  ; sad64x32_skip_sse2
 
 ; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride,
 ;                                uint8_t *ref, int ref_stride);
 %macro SAD32XN 1-2 0
   SAD_FN 32, %1, 5, %2
+%if %2 == 2
+  mov              n_rowsd, %1/4
+%else
   mov              n_rowsd, %1/2
+%endif
   pxor                  m0, m0
 .loop:
   movu                  m1, [refq]
@@ -120,6 +151,9 @@ SAD64XN 32, 1 ; sad64x32_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -131,12 +165,19 @@ SAD32XN 16 ; sad32x16_sse2
 SAD32XN 64, 1 ; sad32x64_avg_sse2
 SAD32XN 32, 1 ; sad32x32_avg_sse2
 SAD32XN 16, 1 ; sad32x16_avg_sse2
+SAD32XN 64, 2 ; sad32x64_skip_sse2
+SAD32XN 32, 2 ; sad32x32_skip_sse2
+SAD32XN 16, 2 ; sad32x16_skip_sse2
 
 ; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro SAD16XN 1-2 0
   SAD_FN 16, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
 
 .loop:
@@ -166,6 +207,9 @@ SAD32XN 16, 1 ; sad32x16_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -177,12 +221,19 @@ SAD16XN  8 ; sad16x8_sse2
 SAD16XN 32, 1 ; sad16x32_avg_sse2
 SAD16XN 16, 1 ; sad16x16_avg_sse2
 SAD16XN  8, 1 ; sad16x8_avg_sse2
+SAD16XN 32, 2 ; sad16x32_skip_sse2
+SAD16XN 16, 2 ; sad16x16_skip_sse2
+SAD16XN  8, 2 ; sad16x8_skip_sse2
 
 ; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
 %macro SAD8XN 1-2 0
   SAD_FN 8, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
 
 .loop:
@@ -210,6 +261,9 @@ SAD16XN  8, 1 ; sad16x8_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -221,12 +275,18 @@ SAD8XN  4 ; sad8x4_sse2
 SAD8XN 16, 1 ; sad8x16_avg_sse2
 SAD8XN  8, 1 ; sad8x8_avg_sse2
 SAD8XN  4, 1 ; sad8x4_avg_sse2
+SAD8XN 16, 2 ; sad8x16_skip_sse2
+SAD8XN  8, 2 ; sad8x8_skip_sse2
 
 ; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
 %macro SAD4XN 1-2 0
   SAD_FN 4, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
 
 .loop:
@@ -257,6 +317,9 @@ SAD8XN  4, 1 ; sad8x4_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -266,3 +329,4 @@ SAD4XN  8 ; sad4x8_sse
 SAD4XN  4 ; sad4x4_sse
 SAD4XN  8, 1 ; sad4x8_avg_sse
 SAD4XN  4, 1 ; sad4x4_avg_sse
+SAD4XN  8, 2 ; sad4x8_skip_sse