83 files changed, 8272 insertions, 4951 deletions
diff --git a/chromium/third_party/libgav1/BUILD.gn b/chromium/third_party/libgav1/BUILD.gn
index 9a31f423f6e..3a28871b8d2 100644
--- a/chromium/third_party/libgav1/BUILD.gn
+++ b/chromium/third_party/libgav1/BUILD.gn
@@ -16,6 +16,10 @@ config("public_libgav1_config") {
     "LIBGAV1_THREADPOOL_USE_STD_MUTEX",  # to avoid abseil dependency.
     "LIBGAV1_ENABLE_LOGGING=0",  # to avoid debug log of libgav1 in chromium
                                  # debug build.
+
+    # Don't let libgav1 export any symbols. Otherwise the verify_order step on
+    # macOS can fail since these exports end up in the final Chromium binary.
+    "LIBGAV1_PUBLIC=",
   ]
 }
 
diff --git a/chromium/third_party/libgav1/README.chromium b/chromium/third_party/libgav1/README.chromium
index fc62bb71907..27a8fe8222f 100644
--- a/chromium/third_party/libgav1/README.chromium
+++ b/chromium/third_party/libgav1/README.chromium
@@ -2,9 +2,9 @@ Name: libgav1
 Short Name: libgav1
 URL: https://chromium.googlesource.com/codecs/libgav1/
 Version: 0
-Date: Thursday March 26 2020
+Date: Saturday May 23 2020
 Branch: master
-Commit: 638ef84819f8b3cd614dcf63378fe4814aa4cb2a
+Commit: bf190c43e5c7cc81751867c917a81bc2920be079
 License: Apache 2.0
 License File: libgav1/LICENSE
 Security Critical: yes
diff --git a/chromium/third_party/libgav1/libgav1_srcs.gni b/chromium/third_party/libgav1/libgav1_srcs.gni
index e460d030f1b..9dc54f97124 100644
--- a/chromium/third_party/libgav1/libgav1_srcs.gni
+++ b/chromium/third_party/libgav1/libgav1_srcs.gni
@@ -15,8 +15,6 @@ gav1_common_sources = [
   "//third_party/libgav1/src/src/frame_scratch_buffer.h",
   "//third_party/libgav1/src/src/internal_frame_buffer_list.cc",
   "//third_party/libgav1/src/src/internal_frame_buffer_list.h",
-  "//third_party/libgav1/src/src/loop_filter_mask.cc",
-  "//third_party/libgav1/src/src/loop_filter_mask.h",
   "//third_party/libgav1/src/src/loop_restoration_info.cc",
   "//third_party/libgav1/src/src/loop_restoration_info.h",
   "//third_party/libgav1/src/src/motion_vector.cc",
@@ -146,6 +144,10 @@ gav1_dsp_sources = [
   "//third_party/libgav1/src/src/dsp/x86/loop_restoration_sse4.h",
   "//third_party/libgav1/src/src/dsp/x86/mask_blend_sse4.cc",
   "//third_party/libgav1/src/src/dsp/x86/mask_blend_sse4.h",
+  "//third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.cc",
+  "//third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.h",
+  "//third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.cc",
+  "//third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.h",
   "//third_party/libgav1/src/src/dsp/x86/obmc_sse4.cc",
   "//third_party/libgav1/src/src/dsp/x86/obmc_sse4.h",
   "//third_party/libgav1/src/src/dsp/x86/super_res_sse4.cc",
@@ -215,6 +217,7 @@ gav1_utils_sources = [
   "//third_party/libgav1/src/src/utils/queue.h",
   "//third_party/libgav1/src/src/utils/raw_bit_reader.cc",
   "//third_party/libgav1/src/src/utils/raw_bit_reader.h",
+  "//third_party/libgav1/src/src/utils/reference_info.h",
   "//third_party/libgav1/src/src/utils/segmentation.cc",
   "//third_party/libgav1/src/src/utils/segmentation.h",
   "//third_party/libgav1/src/src/utils/segmentation_map.cc",
diff --git a/chromium/third_party/libgav1/src/README.md b/chromium/third_party/libgav1/src/README.md
index ead3fc3b8ee..a5799d1395b 100644
--- a/chromium/third_party/libgav1/src/README.md
+++ b/chromium/third_party/libgav1/src/README.md
@@ -56,6 +56,9 @@ Configuration options:
     absl::Mutex in ThreadPool. Defining this to 1 will remove any Abseil
     dependency from the core library. Automatically defined in
     `src/utils/threadpool.h` if unset.
+*   `LIBGAV1_MAX_THREADS`: sets the number of threads that the library is
+    allowed to create. Has to be an integer > 0. Otherwise this is ignored.
+    The default value is 128.
 
 For additional options see:
 
diff --git a/chromium/third_party/libgav1/src/cmake/libgav1_build_definitions.cmake b/chromium/third_party/libgav1/src/cmake/libgav1_build_definitions.cmake
index cd5ff9e1230..31017a9de14 100644
--- a/chromium/third_party/libgav1/src/cmake/libgav1_build_definitions.cmake
+++ b/chromium/third_party/libgav1/src/cmake/libgav1_build_definitions.cmake
@@ -63,6 +63,14 @@ macro(libgav1_set_build_definitions)
   list(APPEND libgav1_clang_cxx_flags "-Wmissing-prototypes"
               "-Wshorten-64-to-32")
 
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6")
+      # Quiet warnings in copy-list-initialization where {} elision has always
+      # been allowed.
+      list(APPEND libgav1_clang_cxx_flags "-Wno-missing-braces")
+    endif()
+  endif()
+
   if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
     if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "7")
       # Quiet warnings due to potential snprintf() truncation in threadpool.cc.
diff --git a/chromium/third_party/libgav1/src/cmake/libgav1_flags.cmake b/chromium/third_party/libgav1/src/cmake/libgav1_flags.cmake
index 295b078756a..0b8df60f3df 100644
--- a/chromium/third_party/libgav1/src/cmake/libgav1_flags.cmake
+++ b/chromium/third_party/libgav1/src/cmake/libgav1_flags.cmake
@@ -212,14 +212,17 @@ endmacro()
 macro(libgav1_set_cxx_flags)
   unset(cxx_flag_lists)
 
-  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-    list(APPEND cxx_flag_lists libgav1_clang_cxx_flags)
-  endif()
-
   if(CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
     list(APPEND cxx_flag_lists libgav1_base_cxx_flags)
   endif()
 
+  # Append clang flags after the base set to allow -Wno* overrides to take
+  # effect. Some of the base flags may enable a large set of warnings, e.g.,
+  # -Wall.
+  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    list(APPEND cxx_flag_lists libgav1_clang_cxx_flags)
+  endif()
+
   if(MSVC)
     list(APPEND cxx_flag_lists libgav1_msvc_cxx_flags)
   endif()
diff --git a/chromium/third_party/libgav1/src/examples/file_reader.cc b/chromium/third_party/libgav1/src/examples/file_reader.cc
index f174e2d67b6..b0967227ef8 100644
--- a/chromium/third_party/libgav1/src/examples/file_reader.cc
+++ b/chromium/third_party/libgav1/src/examples/file_reader.cc
@@ -26,7 +26,6 @@
 #include <io.h>
 #endif
 
-#include "absl/memory/memory.h"
 #include "examples/file_reader_constants.h"
 #include "examples/file_reader_factory.h"
 #include "examples/file_reader_interface.h"
@@ -53,10 +52,9 @@ FileReader::~FileReader() {
 }
 
 std::unique_ptr<FileReaderInterface> FileReader::Open(
-    absl::string_view file_name, const bool error_tolerant) {
+    const std::string& file_name, const bool error_tolerant) {
   if (file_name.empty()) return nullptr;
 
-  const std::string fopen_file_name = std::string(file_name);
   FILE* raw_file_ptr;
 
   bool owns_file = true;
@@ -64,14 +62,14 @@ std::unique_ptr<FileReaderInterface> FileReader::Open(
     raw_file_ptr = SetBinaryMode(stdin);
     owns_file = false;  // stdin is owned by the Standard C Library.
   } else {
-    raw_file_ptr = fopen(fopen_file_name.c_str(), "rb");
+    raw_file_ptr = fopen(file_name.c_str(), "rb");
   }
 
   if (raw_file_ptr == nullptr) {
     return nullptr;
   }
 
-  auto file = absl::WrapUnique(
+  std::unique_ptr<FileReader> file(
       new (std::nothrow) FileReader(raw_file_ptr, owns_file, error_tolerant));
   if (file == nullptr) {
     LIBGAV1_EXAMPLES_LOG_ERROR("Out of memory");
diff --git a/chromium/third_party/libgav1/src/examples/file_reader.h b/chromium/third_party/libgav1/src/examples/file_reader.h
index ad5911e32fe..c342a20df1e 100644
--- a/chromium/third_party/libgav1/src/examples/file_reader.h
+++ b/chromium/third_party/libgav1/src/examples/file_reader.h
@@ -21,10 +21,9 @@
 #include <cstdint>
 #include <cstdio>
 #include <memory>
+#include <string>
 #include <vector>
 
-#include "absl/base/attributes.h"
-#include "absl/strings/string_view.h"
 #include "examples/file_reader_interface.h"
 
 namespace libgav1 {
@@ -42,7 +41,7 @@ class FileReader : public FileReaderInterface {
   // ReadTemporalUnit() may return truncated data.
   // Returns nullptr when the file does not exist, cannot be read, or is not an
   // IVF file.
-  static std::unique_ptr<FileReaderInterface> Open(absl::string_view file_name,
+  static std::unique_ptr<FileReaderInterface> Open(const std::string& file_name,
                                                    bool error_tolerant = false);
 
   FileReader() = delete;
@@ -62,10 +61,10 @@ class FileReader : public FileReaderInterface {
   // The |timestamp| pointer is optional: callers not interested in timestamps
   // can pass nullptr. When |timestamp| is not a nullptr, this function returns
   // the presentation timestamp from the IVF frame header.
-  ABSL_MUST_USE_RESULT bool ReadTemporalUnit(std::vector<uint8_t>* tu_data,
-                                             int64_t* timestamp) override;
+  /*LIBGAV1_MUST_USE_RESULT*/ bool ReadTemporalUnit(
+      std::vector<uint8_t>* tu_data, int64_t* timestamp) override;
 
-  ABSL_MUST_USE_RESULT bool IsEndOfFile() const override {
+  /*LIBGAV1_MUST_USE_RESULT*/ bool IsEndOfFile() const override {
     return feof(file_) != 0;
   }
 
diff --git a/chromium/third_party/libgav1/src/examples/file_reader_factory.cc b/chromium/third_party/libgav1/src/examples/file_reader_factory.cc
index 860d916423d..d5260eba893 100644
--- a/chromium/third_party/libgav1/src/examples/file_reader_factory.cc
+++ b/chromium/third_party/libgav1/src/examples/file_reader_factory.cc
@@ -38,7 +38,7 @@ bool FileReaderFactory::RegisterReader(OpenFunction open_function) {
 }
 
 std::unique_ptr<FileReaderInterface> FileReaderFactory::OpenReader(
-    absl::string_view file_name, const bool error_tolerant /*= false*/) {
+    const std::string& file_name, const bool error_tolerant /*= false*/) {
   for (auto* open_function : *GetFileReaderOpenFunctions()) {
     auto reader = open_function(file_name, error_tolerant);
     if (reader == nullptr) continue;
diff --git a/chromium/third_party/libgav1/src/examples/file_reader_factory.h b/chromium/third_party/libgav1/src/examples/file_reader_factory.h
index ddf8744d19b..0f534845e75 100644
--- a/chromium/third_party/libgav1/src/examples/file_reader_factory.h
+++ b/chromium/third_party/libgav1/src/examples/file_reader_factory.h
@@ -18,8 +18,8 @@
 #define LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_
 
 #include <memory>
+#include <string>
 
-#include "absl/strings/string_view.h"
 #include "examples/file_reader_interface.h"
 
 namespace libgav1 {
@@ -27,7 +27,7 @@ namespace libgav1 {
 class FileReaderFactory {
  public:
   using OpenFunction = std::unique_ptr<FileReaderInterface> (*)(
-      absl::string_view file_name, bool error_tolerant);
+      const std::string& file_name, bool error_tolerant);
 
   FileReaderFactory() = delete;
   FileReaderFactory(const FileReaderFactory&) = delete;
@@ -43,7 +43,7 @@ class FileReaderFactory {
   // returned. If |error_tolerant| is true and the reader supports it, some
   // format and read errors may be ignored and partial data returned.
   static std::unique_ptr<FileReaderInterface> OpenReader(
-      absl::string_view file_name, bool error_tolerant = false);
+      const std::string& file_name, bool error_tolerant = false);
 };
 
 }  // namespace libgav1
diff --git a/chromium/third_party/libgav1/src/examples/file_reader_interface.h b/chromium/third_party/libgav1/src/examples/file_reader_interface.h
index d768017e2ba..d8f703091e2 100644
--- a/chromium/third_party/libgav1/src/examples/file_reader_interface.h
+++ b/chromium/third_party/libgav1/src/examples/file_reader_interface.h
@@ -21,8 +21,6 @@
 #include <cstdint>
 #include <vector>
 
-#include "absl/base/attributes.h"
-
 namespace libgav1 {
 
 class FileReaderInterface {
@@ -47,10 +45,10 @@ class FileReaderInterface {
   // The |timestamp| pointer is optional: callers not interested in timestamps
   // can pass nullptr. When |timestamp| is not a nullptr, this function returns
   // the presentation timestamp of the temporal unit.
-  ABSL_MUST_USE_RESULT virtual bool ReadTemporalUnit(
+  /*LIBGAV1_MUST_USE_RESULT*/ virtual bool ReadTemporalUnit(
       std::vector<uint8_t>* tu_data, int64_t* timestamp) = 0;
 
-  ABSL_MUST_USE_RESULT virtual bool IsEndOfFile() const = 0;
+  /*LIBGAV1_MUST_USE_RESULT*/ virtual bool IsEndOfFile() const = 0;
 
   // The values returned by these accessors are strictly informative. No
   // validation is performed when they are read from file.
diff --git a/chromium/third_party/libgav1/src/examples/file_writer.cc b/chromium/third_party/libgav1/src/examples/file_writer.cc
index bf13d4a1199..54afe145cde 100644
--- a/chromium/third_party/libgav1/src/examples/file_writer.cc
+++ b/chromium/third_party/libgav1/src/examples/file_writer.cc
@@ -25,8 +25,6 @@
 #include <io.h>
 #endif
 
-#include "absl/memory/memory.h"
-#include "absl/strings/str_format.h"
 #include "examples/logging.h"
 
 namespace libgav1 {
@@ -72,9 +70,8 @@ std::string GetY4mColorSpaceString(
   if (y4m_parameters.bitdepth > 8) {
     const bool monochrome =
         y4m_parameters.image_format == kImageFormatMonochrome400;
-    color_space_string =
-        absl::StrFormat("%s%s%d", color_space_string, monochrome ? "" : "p",
-                        y4m_parameters.bitdepth);
+    if (!monochrome) color_space_string += "p";
+    color_space_string += std::to_string(y4m_parameters.bitdepth);
   }
 
   return color_space_string;
@@ -85,7 +82,7 @@ std::string GetY4mColorSpaceString(
 FileWriter::~FileWriter() { fclose(file_); }
 
 std::unique_ptr<FileWriter> FileWriter::Open(
-    absl::string_view file_name, FileType file_type,
+    const std::string& file_name, FileType file_type,
     const Y4mParameters* const y4m_parameters) {
   if (file_name.empty() ||
       (file_type == kFileTypeY4m && y4m_parameters == nullptr) ||
@@ -94,13 +91,12 @@ std::unique_ptr<FileWriter> FileWriter::Open(
     return nullptr;
   }
 
-  const std::string fopen_file_name = std::string(file_name);
   FILE* raw_file_ptr;
 
   if (file_name == "-") {
     raw_file_ptr = SetBinaryMode(stdout);
   } else {
-    raw_file_ptr = fopen(fopen_file_name.c_str(), "wb");
+    raw_file_ptr = fopen(file_name.c_str(), "wb");
   }
 
   if (raw_file_ptr == nullptr) {
@@ -108,7 +104,7 @@ std::unique_ptr<FileWriter> FileWriter::Open(
     return nullptr;
   }
 
-  auto file = absl::WrapUnique(new (std::nothrow) FileWriter(raw_file_ptr));
+  std::unique_ptr<FileWriter> file(new (std::nothrow) FileWriter(raw_file_ptr));
   if (file == nullptr) {
     LIBGAV1_EXAMPLES_LOG_ERROR("Out of memory");
     fclose(raw_file_ptr);
@@ -173,11 +169,13 @@ bool FileWriter::WriteFrame(const DecoderBuffer& frame_buffer) {
 //
 // More info here: https://wiki.multimedia.cx/index.php/YUV4MPEG2
 bool FileWriter::WriteY4mFileHeader(const Y4mParameters& y4m_parameters) {
-  std::string y4m_header = absl::StrFormat(
-      "YUV4MPEG2 W%zu H%zu F%zu:%zu Ip C%s\n", y4m_parameters.width,
-      y4m_parameters.height, y4m_parameters.frame_rate_numerator,
-      y4m_parameters.frame_rate_denominator,
-      GetY4mColorSpaceString(y4m_parameters));
+  std::string y4m_header = "YUV4MPEG2";
+  y4m_header += " W" + std::to_string(y4m_parameters.width);
+  y4m_header += " H" + std::to_string(y4m_parameters.height);
+  y4m_header += " F" + std::to_string(y4m_parameters.frame_rate_numerator) +
+                ":" + std::to_string(y4m_parameters.frame_rate_denominator);
+  y4m_header += " Ip C" + GetY4mColorSpaceString(y4m_parameters);
+  y4m_header += "\n";
   return fwrite(y4m_header.c_str(), 1, y4m_header.length(), file_) ==
          y4m_header.length();
 }
diff --git a/chromium/third_party/libgav1/src/examples/file_writer.h b/chromium/third_party/libgav1/src/examples/file_writer.h
index a7b1937dd37..00f6cc38097 100644
--- a/chromium/third_party/libgav1/src/examples/file_writer.h
+++ b/chromium/third_party/libgav1/src/examples/file_writer.h
@@ -21,9 +21,8 @@
 #include <cstdint>
 #include <cstdio>
 #include <memory>
+#include <string>
 
-#include "absl/base/attributes.h"
-#include "absl/strings/string_view.h"
 #include "gav1/decoder_buffer.h"
 
 namespace libgav1 {
@@ -70,7 +69,7 @@ class FileWriter {
   // Returns a FileWriter instance after the file is opened successfully for
   // kFileTypeRaw files, and after the Y4M file header bytes are written for
   // kFileTypeY4m files. Returns nullptr upon failure.
-  static std::unique_ptr<FileWriter> Open(absl::string_view file_name,
+  static std::unique_ptr<FileWriter> Open(const std::string& file_name,
                                           FileType type,
                                           const Y4mParameters* y4m_parameters);
 
@@ -86,7 +85,8 @@ class FileWriter {
 
   // Writes the frame data in |frame_buffer| to |file_|. Returns true after
   // successful write of |frame_buffer| data.
-  ABSL_MUST_USE_RESULT bool WriteFrame(const DecoderBuffer& frame_buffer);
+  /*LIBGAV1_MUST_USE_RESULT*/ bool WriteFrame(
+      const DecoderBuffer& frame_buffer);
 
  private:
   explicit FileWriter(FILE* file) : file_(file) {}
diff --git a/chromium/third_party/libgav1/src/examples/logging.h b/chromium/third_party/libgav1/src/examples/logging.h
index ba784ef5c15..536ed1dbaf2 100644
--- a/chromium/third_party/libgav1/src/examples/logging.h
+++ b/chromium/third_party/libgav1/src/examples/logging.h
@@ -18,6 +18,7 @@
 #define LIBGAV1_EXAMPLES_LOGGING_H_
 
 #include <cstddef>
+#include <cstdio>
 
 namespace libgav1 {
 namespace examples {
diff --git a/chromium/third_party/libgav1/src/src/buffer_pool.cc b/chromium/third_party/libgav1/src/src/buffer_pool.cc
index 282da8c948b..c1a5606cd11 100644
--- a/chromium/third_party/libgav1/src/src/buffer_pool.cc
+++ b/chromium/third_party/libgav1/src/src/buffer_pool.cc
@@ -69,27 +69,13 @@ bool RefCountedBuffer::SetFrameDimensions(const ObuFrameHeader& frame_header) {
   render_height_ = frame_header.render_height;
   rows4x4_ = frame_header.rows4x4;
   columns4x4_ = frame_header.columns4x4;
-  const int rows4x4_half = DivideBy2(rows4x4_);
-  const int columns4x4_half = DivideBy2(columns4x4_);
-  if (!motion_field_reference_frame_.Reset(rows4x4_half, columns4x4_half,
-                                           /*zero_initialize=*/false) ||
-      !motion_field_mv_.Reset(rows4x4_half, columns4x4_half,
-                              /*zero_initialize=*/false)) {
-    return false;
-  }
-  if (frame_header.refresh_frame_flags != 0) {
-    // Initialize so that Tile::StoreMotionFieldMvsIntoCurrentFrame() can skip
-    // some updates when the updates are the same as the initialized value.
-    // Set to kReferenceFrameIntra instead of kReferenceFrameNone to simplify
-    // branch conditions in motion field projection.
-    // The following memory initialization of contiguous memory is very fast. It
-    // is not recommended to make the initialization multi-threaded, unless the
-    // memory which needs to be initialized in each thread is still contiguous.
-    static_assert(sizeof(motion_field_reference_frame_[0][0]) == sizeof(int8_t),
-                  "");
-    memset(motion_field_reference_frame_.data(), kReferenceFrameIntra,
-           sizeof(motion_field_reference_frame_[0][0]) *
-               motion_field_reference_frame_.size());
+  if (frame_header.refresh_frame_flags != 0 &&
+      !IsIntraFrame(frame_header.frame_type)) {
+    const int rows4x4_half = DivideBy2(rows4x4_);
+    const int columns4x4_half = DivideBy2(columns4x4_);
+    if (!reference_info_.Reset(rows4x4_half, columns4x4_half)) {
+      return false;
+    }
   }
   return segmentation_map_.Allocate(rows4x4_, columns4x4_);
 }
diff --git a/chromium/third_party/libgav1/src/src/buffer_pool.h b/chromium/third_party/libgav1/src/src/buffer_pool.h
index 07adc838f12..13008c10cd2 100644
--- a/chromium/third_party/libgav1/src/src/buffer_pool.h
+++ b/chromium/third_party/libgav1/src/src/buffer_pool.h
@@ -19,6 +19,7 @@
 
 #include <array>
 #include <cassert>
+#include <climits>
 #include <condition_variable>  // NOLINT (unapproved c++11 header)
 #include <cstdint>
 #include <cstring>
@@ -29,9 +30,9 @@
 #include "src/gav1/frame_buffer.h"
 #include "src/internal_frame_buffer_list.h"
 #include "src/symbol_decoder_context.h"
-#include "src/utils/array_2d.h"
 #include "src/utils/compiler_attributes.h"
 #include "src/utils/constants.h"
+#include "src/utils/reference_info.h"
 #include "src/utils/segmentation.h"
 #include "src/utils/segmentation_map.h"
 #include "src/utils/types.h"
@@ -108,21 +109,11 @@ class RefCountedBuffer {
   bool showable_frame() const { return showable_frame_; }
   void set_showable_frame(bool value) { showable_frame_ = value; }
 
-  // This array has kNumReferenceFrameTypes elements.
-  const uint8_t* order_hint_array() const { return order_hint_.data(); }
-  uint8_t order_hint(ReferenceFrameType reference_frame) const {
-    return order_hint_[reference_frame];
-  }
-  void set_order_hint(ReferenceFrameType reference_frame, uint8_t order_hint) {
-    order_hint_[reference_frame] = order_hint;
-  }
-  void ClearOrderHints() { order_hint_.fill(0); }
-
   // Sets upscaled_width_, frame_width_, frame_height_, render_width_,
   // render_height_, rows4x4_ and columns4x4_ from the corresponding fields
-  // in frame_header. Allocates motion_field_reference_frame_,
-  // motion_field_mv_, and segmentation_map_. Returns true on success, false
-  // on failure.
+  // in frame_header. Allocates reference_info_.motion_field_reference_frame,
+  // reference_info_.motion_field_mv_, and segmentation_map_. Returns true on
+  // success, false on failure.
   bool SetFrameDimensions(const ObuFrameHeader& frame_header);
 
   int32_t upscaled_width() const { return upscaled_width_; }
@@ -135,27 +126,6 @@ class RefCountedBuffer {
   int32_t rows4x4() const { return rows4x4_; }
   int32_t columns4x4() const { return columns4x4_; }
 
-  // Entry at |row|, |column| corresponds to
-  // MfRefFrames[row * 2 + 1][column * 2 + 1] in the spec.
-  ReferenceFrameType* motion_field_reference_frame(int row, int column) {
-    return &motion_field_reference_frame_[row][column];
-  }
-
-  const ReferenceFrameType* motion_field_reference_frame(int row,
-                                                         int column) const {
-    return &motion_field_reference_frame_[row][column];
-  }
-
-  // Entry at |row|, |column| corresponds to
-  // MfMvs[row * 2 + 1][column * 2 + 1] in the spec.
-  MotionVector* motion_field_mv(int row, int column) {
-    return &motion_field_mv_[row][column];
-  }
-
-  const MotionVector* motion_field_mv(int row, int column) const {
-    return &motion_field_mv_[row][column];
-  }
-
   SegmentationMap* segmentation_map() { return &segmentation_map_; }
   const SegmentationMap* segmentation_map() const { return &segmentation_map_; }
 
@@ -205,6 +175,9 @@ class RefCountedBuffer {
     film_grain_params_ = params;
   }
 
+  const ReferenceInfo* reference_info() const { return &reference_info_; }
+  ReferenceInfo* reference_info() { return &reference_info_; }
+
   // This will wake up the WaitUntil*() functions and make them return false.
   void Abort() {
     {
@@ -217,8 +190,10 @@ class RefCountedBuffer {
   }
 
   void SetFrameState(FrameState frame_state) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    frame_state_ = frame_state;
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      frame_state_ = frame_state;
+    }
     if (frame_state == kFrameStateParsed) {
       parsed_condvar_.notify_all();
     } else if (frame_state == kFrameStateDecoded) {
@@ -230,9 +205,11 @@ class RefCountedBuffer {
   // Sets the progress of this frame to |progress_row| and notifies any threads
   // that may be waiting on rows <= |progress_row|.
   void SetProgress(int progress_row) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    if (progress_row_ >= progress_row) return;
-    progress_row_ = progress_row;
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      if (progress_row_ >= progress_row) return;
+      progress_row_ = progress_row;
+    }
     progress_row_condvar_.notify_all();
   }
 
@@ -257,8 +234,14 @@ class RefCountedBuffer {
   }
 
   // Waits until the |progress_row| has been decoded (as indicated either by
-  // |progress_row_| or |frame_state_|).
-  bool WaitUntil(int progress_row) {
+  // |progress_row_| or |frame_state_|). |progress_row_cache| must not be
+  // nullptr and will be populated with the value of |progress_row_| after the
+  // wait.
+  //
+  // Typical usage of |progress_row_cache| is as follows:
+  //  * Initialize |*progress_row_cache| to INT_MIN.
+  //  * Call WaitUntil only if |*progress_row_cache| < |progress_row|.
+  bool WaitUntil(int progress_row, int* progress_row_cache) {
     // If |progress_row| is negative, it means that the wait is on the top
     // border to be available. The top border will be available when row 0 has
     // been decoded. So we can simply wait on row 0 instead.
@@ -268,6 +251,11 @@ class RefCountedBuffer {
            !abort_) {
       progress_row_condvar_.wait(lock);
     }
+    // Once |frame_state_| reaches kFrameStateDecoded, |progress_row_| may no
+    // longer be updated. So we set |*progress_row_cache| to INT_MAX in that
+    // case.
+    *progress_row_cache =
+        (frame_state_ != kFrameStateDecoded) ? progress_row_ : INT_MAX;
     return !abort_;
   }
 
@@ -311,8 +299,6 @@ class RefCountedBuffer {
   ChromaSamplePosition chroma_sample_position_ = kChromaSamplePositionUnknown;
   bool showable_frame_ = false;
 
-  std::array<uint8_t, kNumReferenceFrameTypes> order_hint_ = {};
-
   int32_t upscaled_width_ = 0;
   int32_t frame_width_ = 0;
   int32_t frame_height_ = 0;
@@ -321,12 +307,6 @@ class RefCountedBuffer {
   int32_t columns4x4_ = 0;
   int32_t rows4x4_ = 0;
 
-  // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds
-  // to MfRefFrames[i * 2 + 1][j * 2 + 1] in the spec.
-  Array2D<ReferenceFrameType> motion_field_reference_frame_;
-  // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds
-  // to MfMvs[i * 2 + 1][j * 2 + 1] in the spec.
-  Array2D<MotionVector> motion_field_mv_;
   // segmentation_map_ contains a rows4x4_ by columns4x4_ 2D array.
   SegmentationMap segmentation_map_;
 
@@ -344,6 +324,7 @@ class RefCountedBuffer {
   // on feature_enabled only, we also save their values as an optimization.
   Segmentation segmentation_ = {};
   FilmGrainParams film_grain_params_ = {};
+  ReferenceInfo reference_info_;
 };
 
 // RefCountedBufferPtr contains a reference to a RefCountedBuffer.
diff --git a/chromium/third_party/libgav1/src/src/decoder_impl.cc b/chromium/third_party/libgav1/src/src/decoder_impl.cc
index 841e4efed4b..508bbde4822 100644
--- a/chromium/third_party/libgav1/src/src/decoder_impl.cc
+++ b/chromium/third_party/libgav1/src/src/decoder_impl.cc
@@ -27,7 +27,6 @@
 #include "src/film_grain.h"
 #include "src/frame_buffer_utils.h"
 #include "src/frame_scratch_buffer.h"
-#include "src/loop_filter_mask.h"
 #include "src/loop_restoration_info.h"
 #include "src/obu_parser.h"
 #include "src/post_filter.h"
@@ -36,6 +35,7 @@
 #include "src/threading_strategy.h"
 #include "src/utils/blocking_counter.h"
 #include "src/utils/common.h"
+#include "src/utils/constants.h"
 #include "src/utils/logging.h"
 #include "src/utils/parameter_tree.h"
 #include "src/utils/raw_bit_reader.h"
@@ -61,6 +61,41 @@ int GetBottomBorderPixels(const bool do_cdef, const bool do_restoration,
   return border;
 }
 
+// Sets |frame_scratch_buffer->tile_decoding_failed| to true (while holding on
+// to |frame_scratch_buffer->superblock_row_mutex|) and notifies the first
+// |count| condition variables in
+// |frame_scratch_buffer->superblock_row_progress_condvar|.
+void SetFailureAndNotifyAll(FrameScratchBuffer* const frame_scratch_buffer,
+                            int count) {
+  {
+    std::lock_guard<std::mutex> lock(
+        frame_scratch_buffer->superblock_row_mutex);
+    frame_scratch_buffer->tile_decoding_failed = true;
+  }
+  std::condition_variable* const condvars =
+      frame_scratch_buffer->superblock_row_progress_condvar.get();
+  for (int i = 0; i < count; ++i) {
+    condvars[i].notify_one();
+  }
+}
+
+// Helper class that releases the frame scratch buffer in the destructor.
+class FrameScratchBufferReleaser {
+ public:
+  FrameScratchBufferReleaser(
+      FrameScratchBufferPool* frame_scratch_buffer_pool,
+      std::unique_ptr<FrameScratchBuffer>* frame_scratch_buffer)
+      : frame_scratch_buffer_pool_(frame_scratch_buffer_pool),
+        frame_scratch_buffer_(frame_scratch_buffer) {}
+  ~FrameScratchBufferReleaser() {
+    frame_scratch_buffer_pool_->Release(std::move(*frame_scratch_buffer_));
+  }
+
+ private:
+  FrameScratchBufferPool* const frame_scratch_buffer_pool_;
+  std::unique_ptr<FrameScratchBuffer>* const frame_scratch_buffer_;
+};
+
 }  // namespace
 
 // static
@@ -107,22 +142,40 @@ DecoderImpl::~DecoderImpl() {
 }
 
 StatusCode DecoderImpl::Init() {
+  if (!GenerateWedgeMask(&wedge_masks_)) {
+    LIBGAV1_DLOG(ERROR, "GenerateWedgeMask() failed.");
+    return kStatusOutOfMemory;
+  }
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::InitializeFrameThreadPoolAndTemporalUnitQueue(
+    const uint8_t* data, size_t size) {
   if (settings_.frame_parallel) {
-#if defined(ENABLE_FRAME_PARALLEL)
-    if (settings_.threads > 1) {
-      if (!InitializeThreadPoolsForFrameParallel(settings_.threads,
-                                                 &frame_thread_pool_)) {
-        return kStatusOutOfMemory;
-      }
-      // TODO(b/142583029): Frame parallel decoding with in-frame
-      // multi-threading is not yet implemented. Until then, we force
-      // settings_.threads to 1 when frame parallel decoding is enabled.
-      settings_.threads = 1;
+    DecoderState state;
+    std::unique_ptr<ObuParser> obu(
+        new (std::nothrow) ObuParser(data, size, &buffer_pool_, &state));
+    if (obu == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
+      return kStatusOutOfMemory;
+    }
+    RefCountedBufferPtr current_frame;
+    const StatusCode status = obu->ParseOneFrame(&current_frame);
+    if (status != kStatusOk) {
+      LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
+      return status;
+    }
+    current_frame = nullptr;
+    // We assume that the first frame that was parsed will contain the frame
+    // header. This assumption is usually true in practice. So we will simply
+    // not use frame parallel mode if this is not the case.
+    if (settings_.threads > 1 &&
+        !InitializeThreadPoolsForFrameParallel(
+            settings_.threads, obu->frame_header().tile_info.tile_count,
+            obu->frame_header().tile_info.tile_columns, &frame_thread_pool_,
+            &frame_scratch_buffer_pool_)) {
+      return kStatusOutOfMemory;
     }
-#else
-    LIBGAV1_DLOG(
-        ERROR, "Frame parallel decoding is not implemented, ignoring setting.");
-#endif  // defined(ENABLE_FRAME_PARALLEL)
   }
   const int max_allowed_frames = GetMaxAllowedFrames();
   assert(max_allowed_frames > 0);
@@ -130,10 +183,6 @@ StatusCode DecoderImpl::Init() {
     LIBGAV1_DLOG(ERROR, "temporal_units_.Init() failed.");
     return kStatusOutOfMemory;
   }
-  if (!GenerateWedgeMask(&wedge_masks_)) {
-    LIBGAV1_DLOG(ERROR, "GenerateWedgeMask() failed.");
-    return kStatusOutOfMemory;
-  }
   return kStatusOk;
 }
 
@@ -141,7 +190,19 @@ StatusCode DecoderImpl::EnqueueFrame(const uint8_t* data, size_t size,
                                      int64_t user_private_data,
                                      void* buffer_private_data) {
   if (data == nullptr || size == 0) return kStatusInvalidArgument;
-  if (abort_) return kStatusUnknownError;
+  if (HasFailure()) return kStatusUnknownError;
+  if (!seen_first_frame_) {
+    seen_first_frame_ = true;
+    const StatusCode status =
+        InitializeFrameThreadPoolAndTemporalUnitQueue(data, size);
+    if (status != kStatusOk) {
+      if (settings_.release_input_buffer != nullptr) {
+        settings_.release_input_buffer(settings_.callback_private_data,
+                                       buffer_private_data);
+      }
+      return SignalFailure(status);
+    }
+  }
   if (temporal_units_.Full()) {
     return kStatusTryAgain;
   }
@@ -153,11 +214,13 @@ StatusCode DecoderImpl::EnqueueFrame(const uint8_t* data, size_t size,
 
 StatusCode DecoderImpl::SignalFailure(StatusCode status) {
   if (status == kStatusOk || status == kStatusTryAgain) return status;
-  abort_ = true;
-  failure_status_ = status;
   // Make sure all waiting threads exit.
   buffer_pool_.Abort();
   frame_thread_pool_ = nullptr;
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    failure_status_ = status;
+  }
   while (!temporal_units_.Empty()) {
     if (settings_.release_input_buffer != nullptr) {
       settings_.release_input_buffer(
@@ -197,17 +260,22 @@ StatusCode DecoderImpl::DequeueFrame(const DecoderBuffer** out_ptr) {
     temporal_units_.Pop();
     return status;
   }
-  if (settings_.blocking_dequeue) {
+  {
     std::unique_lock<std::mutex> lock(mutex_);
-    while (!temporal_unit.decoded && !abort_) {
-      decoded_condvar_.wait(lock);
+    if (settings_.blocking_dequeue) {
+      while (!temporal_unit.decoded && failure_status_ == kStatusOk) {
+        decoded_condvar_.wait(lock);
+      }
+    } else {
+      if (!temporal_unit.decoded && failure_status_ == kStatusOk) {
+        return kStatusTryAgain;
+      }
+    }
+    if (failure_status_ != kStatusOk) {
+      const StatusCode failure_status = failure_status_;
+      lock.unlock();
+      return SignalFailure(failure_status);
     }
-  } else {
-    std::lock_guard<std::mutex> lock(mutex_);
-    if (!temporal_unit.decoded && !abort_) return kStatusTryAgain;
-  }
-  if (abort_) {
-    return SignalFailure(failure_status_);
   }
   if (settings_.release_input_buffer != nullptr) {
     settings_.release_input_buffer(settings_.callback_private_data,
@@ -290,33 +358,32 @@ StatusCode DecoderImpl::ParseAndSchedule() {
     std::lock_guard<std::mutex> lock(mutex_);
     temporal_unit.has_displayable_frame = false;
     temporal_unit.decoded = true;
-    decoded_condvar_.notify_one();
     return kStatusOk;
   }
   for (auto& frame : temporal_unit.frames) {
     EncodedFrame* const encoded_frame = &frame;
     frame_thread_pool_->Schedule([this, encoded_frame]() {
-      if (abort_) return;
+      if (HasFailure()) return;
       const StatusCode status = DecodeFrame(encoded_frame);
-      if (abort_) return;
       encoded_frame->state = {};
       encoded_frame->frame = nullptr;
       TemporalUnit& temporal_unit = encoded_frame->temporal_unit;
       std::lock_guard<std::mutex> lock(mutex_);
+      if (failure_status_ != kStatusOk) return;
       // temporal_unit's status defaults to kStatusOk. So we need to set it only
-      // on error. If |abort_| is true at this point, it means that there has
-      // already been a failure. So we don't care about this subsequent failure.
-      // We will simply return the error code of the first failure.
+      // on error. If |failure_status_| is not kStatusOk at this point, it means
+      // that there has already been a failure. So we don't care about this
+      // subsequent failure.  We will simply return the error code of the first
+      // failure.
       if (status != kStatusOk) {
         temporal_unit.status = status;
-        if (!abort_) {
-          abort_ = true;
+        if (failure_status_ == kStatusOk) {
           failure_status_ = status;
         }
       }
       temporal_unit.decoded =
           ++temporal_unit.decoded_count == temporal_unit.frames.size();
-      if (temporal_unit.decoded || abort_) {
+      if (temporal_unit.decoded || failure_status_ != kStatusOk) {
         decoded_condvar_.notify_one();
       }
     });
@@ -330,6 +397,17 @@ StatusCode DecoderImpl::DecodeFrame(EncodedFrame* const encoded_frame) {
   const Vector<ObuTileGroup>& tile_groups = encoded_frame->tile_groups;
   RefCountedBufferPtr current_frame = std::move(encoded_frame->frame);
 
+  std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+      frame_scratch_buffer_pool_.Get();
+  if (frame_scratch_buffer == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
+    return kStatusOutOfMemory;
+  }
+  // |frame_scratch_buffer| will be released when this local variable goes out
+  // of scope (i.e.) on any return path in this function.
+  FrameScratchBufferReleaser frame_scratch_buffer_releaser(
+      &frame_scratch_buffer_pool_, &frame_scratch_buffer);
+
   StatusCode status;
   if (!frame_header.show_existing_frame) {
     if (tile_groups.empty()) {
@@ -339,16 +417,9 @@ StatusCode DecoderImpl::DecodeFrame(EncodedFrame* const encoded_frame) {
       // not have a reason to handle those cases, so we simply continue.
       return kStatusOk;
     }
-    std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
-        frame_scratch_buffer_pool_.Get();
-    if (frame_scratch_buffer == nullptr) {
-      LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
-      return kStatusOutOfMemory;
-    }
     status = DecodeTiles(sequence_header, frame_header, tile_groups,
                          encoded_frame->state, frame_scratch_buffer.get(),
                          current_frame.get());
-    frame_scratch_buffer_pool_.Release(std::move(frame_scratch_buffer));
     if (status != kStatusOk) {
       return status;
     }
@@ -362,8 +433,9 @@ StatusCode DecoderImpl::DecodeFrame(EncodedFrame* const encoded_frame) {
     return kStatusOk;
   }
   RefCountedBufferPtr film_grain_frame;
-  status = ApplyFilmGrain(sequence_header, frame_header, current_frame,
-                          &film_grain_frame, /*thread_pool=*/nullptr);
+  status = ApplyFilmGrain(
+      sequence_header, frame_header, current_frame, &film_grain_frame,
+      frame_scratch_buffer->threading_strategy.thread_pool());
   if (status != kStatusOk) {
     return status;
   }
@@ -402,6 +474,17 @@ StatusCode DecoderImpl::DecodeTemporalUnit(const TemporalUnit& temporal_unit,
   RefCountedBufferPtr current_frame;
   RefCountedBufferPtr displayable_frame;
   StatusCode status;
+  std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+      frame_scratch_buffer_pool_.Get();
+  if (frame_scratch_buffer == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
+    return kStatusOutOfMemory;
+  }
+  // |frame_scratch_buffer| will be released when this local variable goes out
+  // of scope (i.e.) on any return path in this function.
+  FrameScratchBufferReleaser frame_scratch_buffer_releaser(
+      &frame_scratch_buffer_pool_, &frame_scratch_buffer);
+
   while (obu->HasData()) {
     status = obu->ParseOneFrame(&current_frame);
     if (status != kStatusOk) {
@@ -433,16 +516,9 @@ StatusCode DecoderImpl::DecodeTemporalUnit(const TemporalUnit& temporal_unit,
         // not have a reason to handle those cases, so we simply continue.
         continue;
       }
-      std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
-          frame_scratch_buffer_pool_.Get();
-      if (frame_scratch_buffer == nullptr) {
-        LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
-        return kStatusOutOfMemory;
-      }
       status = DecodeTiles(obu->sequence_header(), obu->frame_header(),
                            obu->tile_groups(), state_,
                            frame_scratch_buffer.get(), current_frame.get());
-      frame_scratch_buffer_pool_.Release(std::move(frame_scratch_buffer));
       if (status != kStatusOk) {
         return status;
       }
@@ -463,17 +539,10 @@ StatusCode DecoderImpl::DecodeTemporalUnit(const TemporalUnit& temporal_unit,
       }
       displayable_frame = std::move(current_frame);
       RefCountedBufferPtr film_grain_frame;
-      std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
-          frame_scratch_buffer_pool_.Get();
-      if (frame_scratch_buffer == nullptr) {
-        LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
-        return kStatusOutOfMemory;
-      }
       status = ApplyFilmGrain(
           obu->sequence_header(), obu->frame_header(), displayable_frame,
           &film_grain_frame,
           frame_scratch_buffer->threading_strategy.film_grain_thread_pool());
-      frame_scratch_buffer_pool_.Release(std::move(frame_scratch_buffer));
       if (status != kStatusOk) return status;
       displayable_frame = std::move(film_grain_frame);
     }
@@ -572,25 +641,6 @@ StatusCode DecoderImpl::DecodeTiles(
     RefCountedBuffer* const current_frame) {
   frame_scratch_buffer->tile_scratch_buffer_pool.Reset(
       sequence_header.color_config.bitdepth);
-  if (IsFrameParallel()) {
-    // We can parse the current frame if all the reference frames have been
-    // parsed.
-    for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
-      if (!state.reference_valid[i] || state.reference_frame[i] == nullptr) {
-        continue;
-      }
-      if (!state.reference_frame[i]->WaitUntilParsed()) {
-        return kStatusUnknownError;
-      }
-    }
-  }
-  if (PostFilter::DoDeblock(frame_header, settings_.post_filter_mask)) {
-    if (kDeblockFilterBitMask && !frame_scratch_buffer->loop_filter_mask.Reset(
-                                     frame_header.width, frame_header.height)) {
-      LIBGAV1_DLOG(ERROR, "Failed to allocate memory for loop filter masks.");
-      return kStatusOutOfMemory;
-    }
-  }
   if (!frame_scratch_buffer->loop_restoration_info.Reset(
           &frame_header.loop_restoration, frame_header.upscaled_width,
           frame_header.height, sequence_header.color_config.subsampling_x,
@@ -671,11 +721,10 @@ StatusCode DecoderImpl::DecodeTiles(
   // The addition of kMaxBlockHeight4x4 and kMaxBlockWidth4x4 is necessary so
   // that the block parameters cache can be filled in for the last row/column
   // without having to check for boundary conditions.
-  BlockParametersHolder block_parameters_holder(
-      frame_header.rows4x4 + kMaxBlockHeight4x4,
-      frame_header.columns4x4 + kMaxBlockWidth4x4,
-      sequence_header.use_128x128_superblock);
-  if (!block_parameters_holder.Init()) {
+  if (!frame_scratch_buffer->block_parameters_holder.Reset(
+          frame_header.rows4x4 + kMaxBlockHeight4x4,
+          frame_header.columns4x4 + kMaxBlockWidth4x4,
+          sequence_header.use_128x128_superblock)) {
     return kStatusOutOfMemory;
   }
   const dsp::Dsp* const dsp =
@@ -685,24 +734,6 @@ StatusCode DecoderImpl::DecodeTiles(
                  sequence_header.color_config.bitdepth);
     return kStatusInternalError;
   }
-  // If prev_segment_ids is a null pointer, it is treated as if it pointed to
-  // a segmentation map containing all 0s.
-  const SegmentationMap* prev_segment_ids = nullptr;
-  if (frame_header.primary_reference_frame == kPrimaryReferenceNone) {
-    frame_scratch_buffer->symbol_decoder_context.Initialize(
-        frame_header.quantizer.base_index);
-  } else {
-    const int index =
-        frame_header
-            .reference_frame_index[frame_header.primary_reference_frame];
-    const RefCountedBuffer* prev_frame = state.reference_frame[index].get();
-    frame_scratch_buffer->symbol_decoder_context = prev_frame->FrameContext();
-    if (frame_header.segmentation.enabled &&
-        prev_frame->columns4x4() == frame_header.columns4x4 &&
-        prev_frame->rows4x4() == frame_header.rows4x4) {
-      prev_segment_ids = prev_frame->segmentation_map();
-    }
-  }
 
   const uint8_t tile_size_bytes = frame_header.tile_info.tile_size_bytes;
   const int tile_count = tile_groups.back().end + 1;
@@ -714,26 +745,12 @@ StatusCode DecoderImpl::DecodeTiles(
   }
   ThreadingStrategy& threading_strategy =
       frame_scratch_buffer->threading_strategy;
-  if (!threading_strategy.Reset(frame_header, settings_.threads)) {
+  if (!IsFrameParallel() &&
+      !threading_strategy.Reset(frame_header, settings_.threads)) {
     return kStatusOutOfMemory;
   }
 
   if (threading_strategy.row_thread_pool(0) != nullptr || IsFrameParallel()) {
-    const int block_width4x4_minus_one =
-        sequence_header.use_128x128_superblock ? 31 : 15;
-    const int block_width4x4_log2 =
-        sequence_header.use_128x128_superblock ? 5 : 4;
-    const int superblock_rows =
-        (frame_header.rows4x4 + block_width4x4_minus_one) >>
-        block_width4x4_log2;
-    const int superblock_columns =
-        (frame_header.columns4x4 + block_width4x4_minus_one) >>
-        block_width4x4_log2;
-    if (!frame_scratch_buffer->superblock_state.Reset(superblock_rows,
-                                                      superblock_columns)) {
-      LIBGAV1_DLOG(ERROR, "Failed to allocate super_block_state.\n");
-      return kStatusOutOfMemory;
-    }
     if (frame_scratch_buffer->residual_buffer_pool == nullptr) {
       frame_scratch_buffer->residual_buffer_pool.reset(
           new (std::nothrow) ResidualBufferPool(
@@ -818,25 +835,80 @@ StatusCode DecoderImpl::DecodeTiles(
     }
   }
 
-  PostFilter post_filter(
-      frame_header, sequence_header, &frame_scratch_buffer->loop_filter_mask,
-      frame_scratch_buffer->cdef_index,
-      frame_scratch_buffer->inter_transform_sizes,
-      &frame_scratch_buffer->loop_restoration_info, &block_parameters_holder,
-      current_frame->buffer(), &frame_scratch_buffer->deblock_buffer, dsp,
-      threading_strategy.post_filter_thread_pool(),
-      frame_scratch_buffer->threaded_window_buffer.get(),
-      frame_scratch_buffer->superres_line_buffer.get(),
-      settings_.post_filter_mask);
+  PostFilter post_filter(frame_header, sequence_header, frame_scratch_buffer,
+                         current_frame->buffer(), dsp,
+                         settings_.post_filter_mask);
+
+  if (IsFrameParallel()) {
+    // We can parse the current frame if all the reference frames have been
+    // parsed.
+    for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+      if (!state.reference_valid[i] || state.reference_frame[i] == nullptr) {
+        continue;
+      }
+      if (!state.reference_frame[i]->WaitUntilParsed()) {
+        return kStatusUnknownError;
+      }
+    }
+  }
+
+  // If prev_segment_ids is a null pointer, it is treated as if it pointed to
+  // a segmentation map containing all 0s.
+  const SegmentationMap* prev_segment_ids = nullptr;
+  if (frame_header.primary_reference_frame == kPrimaryReferenceNone) {
+    frame_scratch_buffer->symbol_decoder_context.Initialize(
+        frame_header.quantizer.base_index);
+  } else {
+    const int index =
+        frame_header
+            .reference_frame_index[frame_header.primary_reference_frame];
+    const RefCountedBuffer* prev_frame = state.reference_frame[index].get();
+    frame_scratch_buffer->symbol_decoder_context = prev_frame->FrameContext();
+    if (frame_header.segmentation.enabled &&
+        prev_frame->columns4x4() == frame_header.columns4x4 &&
+        prev_frame->rows4x4() == frame_header.rows4x4) {
+      prev_segment_ids = prev_frame->segmentation_map();
+    }
+  }
+
   // The Tile class must make use of a separate buffer to store the unfiltered
   // pixels for the intra prediction of the next superblock row. This is done
   // only when one of the following conditions are true:
-  //   * frame_parallel is true.
+  //   * IsFrameParallel() is true.
   //   * settings_.threads == 1.
   // In the non-frame-parallel multi-threaded case, we do not run the post
   // filters in the decode loop. So this buffer need not be used.
   const bool use_intra_prediction_buffer =
       IsFrameParallel() || settings_.threads == 1;
+  if (use_intra_prediction_buffer) {
+    if (!frame_scratch_buffer->intra_prediction_buffers.Resize(
+            frame_header.tile_info.tile_rows)) {
+      LIBGAV1_DLOG(ERROR, "Failed to Resize intra_prediction_buffers.");
+      return kStatusOutOfMemory;
+    }
+    IntraPredictionBuffer* const intra_prediction_buffers =
+        frame_scratch_buffer->intra_prediction_buffers.get();
+    for (int plane = 0; plane < num_planes; ++plane) {
+      const int subsampling =
+          (plane == kPlaneY) ? 0 : sequence_header.color_config.subsampling_x;
+      const size_t intra_prediction_buffer_size =
+          ((MultiplyBy4(frame_header.columns4x4) >> subsampling) *
+           (sequence_header.color_config.bitdepth == 8 ? sizeof(uint8_t)
+                                                       : sizeof(uint16_t)));
+      for (int tile_row = 0; tile_row < frame_header.tile_info.tile_rows;
+           ++tile_row) {
+        if (!intra_prediction_buffers[tile_row][plane].Resize(
+                intra_prediction_buffer_size)) {
+          LIBGAV1_DLOG(ERROR,
+                       "Failed to allocate intra prediction buffer for tile "
+                       "row %d plane %d.\n",
+                       tile_row, plane);
+          return kStatusOutOfMemory;
+        }
+      }
+    }
+  }
+
   SymbolDecoderContext saved_symbol_decoder_context;
   int tile_index = 0;
   BlockingCounterWithStatus pending_tiles(tile_count);
@@ -870,7 +942,7 @@ StatusCode DecoderImpl::DecodeTiles(
           tile_number, tile_group.data + byte_offset, tile_size,
           sequence_header, frame_header, current_frame, state,
           frame_scratch_buffer, wedge_masks_, &saved_symbol_decoder_context,
-          prev_segment_ids, &post_filter, &block_parameters_holder, dsp,
+          prev_segment_ids, &post_filter, dsp,
           threading_strategy.row_thread_pool(tile_index++), &pending_tiles,
           IsFrameParallel(), use_intra_prediction_buffer);
       if (tile == nullptr) {
@@ -885,7 +957,12 @@ StatusCode DecoderImpl::DecodeTiles(
   }
   assert(tiles.size() == static_cast<size_t>(tile_count));
   if (IsFrameParallel()) {
-    return DecodeTilesFrameParallel(
+    if (frame_scratch_buffer->threading_strategy.thread_pool() == nullptr) {
+      return DecodeTilesFrameParallel(
+          sequence_header, frame_header, tiles, saved_symbol_decoder_context,
+          prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame);
+    }
+    return DecodeTilesThreadedFrameParallel(
         sequence_header, frame_header, tiles, saved_symbol_decoder_context,
         prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame);
   }
@@ -894,10 +971,8 @@ StatusCode DecoderImpl::DecodeTiles(
     status = DecodeTilesNonFrameParallel(sequence_header, frame_header, tiles,
                                          frame_scratch_buffer, &post_filter);
   } else {
-    status = DecodeTilesThreadedNonFrameParallel(
-        sequence_header, frame_header, tiles, tile_groups,
-        block_parameters_holder, frame_scratch_buffer, &post_filter,
-        &pending_tiles);
+    status = DecodeTilesThreadedNonFrameParallel(tiles, frame_scratch_buffer,
+                                                 &post_filter, &pending_tiles);
   }
   if (status != kStatusOk) return status;
   if (frame_header.enable_frame_end_update_cdf) {
@@ -928,8 +1003,8 @@ StatusCode DecoderImpl::DecodeTilesNonFrameParallel(
       }
     }
     post_filter->ApplyFilteringForOneSuperBlockRow(
-        row4x4, block_width4x4,
-        row4x4 + block_width4x4 >= frame_header.rows4x4);
+        row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+        /*do_deblock=*/true);
   }
   frame_scratch_buffer->tile_scratch_buffer_pool.Release(
       std::move(tile_scratch_buffer));
@@ -937,11 +1012,7 @@ StatusCode DecoderImpl::DecodeTilesNonFrameParallel(
 }
 
 StatusCode DecoderImpl::DecodeTilesThreadedNonFrameParallel(
-    const ObuSequenceHeader& sequence_header,
-    const ObuFrameHeader& frame_header,
     const Vector<std::unique_ptr<Tile>>& tiles,
-    const Vector<ObuTileGroup>& tile_groups,
-    const BlockParametersHolder& block_parameters_holder,
     FrameScratchBuffer* const frame_scratch_buffer,
     PostFilter* const post_filter,
     BlockingCounterWithStatus* const pending_tiles) {
@@ -964,7 +1035,7 @@ StatusCode DecoderImpl::DecodeTilesThreadedNonFrameParallel(
              tile_count) {
         if (!failed) {
           const auto& tile_ptr = tiles[index];
-          if (!tile_ptr->ParseAndDecode(/*is_main_thread=*/false)) {
+          if (!tile_ptr->ParseAndDecode()) {
             LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
             failed = true;
           }
@@ -981,7 +1052,7 @@ StatusCode DecoderImpl::DecodeTilesThreadedNonFrameParallel(
          tile_count) {
     if (!tile_decoding_failed) {
       const auto& tile_ptr = tiles[index];
-      if (!tile_ptr->ParseAndDecode(/*is_main_thread=*/true)) {
+      if (!tile_ptr->ParseAndDecode()) {
         LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
         tile_decoding_failed = true;
       }
@@ -995,15 +1066,8 @@ StatusCode DecoderImpl::DecodeTilesThreadedNonFrameParallel(
   // Wait until all the tiles have been decoded.
   tile_decoding_failed |= !pending_tiles->Wait();
   if (tile_decoding_failed) return kStatusUnknownError;
-  if (post_filter->DoDeblock() && kDeblockFilterBitMask) {
-    frame_scratch_buffer->loop_filter_mask.Build(
-        sequence_header, frame_header, tile_groups.front().start,
-        tile_groups.back().end, block_parameters_holder,
-        frame_scratch_buffer->inter_transform_sizes);
-  }
-  if (threading_strategy.post_filter_thread_pool() != nullptr) {
-    post_filter->ApplyFilteringThreaded();
-  }
+  assert(threading_strategy.post_filter_thread_pool() != nullptr);
+  post_filter->ApplyFilteringThreaded();
   return kStatusOk;
 }
 
@@ -1048,8 +1112,8 @@ StatusCode DecoderImpl::DecodeTilesFrameParallel(
       }
     }
     const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow(
-        row4x4, block_width4x4,
-        row4x4 + block_width4x4 >= frame_header.rows4x4);
+        row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+        /*do_deblock=*/true);
     if (progress_row >= 0) {
       current_frame->SetProgress(progress_row);
     }
@@ -1062,6 +1126,309 @@ StatusCode DecoderImpl::DecodeTilesFrameParallel(
   return kStatusOk;
 }
 
+StatusCode DecoderImpl::DecodeTilesThreadedFrameParallel(
+    const ObuSequenceHeader& sequence_header,
+    const ObuFrameHeader& frame_header,
+    const Vector<std::unique_ptr<Tile>>& tiles,
+    const SymbolDecoderContext& saved_symbol_decoder_context,
+    const SegmentationMap* const prev_segment_ids,
+    FrameScratchBuffer* const frame_scratch_buffer,
+    PostFilter* const post_filter, RefCountedBuffer* const current_frame) {
+  // Parse the frame.
+  ThreadPool& thread_pool =
+      *frame_scratch_buffer->threading_strategy.thread_pool();
+  std::atomic<int> tile_counter(0);
+  const int tile_count = static_cast<int>(tiles.size());
+  const int num_workers = thread_pool.num_threads();
+  BlockingCounterWithStatus parse_workers(num_workers);
+  // Submit tile parsing jobs to the thread pool.
+  for (int i = 0; i < num_workers; ++i) {
+    thread_pool.Schedule([&tiles, tile_count, &tile_counter, &parse_workers]() {
+      bool failed = false;
+      int index;
+      while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+             tile_count) {
+        if (!failed) {
+          const auto& tile_ptr = tiles[index];
+          if (!tile_ptr->Parse()) {
+            LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number());
+            failed = true;
+          }
+        }
+      }
+      parse_workers.Decrement(!failed);
+    });
+  }
+
+  // Have the current thread participate in parsing.
+  bool failed = false;
+  int index;
+  while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+         tile_count) {
+    if (!failed) {
+      const auto& tile_ptr = tiles[index];
+      if (!tile_ptr->Parse()) {
+        LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number());
+        failed = true;
+      }
+    }
+  }
+
+  // Wait until all the parse workers are done. This ensures that all the tiles
+  // have been parsed.
+  if (!parse_workers.Wait() || failed) {
+    return kLibgav1StatusUnknownError;
+  }
+  if (frame_header.enable_frame_end_update_cdf) {
+    frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
+  }
+  current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
+  SetCurrentFrameSegmentationMap(frame_header, prev_segment_ids, current_frame);
+  current_frame->SetFrameState(kFrameStateParsed);
+
+  // Decode the frame.
+  const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
+  const int block_width4x4_log2 =
+      sequence_header.use_128x128_superblock ? 5 : 4;
+  const int superblock_rows =
+      (frame_header.rows4x4 + block_width4x4 - 1) >> block_width4x4_log2;
+  if (!frame_scratch_buffer->superblock_row_progress.Resize(superblock_rows) ||
+      !frame_scratch_buffer->superblock_row_progress_condvar.Resize(
+          superblock_rows)) {
+    return kLibgav1StatusOutOfMemory;
+  }
+  int* const superblock_row_progress =
+      frame_scratch_buffer->superblock_row_progress.get();
+  memset(superblock_row_progress, 0,
+         superblock_rows * sizeof(superblock_row_progress[0]));
+  frame_scratch_buffer->tile_decoding_failed = false;
+  const int tile_columns = frame_header.tile_info.tile_columns;
+  const bool decode_entire_tiles_in_worker_threads =
+      num_workers >= tile_columns;
+  BlockingCounter pending_jobs(
+      decode_entire_tiles_in_worker_threads ? num_workers : tile_columns);
+  if (decode_entire_tiles_in_worker_threads) {
+    // Submit tile decoding jobs to the thread pool.
+    tile_counter = 0;
+    for (int i = 0; i < num_workers; ++i) {
+      thread_pool.Schedule([&tiles, tile_count, &tile_counter, &pending_jobs,
+                            frame_scratch_buffer, superblock_rows]() {
+        bool failed = false;
+        int index;
+        while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+               tile_count) {
+          if (failed) continue;
+          const auto& tile_ptr = tiles[index];
+          if (!tile_ptr->Decode(
+                  &frame_scratch_buffer->superblock_row_mutex,
+                  frame_scratch_buffer->superblock_row_progress.get(),
+                  frame_scratch_buffer->superblock_row_progress_condvar
+                      .get())) {
+            LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
+            failed = true;
+            SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+          }
+        }
+        pending_jobs.Decrement();
+      });
+    }
+  } else {
+    // Schedule the jobs for first tile row.
+    for (int tile_index = 0; tile_index < tile_columns; ++tile_index) {
+      thread_pool.Schedule([this, &tiles, tile_index, block_width4x4,
+                            tile_columns, superblock_rows, frame_scratch_buffer,
+                            post_filter, &pending_jobs]() {
+        DecodeSuperBlockRowInTile(
+            tiles, tile_index, 0, block_width4x4, tile_columns, superblock_rows,
+            frame_scratch_buffer, post_filter, &pending_jobs);
+        pending_jobs.Decrement();
+      });
+    }
+  }
+
+  // Current thread will do the post filters.
+  std::condition_variable* const superblock_row_progress_condvar =
+      frame_scratch_buffer->superblock_row_progress_condvar.get();
+  const std::unique_ptr<Tile>* tile_row_base = &tiles[0];
+  for (int row4x4 = 0, index = 0; row4x4 < frame_header.rows4x4;
+       row4x4 += block_width4x4, ++index) {
+    if (!tile_row_base[0]->IsRow4x4Inside(row4x4)) {
+      tile_row_base += tile_columns;
+    }
+    {
+      std::unique_lock<std::mutex> lock(
+          frame_scratch_buffer->superblock_row_mutex);
+      while (superblock_row_progress[index] != tile_columns &&
+             !frame_scratch_buffer->tile_decoding_failed) {
+        superblock_row_progress_condvar[index].wait(lock);
+      }
+      if (frame_scratch_buffer->tile_decoding_failed) break;
+    }
+    if (post_filter->DoDeblock()) {
+      // Apply deblocking filter for the tile boundaries of this superblock row.
+      // The deblocking filter for the internal blocks will be applied in the
+      // tile worker threads. In this thread, we will only have to apply
+      // deblocking filter for the tile boundaries.
+      ApplyDeblockingFilterForTileBoundaries(
+          post_filter, tile_row_base, frame_header, row4x4, block_width4x4,
+          tile_columns, decode_entire_tiles_in_worker_threads);
+    }
+    // Apply all the post filters other than deblocking.
+    const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow(
+        row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+        /*do_deblock=*/false);
+    if (progress_row >= 0) {
+      current_frame->SetProgress(progress_row);
+    }
+  }
+  // Wait until all the pending jobs are done. This ensures that all the tiles
+  // have been decoded and wrapped up.
+  pending_jobs.Wait();
+  {
+    std::lock_guard<std::mutex> lock(
+        frame_scratch_buffer->superblock_row_mutex);
+    if (frame_scratch_buffer->tile_decoding_failed) {
+      return kLibgav1StatusUnknownError;
+    }
+  }
+
+  current_frame->SetFrameState(kFrameStateDecoded);
+  return kStatusOk;
+}
+
+void DecoderImpl::DecodeSuperBlockRowInTile(
+    const Vector<std::unique_ptr<Tile>>& tiles, size_t tile_index, int row4x4,
+    const int superblock_size4x4, const int tile_columns,
+    const int superblock_rows, FrameScratchBuffer* const frame_scratch_buffer,
+    PostFilter* const post_filter, BlockingCounter* const pending_jobs) {
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      frame_scratch_buffer->tile_scratch_buffer_pool.Get();
+  if (scratch_buffer == nullptr) {
+    SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+    return;
+  }
+  Tile& tile = *tiles[tile_index];
+  const bool ok = tile.ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+      row4x4, scratch_buffer.get());
+  frame_scratch_buffer->tile_scratch_buffer_pool.Release(
+      std::move(scratch_buffer));
+  if (!ok) {
+    SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+    return;
+  }
+  if (post_filter->DoDeblock()) {
+    // Apply vertical deblock filtering for all the columns in this tile except
+    // for the first 64 columns.
+    post_filter->ApplyDeblockFilter(
+        kLoopFilterTypeVertical, row4x4,
+        tile.column4x4_start() + kNum4x4InLoopFilterUnit, tile.column4x4_end(),
+        superblock_size4x4);
+    // Apply horizontal deblock filtering for all the columns in this tile
+    // except for the first and the last 64 columns.
+    // Note about the last tile of each row: For the last tile, column4x4_end
+    // may not be a multiple of 16. In that case it is still okay to simply
+    // subtract 16 since ApplyDeblockFilter() will only do the filters in
+    // increments of 64 columns (or 32 columns for chroma with subsampling).
+    post_filter->ApplyDeblockFilter(
+        kLoopFilterTypeHorizontal, row4x4,
+        tile.column4x4_start() + kNum4x4InLoopFilterUnit,
+        tile.column4x4_end() - kNum4x4InLoopFilterUnit, superblock_size4x4);
+  }
+  const int superblock_size4x4_log2 = FloorLog2(superblock_size4x4);
+  const int index = row4x4 >> superblock_size4x4_log2;
+  int* const superblock_row_progress =
+      frame_scratch_buffer->superblock_row_progress.get();
+  std::condition_variable* const superblock_row_progress_condvar =
+      frame_scratch_buffer->superblock_row_progress_condvar.get();
+  bool notify;
+  {
+    std::lock_guard<std::mutex> lock(
+        frame_scratch_buffer->superblock_row_mutex);
+    notify = ++superblock_row_progress[index] == tile_columns;
+  }
+  if (notify) {
+    // We are done decoding this superblock row. Notify the post filtering
+    // thread.
+    superblock_row_progress_condvar[index].notify_one();
+  }
+  // Schedule the next superblock row (if one exists).
+  ThreadPool& thread_pool =
+      *frame_scratch_buffer->threading_strategy.thread_pool();
+  const int next_row4x4 = row4x4 + superblock_size4x4;
+  if (!tile.IsRow4x4Inside(next_row4x4)) {
+    tile_index += tile_columns;
+  }
+  if (tile_index >= tiles.size()) return;
+  pending_jobs->IncrementBy(1);
+  thread_pool.Schedule([this, &tiles, tile_index, next_row4x4,
+                        superblock_size4x4, tile_columns, superblock_rows,
+                        frame_scratch_buffer, post_filter, pending_jobs]() {
+    DecodeSuperBlockRowInTile(tiles, tile_index, next_row4x4,
+                              superblock_size4x4, tile_columns, superblock_rows,
+                              frame_scratch_buffer, post_filter, pending_jobs);
+    pending_jobs->Decrement();
+  });
+}
+
+void DecoderImpl::ApplyDeblockingFilterForTileBoundaries(
+    PostFilter* const post_filter, const std::unique_ptr<Tile>* tile_row_base,
+    const ObuFrameHeader& frame_header, int row4x4, int block_width4x4,
+    int tile_columns, bool decode_entire_tiles_in_worker_threads) {
+  // Apply vertical deblock filtering for the first 64 columns of each tile.
+  for (int tile_column = 0; tile_column < tile_columns; ++tile_column) {
+    const Tile& tile = *tile_row_base[tile_column];
+    post_filter->ApplyDeblockFilter(
+        kLoopFilterTypeVertical, row4x4, tile.column4x4_start(),
+        tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+  }
+  if (decode_entire_tiles_in_worker_threads &&
+      row4x4 == tile_row_base[0]->row4x4_start()) {
+    // This is the first superblock row of a tile row. In this case, apply
+    // horizontal deblock filtering for the entire superblock row.
+    post_filter->ApplyDeblockFilter(kLoopFilterTypeHorizontal, row4x4, 0,
+                                    frame_header.columns4x4, block_width4x4);
+  } else {
+    // Apply horizontal deblock filtering for the first 64 columns of the
+    // first tile.
+    const Tile& first_tile = *tile_row_base[0];
+    post_filter->ApplyDeblockFilter(
+        kLoopFilterTypeHorizontal, row4x4, first_tile.column4x4_start(),
+        first_tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+    // Apply horizontal deblock filtering for the last 64 columns of the
+    // previous tile and the first 64 columns of the current tile.
+    for (int tile_column = 1; tile_column < tile_columns; ++tile_column) {
+      const Tile& tile = *tile_row_base[tile_column];
+      // If the previous tile has more than 64 columns, then include those
+      // for the horizontal deblock.
+      const Tile& previous_tile = *tile_row_base[tile_column - 1];
+      const int column4x4_start =
+          tile.column4x4_start() -
+          ((tile.column4x4_start() - kNum4x4InLoopFilterUnit !=
+            previous_tile.column4x4_start())
+               ? kNum4x4InLoopFilterUnit
+               : 0);
+      post_filter->ApplyDeblockFilter(
+          kLoopFilterTypeHorizontal, row4x4, column4x4_start,
+          tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+    }
+    // Apply horizontal deblock filtering for the last 64 columns of the
+    // last tile.
+    const Tile& last_tile = *tile_row_base[tile_columns - 1];
+    // Identify the last column4x4 value and do horizontal filtering for
+    // that column4x4. The value of last column4x4 is the nearest multiple
+    // of 16 that is before tile.column4x4_end().
+    const int column4x4_start = (last_tile.column4x4_end() - 1) & ~15;
+    // If column4x4_start is the same as tile.column4x4_start() then it
+    // means that the last tile has <= 64 columns. So there is nothing left
+    // to deblock (since it was already deblocked in the loop above).
+    if (column4x4_start != last_tile.column4x4_start()) {
+      post_filter->ApplyDeblockFilter(
+          kLoopFilterTypeHorizontal, row4x4, column4x4_start,
+          last_tile.column4x4_end(), block_width4x4);
+    }
+  }
+}
+
 void DecoderImpl::SetCurrentFrameSegmentationMap(
     const ObuFrameHeader& frame_header, const SegmentationMap* prev_segment_ids,
     RefCountedBuffer* const current_frame) {
@@ -1092,10 +1459,7 @@ StatusCode DecoderImpl::ApplyFilmGrain(
     return kStatusOk;
   }
   if (!frame_header.show_existing_frame &&
-      frame_header.refresh_frame_flags == 0 &&
-      // TODO(vigneshv): In frame parallel mode, we never do film grain in
-      // place. Revisit this and see if this constraint need to be enforced.
-      !IsFrameParallel()) {
+      frame_header.refresh_frame_flags == 0) {
     // If show_existing_frame is true, then the current frame is a previously
     // saved reference frame. If refresh_frame_flags is nonzero, then the
     // state_.UpdateReferenceFrames() call above has saved the current frame as
diff --git a/chromium/third_party/libgav1/src/src/decoder_impl.h b/chromium/third_party/libgav1/src/src/decoder_impl.h
index dbc79ed85d7..4d58999c95e 100644
--- a/chromium/third_party/libgav1/src/src/decoder_impl.h
+++ b/chromium/third_party/libgav1/src/src/decoder_impl.h
@@ -18,7 +18,6 @@
 #define LIBGAV1_SRC_DECODER_IMPL_H_
 
 #include <array>
-#include <atomic>
 #include <condition_variable>  // NOLINT (unapproved c++11 header)
 #include <cstddef>
 #include <cstdint>
@@ -32,7 +31,6 @@
 #include "src/gav1/decoder_buffer.h"
 #include "src/gav1/decoder_settings.h"
 #include "src/gav1/status_code.h"
-#include "src/loop_filter_mask.h"
 #include "src/obu_parser.h"
 #include "src/residual_buffer_pool.h"
 #include "src/symbol_decoder_context.h"
@@ -129,6 +127,19 @@ class DecoderImpl : public Allocable {
  private:
   explicit DecoderImpl(const DecoderSettings* settings);
   StatusCode Init();
+  // Called when the first frame is enqueued. It does the OBU parsing for one
+  // temporal unit to retrieve the tile configuration and sets up the frame
+  // threading if frame parallel mode is allowed. It also initializes the
+  // |temporal_units_| queue based on the number of frame threads.
+  //
+  // The following are the limitations of the current implementation:
+  //  * It assumes that all frames in the video have the same tile
+  //    configuration. The frame parallel threading model will not be updated
+  //    based on tile configuration changes mid-stream.
+  //  * The above assumption holds true even when there is a new coded video
+  //    sequence (i.e.) a new sequence header.
+  StatusCode InitializeFrameThreadPoolAndTemporalUnitQueue(const uint8_t* data,
+                                                           size_t size);
   // Used only in frame parallel mode. Signals failure and waits until the
   // worker threads are aborted if |status| is a failure status. If |status| is
   // equal to kStatusOk or kStatusTryAgain, this function does not do anything.
@@ -175,11 +186,7 @@ class DecoderImpl : public Allocable {
       const Vector<std::unique_ptr<Tile>>& tiles,
       FrameScratchBuffer* frame_scratch_buffer, PostFilter* post_filter);
   StatusCode DecodeTilesThreadedNonFrameParallel(
-      const ObuSequenceHeader& sequence_header,
-      const ObuFrameHeader& frame_header,
       const Vector<std::unique_ptr<Tile>>& tiles,
-      const Vector<ObuTileGroup>& tile_groups,
-      const BlockParametersHolder& block_parameters_holder,
       FrameScratchBuffer* frame_scratch_buffer, PostFilter* post_filter,
       BlockingCounterWithStatus* pending_tiles);
   StatusCode DecodeTilesFrameParallel(
@@ -190,6 +197,36 @@ class DecoderImpl : public Allocable {
       const SegmentationMap* prev_segment_ids,
       FrameScratchBuffer* frame_scratch_buffer, PostFilter* post_filter,
       RefCountedBuffer* current_frame);
+  StatusCode DecodeTilesThreadedFrameParallel(
+      const ObuSequenceHeader& sequence_header,
+      const ObuFrameHeader& frame_header,
+      const Vector<std::unique_ptr<Tile>>& tiles,
+      const SymbolDecoderContext& saved_symbol_decoder_context,
+      const SegmentationMap* prev_segment_ids,
+      FrameScratchBuffer* frame_scratch_buffer, PostFilter* post_filter,
+      RefCountedBuffer* current_frame);
+  // Helper function used by DecodeTilesThreadedFrameParallel. Decodes the
+  // superblock row starting at |row4x4| for tile at index |tile_index| in the
+  // list of tiles |tiles|. If the decoding is successful, then it does the
+  // following:
+  //   * Schedule the next superblock row in the current tile column for
+  //     decoding (the next superblock row may be in a different tile than the
+  //     current one).
+  //   * If an entire superblock row of the frame has been decoded, it notifies
+  //     the waiters (if there are any).
+  void DecodeSuperBlockRowInTile(const Vector<std::unique_ptr<Tile>>& tiles,
+                                 size_t tile_index, int row4x4,
+                                 int superblock_size4x4, int tile_columns,
+                                 int superblock_rows,
+                                 FrameScratchBuffer* frame_scratch_buffer,
+                                 PostFilter* post_filter,
+                                 BlockingCounter* pending_jobs);
+  // Helper function used by DecodeTilesThreadedFrameParallel. Applies the
+  // deblocking filter for tile boundaries for the superblock row at |row4x4|.
+  void ApplyDeblockingFilterForTileBoundaries(
+      PostFilter* post_filter, const std::unique_ptr<Tile>* tile_row_base,
+      const ObuFrameHeader& frame_header, int row4x4, int block_width4x4,
+      int tile_columns, bool decode_entire_tiles_in_worker_threads);
   // Sets the current frame's segmentation map for two cases. The third case
   // is handled in Tile::DecodeBlock().
   void SetCurrentFrameSegmentationMap(const ObuFrameHeader& frame_header,
@@ -206,6 +243,11 @@ class DecoderImpl : public Allocable {
   bool IsNewSequenceHeader(const ObuParser& obu);
   bool IsFrameParallel() const { return frame_thread_pool_ != nullptr; }
 
+  bool HasFailure() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return failure_status_ != kStatusOk;
+  }
+
   Queue<TemporalUnit> temporal_units_;
   DecoderState state_;
 
@@ -228,21 +270,16 @@ class DecoderImpl : public Allocable {
   //  2) DecodeTiles()
   // Both of these functions have to respond to the other one failing by
   // aborting whatever they are doing. This variable is used to accomplish that.
-  std::atomic<bool> abort_{false};
-  // Stores the failure status if |abort_| is true.
-  std::atomic<StatusCode> failure_status_{kStatusOk};
+  // If |failure_status_| is not kStatusOk, then the two functions will try to
+  // abort as early as they can.
+  StatusCode failure_status_ = kStatusOk LIBGAV1_GUARDED_BY(mutex_);
 
   ObuSequenceHeader sequence_header_ = {};
   // If true, sequence_header is valid.
   bool has_sequence_header_ = false;
 
-#if defined(ENABLE_FRAME_PARALLEL)
-  // TODO(b/142583029): A copy of the DecoderSettings is made to facilitate the
-  // development of frame parallel mode behind a compile time flag.
-  DecoderSettings settings_;
-#else
   const DecoderSettings& settings_;
-#endif
+  bool seen_first_frame_ = false;
 };
 
 }  // namespace libgav1
diff --git a/chromium/third_party/libgav1/src/src/dsp/arm/cdef_neon.cc b/chromium/third_party/libgav1/src/src/dsp/arm/cdef_neon.cc
index 1fccfb47b36..c005f081279 100644
--- a/chromium/third_party/libgav1/src/src/dsp/arm/cdef_neon.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/arm/cdef_neon.cc
@@ -36,16 +36,7 @@ namespace dsp {
 namespace low_bitdepth {
 namespace {
 
-// CdefDirection:
-// Mirror values and pad to 16 elements.
-alignas(16) constexpr uint32_t kDivisionTable[] = {840, 420, 280, 210, 168, 140,
-                                                   120, 105, 120, 140, 168, 210,
-                                                   280, 420, 840, 0};
-
-// Used when calculating odd |cost[x]| values to mask off unwanted elements.
-// Holds elements 1 3 5 X 5 3 1 X
-alignas(16) constexpr uint32_t kDivisionTableOdd[] = {420, 210, 140, 0,
-                                                      140, 210, 420, 0};
+#include "src/dsp/cdef.inc"
 
 // Expand |a| to int8x16_t, left shift it by |shift| and sum the low
 // and high values with |b| and |c| respectively.
@@ -159,10 +150,10 @@ uint32x4_t SquareAccumulate(uint32x4_t a, uint16x4_t b) {
 
 // |cost[0]| and |cost[4]| square the input and sum with the corresponding
 // element from the other end of the vector:
-// |kDivisionTable[]| element:
+// |kCdefDivisionTable[]| element:
 // cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
-//             kDivisionTable[i + 1];
-// cost[0] += Square(partial[0][7]) * kDivisionTable[8];
+//             kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
 // Because everything is being summed into a single value the distributive
 // property allows us to mirror the division table and accumulate once.
 uint32_t Cost0Or4(const uint16x8_t a, const uint16x8_t b,
@@ -179,7 +170,7 @@ uint32_t Cost0Or4(const uint16x8_t a, const uint16x8_t b,
 uint32_t SquareAccumulate(const uint16x8_t a) {
   uint32x4_t c = Square(vget_low_u16(a));
   c = SquareAccumulate(c, vget_high_u16(a));
-  c = vmulq_n_u32(c, kDivisionTable[7]);
+  c = vmulq_n_u32(c, kCdefDivisionTable[7]);
   return SumVector(c);
 }
 
@@ -188,7 +179,7 @@ uint32_t CostOdd(const uint16x8_t a, const uint16x8_t b, const uint32x4_t mask,
   // Remove elements 0-2.
   uint32x4_t c = vandq_u32(mask, Square(vget_low_u16(a)));
   c = vaddq_u32(c, Square(vget_high_u16(a)));
-  c = vmulq_n_u32(c, kDivisionTable[7]);
+  c = vmulq_n_u32(c, kCdefDivisionTable[7]);
 
   c = vmlaq_u32(c, Square(vget_low_u16(a)), division_table[0]);
   c = vmlaq_u32(c, Square(vget_low_u16(b)), division_table[1]);
@@ -230,14 +221,14 @@ void CdefDirection_NEON(const void* const source, ptrdiff_t stride,
   cost[6] = SquareAccumulate(partial_lo[6]);
 
   const uint32x4_t division_table[4] = {
-      vld1q_u32(kDivisionTable), vld1q_u32(kDivisionTable + 4),
-      vld1q_u32(kDivisionTable + 8), vld1q_u32(kDivisionTable + 12)};
+      vld1q_u32(kCdefDivisionTable), vld1q_u32(kCdefDivisionTable + 4),
+      vld1q_u32(kCdefDivisionTable + 8), vld1q_u32(kCdefDivisionTable + 12)};
 
   cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table);
   cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table);
 
-  const uint32x4_t division_table_odd[2] = {vld1q_u32(kDivisionTableOdd),
-                                            vld1q_u32(kDivisionTableOdd + 4)};
+  const uint32x4_t division_table_odd[2] = {
+      vld1q_u32(kCdefDivisionTableOdd), vld1q_u32(kCdefDivisionTableOdd + 4)};
 
   const uint32x4_t element_3_mask = {0, 0, 0, static_cast<uint32_t>(-1)};
 
@@ -328,31 +319,34 @@ int16x8_t Constrain(const uint16x8_t pixel, const uint16x8_t reference,
   return vsubq_s16(veorq_s16(clamp_abs_diff, sign), sign);
 }
 
-template <int width>
+template <int width, bool enable_primary = true, bool enable_secondary = true>
 void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height,
             const int direction, const int primary_strength,
             const int secondary_strength, const int damping, uint8_t* dst,
             const ptrdiff_t dst_stride) {
   static_assert(width == 8 || width == 4, "");
+  static_assert(enable_primary || enable_secondary, "");
   const uint16x8_t cdef_large_value_mask =
       vdupq_n_u16(static_cast<uint16_t>(~kCdefLargeValue));
   const int16x8_t primary_threshold = vdupq_n_s16(primary_strength);
   const int16x8_t secondary_threshold = vdupq_n_s16(secondary_strength);
 
   int16x8_t primary_damping_shift, secondary_damping_shift;
+
   // FloorLog2() requires input to be > 0.
-  if (primary_strength == 0) {
-    primary_damping_shift = vdupq_n_s16(0);
-  } else {
+  // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+  if (enable_primary) {
+    // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
+    // for UV filtering.
     primary_damping_shift =
         vdupq_n_s16(-std::max(0, damping - FloorLog2(primary_strength)));
   }
-
-  if (secondary_strength == 0) {
-    secondary_damping_shift = vdupq_n_s16(0);
-  } else {
+  if (enable_secondary) {
+    // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+    // necessary.
+    assert(damping - FloorLog2(secondary_strength) >= 0);
     secondary_damping_shift =
-        vdupq_n_s16(-std::max(0, damping - FloorLog2(secondary_strength)));
+        vdupq_n_s16(-(damping - FloorLog2(secondary_strength)));
   }
 
   const int primary_tap_0 = kCdefPrimaryTaps[primary_strength & 1][0];
@@ -366,105 +360,112 @@ void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height,
     } else {
       pixel = vcombine_u16(vld1_u16(src), vld1_u16(src + src_stride));
     }
+
     uint16x8_t min = pixel;
     uint16x8_t max = pixel;
-
-    // Primary |direction|.
-    uint16x8_t primary_val[4];
-    if (width == 8) {
-      LoadDirection(src, src_stride, primary_val, direction);
+    int16x8_t sum;
+
+    if (enable_primary) {
+      // Primary |direction|.
+      uint16x8_t primary_val[4];
+      if (width == 8) {
+        LoadDirection(src, src_stride, primary_val, direction);
+      } else {
+        LoadDirection4(src, src_stride, primary_val, direction);
+      }
+
+      min = vminq_u16(min, primary_val[0]);
+      min = vminq_u16(min, primary_val[1]);
+      min = vminq_u16(min, primary_val[2]);
+      min = vminq_u16(min, primary_val[3]);
+
+      // Convert kCdefLargeValue to 0 before calculating max.
+      max = vmaxq_u16(max, vandq_u16(primary_val[0], cdef_large_value_mask));
+      max = vmaxq_u16(max, vandq_u16(primary_val[1], cdef_large_value_mask));
+      max = vmaxq_u16(max, vandq_u16(primary_val[2], cdef_large_value_mask));
+      max = vmaxq_u16(max, vandq_u16(primary_val[3], cdef_large_value_mask));
+
+      sum = Constrain(primary_val[0], pixel, primary_threshold,
+                      primary_damping_shift);
+      sum = vmulq_n_s16(sum, primary_tap_0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(primary_val[1], pixel, primary_threshold,
+                                  primary_damping_shift),
+                        primary_tap_0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(primary_val[2], pixel, primary_threshold,
+                                  primary_damping_shift),
+                        primary_tap_1);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(primary_val[3], pixel, primary_threshold,
+                                  primary_damping_shift),
+                        primary_tap_1);
     } else {
-      LoadDirection4(src, src_stride, primary_val, direction);
+      sum = vdupq_n_s16(0);
     }
 
-    min = vminq_u16(min, primary_val[0]);
-    min = vminq_u16(min, primary_val[1]);
-    min = vminq_u16(min, primary_val[2]);
-    min = vminq_u16(min, primary_val[3]);
-
-    // Convert kCdefLargeValue to 0 before calculating max.
-    max = vmaxq_u16(max, vandq_u16(primary_val[0], cdef_large_value_mask));
-    max = vmaxq_u16(max, vandq_u16(primary_val[1], cdef_large_value_mask));
-    max = vmaxq_u16(max, vandq_u16(primary_val[2], cdef_large_value_mask));
-    max = vmaxq_u16(max, vandq_u16(primary_val[3], cdef_large_value_mask));
-
-    int16x8_t sum = Constrain(primary_val[0], pixel, primary_threshold,
-                              primary_damping_shift);
-    sum = vmulq_n_s16(sum, primary_tap_0);
-    sum = vmlaq_n_s16(sum,
-                      Constrain(primary_val[1], pixel, primary_threshold,
-                                primary_damping_shift),
-                      primary_tap_0);
-    sum = vmlaq_n_s16(sum,
-                      Constrain(primary_val[2], pixel, primary_threshold,
-                                primary_damping_shift),
-                      primary_tap_1);
-    sum = vmlaq_n_s16(sum,
-                      Constrain(primary_val[3], pixel, primary_threshold,
-                                primary_damping_shift),
-                      primary_tap_1);
-
-    // Secondary |direction| values (+/- 2). Clamp |direction|.
-    uint16x8_t secondary_val[8];
-    if (width == 8) {
-      LoadDirection(src, src_stride, secondary_val, (direction + 2) & 0x7);
-      LoadDirection(src, src_stride, secondary_val + 4, (direction - 2) & 0x7);
-    } else {
-      LoadDirection4(src, src_stride, secondary_val, (direction + 2) & 0x7);
-      LoadDirection4(src, src_stride, secondary_val + 4, (direction - 2) & 0x7);
+    if (enable_secondary) {
+      // Secondary |direction| values (+/- 2). Clamp |direction|.
+      uint16x8_t secondary_val[8];
+      if (width == 8) {
+        LoadDirection(src, src_stride, secondary_val, direction + 2);
+        LoadDirection(src, src_stride, secondary_val + 4, direction - 2);
+      } else {
+        LoadDirection4(src, src_stride, secondary_val, direction + 2);
+        LoadDirection4(src, src_stride, secondary_val + 4, direction - 2);
+      }
+
+      min = vminq_u16(min, secondary_val[0]);
+      min = vminq_u16(min, secondary_val[1]);
+      min = vminq_u16(min, secondary_val[2]);
+      min = vminq_u16(min, secondary_val[3]);
+      min = vminq_u16(min, secondary_val[4]);
+      min = vminq_u16(min, secondary_val[5]);
+      min = vminq_u16(min, secondary_val[6]);
+      min = vminq_u16(min, secondary_val[7]);
+
+      max = vmaxq_u16(max, vandq_u16(secondary_val[0], cdef_large_value_mask));
+      max = vmaxq_u16(max, vandq_u16(secondary_val[1], cdef_large_value_mask));
+      max = vmaxq_u16(max, vandq_u16(secondary_val[2], cdef_large_value_mask));
+      max = vmaxq_u16(max, vandq_u16(secondary_val[3], cdef_large_value_mask));
+      max = vmaxq_u16(max, vandq_u16(secondary_val[4], cdef_large_value_mask));
+      max = vmaxq_u16(max, vandq_u16(secondary_val[5], cdef_large_value_mask));
+      max = vmaxq_u16(max, vandq_u16(secondary_val[6], cdef_large_value_mask));
+      max = vmaxq_u16(max, vandq_u16(secondary_val[7], cdef_large_value_mask));
+
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[0], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[1], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[2], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap1);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[3], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap1);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[4], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[5], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[6], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap1);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[7], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap1);
     }
-
-    min = vminq_u16(min, secondary_val[0]);
-    min = vminq_u16(min, secondary_val[1]);
-    min = vminq_u16(min, secondary_val[2]);
-    min = vminq_u16(min, secondary_val[3]);
-    min = vminq_u16(min, secondary_val[4]);
-    min = vminq_u16(min, secondary_val[5]);
-    min = vminq_u16(min, secondary_val[6]);
-    min = vminq_u16(min, secondary_val[7]);
-
-    max = vmaxq_u16(max, vandq_u16(secondary_val[0], cdef_large_value_mask));
-    max = vmaxq_u16(max, vandq_u16(secondary_val[1], cdef_large_value_mask));
-    max = vmaxq_u16(max, vandq_u16(secondary_val[2], cdef_large_value_mask));
-    max = vmaxq_u16(max, vandq_u16(secondary_val[3], cdef_large_value_mask));
-    max = vmaxq_u16(max, vandq_u16(secondary_val[4], cdef_large_value_mask));
-    max = vmaxq_u16(max, vandq_u16(secondary_val[5], cdef_large_value_mask));
-    max = vmaxq_u16(max, vandq_u16(secondary_val[6], cdef_large_value_mask));
-    max = vmaxq_u16(max, vandq_u16(secondary_val[7], cdef_large_value_mask));
-
-    sum = vmlaq_n_s16(sum,
-                      Constrain(secondary_val[0], pixel, secondary_threshold,
-                                secondary_damping_shift),
-                      kCdefSecondaryTap0);
-    sum = vmlaq_n_s16(sum,
-                      Constrain(secondary_val[1], pixel, secondary_threshold,
-                                secondary_damping_shift),
-                      kCdefSecondaryTap0);
-    sum = vmlaq_n_s16(sum,
-                      Constrain(secondary_val[2], pixel, secondary_threshold,
-                                secondary_damping_shift),
-                      kCdefSecondaryTap1);
-    sum = vmlaq_n_s16(sum,
-                      Constrain(secondary_val[3], pixel, secondary_threshold,
-                                secondary_damping_shift),
-                      kCdefSecondaryTap1);
-    sum = vmlaq_n_s16(sum,
-                      Constrain(secondary_val[4], pixel, secondary_threshold,
-                                secondary_damping_shift),
-                      kCdefSecondaryTap0);
-    sum = vmlaq_n_s16(sum,
-                      Constrain(secondary_val[5], pixel, secondary_threshold,
-                                secondary_damping_shift),
-                      kCdefSecondaryTap0);
-    sum = vmlaq_n_s16(sum,
-                      Constrain(secondary_val[6], pixel, secondary_threshold,
-                                secondary_damping_shift),
-                      kCdefSecondaryTap1);
-    sum = vmlaq_n_s16(sum,
-                      Constrain(secondary_val[7], pixel, secondary_threshold,
-                                secondary_damping_shift),
-                      kCdefSecondaryTap1);
-
     // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
     const int16x8_t sum_lt_0 = vshrq_n_s16(sum, 15);
     sum = vaddq_s16(sum, vdupq_n_s16(8));
@@ -495,26 +496,48 @@ void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height,
 // inside the frame. However it requires the source input to be padded with a
 // constant large value if at the boundary. The input must be uint16_t.
 void CdefFilter_NEON(const void* const source, const ptrdiff_t source_stride,
-                     const int rows4x4, const int columns4x4, const int curr_x,
-                     const int curr_y, const int subsampling_x,
-                     const int subsampling_y, const int primary_strength,
-                     const int secondary_strength, const int damping,
-                     const int direction, void* const dest,
+                     const int block_width, const int block_height,
+                     const int primary_strength, const int secondary_strength,
+                     const int damping, const int direction, void* const dest,
                      const ptrdiff_t dest_stride) {
-  const int plane_width = MultiplyBy4(columns4x4) >> subsampling_x;
-  const int plane_height = MultiplyBy4(rows4x4) >> subsampling_y;
-  const int block_width = std::min(8 >> subsampling_x, plane_width - curr_x);
-  const int block_height = std::min(8 >> subsampling_y, plane_height - curr_y);
   const auto* src = static_cast<const uint16_t*>(source);
   auto* dst = static_cast<uint8_t*>(dest);
 
-  if (block_width == 8) {
-    DoCdef<8>(src, source_stride, block_height, direction, primary_strength,
-              secondary_strength, damping, dst, dest_stride);
+  // TODO(slavarnway): Change dsp->cdef_filter to dsp->cdef_filter[2][2]. This
+  // would eliminate the strength checks.
+  if (secondary_strength > 0) {
+    if (primary_strength > 0) {
+      if (block_width == 8) {
+        DoCdef<8>(src, source_stride, block_height, direction, primary_strength,
+                  secondary_strength, damping, dst, dest_stride);
+      } else {
+        assert(block_width == 4);
+        DoCdef<4>(src, source_stride, block_height, direction, primary_strength,
+                  secondary_strength, damping, dst, dest_stride);
+      }
+    } else {
+      if (block_width == 8) {
+        DoCdef<8, /*enable_primary=*/false>(
+            src, source_stride, block_height, direction, primary_strength,
+            secondary_strength, damping, dst, dest_stride);
+      } else {
+        assert(block_width == 4);
+        DoCdef<4, /*enable_primary=*/false>(
+            src, source_stride, block_height, direction, primary_strength,
+            secondary_strength, damping, dst, dest_stride);
+      }
+    }
   } else {
-    assert(block_width == 4);
-    DoCdef<4>(src, source_stride, block_height, direction, primary_strength,
-              secondary_strength, damping, dst, dest_stride);
+    if (block_width == 8) {
+      DoCdef<8, /*enable_primary=*/true, /*enable_secondary=*/false>(
+          src, source_stride, block_height, direction, primary_strength,
+          secondary_strength, damping, dst, dest_stride);
+    } else {
+      assert(block_width == 4);
+      DoCdef<4, /*enable_primary=*/true, /*enable_secondary=*/false>(
+          src, source_stride, block_height, direction, primary_strength,
+          secondary_strength, damping, dst, dest_stride);
+    }
   }
 }
 
diff --git a/chromium/third_party/libgav1/src/src/dsp/arm/convolve_neon.cc b/chromium/third_party/libgav1/src/src/dsp/arm/convolve_neon.cc
index 34868826dcd..424be020bff 100644
--- a/chromium/third_party/libgav1/src/src/dsp/arm/convolve_neon.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/arm/convolve_neon.cc
@@ -1350,8 +1350,6 @@ void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
                               const int height, void* dest,
                               const ptrdiff_t dest_stride) {
   constexpr ptrdiff_t src_stride = kIntermediateStride;
-  constexpr int kernel_offset = (8 - num_taps) / 2;
-  src += src_stride * kernel_offset;
   const int16_t* src_y = src;
   // |dest| is 16-bit in compound mode, Pixel otherwise.
   uint16_t* dest16_y = static_cast<uint16_t*>(dest);
@@ -1425,8 +1423,6 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width,
                                   const int step_y, const int height,
                                   void* dest, const ptrdiff_t dest_stride) {
   constexpr ptrdiff_t src_stride = kIntermediateStride;
-  constexpr int kernel_offset = (8 - num_taps) / 2;
-  src += src_stride * kernel_offset;
   // A possible improvement is to use arithmetic to decide how many times to
   // apply filters to same source before checking whether to load new srcs.
   // However, this will only improve performance with very small step sizes.
@@ -1498,15 +1494,14 @@ void ConvolveScale2D_NEON(const void* const reference,
                           const int subpixel_y, const int step_x,
                           const int step_y, const int width, const int height,
                           void* prediction, const ptrdiff_t pred_stride) {
-  // TODO(petersonab): Reduce the height here by using the vertical filter
-  // size and offset horizontal filter. Reduce intermediate block stride to
-  // width to make smaller blocks faster.
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  assert(step_x <= 2048);
+  const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
   const int intermediate_height =
       (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
        kScaleSubPixelBits) +
-      kSubPixelTaps;
-  // TODO(b/133525024): Decide whether it's worth branching to a special case
-  // when step_x or step_y is 1024.
+      num_vert_taps;
   assert(step_x <= 2048);
   // The output of the horizontal filter, i.e. the intermediate_result, is
   // guaranteed to fit in int16_t.
@@ -1520,11 +1515,27 @@ void ConvolveScale2D_NEON(const void* const reference,
   // Similarly for height.
   int filter_index = GetFilterIndex(horizontal_filter_index, width);
   int16_t* intermediate = intermediate_result;
-  const auto* src = static_cast<const uint8_t*>(reference);
   const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference);
+  const int vert_kernel_offset = (8 - num_vert_taps) / 2;
+  src += vert_kernel_offset * src_stride;
+
+  // Derive the maximum value of |step_x| at which all source values fit in one
+  // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
+  // step_x*7 is the final base subpel index for the shuffle mask for filter
+  // inputs in each iteration on large blocks. When step_x is large, we need a
+  // larger structure and use a larger table lookup in order to gather all
+  // filter inputs.
+  // |num_taps| - 1 is the shuffle index of the final filter input.
+  const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+  const int kernel_start_ceiling = 16 - num_horiz_taps;
+  // This truncated quotient |grade_x_threshold| selects |step_x| such that:
+  // (step_x * 7) >> kScaleSubPixelBits < single load limit
+  const int grade_x_threshold =
+      (kernel_start_ceiling << kScaleSubPixelBits) / 7;
   switch (filter_index) {
     case 0:
-      if (step_x > 1024) {
+      if (step_x > grade_x_threshold) {
         ConvolveKernelHorizontalSigned6Tap<2>(
             src, src_stride, width, subpixel_x, step_x, intermediate_height,
             intermediate);
@@ -1535,7 +1546,7 @@ void ConvolveScale2D_NEON(const void* const reference,
       }
       break;
     case 1:
-      if (step_x > 1024) {
+      if (step_x > grade_x_threshold) {
         ConvolveKernelHorizontalMixed6Tap<2>(src, src_stride, width, subpixel_x,
                                              step_x, intermediate_height,
                                              intermediate);
@@ -1547,7 +1558,7 @@ void ConvolveScale2D_NEON(const void* const reference,
       }
       break;
     case 2:
-      if (step_x > 1024) {
+      if (step_x > grade_x_threshold) {
         ConvolveKernelHorizontalSigned8Tap<2>(
             src, src_stride, width, subpixel_x, step_x, intermediate_height,
             intermediate);
@@ -1558,7 +1569,7 @@ void ConvolveScale2D_NEON(const void* const reference,
       }
       break;
     case 3:
-      if (step_x > 1024) {
+      if (step_x > grade_x_threshold) {
         ConvolveKernelHorizontal2Tap<2>(src, src_stride, width, subpixel_x,
                                         step_x, intermediate_height,
                                         intermediate);
diff --git a/chromium/third_party/libgav1/src/src/dsp/arm/loop_restoration_neon.cc b/chromium/third_party/libgav1/src/src/dsp/arm/loop_restoration_neon.cc
index f63fabdd7e2..e89ba36773b 100644
--- a/chromium/third_party/libgav1/src/src/dsp/arm/loop_restoration_neon.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/arm/loop_restoration_neon.cc
@@ -16,7 +16,6 @@
 #include "src/utils/cpu.h"
 
 #if LIBGAV1_ENABLE_NEON
-
 #include <arm_neon.h>
 
 #include <cassert>
@@ -33,6 +32,11 @@ namespace dsp {
 namespace low_bitdepth {
 namespace {
 
+template <int bytes>
+inline uint16x4_t VshrU128(const uint16x8_t a) {
+  return vext_u16(vget_low_u16(a), vget_high_u16(a), bytes / 2);
+}
+
 // Wiener
 
 // Must make a local copy of coefficients to help compiler know that they have
@@ -50,7 +54,6 @@ inline void PopulateWienerCoefficients(
     assert(direction == WienerInfo::kVertical);
     filter_3 = 128;
   }
-
   for (int i = 0; i < 3; ++i) {
     const int16_t coeff = restoration_info.wiener_info.filter[direction][i];
     filter[i] = coeff;
@@ -76,74 +79,24 @@ inline int CountZeroCoefficients(const int16_t filter[2][kSubPixelTaps]) {
   return number_zero_coefficients;
 }
 
-inline void LoadHorizontal4Tap3(const uint8_t* source, uint8x8_t s[3]) {
-  s[0] = vld1_u8(source);
-  // Faster than using vshr_n_u64().
-  s[1] = vext_u8(s[0], s[0], 1);
-  s[2] = vext_u8(s[0], s[0], 2);
-}
-
-inline void LoadHorizontal4Tap5(const uint8_t* source, uint8x8_t s[5]) {
-  s[0] = vld1_u8(source);
-  // Faster than using vshr_n_u64().
-  s[1] = vext_u8(s[0], s[0], 1);
-  s[2] = vext_u8(s[0], s[0], 2);
-  s[3] = vext_u8(s[0], s[0], 3);
-  s[4] = vext_u8(s[0], s[0], 4);
-}
-
-inline void LoadHorizontal8Tap3(const uint8_t* source, uint8x8_t s[3]) {
-  const uint8x16_t r = vld1q_u8(source);
-  s[0] = vget_low_u8(r);
-  s[1] = vext_u8(s[0], vget_high_u8(r), 1);
-  s[2] = vext_u8(s[0], vget_high_u8(r), 2);
-}
-
-inline void LoadHorizontal8Tap5(const uint8_t* source, uint8x8_t s[5]) {
-  const uint8x16_t r = vld1q_u8(source);
-  s[0] = vget_low_u8(r);
-  s[1] = vext_u8(s[0], vget_high_u8(r), 1);
-  s[2] = vext_u8(s[0], vget_high_u8(r), 2);
-  s[3] = vext_u8(s[0], vget_high_u8(r), 3);
-  s[4] = vext_u8(s[0], vget_high_u8(r), 4);
-}
-
-inline void LoadHorizontalTap7(const uint8_t* source, uint8x8_t s[7]) {
-  // This is just as fast as an 8x8 transpose but avoids over-reading
-  // extra rows. It always over-reads by at least 1 value. On small widths
-  // (4xH) it over-reads by 9 values.
-  const uint8x16_t r = vld1q_u8(source);
-  s[0] = vget_low_u8(r);
-  s[1] = vext_u8(s[0], vget_high_u8(r), 1);
-  s[2] = vext_u8(s[0], vget_high_u8(r), 2);
-  s[3] = vext_u8(s[0], vget_high_u8(r), 3);
-  s[4] = vext_u8(s[0], vget_high_u8(r), 4);
-  s[5] = vext_u8(s[0], vget_high_u8(r), 5);
-  s[6] = vext_u8(s[0], vget_high_u8(r), 6);
-}
-
 inline int16x8_t HorizontalSum(const uint8x8_t a[3], const int16_t filter[2],
                                int16x8_t sum) {
   const int16x8_t a_0_2 = vreinterpretq_s16_u16(vaddl_u8(a[0], a[2]));
   sum = vmlaq_n_s16(sum, a_0_2, filter[0]);
   sum = vmlaq_n_s16(sum, vreinterpretq_s16_u16(vmovl_u8(a[1])), filter[1]);
-
   sum = vrshrq_n_s16(sum, kInterRoundBitsHorizontal);
-
   // Delaying |horizontal_rounding| until after down shifting allows the sum to
   // stay in 16 bits.
   // |horizontal_rounding| = 1 << (bitdepth + kWienerFilterBits - 1)
   //                         1 << (       8 +                 7 - 1)
   // Plus |kInterRoundBitsHorizontal| and it works out to 1 << 11.
   sum = vaddq_s16(sum, vdupq_n_s16(1 << 11));
-
   // Just like |horizontal_rounding|, adding |filter[3]| at this point allows
   // the sum to stay in 16 bits.
   // But wait! We *did* calculate |filter[3]| and used it in the sum! But it was
   // offset by 128. Fix that here:
   // |src[3]| * 128 >> 3 == |src[3]| << 4
   sum = vaddq_s16(sum, vreinterpretq_s16_u16(vshll_n_u8(a[1], 4)));
-
   // Saturate to
   // [0,
   // (1 << (bitdepth + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1)]
@@ -153,111 +106,6 @@ inline int16x8_t HorizontalSum(const uint8x8_t a[3], const int16_t filter[2],
   return sum;
 }
 
-inline int16x8_t HorizontalSumTap3(const uint8x8_t a[3],
-                                   const int16_t filter[2]) {
-  return HorizontalSum(a, filter, vdupq_n_s16(0));
-}
-
-inline int16x8_t HorizontalSumTap5(const uint8x8_t a[5],
-                                   const int16_t filter[3]) {
-  const int16x8_t a_0_4 = vreinterpretq_s16_u16(vaddl_u8(a[0], a[4]));
-  const int16x8_t sum = vmulq_n_s16(a_0_4, filter[0]);
-  return HorizontalSum(a + 1, filter + 1, sum);
-}
-
-inline int16x8_t HorizontalSumTap7(const uint8x8_t a[7],
-                                   const int16_t filter[4]) {
-  const int16x8_t a_0_6 = vreinterpretq_s16_u16(vaddl_u8(a[0], a[6]));
-  const int16x8_t a_1_5 = vreinterpretq_s16_u16(vaddl_u8(a[1], a[5]));
-  int16x8_t sum = vmulq_n_s16(a_0_6, filter[0]);
-  sum = vmlaq_n_s16(sum, a_1_5, filter[1]);
-  return HorizontalSum(a + 2, filter + 2, sum);
-}
-
-inline int16x8_t WienerHorizontal4Tap3(const uint8_t* source,
-                                       const int16_t filter[2]) {
-  uint8x8_t s[5];
-  LoadHorizontal4Tap3(source, s);
-  return HorizontalSumTap3(s, filter);
-}
-
-inline int16x8_t WienerHorizontal4Tap5(const uint8_t* source,
-                                       const int16_t filter[3]) {
-  uint8x8_t s[5];
-  LoadHorizontal4Tap5(source, s);
-  return HorizontalSumTap5(s, filter);
-}
-
-inline int16x8_t WienerHorizontal4Tap7(const uint8_t* source,
-                                       const int16_t filter[4]) {
-  uint8x8_t s[7];
-  LoadHorizontalTap7(source, s);
-  return HorizontalSumTap7(s, filter);
-}
-
-inline int16x8_t WienerHorizontal4x2Tap3(const uint8_t* source,
-                                         const ptrdiff_t stride,
-                                         const int16_t filter[2]) {
-  uint8x8_t s0[5], s1[5], s[5];
-  LoadHorizontal4Tap3(source + 0 * stride, s0);
-  LoadHorizontal4Tap3(source + 1 * stride, s1);
-  s[0] = InterleaveLow32(s0[0], s1[0]);
-  s[1] = InterleaveLow32(s0[1], s1[1]);
-  s[2] = InterleaveLow32(s0[2], s1[2]);
-  return HorizontalSumTap3(s, filter);
-}
-
-inline int16x8_t WienerHorizontal4x2Tap5(const uint8_t* source,
-                                         const ptrdiff_t stride,
-                                         const int16_t filter[3]) {
-  uint8x8_t s0[5], s1[5], s[5];
-  LoadHorizontal4Tap5(source + 0 * stride, s0);
-  LoadHorizontal4Tap5(source + 1 * stride, s1);
-  s[0] = InterleaveLow32(s0[0], s1[0]);
-  s[1] = InterleaveLow32(s0[1], s1[1]);
-  s[2] = InterleaveLow32(s0[2], s1[2]);
-  s[3] = InterleaveLow32(s0[3], s1[3]);
-  s[4] = InterleaveLow32(s0[4], s1[4]);
-  return HorizontalSumTap5(s, filter);
-}
-
-inline int16x8_t WienerHorizontal4x2Tap7(const uint8_t* source,
-                                         const ptrdiff_t stride,
-                                         const int16_t filter[4]) {
-  uint8x8_t s0[7], s1[7], s[7];
-  LoadHorizontalTap7(source + 0 * stride, s0);
-  LoadHorizontalTap7(source + 1 * stride, s1);
-  s[0] = InterleaveLow32(s0[0], s1[0]);
-  s[1] = InterleaveLow32(s0[1], s1[1]);
-  s[2] = InterleaveLow32(s0[2], s1[2]);
-  s[3] = InterleaveLow32(s0[3], s1[3]);
-  s[4] = InterleaveLow32(s0[4], s1[4]);
-  s[5] = InterleaveLow32(s0[5], s1[5]);
-  s[6] = InterleaveLow32(s0[6], s1[6]);
-  return HorizontalSumTap7(s, filter);
-}
-
-inline int16x8_t WienerHorizontal8Tap3(const uint8_t* source,
-                                       const int16_t filter[2]) {
-  uint8x8_t s[3];
-  LoadHorizontal8Tap3(source, s);
-  return HorizontalSumTap3(s, filter);
-}
-
-inline int16x8_t WienerHorizontal8Tap5(const uint8_t* source,
-                                       const int16_t filter[3]) {
-  uint8x8_t s[5];
-  LoadHorizontal8Tap5(source, s);
-  return HorizontalSumTap5(s, filter);
-}
-
-inline int16x8_t WienerHorizontal8Tap7(const uint8_t* source,
-                                       const int16_t filter[4]) {
-  uint8x8_t s[7];
-  LoadHorizontalTap7(source, s);
-  return HorizontalSumTap7(s, filter);
-}
-
 inline uint8x8_t WienerVertical(const int16x8_t a[3], const int16_t filter[2],
                                 int32x4_t sum[2]) {
   // -(1 << (bitdepth + kInterRoundBitsVertical - 1))
@@ -265,7 +113,6 @@ inline uint8x8_t WienerVertical(const int16x8_t a[3], const int16_t filter[2],
   constexpr int vertical_rounding = -(1 << 18);
   const int32x4_t rounding = vdupq_n_s32(vertical_rounding);
   const int16x8_t a_0_2 = vaddq_s16(a[0], a[2]);
-
   sum[0] = vaddq_s32(sum[0], rounding);
   sum[1] = vaddq_s32(sum[1], rounding);
   sum[0] = vmlal_n_s16(sum[0], vget_low_s16(a_0_2), filter[0]);
@@ -274,44 +121,9 @@ inline uint8x8_t WienerVertical(const int16x8_t a[3], const int16_t filter[2],
   sum[1] = vmlal_n_s16(sum[1], vget_high_s16(a[1]), filter[1]);
   const uint16x4_t sum_lo_16 = vqrshrun_n_s32(sum[0], 11);
   const uint16x4_t sum_hi_16 = vqrshrun_n_s32(sum[1], 11);
-
   return vqmovn_u16(vcombine_u16(sum_lo_16, sum_hi_16));
 }
 
-inline uint8x8_t WienerVerticalTap3(const int16x8_t a[3],
-                                    const int16_t filter[2]) {
-  int32x4_t sum[2];
-  sum[0] = sum[1] = vdupq_n_s32(0);
-  return WienerVertical(a, filter, sum);
-}
-
-inline uint8x8_t WienerVerticalTap5(const int16x8_t a[5],
-                                    const int16_t filter[3]) {
-  const int16x8_t a_0_4 = vaddq_s16(a[0], a[4]);
-  int32x4_t sum[2];
-
-  sum[0] = sum[1] = vdupq_n_s32(0);
-  sum[0] = vmlal_n_s16(sum[0], vget_low_s16(a_0_4), filter[0]);
-  sum[1] = vmlal_n_s16(sum[1], vget_high_s16(a_0_4), filter[0]);
-
-  return WienerVertical(a + 1, filter + 1, sum);
-}
-
-inline uint8x8_t WienerVerticalTap7(const int16x8_t a[7],
-                                    const int16_t filter[4]) {
-  const int16x8_t a_0_6 = vaddq_s16(a[0], a[6]);
-  const int16x8_t a_1_5 = vaddq_s16(a[1], a[5]);
-  int32x4_t sum[2];
-
-  sum[0] = sum[1] = vdupq_n_s32(0);
-  sum[0] = vmlal_n_s16(sum[0], vget_low_s16(a_0_6), filter[0]);
-  sum[1] = vmlal_n_s16(sum[1], vget_high_s16(a_0_6), filter[0]);
-  sum[0] = vmlal_n_s16(sum[0], vget_low_s16(a_1_5), filter[1]);
-  sum[1] = vmlal_n_s16(sum[1], vget_high_s16(a_1_5), filter[1]);
-
-  return WienerVertical(a + 2, filter + 2, sum);
-}
-
 // For width 16 and up, store the horizontal results, and then do the vertical
 // filter row by row. This is faster than doing it column by column when
 // considering cache issues.
@@ -330,360 +142,168 @@ void WienerFilter_NEON(const void* const source, void* const dest,
   int16_t* wiener_buffer = reinterpret_cast<int16_t*>(buffer->wiener_buffer);
   int16_t filter_horizontal[kSubPixelTaps / 2];
   int16_t filter_vertical[kSubPixelTaps / 2];
-  int16x8_t a[7];
-
   PopulateWienerCoefficients(restoration_info, WienerInfo::kHorizontal,
                              filter_horizontal);
   PopulateWienerCoefficients(restoration_info, WienerInfo::kVertical,
                              filter_vertical);
-
   if (number_zero_coefficients == 0) {
     // 7-tap
-    src -= kCenterTap * source_stride + kCenterTap;
-
-    if (width > 8) {
-      int y = height + kSubPixelTaps - 2;
-      do {
-        int x = 0;
-        do {
-          const int16x8_t a = WienerHorizontal8Tap7(src + x, filter_horizontal);
-          vst1q_s16(wiener_buffer + x, a);
-          x += 8;
-        } while (x < width);
-        src += source_stride;
-        wiener_buffer += width;
-      } while (--y != 0);
-
-      wiener_buffer = reinterpret_cast<int16_t*>(buffer->wiener_buffer);
-
-      y = height;
+    src -= (kCenterTap - 1) * source_stride + kCenterTap;
+    int y = height + kSubPixelTaps - 4;
+    do {
+      wiener_buffer += width;
+      int x = 0;
       do {
-        int x = 0;
-        do {
-          a[0] = vld1q_s16(wiener_buffer + x + 0 * width);
-          a[1] = vld1q_s16(wiener_buffer + x + 1 * width);
-          a[2] = vld1q_s16(wiener_buffer + x + 2 * width);
-          a[3] = vld1q_s16(wiener_buffer + x + 3 * width);
-          a[4] = vld1q_s16(wiener_buffer + x + 4 * width);
-          a[5] = vld1q_s16(wiener_buffer + x + 5 * width);
-          a[6] = vld1q_s16(wiener_buffer + x + 6 * width);
-
-          const uint8x8_t r = WienerVerticalTap7(a, filter_vertical);
-          vst1_u8(dst + x, r);
-          x += 8;
-        } while (x < width);
-        wiener_buffer += width;
-        dst += dest_stride;
-      } while (--y != 0);
-    } else if (width > 4) {
-      a[0] = WienerHorizontal8Tap7(src, filter_horizontal);
-      src += source_stride;
-      a[1] = WienerHorizontal8Tap7(src, filter_horizontal);
-      src += source_stride;
-      a[2] = WienerHorizontal8Tap7(src, filter_horizontal);
+        // This is just as fast as an 8x8 transpose but avoids over-reading
+        // extra rows. It always over-reads by at least 1 value. On small widths
+        // (4xH) it over-reads by 9 values.
+        const uint8x16_t r = vld1q_u8(src + x);
+        uint8x8_t s[7];
+        s[0] = vget_low_u8(r);
+        s[1] = vext_u8(s[0], vget_high_u8(r), 1);
+        s[2] = vext_u8(s[0], vget_high_u8(r), 2);
+        s[3] = vext_u8(s[0], vget_high_u8(r), 3);
+        s[4] = vext_u8(s[0], vget_high_u8(r), 4);
+        s[5] = vext_u8(s[0], vget_high_u8(r), 5);
+        s[6] = vext_u8(s[0], vget_high_u8(r), 6);
+        const int16x8_t s_0_6 = vreinterpretq_s16_u16(vaddl_u8(s[0], s[6]));
+        const int16x8_t s_1_5 = vreinterpretq_s16_u16(vaddl_u8(s[1], s[5]));
+        int16x8_t sum = vmulq_n_s16(s_0_6, filter_horizontal[0]);
+        sum = vmlaq_n_s16(sum, s_1_5, filter_horizontal[1]);
+        const int16x8_t a = HorizontalSum(s + 2, filter_horizontal + 2, sum);
+        vst1q_s16(wiener_buffer + x, a);
+        x += 8;
+      } while (x < width);
       src += source_stride;
-      a[3] = WienerHorizontal8Tap7(src, filter_horizontal);
-      src += source_stride;
-      a[4] = WienerHorizontal8Tap7(src, filter_horizontal);
-      src += source_stride;
-      a[5] = WienerHorizontal8Tap7(src, filter_horizontal);
-      src += source_stride;
-
-      int y = height;
+    } while (--y != 0);
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer + width, wiener_buffer,
+           sizeof(*wiener_buffer) * width);
+    wiener_buffer = reinterpret_cast<int16_t*>(buffer->wiener_buffer);
+    memcpy(wiener_buffer, wiener_buffer + width,
+           sizeof(*wiener_buffer) * width);
+
+    y = height;
+    do {
+      int x = 0;
       do {
-        a[6] = WienerHorizontal8Tap7(src, filter_horizontal);
-        src += source_stride;
-
-        const uint8x8_t r = WienerVerticalTap7(a, filter_vertical);
-        vst1_u8(dst, r);
-        dst += dest_stride;
-
-        a[0] = a[1];
-        a[1] = a[2];
-        a[2] = a[3];
-        a[3] = a[4];
-        a[4] = a[5];
-        a[5] = a[6];
-      } while (--y != 0);
-    } else {
-      int y = height;
-
-      if ((y & 1) != 0) {
-        --y;
-        a[0] = WienerHorizontal4x2Tap7(src, source_stride, filter_horizontal);
-        src += source_stride;
-        a[2] = WienerHorizontal4x2Tap7(src + source_stride, source_stride,
-                                       filter_horizontal);
-        a[4] = WienerHorizontal4x2Tap7(src + 3 * source_stride, source_stride,
-                                       filter_horizontal);
-        a[1] = vcombine_s16(vget_high_s16(a[0]), vget_low_s16(a[2]));
-        a[3] = vcombine_s16(vget_high_s16(a[2]), vget_low_s16(a[4]));
-        a[6] =
-            WienerHorizontal4Tap7(src + 5 * source_stride, filter_horizontal);
-        a[5] = vcombine_s16(vget_high_s16(a[4]), vget_low_s16(a[6]));
-        const uint8x8_t r = WienerVerticalTap7(a, filter_vertical);
-        StoreLo4(dst, r);
-        dst += dest_stride;
-      }
-
-      if (y != 0) {
-        a[0] = WienerHorizontal4x2Tap7(src, source_stride, filter_horizontal);
-        src += 2 * source_stride;
-        a[2] = WienerHorizontal4x2Tap7(src, source_stride, filter_horizontal);
-        src += 2 * source_stride;
-        a[4] = WienerHorizontal4x2Tap7(src, source_stride, filter_horizontal);
-        src += 2 * source_stride;
-        a[1] = vcombine_s16(vget_high_s16(a[0]), vget_low_s16(a[2]));
-        a[3] = vcombine_s16(vget_high_s16(a[2]), vget_low_s16(a[4]));
-
-        do {
-          a[6] = WienerHorizontal4x2Tap7(src, source_stride, filter_horizontal);
-          src += 2 * source_stride;
-          a[5] = vcombine_s16(vget_high_s16(a[4]), vget_low_s16(a[6]));
-
-          const uint8x8_t r = WienerVerticalTap7(a, filter_vertical);
-          StoreLo4(dst, r);
-          dst += dest_stride;
-          StoreHi4(dst, r);
-          dst += dest_stride;
-
-          a[0] = a[2];
-          a[1] = a[3];
-          a[2] = a[4];
-          a[3] = a[5];
-          a[4] = a[6];
-          y -= 2;
-        } while (y != 0);
-      }
-    }
+        int16x8_t a[7];
+        a[0] = vld1q_s16(wiener_buffer + x + 0 * width);
+        a[1] = vld1q_s16(wiener_buffer + x + 1 * width);
+        a[2] = vld1q_s16(wiener_buffer + x + 2 * width);
+        a[3] = vld1q_s16(wiener_buffer + x + 3 * width);
+        a[4] = vld1q_s16(wiener_buffer + x + 4 * width);
+        a[5] = vld1q_s16(wiener_buffer + x + 5 * width);
+        a[6] = vld1q_s16(wiener_buffer + x + 6 * width);
+        const int16x8_t a_0_6 = vaddq_s16(a[0], a[6]);
+        const int16x8_t a_1_5 = vaddq_s16(a[1], a[5]);
+        int32x4_t sum[2];
+        sum[0] = sum[1] = vdupq_n_s32(0);
+        sum[0] = vmlal_n_s16(sum[0], vget_low_s16(a_0_6), filter_vertical[0]);
+        sum[1] = vmlal_n_s16(sum[1], vget_high_s16(a_0_6), filter_vertical[0]);
+        sum[0] = vmlal_n_s16(sum[0], vget_low_s16(a_1_5), filter_vertical[1]);
+        sum[1] = vmlal_n_s16(sum[1], vget_high_s16(a_1_5), filter_vertical[1]);
+        const uint8x8_t r = WienerVertical(a + 2, filter_vertical + 2, sum);
+        vst1_u8(dst + x, r);
+        x += 8;
+      } while (x < width);
+      wiener_buffer += width;
+      dst += dest_stride;
+    } while (--y != 0);
   } else if (number_zero_coefficients == 1) {
     // 5-tap
     src -= (kCenterTap - 1) * source_stride + kCenterTap - 1;
-
-    if (width > 8) {
-      int y = height + kSubPixelTaps - 4;
-      do {
-        int x = 0;
-        do {
-          const int16x8_t a =
-              WienerHorizontal8Tap5(src + x, filter_horizontal + 1);
-          vst1q_s16(wiener_buffer + x, a);
-          x += 8;
-        } while (x < width);
-        src += source_stride;
-        wiener_buffer += width;
-      } while (--y != 0);
-
-      wiener_buffer = reinterpret_cast<int16_t*>(buffer->wiener_buffer);
-
-      y = height;
+    int y = height + kSubPixelTaps - 4;
+    do {
+      int x = 0;
       do {
-        int x = 0;
-        do {
-          a[0] = vld1q_s16(wiener_buffer + x + 0 * width);
-          a[1] = vld1q_s16(wiener_buffer + x + 1 * width);
-          a[2] = vld1q_s16(wiener_buffer + x + 2 * width);
-          a[3] = vld1q_s16(wiener_buffer + x + 3 * width);
-          a[4] = vld1q_s16(wiener_buffer + x + 4 * width);
-
-          const uint8x8_t r = WienerVerticalTap5(a, filter_vertical + 1);
-          vst1_u8(dst + x, r);
-          x += 8;
-        } while (x < width);
-        wiener_buffer += width;
-        dst += dest_stride;
-      } while (--y != 0);
-    } else if (width > 4) {
-      a[0] = WienerHorizontal8Tap5(src, filter_horizontal + 1);
-      src += source_stride;
-      a[1] = WienerHorizontal8Tap5(src, filter_horizontal + 1);
-      src += source_stride;
-      a[2] = WienerHorizontal8Tap5(src, filter_horizontal + 1);
-      src += source_stride;
-      a[3] = WienerHorizontal8Tap5(src, filter_horizontal + 1);
+        const uint8x16_t r = vld1q_u8(src + x);
+        uint8x8_t s[5];
+        s[0] = vget_low_u8(r);
+        s[1] = vext_u8(s[0], vget_high_u8(r), 1);
+        s[2] = vext_u8(s[0], vget_high_u8(r), 2);
+        s[3] = vext_u8(s[0], vget_high_u8(r), 3);
+        s[4] = vext_u8(s[0], vget_high_u8(r), 4);
+        const int16x8_t s_0_4 = vreinterpretq_s16_u16(vaddl_u8(s[0], s[4]));
+        const int16x8_t sum = vmulq_n_s16(s_0_4, filter_horizontal[1]);
+        const int16x8_t a = HorizontalSum(s + 1, filter_horizontal + 2, sum);
+        vst1q_s16(wiener_buffer + x, a);
+        x += 8;
+      } while (x < width);
       src += source_stride;
+      wiener_buffer += width;
+    } while (--y != 0);
 
-      int y = height;
+    wiener_buffer = reinterpret_cast<int16_t*>(buffer->wiener_buffer);
+    y = height;
+    do {
+      int x = 0;
       do {
-        a[4] = WienerHorizontal8Tap5(src, filter_horizontal + 1);
-        src += source_stride;
-
-        const uint8x8_t r = WienerVerticalTap5(a, filter_vertical + 1);
-        vst1_u8(dst, r);
-        dst += dest_stride;
-
-        a[0] = a[1];
-        a[1] = a[2];
-        a[2] = a[3];
-        a[3] = a[4];
-      } while (--y != 0);
-    } else {
-      int y = height;
-
-      if ((y & 1) != 0) {
-        --y;
-        a[0] =
-            WienerHorizontal4x2Tap5(src, source_stride, filter_horizontal + 1);
-        src += source_stride;
-        a[2] = WienerHorizontal4x2Tap5(src + source_stride, source_stride,
-                                       filter_horizontal + 1);
-        a[1] = vcombine_s16(vget_high_s16(a[0]), vget_low_s16(a[2]));
-        a[4] = WienerHorizontal4Tap5(src + 3 * source_stride,
-                                     filter_horizontal + 1);
-        a[3] = vcombine_s16(vget_high_s16(a[2]), vget_low_s16(a[4]));
-        const uint8x8_t r = WienerVerticalTap5(a, filter_vertical + 1);
-        StoreLo4(dst, r);
-        dst += dest_stride;
-      }
-
-      if (y != 0) {
-        a[0] =
-            WienerHorizontal4x2Tap5(src, source_stride, filter_horizontal + 1);
-        src += 2 * source_stride;
-        a[2] =
-            WienerHorizontal4x2Tap5(src, source_stride, filter_horizontal + 1);
-        src += 2 * source_stride;
-        a[1] = vcombine_s16(vget_high_s16(a[0]), vget_low_s16(a[2]));
-
-        do {
-          a[4] = WienerHorizontal4x2Tap5(src, source_stride,
-                                         filter_horizontal + 1);
-          src += 2 * source_stride;
-          a[3] = vcombine_s16(vget_high_s16(a[2]), vget_low_s16(a[4]));
-
-          const uint8x8_t r = WienerVerticalTap5(a, filter_vertical + 1);
-          StoreLo4(dst, r);
-          dst += dest_stride;
-          StoreHi4(dst, r);
-          dst += dest_stride;
-
-          a[0] = a[2];
-          a[1] = a[3];
-          a[2] = a[4];
-          y -= 2;
-        } while (y != 0);
-      }
-    }
+        int16x8_t a[5];
+        a[0] = vld1q_s16(wiener_buffer + x + 0 * width);
+        a[1] = vld1q_s16(wiener_buffer + x + 1 * width);
+        a[2] = vld1q_s16(wiener_buffer + x + 2 * width);
+        a[3] = vld1q_s16(wiener_buffer + x + 3 * width);
+        a[4] = vld1q_s16(wiener_buffer + x + 4 * width);
+        const int16x8_t a_0_4 = vaddq_s16(a[0], a[4]);
+        int32x4_t sum[2];
+        sum[0] = sum[1] = vdupq_n_s32(0);
+        sum[0] = vmlal_n_s16(sum[0], vget_low_s16(a_0_4), filter_vertical[1]);
+        sum[1] = vmlal_n_s16(sum[1], vget_high_s16(a_0_4), filter_vertical[1]);
+        const uint8x8_t r = WienerVertical(a + 1, filter_vertical + 2, sum);
+        vst1_u8(dst + x, r);
+        x += 8;
+      } while (x < width);
+      wiener_buffer += width;
+      dst += dest_stride;
+    } while (--y != 0);
   } else {
     // 3-tap
     src -= (kCenterTap - 2) * source_stride + kCenterTap - 2;
-
-    if (width > 8) {
-      int y = height + kSubPixelTaps - 6;
-      do {
-        int x = 0;
-        do {
-          const int16x8_t a =
-              WienerHorizontal8Tap3(src + x, filter_horizontal + 2);
-          vst1q_s16(wiener_buffer + x, a);
-          x += 8;
-        } while (x < width);
-        src += source_stride;
-        wiener_buffer += width;
-      } while (--y != 0);
-
-      wiener_buffer = reinterpret_cast<int16_t*>(buffer->wiener_buffer);
-
-      y = height;
+    int y = height + kSubPixelTaps - 6;
+    do {
+      int x = 0;
       do {
-        int x = 0;
-        do {
-          a[0] = vld1q_s16(wiener_buffer + x + 0 * width);
-          a[1] = vld1q_s16(wiener_buffer + x + 1 * width);
-          a[2] = vld1q_s16(wiener_buffer + x + 2 * width);
-
-          const uint8x8_t r = WienerVerticalTap3(a, filter_vertical + 2);
-          vst1_u8(dst + x, r);
-          x += 8;
-        } while (x < width);
-        wiener_buffer += width;
-        dst += dest_stride;
-      } while (--y != 0);
-    } else if (width > 4) {
-      a[0] = WienerHorizontal8Tap3(src, filter_horizontal + 2);
-      src += source_stride;
-      a[1] = WienerHorizontal8Tap3(src, filter_horizontal + 2);
+        const uint8x16_t r = vld1q_u8(src + x);
+        uint8x8_t s[3];
+        s[0] = vget_low_u8(r);
+        s[1] = vext_u8(s[0], vget_high_u8(r), 1);
+        s[2] = vext_u8(s[0], vget_high_u8(r), 2);
+        const int16x8_t a =
+            HorizontalSum(s, filter_horizontal + 2, vdupq_n_s16(0));
+        vst1q_s16(wiener_buffer + x, a);
+        x += 8;
+      } while (x < width);
       src += source_stride;
+      wiener_buffer += width;
+    } while (--y != 0);
 
-      int y = height;
+    wiener_buffer = reinterpret_cast<int16_t*>(buffer->wiener_buffer);
+    y = height;
+    do {
+      int x = 0;
       do {
-        a[2] = WienerHorizontal8Tap3(src, filter_horizontal + 2);
-        src += source_stride;
-
-        const uint8x8_t r = WienerVerticalTap3(a, filter_vertical + 2);
-        vst1_u8(dst, r);
-        dst += dest_stride;
-
-        a[0] = a[1];
-        a[1] = a[2];
-      } while (--y != 0);
-    } else {
-      int y = height;
-
-      if ((y & 1) != 0) {
-        --y;
-        a[0] =
-            WienerHorizontal4x2Tap3(src, source_stride, filter_horizontal + 2);
-        src += source_stride;
-        a[2] =
-            WienerHorizontal4Tap3(src + source_stride, filter_horizontal + 2);
-        a[1] = vcombine_s16(vget_high_s16(a[0]), vget_low_s16(a[2]));
-        const uint8x8_t r = WienerVerticalTap3(a, filter_vertical + 2);
-        StoreLo4(dst, r);
-        dst += dest_stride;
-      }
-
-      if (y != 0) {
-        a[0] =
-            WienerHorizontal4x2Tap3(src, source_stride, filter_horizontal + 2);
-        src += 2 * source_stride;
-
-        do {
-          a[2] = WienerHorizontal4x2Tap3(src, source_stride,
-                                         filter_horizontal + 2);
-          src += 2 * source_stride;
-          a[1] = vcombine_s16(vget_high_s16(a[0]), vget_low_s16(a[2]));
-
-          const uint8x8_t r = WienerVerticalTap3(a, filter_vertical + 2);
-          StoreLo4(dst, r);
-          dst += dest_stride;
-          StoreHi4(dst, r);
-          dst += dest_stride;
-
-          a[0] = a[2];
-          y -= 2;
-        } while (y != 0);
-      }
-    }
+        int16x8_t a[3];
+        a[0] = vld1q_s16(wiener_buffer + x + 0 * width);
+        a[1] = vld1q_s16(wiener_buffer + x + 1 * width);
+        a[2] = vld1q_s16(wiener_buffer + x + 2 * width);
+        int32x4_t sum[2];
+        sum[0] = sum[1] = vdupq_n_s32(0);
+        const uint8x8_t r = WienerVertical(a, filter_vertical + 2, sum);
+        vst1_u8(dst + x, r);
+        x += 8;
+      } while (x < width);
+      wiener_buffer += width;
+      dst += dest_stride;
+    } while (--y != 0);
   }
 }
 
+//------------------------------------------------------------------------------
 // SGR
 
-constexpr int kSgrProjScaleBits = 20;
-constexpr int kSgrProjRestoreBits = 4;
-constexpr int kSgrProjSgrBits = 8;
-constexpr int kSgrProjReciprocalBits = 12;
-
-// a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1);
-// sgr_ma2 = 256 - a2
-constexpr uint8_t kSgrMa2Lookup[256] = {
-    255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16, 15, 14,
-    13,  13,  12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,  8,  8,  7,  7,
-    7,   7,   7,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,
-    5,   5,   4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
-    4,   3,   3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
-    3,   3,   3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,  2,  2,
-    2,   2,   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-    2,   2,   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-    2,   2,   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-    2,   2,   2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-    1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-    1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-    1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-    1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-    1,   1,   1,  0};
-
 template <int n>
 inline uint16x4_t CalculateSgrMA2(const uint32x4_t sum_sq, const uint16x4_t sum,
                                   const uint32_t s) {
@@ -697,15 +317,15 @@ inline uint16x4_t CalculateSgrMA2(const uint32x4_t sum_sq, const uint16x4_t sum,
 
   // z = RightShiftWithRounding(p * s, kSgrProjScaleBits);
   const uint32x4_t pxs = vmulq_n_u32(p, s);
-  // For some reason vrshrn_n_u32() (narrowing shift) can only shift by 16
-  // and kSgrProjScaleBits is 20.
+  // vrshrn_n_u32() (narrowing shift) can only shift by 16 and kSgrProjScaleBits
+  // is 20.
   const uint32x4_t shifted = vrshrq_n_u32(pxs, kSgrProjScaleBits);
   return vmovn_u32(shifted);
 }
 
-inline uint16x4_t CalculateB2Shifted(const uint8x8_t sgr_ma2,
-                                     const uint16x4_t sum,
-                                     const uint32_t one_over_n) {
+inline uint16x4_t CalculateIntermediate4(const uint8x8_t sgr_ma2,
+                                         const uint16x4_t sum,
+                                         const uint32_t one_over_n) {
   // b2 = ((1 << kSgrProjSgrBits) - a2) * b * one_over_n
   // 1 << kSgrProjSgrBits = 256
   // |a2| = [1, 256]
@@ -726,9 +346,9 @@ inline uint16x4_t CalculateB2Shifted(const uint8x8_t sgr_ma2,
   return vrshrn_n_u32(b2, kSgrProjReciprocalBits);
 }
 
-inline uint16x8_t CalculateB2Shifted(const uint8x8_t sgr_ma2,
-                                     const uint16x8_t sum,
-                                     const uint32_t one_over_n) {
+inline uint16x8_t CalculateIntermediate8(const uint8x8_t sgr_ma2,
+                                         const uint16x8_t sum,
+                                         const uint32_t one_over_n) {
   // b2 = ((1 << kSgrProjSgrBits) - a2) * b * one_over_n
   // 1 << kSgrProjSgrBits = 256
   // |a2| = [1, 256]
@@ -753,41 +373,41 @@ inline uint16x8_t CalculateB2Shifted(const uint8x8_t sgr_ma2,
   return vcombine_u16(b2_lo, b2_hi);
 }
 
-inline uint16x8_t Sum3(const uint16x8_t left, const uint16x8_t middle,
-                       const uint16x8_t right) {
+inline uint16x4_t Sum3(const uint16x4_t left, const uint16x4_t middle,
+                       const uint16x4_t right) {
+  const uint16x4_t sum = vadd_u16(left, middle);
+  return vadd_u16(sum, right);
+}
+
+inline uint16x8_t Sum3_16(const uint16x8_t left, const uint16x8_t middle,
+                          const uint16x8_t right) {
   const uint16x8_t sum = vaddq_u16(left, middle);
   return vaddq_u16(sum, right);
 }
 
-inline uint32x4_t Sum3(const uint32x4_t left, const uint32x4_t middle,
-                       const uint32x4_t right) {
+inline uint32x4_t Sum3_32(const uint32x4_t left, const uint32x4_t middle,
+                          const uint32x4_t right) {
   const uint32x4_t sum = vaddq_u32(left, middle);
   return vaddq_u32(sum, right);
 }
 
-inline uint16x8_t Sum3W(const uint8x8_t left, const uint8x8_t middle,
-                        const uint8x8_t right) {
+inline uint16x8_t Sum3W_16(const uint8x8_t left, const uint8x8_t middle,
+                           const uint8x8_t right) {
   const uint16x8_t sum = vaddl_u8(left, middle);
   return vaddw_u8(sum, right);
 }
 
-inline uint32x4_t Sum3W(const uint16x4_t left, const uint16x4_t middle,
-                        const uint16x4_t right) {
-  const uint32x4_t sum = vaddl_u16(left, middle);
-  return vaddw_u16(sum, right);
-}
-
-inline uint16x4_t Sum3(const uint16x4_t left, const uint16x4_t middle,
-                       const uint16x4_t right) {
-  const uint16x4_t sum = vadd_u16(left, middle);
-  return vadd_u16(sum, right);
+inline uint16x8_t Sum3W_16(const uint8x8_t a[3]) {
+  return Sum3W_16(a[0], a[1], a[2]);
 }
 
-inline uint16x8_t Sum3W(const uint8x8_t a[3]) {
-  return Sum3W(a[0], a[1], a[2]);
+inline uint32x4_t Sum3W_32(const uint16x4_t left, const uint16x4_t middle,
+                           const uint16x4_t right) {
+  const uint32x4_t sum = vaddl_u16(left, middle);
+  return vaddw_u16(sum, right);
 }
 
-inline uint16x8x2_t Sum3W(const uint8x16_t a[3]) {
+inline uint16x8x2_t Sum3W_16x2(const uint8x16_t a[3]) {
   const uint8x8_t low0 = vget_low_u8(a[0]);
   const uint8x8_t low1 = vget_low_u8(a[1]);
   const uint8x8_t low2 = vget_low_u8(a[2]);
@@ -795,8 +415,8 @@ inline uint16x8x2_t Sum3W(const uint8x16_t a[3]) {
   const uint8x8_t high1 = vget_high_u8(a[1]);
   const uint8x8_t high2 = vget_high_u8(a[2]);
   uint16x8x2_t sum;
-  sum.val[0] = Sum3W(low0, low1, low2);
-  sum.val[1] = Sum3W(high0, high1, high2);
+  sum.val[0] = Sum3W_16(low0, low1, low2);
+  sum.val[1] = Sum3W_16(high0, high1, high2);
   return sum;
 }
 
@@ -808,32 +428,31 @@ inline uint32x4x2_t Sum3W(const uint16x8_t a[3]) {
   const uint16x4_t high1 = vget_high_u16(a[1]);
   const uint16x4_t high2 = vget_high_u16(a[2]);
   uint32x4x2_t sum;
-  sum.val[0] = Sum3W(low0, low1, low2);
-  sum.val[1] = Sum3W(high0, high1, high2);
+  sum.val[0] = Sum3W_32(low0, low1, low2);
+  sum.val[1] = Sum3W_32(high0, high1, high2);
   return sum;
 }
 
 template <int index>
-inline uint32x4_t Sum3WLow(const uint16x8x2_t a[3]) {
+inline uint32x4_t Sum3WLo(const uint16x8x2_t a[3]) {
   const uint16x4_t low0 = vget_low_u16(a[0].val[index]);
   const uint16x4_t low1 = vget_low_u16(a[1].val[index]);
   const uint16x4_t low2 = vget_low_u16(a[2].val[index]);
-  return Sum3W(low0, low1, low2);
+  return Sum3W_32(low0, low1, low2);
 }
 
-template <int index>
-inline uint32x4_t Sum3WHigh(const uint16x8x2_t a[3]) {
-  const uint16x4_t high0 = vget_high_u16(a[0].val[index]);
-  const uint16x4_t high1 = vget_high_u16(a[1].val[index]);
-  const uint16x4_t high2 = vget_high_u16(a[2].val[index]);
-  return Sum3W(high0, high1, high2);
+inline uint32x4_t Sum3WHi(const uint16x8x2_t a[3]) {
+  const uint16x4_t high0 = vget_high_u16(a[0].val[0]);
+  const uint16x4_t high1 = vget_high_u16(a[1].val[0]);
+  const uint16x4_t high2 = vget_high_u16(a[2].val[0]);
+  return Sum3W_32(high0, high1, high2);
 }
 
 inline uint32x4x3_t Sum3W(const uint16x8x2_t a[3]) {
   uint32x4x3_t sum;
-  sum.val[0] = Sum3WLow<0>(a);
-  sum.val[1] = Sum3WHigh<0>(a);
-  sum.val[2] = Sum3WLow<1>(a);
+  sum.val[0] = Sum3WLo<0>(a);
+  sum.val[1] = Sum3WHi(a);
+  sum.val[2] = Sum3WLo<1>(a);
   return sum;
 }
 
@@ -844,35 +463,35 @@ inline uint16x4_t Sum5(const uint16x4_t a[5]) {
   return vadd_u16(sum, a[4]);
 }
 
-inline uint16x8_t Sum5(const uint16x8_t a[5]) {
+inline uint16x8_t Sum5_16(const uint16x8_t a[5]) {
   const uint16x8_t sum01 = vaddq_u16(a[0], a[1]);
   const uint16x8_t sum23 = vaddq_u16(a[2], a[3]);
   const uint16x8_t sum = vaddq_u16(sum01, sum23);
   return vaddq_u16(sum, a[4]);
 }
 
-inline uint32x4_t Sum5(const uint32x4_t a[5]) {
+inline uint32x4_t Sum5_32(const uint32x4_t a[5]) {
   const uint32x4_t sum01 = vaddq_u32(a[0], a[1]);
   const uint32x4_t sum23 = vaddq_u32(a[2], a[3]);
   const uint32x4_t sum = vaddq_u32(sum01, sum23);
   return vaddq_u32(sum, a[4]);
 }
 
-inline uint16x8_t Sum5W(const uint8x8_t a[5]) {
+inline uint16x8_t Sum5W_16(const uint8x8_t a[5]) {
   const uint16x8_t sum01 = vaddl_u8(a[0], a[1]);
   const uint16x8_t sum23 = vaddl_u8(a[2], a[3]);
   const uint16x8_t sum = vaddq_u16(sum01, sum23);
   return vaddw_u8(sum, a[4]);
 }
 
-inline uint32x4_t Sum5W(const uint16x4_t a[5]) {
+inline uint32x4_t Sum5W_32(const uint16x4_t a[5]) {
   const uint32x4_t sum01 = vaddl_u16(a[0], a[1]);
   const uint32x4_t sum23 = vaddl_u16(a[2], a[3]);
   const uint32x4_t sum0123 = vaddq_u32(sum01, sum23);
   return vaddw_u16(sum0123, a[4]);
 }
 
-inline uint16x8x2_t Sum5W(const uint8x16_t a[5]) {
+inline uint16x8x2_t Sum5W_16D(const uint8x16_t a[5]) {
   uint16x8x2_t sum;
   uint8x8_t low[5], high[5];
   low[0] = vget_low_u8(a[0]);
@@ -885,12 +504,12 @@ inline uint16x8x2_t Sum5W(const uint8x16_t a[5]) {
   high[2] = vget_high_u8(a[2]);
   high[3] = vget_high_u8(a[3]);
   high[4] = vget_high_u8(a[4]);
-  sum.val[0] = Sum5W(low);
-  sum.val[1] = Sum5W(high);
+  sum.val[0] = Sum5W_16(low);
+  sum.val[1] = Sum5W_16(high);
   return sum;
 }
 
-inline uint32x4x2_t Sum5W(const uint16x8_t a[5]) {
+inline uint32x4x2_t Sum5W_32x2(const uint16x8_t a[5]) {
   uint32x4x2_t sum;
   uint16x4_t low[5], high[5];
   low[0] = vget_low_u16(a[0]);
@@ -903,113 +522,112 @@ inline uint32x4x2_t Sum5W(const uint16x8_t a[5]) {
   high[2] = vget_high_u16(a[2]);
   high[3] = vget_high_u16(a[3]);
   high[4] = vget_high_u16(a[4]);
-  sum.val[0] = Sum5W(low);
-  sum.val[1] = Sum5W(high);
+  sum.val[0] = Sum5W_32(low);
+  sum.val[1] = Sum5W_32(high);
   return sum;
 }
 
 template <int index>
-inline uint32x4_t Sum5WLow(const uint16x8x2_t a[5]) {
+inline uint32x4_t Sum5WLo(const uint16x8x2_t a[5]) {
   uint16x4_t low[5];
   low[0] = vget_low_u16(a[0].val[index]);
   low[1] = vget_low_u16(a[1].val[index]);
   low[2] = vget_low_u16(a[2].val[index]);
   low[3] = vget_low_u16(a[3].val[index]);
   low[4] = vget_low_u16(a[4].val[index]);
-  return Sum5W(low);
+  return Sum5W_32(low);
 }
 
-template <int index>
-inline uint32x4_t Sum5WHigh(const uint16x8x2_t a[5]) {
+inline uint32x4_t Sum5WHi(const uint16x8x2_t a[5]) {
   uint16x4_t high[5];
-  high[0] = vget_high_u16(a[0].val[index]);
-  high[1] = vget_high_u16(a[1].val[index]);
-  high[2] = vget_high_u16(a[2].val[index]);
-  high[3] = vget_high_u16(a[3].val[index]);
-  high[4] = vget_high_u16(a[4].val[index]);
-  return Sum5W(high);
+  high[0] = vget_high_u16(a[0].val[0]);
+  high[1] = vget_high_u16(a[1].val[0]);
+  high[2] = vget_high_u16(a[2].val[0]);
+  high[3] = vget_high_u16(a[3].val[0]);
+  high[4] = vget_high_u16(a[4].val[0]);
+  return Sum5W_32(high);
 }
 
-inline uint32x4x3_t Sum5W(const uint16x8x2_t a[5]) {
+inline uint32x4x3_t Sum5W_32x3(const uint16x8x2_t a[5]) {
   uint32x4x3_t sum;
-  sum.val[0] = Sum5WLow<0>(a);
-  sum.val[1] = Sum5WHigh<0>(a);
-  sum.val[2] = Sum5WLow<1>(a);
+  sum.val[0] = Sum5WLo<0>(a);
+  sum.val[1] = Sum5WHi(a);
+  sum.val[2] = Sum5WLo<1>(a);
   return sum;
 }
 
 inline uint16x4_t Sum3Horizontal(const uint16x8_t a) {
   const uint16x4_t left = vget_low_u16(a);
-  const uint16x4_t middle = vext_u16(vget_low_u16(a), vget_high_u16(a), 1);
-  const uint16x4_t right = vext_u16(vget_low_u16(a), vget_high_u16(a), 2);
+  const uint16x4_t middle = VshrU128<2>(a);
+  const uint16x4_t right = VshrU128<4>(a);
   return Sum3(left, middle, right);
 }
 
-inline uint16x8_t Sum3Horizontal(const uint16x8x2_t a) {
+inline uint16x8_t Sum3Horizontal_16(const uint16x8x2_t a) {
   const uint16x8_t left = a.val[0];
   const uint16x8_t middle = vextq_u16(a.val[0], a.val[1], 1);
   const uint16x8_t right = vextq_u16(a.val[0], a.val[1], 2);
-  return Sum3(left, middle, right);
+  return Sum3_16(left, middle, right);
 }
 
-inline uint32x4_t Sum3Horizontal(const uint32x4x2_t a) {
+inline uint32x4_t Sum3Horizontal_32(const uint32x4x2_t a) {
   const uint32x4_t left = a.val[0];
   const uint32x4_t middle = vextq_u32(a.val[0], a.val[1], 1);
   const uint32x4_t right = vextq_u32(a.val[0], a.val[1], 2);
-  return Sum3(left, middle, right);
+  return Sum3_32(left, middle, right);
 }
 
-inline uint32x4x2_t Sum3Horizontal(const uint32x4x3_t a) {
+inline uint32x4x2_t Sum3Horizontal_32x2(const uint32x4x3_t a) {
   uint32x4x2_t sum;
   {
     const uint32x4_t left = a.val[0];
     const uint32x4_t middle = vextq_u32(a.val[0], a.val[1], 1);
     const uint32x4_t right = vextq_u32(a.val[0], a.val[1], 2);
-    sum.val[0] = Sum3(left, middle, right);
+    sum.val[0] = Sum3_32(left, middle, right);
   }
   {
     const uint32x4_t left = a.val[1];
     const uint32x4_t middle = vextq_u32(a.val[1], a.val[2], 1);
     const uint32x4_t right = vextq_u32(a.val[1], a.val[2], 2);
-    sum.val[1] = Sum3(left, middle, right);
+    sum.val[1] = Sum3_32(left, middle, right);
   }
   return sum;
 }
 
 inline uint16x4_t Sum3HorizontalOffset1(const uint16x8_t a) {
-  const uint16x4_t left = vext_u16(vget_low_u16(a), vget_high_u16(a), 1);
-  const uint16x4_t middle = vext_u16(vget_low_u16(a), vget_high_u16(a), 2);
-  const uint16x4_t right = vext_u16(vget_low_u16(a), vget_high_u16(a), 3);
+  const uint16x4_t left = VshrU128<2>(a);
+  const uint16x4_t middle = VshrU128<4>(a);
+  const uint16x4_t right = VshrU128<6>(a);
   return Sum3(left, middle, right);
 }
 
-inline uint16x8_t Sum3HorizontalOffset1(const uint16x8x2_t a) {
+inline uint16x8_t Sum3HorizontalOffset1_16(const uint16x8x2_t a) {
   const uint16x8_t left = vextq_u16(a.val[0], a.val[1], 1);
   const uint16x8_t middle = vextq_u16(a.val[0], a.val[1], 2);
   const uint16x8_t right = vextq_u16(a.val[0], a.val[1], 3);
-  return Sum3(left, middle, right);
+  return Sum3_16(left, middle, right);
 }
 
-inline uint32x4_t Sum3HorizontalOffset1(const uint32x4x2_t a) {
+inline uint32x4_t Sum3HorizontalOffset1_32(const uint32x4x2_t a) {
   const uint32x4_t left = vextq_u32(a.val[0], a.val[1], 1);
   const uint32x4_t middle = vextq_u32(a.val[0], a.val[1], 2);
   const uint32x4_t right = vextq_u32(a.val[0], a.val[1], 3);
-  return Sum3(left, middle, right);
+  return Sum3_32(left, middle, right);
 }
 
-inline uint32x4x2_t Sum3HorizontalOffset1(const uint32x4x3_t a) {
+inline uint32x4x2_t Sum3HorizontalOffset1_32x2(const uint32x4x3_t a) {
   uint32x4x2_t sum;
   {
     const uint32x4_t left = vextq_u32(a.val[0], a.val[1], 1);
     const uint32x4_t middle = vextq_u32(a.val[0], a.val[1], 2);
     const uint32x4_t right = vextq_u32(a.val[0], a.val[1], 3);
-    sum.val[0] = Sum3(left, middle, right);
+    sum.val[0] = Sum3_32(left, middle, right);
   }
   {
     const uint32x4_t left = vextq_u32(a.val[1], a.val[2], 1);
     const uint32x4_t middle = vextq_u32(a.val[1], a.val[2], 2);
     const uint32x4_t right = vextq_u32(a.val[1], a.val[2], 3);
-    sum.val[1] = Sum3(left, middle, right);
+    sum.val[1] = Sum3_32(left, middle, right);
   }
   return sum;
 }
@@ -1017,34 +635,34 @@ inline uint32x4x2_t Sum3HorizontalOffset1(const uint32x4x3_t a) {
 inline uint16x4_t Sum5Horizontal(const uint16x8_t a) {
   uint16x4_t s[5];
   s[0] = vget_low_u16(a);
-  s[1] = vext_u16(vget_low_u16(a), vget_high_u16(a), 1);
-  s[2] = vext_u16(vget_low_u16(a), vget_high_u16(a), 2);
-  s[3] = vext_u16(vget_low_u16(a), vget_high_u16(a), 3);
+  s[1] = VshrU128<2>(a);
+  s[2] = VshrU128<4>(a);
+  s[3] = VshrU128<6>(a);
   s[4] = vget_high_u16(a);
   return Sum5(s);
 }
 
-inline uint16x8_t Sum5Horizontal(const uint16x8x2_t a) {
+inline uint16x8_t Sum5Horizontal_16(const uint16x8x2_t a) {
   uint16x8_t s[5];
   s[0] = a.val[0];
   s[1] = vextq_u16(a.val[0], a.val[1], 1);
   s[2] = vextq_u16(a.val[0], a.val[1], 2);
   s[3] = vextq_u16(a.val[0], a.val[1], 3);
-  s[4] = vcombine_u16(vget_high_u16(a.val[0]), vget_low_u16(a.val[1]));
-  return Sum5(s);
+  s[4] = vextq_u16(a.val[0], a.val[1], 4);
+  return Sum5_16(s);
 }
 
-inline uint32x4_t Sum5Horizontal(const uint32x4x2_t a) {
+inline uint32x4_t Sum5Horizontal_32(const uint32x4x2_t a) {
   uint32x4_t s[5];
   s[0] = a.val[0];
   s[1] = vextq_u32(a.val[0], a.val[1], 1);
   s[2] = vextq_u32(a.val[0], a.val[1], 2);
   s[3] = vextq_u32(a.val[0], a.val[1], 3);
   s[4] = a.val[1];
-  return Sum5(s);
+  return Sum5_32(s);
 }
 
-inline uint32x4x2_t Sum5Horizontal(const uint32x4x3_t a) {
+inline uint32x4x2_t Sum5Horizontal_32x2(const uint32x4x3_t a) {
   uint32x4x2_t sum;
   uint32x4_t s[5];
   s[0] = a.val[0];
@@ -1052,43 +670,42 @@ inline uint32x4x2_t Sum5Horizontal(const uint32x4x3_t a) {
   s[2] = vextq_u32(a.val[0], a.val[1], 2);
   s[3] = vextq_u32(a.val[0], a.val[1], 3);
   s[4] = a.val[1];
-  sum.val[0] = Sum5(s);
+  sum.val[0] = Sum5_32(s);
   s[0] = a.val[1];
   s[1] = vextq_u32(a.val[1], a.val[2], 1);
   s[2] = vextq_u32(a.val[1], a.val[2], 2);
   s[3] = vextq_u32(a.val[1], a.val[2], 3);
   s[4] = a.val[2];
-  sum.val[1] = Sum5(s);
+  sum.val[1] = Sum5_32(s);
   return sum;
 }
 
 template <int size, int offset>
-inline void PreProcess4(const uint8x8_t* const row,
-                        const uint16x8_t* const row_sq, const uint32_t s,
-                        uint16_t* const dst) {
+inline void BoxFilterPreProcess4(const uint8x8_t* const row,
+                                 const uint16x8_t* const row_sq,
+                                 const uint32_t s, uint16_t* const dst) {
   static_assert(offset == 0 || offset == 1, "");
   // Number of elements in the box being summed.
   constexpr uint32_t n = size * size;
   constexpr uint32_t one_over_n =
       ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
-  const uint16x4_t v_255 = vdup_n_u16(255);
   uint16x4_t sum;
   uint32x4_t sum_sq;
   if (size == 3) {
     if (offset == 0) {
-      sum = Sum3Horizontal(Sum3W(row));
-      sum_sq = Sum3Horizontal(Sum3W(row_sq));
+      sum = Sum3Horizontal(Sum3W_16(row));
+      sum_sq = Sum3Horizontal_32(Sum3W(row_sq));
     } else {
-      sum = Sum3HorizontalOffset1(Sum3W(row));
-      sum_sq = Sum3HorizontalOffset1(Sum3W(row_sq));
+      sum = Sum3HorizontalOffset1(Sum3W_16(row));
+      sum_sq = Sum3HorizontalOffset1_32(Sum3W(row_sq));
     }
   }
   if (size == 5) {
-    sum = Sum5Horizontal(Sum5W(row));
-    sum_sq = Sum5Horizontal(Sum5W(row_sq));
+    sum = Sum5Horizontal(Sum5W_16(row));
+    sum_sq = Sum5Horizontal_32(Sum5W_32x2(row_sq));
   }
   const uint16x4_t z0 = CalculateSgrMA2<n>(sum_sq, sum, s);
-  const uint16x4_t z = vmin_u16(v_255, z0);
+  const uint16x4_t z = vmin_u16(z0, vdup_n_u16(255));
   // Using vget_lane_s16() can save a sign extension instruction.
   // Add 4 0s for memory initialization purpose only.
   const uint8_t lookup[8] = {
@@ -1101,42 +718,41 @@ inline void PreProcess4(const uint8x8_t* const row,
       kSgrMa2Lookup[vget_lane_s16(vreinterpret_s16_u16(z), 2)],
       kSgrMa2Lookup[vget_lane_s16(vreinterpret_s16_u16(z), 3)]};
   const uint8x8_t sgr_ma2 = vld1_u8(lookup);
-  const uint16x4_t b2 = CalculateB2Shifted(sgr_ma2, sum, one_over_n);
+  const uint16x4_t b2 = CalculateIntermediate4(sgr_ma2, sum, one_over_n);
   const uint16x8_t sgr_ma2_b2 = vcombine_u16(vreinterpret_u16_u8(sgr_ma2), b2);
   vst1q_u16(dst, sgr_ma2_b2);
 }
 
 template <int size, int offset>
-inline void PreProcess8(const uint8x16_t* const row,
-                        const uint16x8x2_t* const row_sq, const uint32_t s,
-                        uint8x8_t* const sgr_ma2, uint16x8_t* const b2,
-                        uint16_t* const dst) {
+inline void BoxFilterPreProcess8(const uint8x16_t* const row,
+                                 const uint16x8x2_t* const row_sq,
+                                 const uint32_t s, uint8x8_t* const sgr_ma2,
+                                 uint16x8_t* const b2, uint16_t* const dst) {
   // Number of elements in the box being summed.
   constexpr uint32_t n = size * size;
   constexpr uint32_t one_over_n =
       ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
-  const uint16x8_t v_255 = vdupq_n_u16(255);
   uint16x8_t sum;
   uint32x4x2_t sum_sq;
   if (size == 3) {
     if (offset == 0) {
-      sum = Sum3Horizontal(Sum3W(row));
-      sum_sq = Sum3Horizontal(Sum3W(row_sq));
+      sum = Sum3Horizontal_16(Sum3W_16x2(row));
+      sum_sq = Sum3Horizontal_32x2(Sum3W(row_sq));
     } else /* if (offset == 1) */ {
-      sum = Sum3HorizontalOffset1(Sum3W(row));
-      sum_sq = Sum3HorizontalOffset1(Sum3W(row_sq));
+      sum = Sum3HorizontalOffset1_16(Sum3W_16x2(row));
+      sum_sq = Sum3HorizontalOffset1_32x2(Sum3W(row_sq));
     }
   }
   if (size == 5) {
-    sum = Sum5Horizontal(Sum5W(row));
-    sum_sq = Sum5Horizontal(Sum5W(row_sq));
+    sum = Sum5Horizontal_16(Sum5W_16D(row));
+    sum_sq = Sum5Horizontal_32x2(Sum5W_32x3(row_sq));
   }
   const uint16x4_t z0 = CalculateSgrMA2<n>(sum_sq.val[0], vget_low_u16(sum), s);
   const uint16x4_t z1 =
       CalculateSgrMA2<n>(sum_sq.val[1], vget_high_u16(sum), s);
   const uint16x8_t z01 = vcombine_u16(z0, z1);
   // Using vqmovn_u16() needs an extra sign extension instruction.
-  const uint16x8_t z = vminq_u16(v_255, z01);
+  const uint16x8_t z = vminq_u16(z01, vdupq_n_u16(255));
   // Using vgetq_lane_s16() can save the sign extension instruction.
   const uint8_t lookup[8] = {
       kSgrMa2Lookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 0)],
@@ -1148,40 +764,40 @@ inline void PreProcess8(const uint8x16_t* const row,
       kSgrMa2Lookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 6)],
       kSgrMa2Lookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 7)]};
   *sgr_ma2 = vld1_u8(lookup);
-  *b2 = CalculateB2Shifted(*sgr_ma2, sum, one_over_n);
+  *b2 = CalculateIntermediate8(*sgr_ma2, sum, one_over_n);
   const uint16x8_t sgr_ma2_b2 =
       vcombine_u16(vreinterpret_u16_u8(*sgr_ma2), vget_high_u16(*b2));
   vst1q_u16(dst, sgr_ma2_b2);
 }
 
-inline void Prepare3(const uint8x8_t a[2], uint8x8_t* const left,
-                     uint8x8_t* const middle, uint8x8_t* const right) {
+inline void Prepare3_8(const uint8x8_t a[2], uint8x8_t* const left,
+                       uint8x8_t* const middle, uint8x8_t* const right) {
   *left = vext_u8(a[0], a[1], 4);
   *middle = vext_u8(a[0], a[1], 5);
   *right = vext_u8(a[0], a[1], 6);
 }
 
-inline void Prepare3(const uint16x8_t a[2], uint16x8_t* const left,
-                     uint16x8_t* const middle, uint16x8_t* const right) {
-  *left = vcombine_u16(vget_high_u16(a[0]), vget_low_u16(a[1]));
+inline void Prepare3_16(const uint16x8_t a[2], uint16x8_t* const left,
+                        uint16x8_t* const middle, uint16x8_t* const right) {
+  *left = vextq_u16(a[0], a[1], 4);
   *middle = vextq_u16(a[0], a[1], 5);
   *right = vextq_u16(a[0], a[1], 6);
 }
 
 inline uint16x8_t Sum343(const uint8x8_t a[2]) {
   uint8x8_t left, middle, right;
-  Prepare3(a, &left, &middle, &right);
-  const uint16x8_t sum = Sum3W(left, middle, right);
-  const uint16x8_t sum3 = Sum3(sum, sum, sum);
+  Prepare3_8(a, &left, &middle, &right);
+  const uint16x8_t sum = Sum3W_16(left, middle, right);
+  const uint16x8_t sum3 = Sum3_16(sum, sum, sum);
   return vaddw_u8(sum3, middle);
 }
 
 inline void Sum343_444(const uint8x8_t a[2], uint16x8_t* const sum343,
                        uint16x8_t* const sum444) {
   uint8x8_t left, middle, right;
-  Prepare3(a, &left, &middle, &right);
-  const uint16x8_t sum = Sum3W(left, middle, right);
-  const uint16x8_t sum3 = Sum3(sum, sum, sum);
+  Prepare3_8(a, &left, &middle, &right);
+  const uint16x8_t sum = Sum3W_16(left, middle, right);
+  const uint16x8_t sum3 = Sum3_16(sum, sum, sum);
   *sum343 = vaddw_u8(sum3, middle);
   *sum444 = vshlq_n_u16(sum, 2);
 }
@@ -1189,13 +805,13 @@ inline void Sum343_444(const uint8x8_t a[2], uint16x8_t* const sum343,
 inline uint32x4x2_t Sum343W(const uint16x8_t a[2]) {
   uint16x8_t left, middle, right;
   uint32x4x2_t d;
-  Prepare3(a, &left, &middle, &right);
+  Prepare3_16(a, &left, &middle, &right);
   d.val[0] =
-      Sum3W(vget_low_u16(left), vget_low_u16(middle), vget_low_u16(right));
-  d.val[1] =
-      Sum3W(vget_high_u16(left), vget_high_u16(middle), vget_high_u16(right));
-  d.val[0] = Sum3(d.val[0], d.val[0], d.val[0]);
-  d.val[1] = Sum3(d.val[1], d.val[1], d.val[1]);
+      Sum3W_32(vget_low_u16(left), vget_low_u16(middle), vget_low_u16(right));
+  d.val[1] = Sum3W_32(vget_high_u16(left), vget_high_u16(middle),
+                      vget_high_u16(right));
+  d.val[0] = Sum3_32(d.val[0], d.val[0], d.val[0]);
+  d.val[1] = Sum3_32(d.val[1], d.val[1], d.val[1]);
   d.val[0] = vaddw_u16(d.val[0], vget_low_u16(middle));
   d.val[1] = vaddw_u16(d.val[1], vget_high_u16(middle));
   return d;
@@ -1204,13 +820,13 @@ inline uint32x4x2_t Sum343W(const uint16x8_t a[2]) {
 inline void Sum343_444W(const uint16x8_t a[2], uint32x4x2_t* const sum343,
                         uint32x4x2_t* const sum444) {
   uint16x8_t left, middle, right;
-  Prepare3(a, &left, &middle, &right);
+  Prepare3_16(a, &left, &middle, &right);
   sum444->val[0] =
-      Sum3W(vget_low_u16(left), vget_low_u16(middle), vget_low_u16(right));
-  sum444->val[1] =
-      Sum3W(vget_high_u16(left), vget_high_u16(middle), vget_high_u16(right));
-  sum343->val[0] = Sum3(sum444->val[0], sum444->val[0], sum444->val[0]);
-  sum343->val[1] = Sum3(sum444->val[1], sum444->val[1], sum444->val[1]);
+      Sum3W_32(vget_low_u16(left), vget_low_u16(middle), vget_low_u16(right));
+  sum444->val[1] = Sum3W_32(vget_high_u16(left), vget_high_u16(middle),
+                            vget_high_u16(right));
+  sum343->val[0] = Sum3_32(sum444->val[0], sum444->val[0], sum444->val[0]);
+  sum343->val[1] = Sum3_32(sum444->val[1], sum444->val[1], sum444->val[1]);
   sum343->val[0] = vaddw_u16(sum343->val[0], vget_low_u16(middle));
   sum343->val[1] = vaddw_u16(sum343->val[1], vget_high_u16(middle));
   sum444->val[0] = vshlq_n_u32(sum444->val[0], 2);
@@ -1219,8 +835,8 @@ inline void Sum343_444W(const uint16x8_t a[2], uint32x4x2_t* const sum343,
 
 inline uint16x8_t Sum565(const uint8x8_t a[2]) {
   uint8x8_t left, middle, right;
-  Prepare3(a, &left, &middle, &right);
-  const uint16x8_t sum = Sum3W(left, middle, right);
+  Prepare3_8(a, &left, &middle, &right);
+  const uint16x8_t sum = Sum3W_16(left, middle, right);
   const uint16x8_t sum4 = vshlq_n_u16(sum, 2);
   const uint16x8_t sum5 = vaddq_u16(sum4, sum);
   return vaddw_u8(sum5, middle);
@@ -1228,9 +844,9 @@ inline uint16x8_t Sum565(const uint8x8_t a[2]) {
 
 inline uint32x4_t Sum565W(const uint16x8_t a) {
   const uint16x4_t left = vget_low_u16(a);
-  const uint16x4_t middle = vext_u16(left, vget_high_u16(a), 1);
-  const uint16x4_t right = vext_u16(left, vget_high_u16(a), 2);
-  const uint32x4_t sum = Sum3W(left, middle, right);
+  const uint16x4_t middle = VshrU128<2>(a);
+  const uint16x4_t right = VshrU128<4>(a);
+  const uint32x4_t sum = Sum3W_32(left, middle, right);
   const uint32x4_t sum4 = vshlq_n_u32(sum, 2);
   const uint32x4_t sum5 = vaddq_u32(sum4, sum);
   return vaddw_u16(sum5, middle);
@@ -1256,53 +872,95 @@ inline uint16x4_t FilterOutput(const uint16x4_t src, const uint16x4_t a,
 }
 
 template <int shift>
-inline void CalculateFilteredOutput(const uint8x8_t src, const uint16x8_t a,
-                                    const uint32x4x2_t b, uint16_t* const dst) {
+inline int16x8_t CalculateFilteredOutput(const uint8x8_t src,
+                                         const uint16x8_t a,
+                                         const uint32x4x2_t b) {
   const uint16x8_t src_u16 = vmovl_u8(src);
   const uint16x4_t dst_lo =
       FilterOutput<shift>(vget_low_u16(src_u16), vget_low_u16(a), b.val[0]);
   const uint16x4_t dst_hi =
       FilterOutput<shift>(vget_high_u16(src_u16), vget_high_u16(a), b.val[1]);
-  const uint16x8_t d = vcombine_u16(dst_lo, dst_hi);
-  vst1q_u16(dst, d);
+  return vreinterpretq_s16_u16(vcombine_u16(dst_lo, dst_hi));  // 14 bits
 }
 
-inline void BoxFilter1(const uint8x8_t src_u8, const uint8x8_t a2[2],
-                       const uint16x8_t b2[2], uint16x8_t sum565_a[2],
-                       uint32x4x2_t sum565_b[2], uint16_t* const out_buf) {
+inline int16x8_t BoxFilterPass1(const uint8x8_t src_u8, const uint8x8_t a2[2],
+                                const uint16x8_t b2[2], uint16x8_t sum565_a[2],
+                                uint32x4x2_t sum565_b[2]) {
   uint32x4x2_t b_v;
   sum565_a[1] = Sum565(a2);
   sum565_a[1] = vsubq_u16(vdupq_n_u16((5 + 6 + 5) * 256), sum565_a[1]);
-  sum565_b[1].val[0] =
-      Sum565W(vcombine_u16(vget_high_u16(b2[0]), vget_low_u16(b2[1])));
+  sum565_b[1].val[0] = Sum565W(vextq_u16(b2[0], b2[1], 4));
   sum565_b[1].val[1] = Sum565W(b2[1]);
 
   uint16x8_t a_v = vaddq_u16(sum565_a[0], sum565_a[1]);
   b_v.val[0] = vaddq_u32(sum565_b[0].val[0], sum565_b[1].val[0]);
   b_v.val[1] = vaddq_u32(sum565_b[0].val[1], sum565_b[1].val[1]);
-  CalculateFilteredOutput<5>(src_u8, a_v, b_v, out_buf);
+  return CalculateFilteredOutput<5>(src_u8, a_v, b_v);  // 14 bits
 }
 
-inline void BoxFilter2(const uint8x8_t src_u8, const uint8x8_t a2[2],
-                       const uint16x8_t b2[2], uint16x8_t sum343_a[4],
-                       uint16x8_t sum444_a[3], uint32x4x2_t sum343_b[4],
-                       uint32x4x2_t sum444_b[3], uint16_t* const out_buf) {
+inline int16x8_t BoxFilterPass2(const uint8x8_t src_u8, const uint8x8_t a2[2],
+                                const uint16x8_t b2[2], uint16x8_t sum343_a[4],
+                                uint16x8_t sum444_a[3],
+                                uint32x4x2_t sum343_b[4],
+                                uint32x4x2_t sum444_b[3]) {
   uint32x4x2_t b_v;
   Sum343_444(a2, &sum343_a[2], &sum444_a[1]);
   sum343_a[2] = vsubq_u16(vdupq_n_u16((3 + 4 + 3) * 256), sum343_a[2]);
   sum444_a[1] = vsubq_u16(vdupq_n_u16((4 + 4 + 4) * 256), sum444_a[1]);
-  uint16x8_t a_v = Sum3(sum343_a[0], sum444_a[0], sum343_a[2]);
+  uint16x8_t a_v = Sum3_16(sum343_a[0], sum444_a[0], sum343_a[2]);
   Sum343_444W(b2, &sum343_b[2], &sum444_b[1]);
-  b_v.val[0] = Sum3(sum343_b[0].val[0], sum444_b[0].val[0], sum343_b[2].val[0]);
-  b_v.val[1] = Sum3(sum343_b[0].val[1], sum444_b[0].val[1], sum343_b[2].val[1]);
-  CalculateFilteredOutput<5>(src_u8, a_v, b_v, out_buf);
+  b_v.val[0] =
+      Sum3_32(sum343_b[0].val[0], sum444_b[0].val[0], sum343_b[2].val[0]);
+  b_v.val[1] =
+      Sum3_32(sum343_b[0].val[1], sum444_b[0].val[1], sum343_b[2].val[1]);
+  return CalculateFilteredOutput<5>(src_u8, a_v, b_v);  // 14 bits
+}
+
+inline void SelfGuidedDoubleMultiplier(
+    const uint8x8_t src, const int16x8_t box_filter_process_output[2],
+    const int16x4_t w0, const int16x4_t w1, const int16x4_t w2,
+    uint8_t* const dst) {
+  // |wN| values are signed. |src| values can be treated as int16_t.
+  const int16x8_t u =
+      vreinterpretq_s16_u16(vshll_n_u8(src, kSgrProjRestoreBits));
+  int32x4_t v_lo = vmull_s16(vget_low_s16(u), w1);
+  v_lo = vmlal_s16(v_lo, vget_low_s16(box_filter_process_output[0]), w0);
+  v_lo = vmlal_s16(v_lo, vget_low_s16(box_filter_process_output[1]), w2);
+  int32x4_t v_hi = vmull_s16(vget_high_s16(u), w1);
+  v_hi = vmlal_s16(v_hi, vget_high_s16(box_filter_process_output[0]), w0);
+  v_hi = vmlal_s16(v_hi, vget_high_s16(box_filter_process_output[1]), w2);
+  // |s| is saturated to uint8_t.
+  const int16x4_t s_lo =
+      vrshrn_n_s32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const int16x4_t s_hi =
+      vrshrn_n_s32(v_hi, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  vst1_u8(dst, vqmovun_s16(vcombine_s16(s_lo, s_hi)));
 }
 
-inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride,
+inline void SelfGuidedSingleMultiplier(
+    const uint8x8_t src, const int16x8_t box_filter_process_output,
+    const int16_t w0, const int16_t w1, uint8_t* dst) {
+  // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+  const int16x8_t u =
+      vreinterpretq_s16_u16(vshll_n_u8(src, kSgrProjRestoreBits));
+  // u * w1 + u * wN == u * (w1 + wN)
+  int32x4_t v_lo = vmull_n_s16(vget_low_s16(u), w1);
+  v_lo = vmlal_n_s16(v_lo, vget_low_s16(box_filter_process_output), w0);
+  int32x4_t v_hi = vmull_n_s16(vget_high_s16(u), w1);
+  v_hi = vmlal_n_s16(v_hi, vget_high_s16(box_filter_process_output), w0);
+  const int16x4_t s_lo =
+      vrshrn_n_s32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const int16x4_t s_hi =
+      vrshrn_n_s32(v_hi, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  vst1_u8(dst, vqmovun_s16(vcombine_s16(s_lo, s_hi)));
+}
+
+inline void BoxFilterProcess(const uint8_t* const src,
+                             const ptrdiff_t src_stride,
+                             const RestorationUnitInfo& restoration_info,
                              const int width, const int height,
-                             const uint16_t s[2],
-                             uint16_t* const box_filter_process_output,
-                             uint16_t* const temp) {
+                             const uint16_t s[2], uint16_t* const temp,
+                             uint8_t* const dst, const ptrdiff_t dst_stride) {
   // We have combined PreProcess and Process for the first pass by storing
   // intermediate values in the |a2| region. The values stored are one vertical
   // column of interleaved |a2| and |b2| values and consume 8 * |height| values.
@@ -1340,45 +998,39 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride,
   // interleaved in |temp|. The first half is not stored, since it is used
   // immediately and becomes useless for the next column. Next we will start the
   // second column. When 2 rows have been calculated we can calculate Process
-  // and output those into the top of |box_filter_process_output|.
+  // and output the results.
 
   // Calculate and store a single column. Scope so we can re-use the variable
   // names for the next step.
   uint16_t* ab_ptr = temp;
 
-  // The first phase needs a radius of 2 context values. The second phase
-  // needs a context of radius 1 values. This means we start at (-3, -3).
-  const uint8_t* const src_pre_process = src - 3 - 3 * stride;
-  // Calculate intermediate results, including two-pixel border, for example,
-  // if unit size is 64x64, we calculate 68x68 pixels.
+  const uint8_t* const src_pre_process = src - 2 * src_stride - 3;
+  // Calculate intermediate results, including two-pixel border, for example, if
+  // unit size is 64x64, we calculate 68x68 pixels.
   {
     const uint8_t* column = src_pre_process;
     uint8x8_t row[5];
     uint16x8_t row_sq[5];
-
-    row[0] = vld1_u8(column);
-    column += stride;
-    row[1] = vld1_u8(column);
-    column += stride;
+    row[0] = row[1] = vld1_u8(column);
+    column += src_stride;
     row[2] = vld1_u8(column);
 
-    row_sq[0] = vmull_u8(row[0], row[0]);
-    row_sq[1] = vmull_u8(row[1], row[1]);
+    row_sq[0] = row_sq[1] = vmull_u8(row[1], row[1]);
     row_sq[2] = vmull_u8(row[2], row[2]);
 
-    int y = 0;
+    int y = (height + 2) >> 1;
     do {
-      column += stride;
+      column += src_stride;
       row[3] = vld1_u8(column);
-      column += stride;
+      column += src_stride;
       row[4] = vld1_u8(column);
 
       row_sq[3] = vmull_u8(row[3], row[3]);
       row_sq[4] = vmull_u8(row[4], row[4]);
 
-      PreProcess4<5, 0>(row + 0, row_sq + 0, s[0], ab_ptr + 0);
-      PreProcess4<3, 1>(row + 1, row_sq + 1, s[1], ab_ptr + 8);
-      PreProcess4<3, 1>(row + 2, row_sq + 2, s[1], ab_ptr + 16);
+      BoxFilterPreProcess4<5, 0>(row + 0, row_sq + 0, s[0], ab_ptr + 0);
+      BoxFilterPreProcess4<3, 1>(row + 1, row_sq + 1, s[1], ab_ptr + 8);
+      BoxFilterPreProcess4<3, 1>(row + 2, row_sq + 2, s[1], ab_ptr + 16);
 
       row[0] = row[2];
       row[1] = row[3];
@@ -1388,10 +1040,23 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride,
       row_sq[1] = row_sq[3];
       row_sq[2] = row_sq[4];
       ab_ptr += 24;
-      y += 2;
-    } while (y < height + 2);
+    } while (--y != 0);
+
+    if ((height & 1) != 0) {
+      column += src_stride;
+      row[3] = row[4] = vld1_u8(column);
+      row_sq[3] = row_sq[4] = vmull_u8(row[3], row[3]);
+      BoxFilterPreProcess4<5, 0>(row + 0, row_sq + 0, s[0], ab_ptr + 0);
+      BoxFilterPreProcess4<3, 1>(row + 1, row_sq + 1, s[1], ab_ptr + 8);
+    }
   }
 
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  const int16x4_t w0_v = vdup_n_s16(w0);
+  const int16x4_t w1_v = vdup_n_s16(w1);
+  const int16x4_t w2_v = vdup_n_s16(w2);
   int x = 0;
   do {
     // |src_pre_process| is X but we already processed the first column of 4
@@ -1423,21 +1088,18 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride,
     const uint8_t* column = src_pre_process + x + 4;
     uint8x16_t row[5];
     uint16x8x2_t row_sq[5];
-
-    row[0] = vld1q_u8(column);
-    column += stride;
-    row[1] = vld1q_u8(column);
-    column += stride;
+    row[0] = row[1] = vld1q_u8(column);
+    column += src_stride;
     row[2] = vld1q_u8(column);
-    column += stride;
+    column += src_stride;
     row[3] = vld1q_u8(column);
-    column += stride;
+    column += src_stride;
     row[4] = vld1q_u8(column);
 
-    row_sq[0].val[0] = vmull_u8(vget_low_u8(row[0]), vget_low_u8(row[0]));
-    row_sq[0].val[1] = vmull_u8(vget_high_u8(row[0]), vget_high_u8(row[0]));
-    row_sq[1].val[0] = vmull_u8(vget_low_u8(row[1]), vget_low_u8(row[1]));
-    row_sq[1].val[1] = vmull_u8(vget_high_u8(row[1]), vget_high_u8(row[1]));
+    row_sq[0].val[0] = row_sq[1].val[0] =
+        vmull_u8(vget_low_u8(row[1]), vget_low_u8(row[1]));
+    row_sq[0].val[1] = row_sq[1].val[1] =
+        vmull_u8(vget_high_u8(row[1]), vget_high_u8(row[1]));
     row_sq[2].val[0] = vmull_u8(vget_low_u8(row[2]), vget_low_u8(row[2]));
     row_sq[2].val[1] = vmull_u8(vget_high_u8(row[2]), vget_high_u8(row[2]));
     row_sq[3].val[0] = vmull_u8(vget_low_u8(row[3]), vget_low_u8(row[3]));
@@ -1445,21 +1107,17 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride,
     row_sq[4].val[0] = vmull_u8(vget_low_u8(row[4]), vget_low_u8(row[4]));
     row_sq[4].val[1] = vmull_u8(vget_high_u8(row[4]), vget_high_u8(row[4]));
 
-    PreProcess8<5, 0>(row, row_sq, s[0], &a2[0][1], &b2[0][1], ab_ptr);
-    PreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1][1], &b2[1][1],
-                      ab_ptr + 8);
+    BoxFilterPreProcess8<5, 0>(row, row_sq, s[0], &a2[0][1], &b2[0][1], ab_ptr);
+    BoxFilterPreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1][1], &b2[1][1],
+                               ab_ptr + 8);
 
     // Pass 1 Process. These are the only values we need to propagate between
     // rows.
     sum565_a[0] = Sum565(a2[0]);
     sum565_a[0] = vsubq_u16(vdupq_n_u16((5 + 6 + 5) * 256), sum565_a[0]);
-    sum565_b[0].val[0] =
-        Sum565W(vcombine_u16(vget_high_u16(b2[0][0]), vget_low_u16(b2[0][1])));
+    sum565_b[0].val[0] = Sum565W(vextq_u16(b2[0][0], b2[0][1], 4));
     sum565_b[0].val[1] = Sum565W(b2[0][1]);
 
-    const uint8_t* src_ptr = src + x;
-    uint16_t* out_buf = box_filter_process_output + 2 * x;
-
     sum343_a[0] = Sum343(a2[1]);
     sum343_a[0] = vsubq_u16(vdupq_n_u16((3 + 4 + 3) * 256), sum343_a[0]);
     sum343_b[0] = Sum343W(b2[1]);
@@ -1467,19 +1125,21 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride,
     b2[1][0] = vld1q_u16(ab_ptr + 16);
     a2[1][0] = vget_low_u8(vreinterpretq_u8_u16(b2[1][0]));
 
-    PreProcess8<3, 1>(row + 2, row_sq + 2, s[1], &a2[1][1], &b2[1][1],
-                      ab_ptr + 16);
+    BoxFilterPreProcess8<3, 1>(row + 2, row_sq + 2, s[1], &a2[1][1], &b2[1][1],
+                               ab_ptr + 16);
 
     Sum343_444(a2[1], &sum343_a[1], &sum444_a[0]);
     sum343_a[1] = vsubq_u16(vdupq_n_u16((3 + 4 + 3) * 256), sum343_a[1]);
     sum444_a[0] = vsubq_u16(vdupq_n_u16((4 + 4 + 4) * 256), sum444_a[0]);
     Sum343_444W(b2[1], &sum343_b[1], &sum444_b[0]);
 
+    const uint8_t* src_ptr = src + x;
+    uint8_t* dst_ptr = dst + x;
+
     // Calculate one output line. Add in the line from the previous pass and
     // output one even row. Sum the new line and output the odd row. Carry the
     // new row into the next pass.
-    int y = 0;
-    do {
+    for (int y = height >> 1; y != 0; --y) {
       ab_ptr += 24;
       b2[0][0] = vld1q_u16(ab_ptr);
       a2[0][0] = vget_low_u8(vreinterpretq_u8_u16(b2[0][0]));
@@ -1494,9 +1154,9 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride,
       row_sq[1] = row_sq[3];
       row_sq[2] = row_sq[4];
 
-      column += stride;
+      column += src_stride;
       row[3] = vld1q_u8(column);
-      column += stride;
+      column += src_stride;
       row[4] = vld1q_u8(column);
 
       row_sq[3].val[0] = vmull_u8(vget_low_u8(row[3]), vget_low_u8(row[3]));
@@ -1504,28 +1164,31 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride,
       row_sq[4].val[0] = vmull_u8(vget_low_u8(row[4]), vget_low_u8(row[4]));
       row_sq[4].val[1] = vmull_u8(vget_high_u8(row[4]), vget_high_u8(row[4]));
 
-      PreProcess8<5, 0>(row, row_sq, s[0], &a2[0][1], &b2[0][1], ab_ptr);
-      PreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1][1], &b2[1][1],
-                        ab_ptr + 8);
+      BoxFilterPreProcess8<5, 0>(row, row_sq, s[0], &a2[0][1], &b2[0][1],
+                                 ab_ptr);
+      BoxFilterPreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1][1],
+                                 &b2[1][1], ab_ptr + 8);
+
+      int16x8_t p[2];
+      const uint8x8_t src0 = vld1_u8(src_ptr);
+      p[0] = BoxFilterPass1(src0, a2[0], b2[0], sum565_a, sum565_b);
+      p[1] = BoxFilterPass2(src0, a2[1], b2[1], sum343_a, sum444_a, sum343_b,
+                            sum444_b);
+      SelfGuidedDoubleMultiplier(src0, p, w0_v, w1_v, w2_v, dst_ptr);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
 
-      uint8x8_t src_u8 = vld1_u8(src_ptr);
-      BoxFilter1(src_u8, a2[0], b2[0], sum565_a, sum565_b, out_buf);
-      BoxFilter2(src_u8, a2[1], b2[1], sum343_a, sum444_a, sum343_b, sum444_b,
-                 out_buf + 8);
-      src_ptr += stride;
-      out_buf += 2 * kRestorationProcessingUnitSize;
-
-      src_u8 = vld1_u8(src_ptr);
-      CalculateFilteredOutput<4>(src_u8, sum565_a[1], sum565_b[1], out_buf);
+      const uint8x8_t src1 = vld1_u8(src_ptr);
+      p[0] = CalculateFilteredOutput<4>(src1, sum565_a[1], sum565_b[1]);
       b2[1][0] = vld1q_u16(ab_ptr + 16);
       a2[1][0] = vget_low_u8(vreinterpretq_u8_u16(b2[1][0]));
-      PreProcess8<3, 1>(row + 2, row_sq + 2, s[1], &a2[1][1], &b2[1][1],
-                        ab_ptr + 16);
-
-      BoxFilter2(src_u8, a2[1], b2[1], sum343_a + 1, sum444_a + 1, sum343_b + 1,
-                 sum444_b + 1, out_buf + 8);
-      src_ptr += stride;
-      out_buf += 2 * kRestorationProcessingUnitSize;
+      BoxFilterPreProcess8<3, 1>(row + 2, row_sq + 2, s[1], &a2[1][1],
+                                 &b2[1][1], ab_ptr + 16);
+      p[1] = BoxFilterPass2(src1, a2[1], b2[1], sum343_a + 1, sum444_a + 1,
+                            sum343_b + 1, sum444_b + 1);
+      SelfGuidedDoubleMultiplier(src1, p, w0_v, w1_v, w2_v, dst_ptr);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
 
       sum565_a[0] = sum565_a[1];
       sum565_b[0] = sum565_b[1];
@@ -1535,17 +1198,53 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride,
       sum343_b[0] = sum343_b[2];
       sum343_b[1] = sum343_b[3];
       sum444_b[0] = sum444_b[2];
+    }
+    if ((height & 1) != 0) {
+      ab_ptr += 24;
+      b2[0][0] = vld1q_u16(ab_ptr);
+      a2[0][0] = vget_low_u8(vreinterpretq_u8_u16(b2[0][0]));
+      b2[1][0] = vld1q_u16(ab_ptr + 8);
+      a2[1][0] = vget_low_u8(vreinterpretq_u8_u16(b2[1][0]));
 
-      y += 2;
-    } while (y < height);
+      row[0] = row[2];
+      row[1] = row[3];
+      row[2] = row[4];
+
+      row_sq[0] = row_sq[2];
+      row_sq[1] = row_sq[3];
+      row_sq[2] = row_sq[4];
+
+      column += src_stride;
+      row[3] = row[4] = vld1q_u8(column);
+
+      row_sq[3].val[0] = row_sq[4].val[0] =
+          vmull_u8(vget_low_u8(row[3]), vget_low_u8(row[3]));
+      row_sq[3].val[1] = row_sq[4].val[1] =
+          vmull_u8(vget_high_u8(row[3]), vget_high_u8(row[3]));
+
+      BoxFilterPreProcess8<5, 0>(row, row_sq, s[0], &a2[0][1], &b2[0][1],
+                                 ab_ptr);
+      BoxFilterPreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1][1],
+                                 &b2[1][1], ab_ptr + 8);
+
+      int16x8_t p[2];
+      const uint8x8_t src0 = vld1_u8(src_ptr);
+      p[0] = BoxFilterPass1(src0, a2[0], b2[0], sum565_a, sum565_b);
+      p[1] = BoxFilterPass2(src0, a2[1], b2[1], sum343_a, sum444_a, sum343_b,
+                            sum444_b);
+      SelfGuidedDoubleMultiplier(src0, p, w0_v, w1_v, w2_v, dst_ptr);
+    }
     x += 8;
   } while (x < width);
 }
 
-inline void BoxFilterProcess_FirstPass(
-    const uint8_t* const src, const ptrdiff_t stride, const int width,
-    const int height, const uint32_t s,
-    uint16_t* const box_filter_process_output, uint16_t* const temp) {
+inline void BoxFilterProcessPass1(const uint8_t* const src,
+                                  const ptrdiff_t src_stride,
+                                  const RestorationUnitInfo& restoration_info,
+                                  const int width, const int height,
+                                  const uint32_t s, uint16_t* const temp,
+                                  uint8_t* const dst,
+                                  const ptrdiff_t dst_stride) {
   // We have combined PreProcess and Process for the first pass by storing
   // intermediate values in the |a2| region. The values stored are one vertical
   // column of interleaved |a2| and |b2| values and consume 8 * |height| values.
@@ -1583,43 +1282,37 @@ inline void BoxFilterProcess_FirstPass(
   // interleaved in |temp|. The first half is not stored, since it is used
   // immediately and becomes useless for the next column. Next we will start the
   // second column. When 2 rows have been calculated we can calculate Process
-  // and output those into the top of |box_filter_process_output|.
+  // and output the results.
 
   // Calculate and store a single column. Scope so we can re-use the variable
   // names for the next step.
   uint16_t* ab_ptr = temp;
 
-  // The first phase needs a radius of 2 context values. The second phase
-  // needs a context of radius 1 values. This means we start at (-3, -3).
-  const uint8_t* const src_pre_process = src - 3 - 3 * stride;
-  // Calculate intermediate results, including two-pixel border, for example,
-  // if unit size is 64x64, we calculate 68x68 pixels.
+  const uint8_t* const src_pre_process = src - 2 * src_stride - 3;
+  // Calculate intermediate results, including two-pixel border, for example, if
+  // unit size is 64x64, we calculate 68x68 pixels.
   {
     const uint8_t* column = src_pre_process;
     uint8x8_t row[5];
     uint16x8_t row_sq[5];
-
-    row[0] = vld1_u8(column);
-    column += stride;
-    row[1] = vld1_u8(column);
-    column += stride;
+    row[0] = row[1] = vld1_u8(column);
+    column += src_stride;
     row[2] = vld1_u8(column);
 
-    row_sq[0] = vmull_u8(row[0], row[0]);
-    row_sq[1] = vmull_u8(row[1], row[1]);
+    row_sq[0] = row_sq[1] = vmull_u8(row[1], row[1]);
     row_sq[2] = vmull_u8(row[2], row[2]);
 
-    int y = 0;
+    int y = (height + 2) >> 1;
     do {
-      column += stride;
+      column += src_stride;
       row[3] = vld1_u8(column);
-      column += stride;
+      column += src_stride;
       row[4] = vld1_u8(column);
 
       row_sq[3] = vmull_u8(row[3], row[3]);
       row_sq[4] = vmull_u8(row[4], row[4]);
 
-      PreProcess4<5, 0>(row, row_sq, s, ab_ptr);
+      BoxFilterPreProcess4<5, 0>(row, row_sq, s, ab_ptr);
 
       row[0] = row[2];
       row[1] = row[3];
@@ -1629,10 +1322,18 @@ inline void BoxFilterProcess_FirstPass(
       row_sq[1] = row_sq[3];
       row_sq[2] = row_sq[4];
       ab_ptr += 8;
-      y += 2;
-    } while (y < height + 2);
+    } while (--y != 0);
+
+    if ((height & 1) != 0) {
+      column += src_stride;
+      row[3] = row[4] = vld1_u8(column);
+      row_sq[3] = row_sq[4] = vmull_u8(row[3], row[3]);
+      BoxFilterPreProcess4<5, 0>(row, row_sq, s, ab_ptr);
+    }
   }
 
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = (1 << kSgrProjPrecisionBits) - w0;
   int x = 0;
   do {
     // |src_pre_process| is X but we already processed the first column of 4
@@ -1662,21 +1363,18 @@ inline void BoxFilterProcess_FirstPass(
     const uint8_t* column = src_pre_process + x + 4;
     uint8x16_t row[5];
     uint16x8x2_t row_sq[5];
-
-    row[0] = vld1q_u8(column);
-    column += stride;
-    row[1] = vld1q_u8(column);
-    column += stride;
+    row[0] = row[1] = vld1q_u8(column);
+    column += src_stride;
     row[2] = vld1q_u8(column);
-    column += stride;
+    column += src_stride;
     row[3] = vld1q_u8(column);
-    column += stride;
+    column += src_stride;
     row[4] = vld1q_u8(column);
 
-    row_sq[0].val[0] = vmull_u8(vget_low_u8(row[0]), vget_low_u8(row[0]));
-    row_sq[0].val[1] = vmull_u8(vget_high_u8(row[0]), vget_high_u8(row[0]));
-    row_sq[1].val[0] = vmull_u8(vget_low_u8(row[1]), vget_low_u8(row[1]));
-    row_sq[1].val[1] = vmull_u8(vget_high_u8(row[1]), vget_high_u8(row[1]));
+    row_sq[0].val[0] = row_sq[1].val[0] =
+        vmull_u8(vget_low_u8(row[1]), vget_low_u8(row[1]));
+    row_sq[0].val[1] = row_sq[1].val[1] =
+        vmull_u8(vget_high_u8(row[1]), vget_high_u8(row[1]));
     row_sq[2].val[0] = vmull_u8(vget_low_u8(row[2]), vget_low_u8(row[2]));
     row_sq[2].val[1] = vmull_u8(vget_high_u8(row[2]), vget_high_u8(row[2]));
     row_sq[3].val[0] = vmull_u8(vget_low_u8(row[3]), vget_low_u8(row[3]));
@@ -1684,24 +1382,22 @@ inline void BoxFilterProcess_FirstPass(
     row_sq[4].val[0] = vmull_u8(vget_low_u8(row[4]), vget_low_u8(row[4]));
     row_sq[4].val[1] = vmull_u8(vget_high_u8(row[4]), vget_high_u8(row[4]));
 
-    PreProcess8<5, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr);
+    BoxFilterPreProcess8<5, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr);
 
     // Pass 1 Process. These are the only values we need to propagate between
     // rows.
     sum565_a[0] = Sum565(a2);
     sum565_a[0] = vsubq_u16(vdupq_n_u16((5 + 6 + 5) * 256), sum565_a[0]);
-    sum565_b[0].val[0] =
-        Sum565W(vcombine_u16(vget_high_u16(b2[0]), vget_low_u16(b2[1])));
+    sum565_b[0].val[0] = Sum565W(vextq_u16(b2[0], b2[1], 4));
     sum565_b[0].val[1] = Sum565W(b2[1]);
 
     const uint8_t* src_ptr = src + x;
-    uint16_t* out_buf = box_filter_process_output + x;
+    uint8_t* dst_ptr = dst + x;
 
     // Calculate one output line. Add in the line from the previous pass and
     // output one even row. Sum the new line and output the odd row. Carry the
     // new row into the next pass.
-    int y = 0;
-    do {
+    for (int y = height >> 1; y != 0; --y) {
       ab_ptr += 8;
       b2[0] = vld1q_u16(ab_ptr);
       a2[0] = vget_low_u8(vreinterpretq_u8_u16(b2[0]));
@@ -1714,9 +1410,9 @@ inline void BoxFilterProcess_FirstPass(
       row_sq[1] = row_sq[3];
       row_sq[2] = row_sq[4];
 
-      column += stride;
+      column += src_stride;
       row[3] = vld1q_u8(column);
-      column += stride;
+      column += src_stride;
       row[4] = vld1q_u8(column);
 
       row_sq[3].val[0] = vmull_u8(vget_low_u8(row[3]), vget_low_u8(row[3]));
@@ -1724,55 +1420,86 @@ inline void BoxFilterProcess_FirstPass(
       row_sq[4].val[0] = vmull_u8(vget_low_u8(row[4]), vget_low_u8(row[4]));
       row_sq[4].val[1] = vmull_u8(vget_high_u8(row[4]), vget_high_u8(row[4]));
 
-      PreProcess8<5, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr);
+      BoxFilterPreProcess8<5, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr);
 
-      uint8x8_t src_u8 = vld1_u8(src_ptr);
-      BoxFilter1(src_u8, a2, b2, sum565_a, sum565_b, out_buf);
-      src_ptr += stride;
-      out_buf += kRestorationProcessingUnitSize;
+      const uint8x8_t src0 = vld1_u8(src_ptr);
+      const int16x8_t p0 = BoxFilterPass1(src0, a2, b2, sum565_a, sum565_b);
+      SelfGuidedSingleMultiplier(src0, p0, w0, w1, dst_ptr);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
 
-      src_u8 = vld1_u8(src_ptr);
-      CalculateFilteredOutput<4>(src_u8, sum565_a[1], sum565_b[1], out_buf);
-      src_ptr += stride;
-      out_buf += kRestorationProcessingUnitSize;
+      const uint8x8_t src1 = vld1_u8(src_ptr);
+      const int16x8_t p1 =
+          CalculateFilteredOutput<4>(src1, sum565_a[1], sum565_b[1]);
+      SelfGuidedSingleMultiplier(src1, p1, w0, w1, dst_ptr);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
 
       sum565_a[0] = sum565_a[1];
       sum565_b[0] = sum565_b[1];
-      y += 2;
-    } while (y < height);
+    }
+    if ((height & 1) != 0) {
+      ab_ptr += 8;
+      b2[0] = vld1q_u16(ab_ptr);
+      a2[0] = vget_low_u8(vreinterpretq_u8_u16(b2[0]));
+
+      row[0] = row[2];
+      row[1] = row[3];
+      row[2] = row[4];
+
+      row_sq[0] = row_sq[2];
+      row_sq[1] = row_sq[3];
+      row_sq[2] = row_sq[4];
+
+      column += src_stride;
+      row[3] = row[4] = vld1q_u8(column);
+
+      row_sq[3].val[0] = row_sq[4].val[0] =
+          vmull_u8(vget_low_u8(row[3]), vget_low_u8(row[3]));
+      row_sq[3].val[1] = row_sq[4].val[1] =
+          vmull_u8(vget_high_u8(row[3]), vget_high_u8(row[3]));
+
+      BoxFilterPreProcess8<5, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr);
+
+      const uint8x8_t src0 = vld1_u8(src_ptr);
+      const int16x8_t p0 = BoxFilterPass1(src0, a2, b2, sum565_a, sum565_b);
+      SelfGuidedSingleMultiplier(src0, p0, w0, w1, dst_ptr);
+    }
     x += 8;
   } while (x < width);
 }
 
-inline void BoxFilterProcess_SecondPass(
-    const uint8_t* src, const ptrdiff_t stride, const int width,
-    const int height, const uint32_t s,
-    uint16_t* const box_filter_process_output, uint16_t* const temp) {
+inline void BoxFilterProcessPass2(const uint8_t* src,
+                                  const ptrdiff_t src_stride,
+                                  const RestorationUnitInfo& restoration_info,
+                                  const int width, const int height,
+                                  const uint32_t s, uint16_t* const temp,
+                                  uint8_t* const dst,
+                                  const ptrdiff_t dst_stride) {
   uint16_t* ab_ptr = temp;
 
-  // Calculate intermediate results, including one-pixel border, for example,
-  // if unit size is 64x64, we calculate 66x66 pixels.
+  // Calculate intermediate results, including one-pixel border, for example, if
+  // unit size is 64x64, we calculate 66x66 pixels.
   // Because of the vectors this calculates start in blocks of 4 so we actually
   // get 68 values.
-  const uint8_t* const src_top_left_corner = src - 2 - 2 * stride;
+  const uint8_t* const src_top_left_corner = src - 2 - 2 * src_stride;
   {
     const uint8_t* column = src_top_left_corner;
     uint8x8_t row[3];
     uint16x8_t row_sq[3];
-
     row[0] = vld1_u8(column);
-    column += stride;
+    column += src_stride;
     row[1] = vld1_u8(column);
     row_sq[0] = vmull_u8(row[0], row[0]);
     row_sq[1] = vmull_u8(row[1], row[1]);
 
     int y = height + 2;
     do {
-      column += stride;
+      column += src_stride;
       row[2] = vld1_u8(column);
       row_sq[2] = vmull_u8(row[2], row[2]);
 
-      PreProcess4<3, 0>(row, row_sq, s, ab_ptr);
+      BoxFilterPreProcess4<3, 0>(row, row_sq, s, ab_ptr);
 
       row[0] = row[1];
       row[1] = row[2];
@@ -1780,13 +1507,14 @@ inline void BoxFilterProcess_SecondPass(
       row_sq[0] = row_sq[1];
       row_sq[1] = row_sq[2];
       ab_ptr += 8;
-    } while (--y);
+    } while (--y != 0);
   }
 
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
   int x = 0;
   do {
-    const uint8_t* src_ptr = src + x;
-    uint16_t* out_buf = box_filter_process_output + x;
     ab_ptr = temp;
 
     uint8x8_t a2[2];
@@ -1799,9 +1527,9 @@ inline void BoxFilterProcess_SecondPass(
     uint8x16_t row[3];
     uint16x8x2_t row_sq[3];
     row[0] = vld1q_u8(column);
-    column += stride;
+    column += src_stride;
     row[1] = vld1q_u8(column);
-    column += stride;
+    column += src_stride;
     row[2] = vld1q_u8(column);
 
     row_sq[0].val[0] = vmull_u8(vget_low_u8(row[0]), vget_low_u8(row[0]));
@@ -1811,7 +1539,7 @@ inline void BoxFilterProcess_SecondPass(
     row_sq[2].val[0] = vmull_u8(vget_low_u8(row[2]), vget_low_u8(row[2]));
     row_sq[2].val[1] = vmull_u8(vget_high_u8(row[2]), vget_high_u8(row[2]));
 
-    PreProcess8<3, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr);
+    BoxFilterPreProcess8<3, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr);
 
     sum343_a[0] = Sum343(a2);
     sum343_a[0] = vsubq_u16(vdupq_n_u16((3 + 4 + 3) * 256), sum343_a[0]);
@@ -1826,19 +1554,21 @@ inline void BoxFilterProcess_SecondPass(
 
     row_sq[0] = row_sq[1];
     row_sq[1] = row_sq[2];
-    column += stride;
+    column += src_stride;
     row[2] = vld1q_u8(column);
 
     row_sq[2].val[0] = vmull_u8(vget_low_u8(row[2]), vget_low_u8(row[2]));
     row_sq[2].val[1] = vmull_u8(vget_high_u8(row[2]), vget_high_u8(row[2]));
 
-    PreProcess8<3, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr);
+    BoxFilterPreProcess8<3, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr);
 
     Sum343_444(a2, &sum343_a[1], &sum444_a[0]);
     sum343_a[1] = vsubq_u16(vdupq_n_u16((3 + 4 + 3) * 256), sum343_a[1]);
     sum444_a[0] = vsubq_u16(vdupq_n_u16((4 + 4 + 4) * 256), sum444_a[0]);
     Sum343_444W(b2, &sum343_b[1], &sum444_b[0]);
 
+    const uint8_t* src_ptr = src + x;
+    uint8_t* dst_ptr = dst + x;
     int y = height;
     do {
       ab_ptr += 8;
@@ -1850,214 +1580,59 @@ inline void BoxFilterProcess_SecondPass(
 
       row_sq[0] = row_sq[1];
       row_sq[1] = row_sq[2];
-      column += stride;
+      column += src_stride;
       row[2] = vld1q_u8(column);
 
       row_sq[2].val[0] = vmull_u8(vget_low_u8(row[2]), vget_low_u8(row[2]));
       row_sq[2].val[1] = vmull_u8(vget_high_u8(row[2]), vget_high_u8(row[2]));
 
-      PreProcess8<3, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr);
+      BoxFilterPreProcess8<3, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr);
 
       uint8x8_t src_u8 = vld1_u8(src_ptr);
-      BoxFilter2(src_u8, a2, b2, sum343_a, sum444_a, sum343_b, sum444_b,
-                 out_buf);
+      int16x8_t p = BoxFilterPass2(src_u8, a2, b2, sum343_a, sum444_a, sum343_b,
+                                   sum444_b);
+      SelfGuidedSingleMultiplier(src_u8, p, w0, w1, dst_ptr);
       sum343_a[0] = sum343_a[1];
       sum343_a[1] = sum343_a[2];
       sum444_a[0] = sum444_a[1];
       sum343_b[0] = sum343_b[1];
       sum343_b[1] = sum343_b[2];
       sum444_b[0] = sum444_b[1];
-      src_ptr += stride;
-      out_buf += kRestorationProcessingUnitSize;
-    } while (--y);
-    x += 8;
-  } while (x < width);
-}
-
-inline void SelfGuidedSingleMultiplier(
-    const uint8_t* src, const ptrdiff_t src_stride,
-    const uint16_t* const box_filter_process_output, uint8_t* dst,
-    const ptrdiff_t dst_stride, const int width, const int height,
-    const int16_t w_single) {
-  const int16_t w_combo = (1 << kSgrProjPrecisionBits) - w_single;
-  const auto* box_filter =
-      reinterpret_cast<const int16_t*>(box_filter_process_output);
-  int w = width;
-
-  if (w & 4) {
-    w -= 4;
-    const uint8_t* src_ptr = src + w;
-    uint8_t* dst_ptr = dst + w;
-    const int16_t* box_filter_w = box_filter + w;
-    int y = height;
-    do {
-      const int16x8_t u = vreinterpretq_s16_u16(
-          vshll_n_u8(vld1_u8(src_ptr), kSgrProjRestoreBits));
-      const int16x4_t p = vld1_s16(box_filter_w);
-      // u * w1 + u * wN == u * (w1 + wN)
-      int32x4_t v_lo = vmull_n_s16(vget_low_s16(u), w_combo);
-      v_lo = vmlal_n_s16(v_lo, p, w_single);
-      const int16x4_t s_lo =
-          vrshrn_n_s32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits);
-      StoreLo4(dst_ptr, vqmovun_s16(vcombine_s16(s_lo, s_lo)));
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-      box_filter_w += kRestorationProcessingUnitSize;
-    } while (--y);
-
-    if (!w) return;
-  }
-
-  int y = height;
-  do {
-    int x = 0;
-    do {
-      const int16x8_t u = vreinterpretq_s16_u16(
-          vshll_n_u8(vld1_u8(src + x), kSgrProjRestoreBits));
-      const int16x8_t p = vld1q_s16(box_filter + x);
-      // u * w1 + u * wN == u * (w1 + wN)
-      int32x4_t v_lo = vmull_n_s16(vget_low_s16(u), w_combo);
-      v_lo = vmlal_n_s16(v_lo, vget_low_s16(p), w_single);
-      int32x4_t v_hi = vmull_n_s16(vget_high_s16(u), w_combo);
-      v_hi = vmlal_n_s16(v_hi, vget_high_s16(p), w_single);
-      const int16x4_t s_lo =
-          vrshrn_n_s32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits);
-      const int16x4_t s_hi =
-          vrshrn_n_s32(v_hi, kSgrProjRestoreBits + kSgrProjPrecisionBits);
-      vst1_u8(dst + x, vqmovun_s16(vcombine_s16(s_lo, s_hi)));
-      x += 8;
-    } while (x < w);
-    src += src_stride;
-    dst += dst_stride;
-    box_filter += kRestorationProcessingUnitSize;
-  } while (--y);
-}
-
-inline void SelfGuidedDoubleMultiplier(
-    const uint8_t* src, const ptrdiff_t src_stride,
-    const uint16_t* const box_filter_process_output, uint8_t* dst,
-    const ptrdiff_t dst_stride, const int width, const int height, const int w0,
-    const int w1, const int w2) {
-  const auto* box_filter =
-      reinterpret_cast<const int16_t*>(box_filter_process_output);
-  const int16x4_t w0_v = vdup_n_s16(w0);
-  const int16x4_t w1_v = vdup_n_s16(w1);
-  const int16x4_t w2_v = vdup_n_s16(w2);
-  int w = width;
-
-  if (w & 4) {
-    w -= 4;
-    const uint8_t* src_ptr = src + w;
-    uint8_t* dst_ptr = dst + w;
-    const int16_t* box_filter_w = box_filter + 2 * w;
-    int y = height;
-    do {
-      // |wN| values are signed. |src| values can be treated as int16_t.
-      // Load 8 values but ignore 4.
-      const int16x4_t u = vget_low_s16(vreinterpretq_s16_u16(
-          vshll_n_u8(vld1_u8(src_ptr), kSgrProjRestoreBits)));
-      // |box_filter_process_output| is 14 bits, also safe to treat as int16_t.
-      const int16x4_t p0 = vld1_s16(box_filter_w + 0);
-      const int16x4_t p1 = vld1_s16(box_filter_w + 8);
-      int32x4_t v = vmull_s16(u, w1_v);
-      v = vmlal_s16(v, p0, w0_v);
-      v = vmlal_s16(v, p1, w2_v);
-      // |s| is saturated to uint8_t.
-      const int16x4_t s =
-          vrshrn_n_s32(v, kSgrProjRestoreBits + kSgrProjPrecisionBits);
-      StoreLo4(dst_ptr, vqmovun_s16(vcombine_s16(s, s)));
       src_ptr += src_stride;
       dst_ptr += dst_stride;
-      box_filter_w += 2 * kRestorationProcessingUnitSize;
-    } while (--y);
-
-    if (!w) return;
-  }
-
-  int y = height;
-  do {
-    int x = 0;
-    do {
-      // |wN| values are signed. |src| values can be treated as int16_t.
-      const int16x8_t u = vreinterpretq_s16_u16(
-          vshll_n_u8(vld1_u8(src + x), kSgrProjRestoreBits));
-      // |box_filter_process_output| is 14 bits, also safe to treat as int16_t.
-      const int16x8_t p0 = vld1q_s16(box_filter + 2 * x + 0);
-      const int16x8_t p1 = vld1q_s16(box_filter + 2 * x + 8);
-      int32x4_t v_lo = vmull_s16(vget_low_s16(u), w1_v);
-      v_lo = vmlal_s16(v_lo, vget_low_s16(p0), w0_v);
-      v_lo = vmlal_s16(v_lo, vget_low_s16(p1), w2_v);
-      int32x4_t v_hi = vmull_s16(vget_high_s16(u), w1_v);
-      v_hi = vmlal_s16(v_hi, vget_high_s16(p0), w0_v);
-      v_hi = vmlal_s16(v_hi, vget_high_s16(p1), w2_v);
-      // |s| is saturated to uint8_t.
-      const int16x4_t s_lo =
-          vrshrn_n_s32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits);
-      const int16x4_t s_hi =
-          vrshrn_n_s32(v_hi, kSgrProjRestoreBits + kSgrProjPrecisionBits);
-      vst1_u8(dst + x, vqmovun_s16(vcombine_s16(s_lo, s_hi)));
-      x += 8;
-    } while (x < w);
-    src += src_stride;
-    dst += dst_stride;
-    box_filter += 2 * kRestorationProcessingUnitSize;
-  } while (--y);
+    } while (--y != 0);
+    x += 8;
+  } while (x < width);
 }
 
+// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
+// the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
 void SelfGuidedFilter_NEON(const void* const source, void* const dest,
                            const RestorationUnitInfo& restoration_info,
-                           ptrdiff_t source_stride, ptrdiff_t dest_stride,
-                           const int width, const int height,
-                           RestorationBuffer* const /*buffer*/) {
-  const auto* src = static_cast<const uint8_t*>(source);
-
-  // The output frame is broken into blocks of 64x64 (32x32 if U/V are
-  // subsampled). If either dimension is less than 32/64 it indicates it is at
-  // the right or bottom edge of the frame. It is safe to overwrite the output
-  // as it will not be part of the visible frame. This saves us from having to
-  // handle non-multiple-of-8 widths.
-  // We could round here, but the for loop with += 8 does the same thing.
-
-  // width = (width + 7) & ~0x7;
-
-  // -96 to 96 (Sgrproj_Xqd_Min/Max)
+                           const ptrdiff_t source_stride,
+                           const ptrdiff_t dest_stride, const int width,
+                           const int height, RestorationBuffer* const buffer) {
   const int index = restoration_info.sgr_proj_info.index;
-  const int radius_pass_0 = kSgrProjParams[index][0];
-  const int radius_pass_1 = kSgrProjParams[index][2];
-  alignas(kMaxAlignment)
-      uint16_t box_filter_process_output[2 * kMaxBoxFilterProcessOutputPixels];
-  alignas(kMaxAlignment)
-      uint16_t temp[12 * (kRestorationProcessingUnitSize + 2)];
-
-  // If |radius| is 0 then there is nothing to do. If |radius| is not 0, it is
-  // always 2 for the first pass and 1 for the second pass.
-  const int w0 = restoration_info.sgr_proj_info.multiplier[0];
-  const int w1 = restoration_info.sgr_proj_info.multiplier[1];
-  const int w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* src = static_cast<const uint8_t*>(source);
   auto* dst = static_cast<uint8_t*>(dest);
-  // Note: Combining box filter process with the final multipliers has no speed
-  // gain. There are not enough neon registers to hold those weights.
-  if (radius_pass_0 != 0 && radius_pass_1 != 0) {
-    BoxFilterProcess(src, source_stride, width, height,
-                     kSgrScaleParameter[index], box_filter_process_output,
-                     temp);
-    SelfGuidedDoubleMultiplier(src, source_stride, box_filter_process_output,
-                               dst, dest_stride, width, height, w0, w1, w2);
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1(src, source_stride, restoration_info, width, height,
+                          kSgrScaleParameter[index][0], buffer->sgf_buffer, dst,
+                          dest_stride);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2(src, source_stride, restoration_info, width, height,
+                          kSgrScaleParameter[index][1], buffer->sgf_buffer, dst,
+                          dest_stride);
   } else {
-    int16_t w_single;
-    if (radius_pass_0 != 0) {
-      BoxFilterProcess_FirstPass(src, source_stride, width, height,
-                                 kSgrScaleParameter[index][0],
-                                 box_filter_process_output, temp);
-      w_single = w0;
-    } else /* if (radius_pass_1 != 0) */ {
-      BoxFilterProcess_SecondPass(src, source_stride, width, height,
-                                  kSgrScaleParameter[index][1],
-                                  box_filter_process_output, temp);
-      w_single = w2;
-    }
-    SelfGuidedSingleMultiplier(src, source_stride, box_filter_process_output,
-                               dst, dest_stride, width, height, w_single);
+    BoxFilterProcess(src, source_stride, restoration_info, width, height,
+                     kSgrScaleParameter[index], buffer->sgf_buffer, dst,
+                     dest_stride);
   }
 }
 
diff --git a/chromium/third_party/libgav1/src/src/dsp/arm/motion_field_projection_neon.cc b/chromium/third_party/libgav1/src/src/dsp/arm/motion_field_projection_neon.cc
index b84548de6f7..3e731b22450 100644
--- a/chromium/third_party/libgav1/src/src/dsp/arm/motion_field_projection_neon.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/arm/motion_field_projection_neon.cc
@@ -34,92 +34,77 @@ namespace libgav1 {
 namespace dsp {
 namespace {
 
-inline int8x8_t Project_NEON(const int16x8_t delta, const int16x8_t dst_sign) {
-  // Add 63 to negative delta so that it shifts towards zero.
-  const int16x8_t delta_sign = vshrq_n_s16(delta, 15);
-  const uint16x8_t delta_u = vreinterpretq_u16_s16(delta);
-  const uint16x8_t delta_sign_u = vreinterpretq_u16_s16(delta_sign);
-  const uint16x8_t delta_adjust_u = vsraq_n_u16(delta_u, delta_sign_u, 10);
-  const int16x8_t delta_adjust = vreinterpretq_s16_u16(delta_adjust_u);
-  const int16x8_t offset0 = vshrq_n_s16(delta_adjust, 6);
-  const int16x8_t offset1 = veorq_s16(offset0, dst_sign);
-  const int16x8_t offset2 = vsubq_s16(offset1, dst_sign);
-  return vqmovn_s16(offset2);
-}
-
-inline int16x8_t LookupTable(const int8x8x4_t division_table,
-                             const int8x16_t idx) {
-  const int8x8_t idx_low = vget_low_s8(idx);
-  const int8x8_t idx_high = vget_high_s8(idx);
-  const int16x4_t d0 = vreinterpret_s16_s8(vtbl4_s8(division_table, idx_low));
-  const int16x4_t d1 = vreinterpret_s16_s8(vtbl4_s8(division_table, idx_high));
-  return vcombine_s16(d0, d1);
-}
-
-inline int16x8_t LoadDivision(const int8x8x4_t division_table[2],
+inline int16x8_t LoadDivision(const int8x8x2_t division_table,
                               const int8x8_t reference_offset) {
-  const int8x16_t k32 = vdupq_n_s8(32);
   const int8x8_t kOne = vcreate_s8(0x0100010001000100);
   const int8x16_t kOneQ = vcombine_s8(kOne, kOne);
   const int8x8_t t = vadd_s8(reference_offset, reference_offset);
   const int8x8x2_t tt = vzip_s8(t, t);
   const int8x16_t t1 = vcombine_s8(tt.val[0], tt.val[1]);
-  const int8x16_t idx0 = vaddq_s8(t1, kOneQ);
-  const int8x16_t idx1 = vsubq_s8(idx0, k32);
-  const int16x8_t denorm0 = LookupTable(division_table[0], idx0);
-  const int16x8_t denorm1 = LookupTable(division_table[1], idx1);
-  return vorrq_s16(denorm0, denorm1);
+  const int8x16_t idx = vaddq_s8(t1, kOneQ);
+  const int8x8_t idx_low = vget_low_s8(idx);
+  const int8x8_t idx_high = vget_high_s8(idx);
+  const int16x4_t d0 = vreinterpret_s16_s8(vtbl2_s8(division_table, idx_low));
+  const int16x4_t d1 = vreinterpret_s16_s8(vtbl2_s8(division_table, idx_high));
+  return vcombine_s16(d0, d1);
 }
 
 inline int16x4_t MvProjection(const int16x4_t mv, const int16x4_t denominator,
                               const int numerator) {
   const int32x4_t m0 = vmull_s16(mv, denominator);
   const int32x4_t m = vmulq_n_s32(m0, numerator);
-  // Subtract the sign bit to round towards zero.
-  const int32x4_t sub_sign = vsraq_n_s32(m, m, 31);
-  return vqrshrn_n_s32(sub_sign, 14);
+  // Add the sign (0 or -1) to round towards zero.
+  const int32x4_t add_sign = vsraq_n_s32(m, m, 31);
+  return vqrshrn_n_s32(add_sign, 14);
 }
 
 inline int16x8_t MvProjectionClip(const int16x8_t mv,
                                   const int16x8_t denominator,
                                   const int numerator) {
-  const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp);
   const int16x4_t mv0 = vget_low_s16(mv);
   const int16x4_t mv1 = vget_high_s16(mv);
-  const int16x4_t m0 = MvProjection(mv0, vget_low_s16(denominator), numerator);
-  const int16x4_t m1 = MvProjection(mv1, vget_high_s16(denominator), numerator);
-  const int16x8_t m = vcombine_s16(m0, m1);
-  const int16x8_t clamp = vminq_s16(m, projection_mv_clamp);
+  const int16x4_t s0 = MvProjection(mv0, vget_low_s16(denominator), numerator);
+  const int16x4_t s1 = MvProjection(mv1, vget_high_s16(denominator), numerator);
+  const int16x8_t projection = vcombine_s16(s0, s1);
+  const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp);
+  const int16x8_t clamp = vminq_s16(projection, projection_mv_clamp);
   return vmaxq_s16(clamp, vnegq_s16(projection_mv_clamp));
 }
 
-inline void GetMvProjection(const int32x4_t mv[2], const int16x8_t denominator,
-                            const int numerator, int16x8_t projection_mv[2]) {
-  const int16x8_t mv0 = vreinterpretq_s16_s32(mv[0]);
-  const int16x8_t mv1 = vreinterpretq_s16_s32(mv[1]);
-  // Deinterlace
-  const int16x8x2_t mvs = vuzpq_s16(mv0, mv1);
-  projection_mv[0] = MvProjectionClip(mvs.val[0], denominator, numerator);
-  projection_mv[1] = MvProjectionClip(mvs.val[1], denominator, numerator);
+inline int8x8_t Project_NEON(const int16x8_t delta, const int16x8_t dst_sign) {
+  // Add 63 to negative delta so that it shifts towards zero.
+  const int16x8_t delta_sign = vshrq_n_s16(delta, 15);
+  const uint16x8_t delta_u = vreinterpretq_u16_s16(delta);
+  const uint16x8_t delta_sign_u = vreinterpretq_u16_s16(delta_sign);
+  const uint16x8_t delta_adjust_u = vsraq_n_u16(delta_u, delta_sign_u, 10);
+  const int16x8_t delta_adjust = vreinterpretq_s16_u16(delta_adjust_u);
+  const int16x8_t offset0 = vshrq_n_s16(delta_adjust, 6);
+  const int16x8_t offset1 = veorq_s16(offset0, dst_sign);
+  const int16x8_t offset2 = vsubq_s16(offset1, dst_sign);
+  return vqmovn_s16(offset2);
 }
 
-void GetPosition(const int8x8x4_t division_table[2],
-                 const MotionVector* const mv,
-                 const int reference_to_current_with_sign, const int x8_start,
-                 const int x8_end, const int x8, const int8x8_t r_offsets,
-                 const int8x8_t source_reference_type8, const int8x8_t skip_r,
-                 const int8x8_t y8_floor8, const int8x8_t y8_ceiling8,
-                 const int16x8_t d_sign, const int delta, int8x8_t* const r,
-                 int8x8_t* const position_y8, int8x8_t* const position_x8,
-                 int64_t* const skip_64, int32x4_t mvs[2]) {
-  const int32_t* const mv_int = reinterpret_cast<const int32_t*>(mv + x8);
+inline void GetPosition(
+    const int8x8x2_t division_table, const MotionVector* const mv,
+    const int numerator, const int x8_start, const int x8_end, const int x8,
+    const int8x8_t r_offsets, const int8x8_t source_reference_type8,
+    const int8x8_t skip_r, const int8x8_t y8_floor8, const int8x8_t y8_ceiling8,
+    const int16x8_t d_sign, const int delta, int8x8_t* const r,
+    int8x8_t* const position_y8, int8x8_t* const position_x8,
+    int64_t* const skip_64, int32x4_t mvs[2]) {
+  const auto* const mv_int = reinterpret_cast<const int32_t*>(mv + x8);
   *r = vtbl1_s8(r_offsets, source_reference_type8);
-  const int16x8_t denorm = LoadDivision(division_table, *r);
+  const int16x8_t denorm = LoadDivision(division_table, source_reference_type8);
   int16x8_t projection_mv[2];
   mvs[0] = vld1q_s32(mv_int + 0);
   mvs[1] = vld1q_s32(mv_int + 4);
-  // reference_to_current_with_sign could be 0.
-  GetMvProjection(mvs, denorm, reference_to_current_with_sign, projection_mv);
+  // Deinterlace x and y components
+  const int16x8_t mv0 = vreinterpretq_s16_s32(mvs[0]);
+  const int16x8_t mv1 = vreinterpretq_s16_s32(mvs[1]);
+  const int16x8x2_t mv_yx = vuzpq_s16(mv0, mv1);
+  // numerator could be 0.
+  projection_mv[0] = MvProjectionClip(mv_yx.val[0], denorm, numerator);
+  projection_mv[1] = MvProjectionClip(mv_yx.val[1], denorm, numerator);
   // Do not update the motion vector if the block position is not valid or
   // if position_x8 is outside the current range of x8_start and x8_end.
   // Note that position_y8 will always be within the range of y8_start and
@@ -147,46 +132,31 @@ void GetPosition(const int8x8x4_t division_table[2],
 }
 
 template <int idx>
-int16_t VgetqLaneS16(const int16x8_t src) {
-  if (idx == 0) return vgetq_lane_s16(src, 0);
-  if (idx == 1) return vgetq_lane_s16(src, 1);
-  if (idx == 2) return vgetq_lane_s16(src, 2);
-  if (idx == 3) return vgetq_lane_s16(src, 3);
-  if (idx == 4) return vgetq_lane_s16(src, 4);
-  if (idx == 5) return vgetq_lane_s16(src, 5);
-  if (idx == 6) return vgetq_lane_s16(src, 6);
-  return vgetq_lane_s16(src, 7);
-}
-
-template <int idx>
 inline void Store(const int16x8_t position, const int8x8_t reference_offset,
-                  const int32x4_t mvs, int8_t* dst_reference_offset,
+                  const int32x4_t mv, int8_t* dst_reference_offset,
                   MotionVector* dst_mv) {
-  const ptrdiff_t offset = VgetqLaneS16<idx>(position);
-  int32_t* const d_mv = reinterpret_cast<int32_t*>(&dst_mv[offset]);
-  vst1q_lane_s32(d_mv, mvs, idx & 3);
+  const ptrdiff_t offset = vgetq_lane_s16(position, idx);
+  auto* const d_mv = reinterpret_cast<int32_t*>(&dst_mv[offset]);
+  vst1q_lane_s32(d_mv, mv, idx & 3);
   vst1_lane_s8(&dst_reference_offset[offset], reference_offset, idx);
 }
 
 template <int idx>
 inline void CheckStore(const int8_t* skips, const int16x8_t position,
-                       const int8x8_t reference_offset, const int32x4_t mvs,
+                       const int8x8_t reference_offset, const int32x4_t mv,
                        int8_t* dst_reference_offset, MotionVector* dst_mv) {
   if (skips[idx] == 0) {
-    const ptrdiff_t offset = VgetqLaneS16<idx>(position);
-    int32_t* const d_mv = reinterpret_cast<int32_t*>(&dst_mv[offset]);
-    vst1q_lane_s32(d_mv, mvs, idx & 3);
-    vst1_lane_s8(&dst_reference_offset[offset], reference_offset, idx);
+    Store<idx>(position, reference_offset, mv, dst_reference_offset, dst_mv);
   }
 }
 
 // 7.9.2.
-void MotionFieldProjectionKernel_NEON(
-    const ReferenceFrameType* source_reference_type, const MotionVector* mv,
-    const uint8_t order_hint[kNumReferenceFrameTypes],
-    unsigned int current_frame_order_hint, unsigned int order_hint_shift_bits,
-    int reference_to_current_with_sign, int dst_sign, int y8_start, int y8_end,
-    int x8_start, int x8_end, TemporalMotionField* motion_field) {
+void MotionFieldProjectionKernel_NEON(const ReferenceInfo& reference_info,
+                                      const int reference_to_current_with_sign,
+                                      const int dst_sign, const int y8_start,
+                                      const int y8_end, const int x8_start,
+                                      const int x8_end,
+                                      TemporalMotionField* const motion_field) {
   const ptrdiff_t stride = motion_field->mv.columns();
   // The column range has to be offset by kProjectionMvMaxHorizontalOffset since
   // coordinates in that range could end up being position_x8 because of
@@ -197,14 +167,17 @@ void MotionFieldProjectionKernel_NEON(
       x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
   const int adjusted_x8_end8 = adjusted_x8_end & ~7;
   const int leftover = adjusted_x8_end - adjusted_x8_end8;
-  const int8_t* const table =
-      reinterpret_cast<const int8_t*>(kProjectionMvDivisionLookup);
+  const int8_t* const reference_offsets =
+      reference_info.relative_distance_to.data();
+  const bool* const skip_references = reference_info.skip_references.data();
+  const int16_t* const projection_divisions =
+      reference_info.projection_divisions.data();
+  const ReferenceFrameType* source_reference_types =
+      &reference_info.motion_field_reference_frame[y8_start][0];
+  const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
   int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
   MotionVector* dst_mv = motion_field->mv[y8_start];
   const int16x8_t d_sign = vdupq_n_s16(dst_sign);
-  int8_t reference_offsets[kNumReferenceFrameTypes];
-  bool skip_reference[kNumReferenceFrameTypes];
-  int8x8x4_t division_table[2];
 
   static_assert(sizeof(int8_t) == sizeof(bool), "");
   static_assert(sizeof(int8_t) == sizeof(ReferenceFrameType), "");
@@ -219,37 +192,13 @@ void MotionFieldProjectionKernel_NEON(
   // which means this optimization works for frame width up to 32K (each
   // position is a 8x8 block).
   assert(8 * stride <= 32768);
-
-  const int8x8_t current_order_hints = vdup_n_s8(current_frame_order_hint);
-  const int8x8_t order_hints = vreinterpret_s8_u8(vld1_u8(order_hint));
-  const int8x8_t diff = vsub_s8(current_order_hints, order_hints);
-  // |order_hint_shift_bits| - 24 could be -24. In this case diff is 0,
-  // and the behavior of left or right shifting -24 bits is defined for ARM NEON
-  // instructions, and the result of shifting 0 is still 0.
-  const int8x8_t left_shift_bits = vdup_n_s8(order_hint_shift_bits - 24);
-  const int8x8_t diff_shift_left = vshl_s8(diff, left_shift_bits);
-  const int8x8_t r_offsets = vshl_s8(diff_shift_left, vneg_s8(left_shift_bits));
-  const uint8x8_t overflow = vcgt_s8(r_offsets, vdup_n_s8(kMaxFrameDistance));
-  const uint8x8_t underflow = vcle_s8(r_offsets, vdup_n_s8(0));
-  const int8x8_t sk = vreinterpret_s8_u8(vorr_u8(overflow, underflow));
-  // Initialize skip_reference[kReferenceFrameIntra] to simplify branch
-  // conditions in projection.
-  const int8x8_t skip_reference8 = vset_lane_s8(-1, sk, 0);
-  vst1_s8(reinterpret_cast<int8_t*>(skip_reference), skip_reference8);
-  vst1_s8(reference_offsets, r_offsets);
-
-  // The compiler is inefficient when using vld4_s64(). Instructions waste in
-  // copying from int64x1x4_t to int8x8x4_t, and there is no such vector
-  // reinterpret intrinsics available to the best of our knowledge. Anyway
-  // compiler is good enough to use 4 vld1q_s8().
-  division_table[0].val[0] = vld1_s8(table + 0 * 8);
-  division_table[0].val[1] = vld1_s8(table + 1 * 8);
-  division_table[0].val[2] = vld1_s8(table + 2 * 8);
-  division_table[0].val[3] = vld1_s8(table + 3 * 8);
-  division_table[1].val[0] = vld1_s8(table + 4 * 8);
-  division_table[1].val[1] = vld1_s8(table + 5 * 8);
-  division_table[1].val[2] = vld1_s8(table + 6 * 8);
-  division_table[1].val[3] = vld1_s8(table + 7 * 8);
+  const int8x8_t skip_reference =
+      vld1_s8(reinterpret_cast<const int8_t*>(skip_references));
+  const int8x8_t r_offsets = vld1_s8(reference_offsets);
+  const int8x16_t table = vreinterpretq_s8_s16(vld1q_s16(projection_divisions));
+  int8x8x2_t division_table;
+  division_table.val[0] = vget_low_s8(table);
+  division_table.val[1] = vget_high_s8(table);
 
   int y8 = y8_start;
   do {
@@ -261,8 +210,8 @@ void MotionFieldProjectionKernel_NEON(
 
     for (x8 = adjusted_x8_start; x8 < adjusted_x8_end8; x8 += 8) {
       const int8x8_t source_reference_type8 =
-          vld1_s8(reinterpret_cast<const int8_t*>(source_reference_type + x8));
-      const int8x8_t skip_r = vtbl1_s8(skip_reference8, source_reference_type8);
+          vld1_s8(reinterpret_cast<const int8_t*>(source_reference_types + x8));
+      const int8x8_t skip_r = vtbl1_s8(skip_reference, source_reference_type8);
       const int64_t early_skip = vget_lane_s64(vreinterpret_s64_s8(skip_r), 0);
       // Early termination #1 if all are skips. Chance is typically ~30-40%.
       if (early_skip == -1) continue;
@@ -278,8 +227,8 @@ void MotionFieldProjectionKernel_NEON(
       if (skip_64 == -1) continue;
       const int16x8_t p_y = vmovl_s8(position_y8);
       const int16x8_t p_x = vmovl_s8(position_x8);
-      const int16x8_t p_xy = vmlaq_n_s16(p_x, p_y, stride);
-      const int16x8_t position = vaddq_s16(p_xy, vdupq_n_s16(x8));
+      const int16x8_t pos = vmlaq_n_s16(p_x, p_y, stride);
+      const int16x8_t position = vaddq_s16(pos, vdupq_n_s16(x8));
       if (skip_64 == 0) {
         // Store all. Chance is typically ~70-85% after Early termination #2.
         Store<0>(position, r, mvs[0], dst_reference_offset, dst_mv);
@@ -318,9 +267,9 @@ void MotionFieldProjectionKernel_NEON(
         const int delta = 8 - leftover;
         x8 = adjusted_x8_end - 8;
         const int8x8_t source_reference_type8 = vld1_s8(
-            reinterpret_cast<const int8_t*>(source_reference_type + x8));
+            reinterpret_cast<const int8_t*>(source_reference_types + x8));
         const int8x8_t skip_r =
-            vtbl1_s8(skip_reference8, source_reference_type8);
+            vtbl1_s8(skip_reference, source_reference_type8);
         const int64_t early_skip =
             vget_lane_s64(vreinterpret_s64_s8(skip_r), 0);
         // Early termination #1 if all are skips.
@@ -336,8 +285,8 @@ void MotionFieldProjectionKernel_NEON(
           if (skip_64 != -1) {
             const int16x8_t p_y = vmovl_s8(position_y8);
             const int16x8_t p_x = vmovl_s8(position_x8);
-            const int16x8_t p_xy = vmlaq_n_s16(p_x, p_y, stride);
-            const int16x8_t position = vaddq_s16(p_xy, vdupq_n_s16(x8));
+            const int16x8_t pos = vmlaq_n_s16(p_x, p_y, stride);
+            const int16x8_t position = vaddq_s16(pos, vdupq_n_s16(x8));
             // Store up to 7 elements since leftover is at most 7.
             if (skip_64 == 0) {
               // Store all.
@@ -373,13 +322,13 @@ void MotionFieldProjectionKernel_NEON(
         }
       } else {
         for (; x8 < adjusted_x8_end; ++x8) {
-          if (skip_reference[source_reference_type[x8]]) continue;
-          const int reference_offset =
-              reference_offsets[source_reference_type[x8]];
+          const int source_reference_type = source_reference_types[x8];
+          if (skip_references[source_reference_type]) continue;
           MotionVector projection_mv;
           // reference_to_current_with_sign could be 0.
           GetMvProjection(mv[x8], reference_to_current_with_sign,
-                          reference_offset, &projection_mv);
+                          projection_divisions[source_reference_type],
+                          &projection_mv);
           // Do not update the motion vector if the block position is not valid
           // or if position_x8 is outside the current range of x8_start and
           // x8_end. Note that position_y8 will always be within the range of
@@ -395,12 +344,12 @@ void MotionFieldProjectionKernel_NEON(
           if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
           dst_mv[position_y8 * stride + position_x8] = mv[x8];
           dst_reference_offset[position_y8 * stride + position_x8] =
-              reference_offset;
+              reference_offsets[source_reference_type];
         }
       }
     }
 
-    source_reference_type += stride;
+    source_reference_types += stride;
     mv += stride;
     dst_reference_offset += stride;
     dst_mv += stride;
diff --git a/chromium/third_party/libgav1/src/src/dsp/arm/motion_vector_search_neon.cc b/chromium/third_party/libgav1/src/src/dsp/arm/motion_vector_search_neon.cc
index 5332180dfbc..da3ba1706e6 100644
--- a/chromium/third_party/libgav1/src/src/dsp/arm/motion_vector_search_neon.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/arm/motion_vector_search_neon.cc
@@ -64,7 +64,7 @@ inline int16x8_t MvProjectionCompoundClip(
     const MotionVector* const temporal_mvs,
     const int8_t* const temporal_reference_offsets,
     const int reference_offsets[2]) {
-  const int32_t* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
+  const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
   const int32x2_t temporal_mv = vld1_s32(tmvs);
   const int16x4_t tmv0 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 0));
   const int16x4_t tmv1 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 1));
@@ -79,7 +79,7 @@ inline int16x8_t MvProjectionSingleClip(
     const MotionVector* const temporal_mvs,
     const int8_t* const temporal_reference_offsets, const int reference_offset,
     int16x4_t* const lookup) {
-  const int16_t* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
+  const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
   const int16x8_t temporal_mv = vld1q_s16(tmvs);
   *lookup = vld1_lane_s16(
       &kProjectionMvDivisionLookup[temporal_reference_offsets[0]], *lookup, 0);
@@ -98,27 +98,26 @@ inline int16x8_t MvProjectionSingleClip(
   return ProjectionClip(mv0, mv1);
 }
 
-void LowPrecision(const int16x8_t mv, void* const candidate_mvs) {
-  const int16x8_t k1 = vdupq_n_s16(1);
+inline void LowPrecision(const int16x8_t mv, void* const candidate_mvs) {
+  const int16x8_t kRoundDownMask = vdupq_n_s16(1);
   const uint16x8_t mvu = vreinterpretq_u16_s16(mv);
   const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15));
-  const int16x8_t mv1 = vbicq_s16(mv0, k1);
+  const int16x8_t mv1 = vbicq_s16(mv0, kRoundDownMask);
   vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv1);
 }
 
-void ForceInteger(const int16x8_t mv, void* const candidate_mvs) {
-  const int16x8_t k3 = vdupq_n_s16(3);
-  const int16x8_t k7 = vdupq_n_s16(7);
+inline void ForceInteger(const int16x8_t mv, void* const candidate_mvs) {
+  const int16x8_t kRoundDownMask = vdupq_n_s16(7);
   const uint16x8_t mvu = vreinterpretq_u16_s16(mv);
   const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15));
-  const int16x8_t mv1 = vaddq_s16(mv0, k3);
-  const int16x8_t mv2 = vbicq_s16(mv1, k7);
+  const int16x8_t mv1 = vaddq_s16(mv0, vdupq_n_s16(3));
+  const int16x8_t mv2 = vbicq_s16(mv1, kRoundDownMask);
   vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv2);
 }
 
 void MvProjectionCompoundLowPrecision_NEON(
     const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
-    const int reference_offsets[2], int count,
+    const int reference_offsets[2], const int count,
     CompoundMotionVector* candidate_mvs) {
   // |reference_offsets| non-zero check usually equals true and is ignored.
   // To facilitate the compilers, make a local copy of |reference_offsets|.
diff --git a/chromium/third_party/libgav1/src/src/dsp/arm/warp_neon.cc b/chromium/third_party/libgav1/src/src/dsp/arm/warp_neon.cc
index 901aa3ddedf..c7fb739ba75 100644
--- a/chromium/third_party/libgav1/src/src/dsp/arm/warp_neon.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/arm/warp_neon.cc
@@ -133,7 +133,7 @@ void Warp_NEON(const void* const source, const ptrdiff_t source_stride,
   assert(block_width >= 8);
   assert(block_height >= 8);
 
-  // Warp process applies for each 8x8 block (or smaller).
+  // Warp process applies for each 8x8 block.
   int start_y = block_start_y;
   do {
     int start_x = block_start_x;
diff --git a/chromium/third_party/libgav1/src/src/dsp/arm/weight_mask_neon.h b/chromium/third_party/libgav1/src/src/dsp/arm/weight_mask_neon.h
index f13eb13605c..b4749ec6aea 100644
--- a/chromium/third_party/libgav1/src/src/dsp/arm/weight_mask_neon.h
+++ b/chromium/third_party/libgav1/src/src/dsp/arm/weight_mask_neon.h
@@ -36,6 +36,7 @@ void WeightMaskInit_NEON();
 #define LIBGAV1_Dsp8bpp_WeightMask_16x8 LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_WeightMask_16x16 LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_WeightMask_16x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_16x64 LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_WeightMask_32x8 LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_WeightMask_32x16 LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_WeightMask_32x32 LIBGAV1_CPU_NEON
diff --git a/chromium/third_party/libgav1/src/src/dsp/cdef.cc b/chromium/third_party/libgav1/src/src/dsp/cdef.cc
index 0ebee20d8b5..a7c720b77cc 100644
--- a/chromium/third_party/libgav1/src/src/dsp/cdef.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/cdef.cc
@@ -29,6 +29,8 @@ namespace libgav1 {
 namespace dsp {
 namespace {
 
+#include "src/dsp/cdef.inc"
+
 // Silence unused function warnings when CdefDirection_C is obviated.
 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||        \
     !defined(LIBGAV1_Dsp8bpp_CdefDirection) || \
@@ -119,21 +121,23 @@ int Constrain(int diff, int threshold, int damping) {
 // constant large value if at the boundary. And the input should be uint16_t.
 template <int bitdepth, typename Pixel>
 void CdefFilter_C(const void* const source, const ptrdiff_t source_stride,
-                  const int rows4x4, const int columns4x4, const int curr_x,
-                  const int curr_y, const int subsampling_x,
-                  const int subsampling_y, const int primary_strength,
-                  const int secondary_strength, const int damping,
-                  const int direction, void* const dest,
+                  const int block_width, const int block_height,
+                  const int primary_strength, const int secondary_strength,
+                  const int damping, const int direction, void* const dest,
                   const ptrdiff_t dest_stride) {
-  static constexpr int kCdefSecondaryTaps[2] = {kCdefSecondaryTap0,
-                                                kCdefSecondaryTap1};
-  const int coeff_shift = bitdepth - 8;
-  const int plane_width = MultiplyBy4(columns4x4) >> subsampling_x;
-  const int plane_height = MultiplyBy4(rows4x4) >> subsampling_y;
-  const int block_width = std::min(8 >> subsampling_x, plane_width - curr_x);
   assert(block_width == 4 || block_width == 8);
-  const int block_height = std::min(8 >> subsampling_y, plane_height - curr_y);
   assert(block_height == 4 || block_height == 8);
+  assert(direction >= 0 && direction <= 7);
+  constexpr int coeff_shift = bitdepth - 8;
+  // Section 5.9.19. CDEF params syntax.
+  assert(primary_strength >= 0 && primary_strength <= 15 << coeff_shift);
+  assert(secondary_strength >= 0 && secondary_strength <= 4 << coeff_shift &&
+         secondary_strength != 3 << coeff_shift);
+  // damping is decreased by 1 for chroma.
+  assert((damping >= 3 && damping <= 6 + coeff_shift) ||
+         (damping >= 2 && damping <= 5 + coeff_shift));
+  static constexpr int kCdefSecondaryTaps[2] = {kCdefSecondaryTap0,
+                                                kCdefSecondaryTap1};
   const auto* src = static_cast<const uint16_t*>(source);
   auto* dst = static_cast<Pixel*>(dest);
   const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel);
@@ -146,7 +150,7 @@ void CdefFilter_C(const void* const source, const ptrdiff_t source_stride,
       uint16_t max_value = pixel_value;
       uint16_t min_value = pixel_value;
       for (int k = 0; k < 2; ++k) {
-        const int signs[] = {-1, 1};
+        static constexpr int signs[] = {-1, 1};
         for (const int& sign : signs) {
           int dy = sign * kCdefDirections[direction][k][0];
           int dx = sign * kCdefDirections[direction][k][1];
@@ -160,10 +164,10 @@ void CdefFilter_C(const void* const source, const ptrdiff_t source_stride,
             max_value = std::max(value, max_value);
             min_value = std::min(value, min_value);
           }
-          const int offsets[] = {-2, 2};
+          static constexpr int offsets[] = {-2, 2};
           for (const int& offset : offsets) {
-            dy = sign * kCdefDirections[(direction + offset) & 7][k][0];
-            dx = sign * kCdefDirections[(direction + offset) & 7][k][1];
+            dy = sign * kCdefDirections[direction + offset][k][0];
+            dx = sign * kCdefDirections[direction + offset][k][1];
             value = src[dy * source_stride + dx + x];
             // Note: the summation can ignore the condition check in SIMD
             // implementation.
diff --git a/chromium/third_party/libgav1/src/src/dsp/cdef.inc b/chromium/third_party/libgav1/src/src/dsp/cdef.inc
new file mode 100644
index 00000000000..c1a31361796
--- /dev/null
+++ b/chromium/third_party/libgav1/src/src/dsp/cdef.inc
@@ -0,0 +1,29 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants used for cdef implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+const int8_t (*const kCdefDirections)[2][2] = kCdefDirectionsPadded + 2;
+
+// Mirror values and pad to 16 elements.
+alignas(16) constexpr uint32_t kCdefDivisionTable[] = {
+    840, 420, 280, 210, 168, 140, 120, 105,
+    120, 140, 168, 210, 280, 420, 840, 0};
+
+// Used when calculating odd |cost[x]| values to mask off unwanted elements.
+// Holds elements 1 3 5 X 5 3 1 X
+alignas(16) constexpr uint32_t kCdefDivisionTableOdd[] = {420, 210, 140, 0,
+                                                          140, 210, 420, 0};
diff --git a/chromium/third_party/libgav1/src/src/dsp/common.h b/chromium/third_party/libgav1/src/src/dsp/common.h
index 2532d177856..2a08403379f 100644
--- a/chromium/third_party/libgav1/src/src/dsp/common.h
+++ b/chromium/third_party/libgav1/src/src/dsp/common.h
@@ -45,15 +45,15 @@ struct RestorationUnitInfo : public MaxAlignedAllocable {
   WienerInfo wiener_info;
 };
 
-struct RestorationBuffer {
+union RestorationBuffer {
   // For self-guided filter.
-  int* box_filter_process_output[2];
-  ptrdiff_t box_filter_process_output_stride;
-  uint32_t* box_filter_process_intermediate[2];
-  ptrdiff_t box_filter_process_intermediate_stride;
+  alignas(kMaxAlignment) uint16_t sgf_buffer[12 * (kRestorationUnitHeight + 2)];
   // For wiener filter.
-  uint16_t* wiener_buffer;
-  ptrdiff_t wiener_buffer_stride;
+  // The array |intermediate| in Section 7.17.4, the intermediate results
+  // between the horizontal and vertical filters.
+  alignas(kMaxAlignment) uint16_t
+      wiener_buffer[(kRestorationUnitHeight + kSubPixelTaps - 1) *
+                    kRestorationUnitWidth];
 };
 
 }  // namespace libgav1
diff --git a/chromium/third_party/libgav1/src/src/dsp/constants.cc b/chromium/third_party/libgav1/src/src/dsp/constants.cc
index 1b9e6fc14e0..0099ca36c8c 100644
--- a/chromium/third_party/libgav1/src/src/dsp/constants.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/constants.cc
@@ -81,8 +81,23 @@ const uint16_t kSgrScaleParameter[16][2] = {
 
 const uint8_t kCdefPrimaryTaps[2][2] = {{4, 2}, {3, 3}};
 
-const int8_t kCdefDirections[8][2][2] = {
-    {{-1, 1}, {-2, 2}}, {{0, 1}, {-1, 2}}, {{0, 1}, {0, 2}}, {{0, 1}, {1, 2}},
-    {{1, 1}, {2, 2}},   {{1, 0}, {2, 1}},  {{1, 0}, {2, 0}}, {{1, 0}, {2, -1}}};
+// This is Cdef_Directions (section 7.15.3) with 2 padding entries at the
+// beginning and end of the table. The cdef direction range is [0, 7] and the
+// first index is offset +/-2. This removes the need to constrain the first
+// index to the same range using e.g., & 7.
+const int8_t kCdefDirectionsPadded[12][2][2] = {
+    {{1, 0}, {2, 0}},    // Padding: Cdef_Directions[6]
+    {{1, 0}, {2, -1}},   // Padding: Cdef_Directions[7]
+    {{-1, 1}, {-2, 2}},  // Begin Cdef_Directions
+    {{0, 1}, {-1, 2}},   //
+    {{0, 1}, {0, 2}},    //
+    {{0, 1}, {1, 2}},    //
+    {{1, 1}, {2, 2}},    //
+    {{1, 0}, {2, 1}},    //
+    {{1, 0}, {2, 0}},    //
+    {{1, 0}, {2, -1}},   // End Cdef_Directions
+    {{-1, 1}, {-2, 2}},  // Padding: Cdef_Directions[0]
+    {{0, 1}, {-1, 2}},   // Padding: Cdef_Directions[1]
+};
 
 }  // namespace libgav1
diff --git a/chromium/third_party/libgav1/src/src/dsp/constants.h b/chromium/third_party/libgav1/src/src/dsp/constants.h
index d588d22af41..7c1b62c4926 100644
--- a/chromium/third_party/libgav1/src/src/dsp/constants.h
+++ b/chromium/third_party/libgav1/src/src/dsp/constants.h
@@ -64,7 +64,7 @@ extern const uint16_t kSgrScaleParameter[16][2];
 
 extern const uint8_t kCdefPrimaryTaps[2][2];
 
-extern const int8_t kCdefDirections[8][2][2];
+extern const int8_t kCdefDirectionsPadded[12][2][2];
 
 }  // namespace libgav1
 
diff --git a/chromium/third_party/libgav1/src/src/dsp/dsp.cc b/chromium/third_party/libgav1/src/src/dsp/dsp.cc
index db285a5f8a0..c1df27634cc 100644
--- a/chromium/third_party/libgav1/src/src/dsp/dsp.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/dsp.cc
@@ -94,6 +94,8 @@ void DspInit() {
       LoopFilterInit_SSE4_1();
       LoopRestorationInit_SSE4_1();
       MaskBlendInit_SSE4_1();
+      MotionFieldProjectionInit_SSE4_1();
+      MotionVectorSearchInit_SSE4_1();
       ObmcInit_SSE4_1();
       SuperResInit_SSE4_1();
       WarpInit_SSE4_1();
diff --git a/chromium/third_party/libgav1/src/src/dsp/dsp.h b/chromium/third_party/libgav1/src/src/dsp/dsp.h
index f5b5b366947..470436faf26 100644
--- a/chromium/third_party/libgav1/src/src/dsp/dsp.h
+++ b/chromium/third_party/libgav1/src/src/dsp/dsp.h
@@ -25,6 +25,7 @@
 #include "src/dsp/constants.h"
 #include "src/dsp/film_grain_common.h"
 #include "src/utils/cpu.h"
+#include "src/utils/reference_info.h"
 #include "src/utils/types.h"
 
 namespace libgav1 {
@@ -328,20 +329,15 @@ using CdefDirectionFunc = void (*)(const void* src, ptrdiff_t stride,
 
 // Cdef filtering function signature. Section 7.15.3.
 // |source| is a pointer to the input block. |source_stride| is given in bytes.
-// |rows4x4| and |columns4x4| are frame sizes in units of 4x4 pixels.
-// |curr_x| and |curr_y| are current position in units of pixels.
-// |subsampling_x|, |subsampling_y| are the subsampling factors of current
-// plane.
+// |block_width|, |block_height| are the width/height of the input block.
 // |primary_strength|, |secondary_strength|, and |damping| are Cdef filtering
 // parameters.
 // |direction| is the filtering direction.
 // |dest| is the output buffer. |dest_stride| is given in bytes.
 using CdefFilteringFunc = void (*)(const void* source, ptrdiff_t source_stride,
-                                   int rows4x4, int columns4x4, int curr_x,
-                                   int curr_y, int subsampling_x,
-                                   int subsampling_y, int primary_strength,
-                                   int secondary_strength, int damping,
-                                   int direction, void* dest,
+                                   int block_width, int block_height,
+                                   int primary_strength, int secondary_strength,
+                                   int damping, int direction, void* dest,
                                    ptrdiff_t dest_stride);
 
 // Upscaling process function signature. Section 7.16.
@@ -360,7 +356,8 @@ using SuperResRowFunc = void (*)(const void* source, const int upscaled_width,
 // |source| is the input frame buffer, which is deblocked and cdef filtered.
 // |dest| is the output.
 // |restoration_info| contains loop restoration information, such as filter
-// type, strength. |source| and |dest| share the same stride given in bytes.
+// type, strength.
+// |source_stride| and |dest_stride| are given in pixels.
 // |buffer| contains buffers required for self guided filter and wiener filter.
 // They must be initialized before calling.
 using LoopRestorationFunc = void (*)(
@@ -745,15 +742,7 @@ struct FilmGrainFuncs {
 };
 
 // Motion field projection function signature. Section 7.9.
-// |source_reference_type| corresponds to MfRefFrames[i * 2 + 1][j * 2 + 1] in
-// the spec.
-// |mv| corresponds to MfMvs[i * 2 + 1][j * 2 + 1] in the spec.
-// |order_hint| points to an array of kNumReferenceFrameTypes elements which
-// specifies OrderHintBits least significant bits of the expected output order
-// for reference frames.
-// |current_frame_order_hint| specifies OrderHintBits least significant bits of
-// the expected output order for this frame.
-// |order_hint_shift_bits| equals (32 - OrderHintBits) % 32.
+// |reference_info| provides reference information for motion field projection.
 // |reference_to_current_with_sign| is the precalculated reference frame id
 // distance from current frame.
 // |dst_sign| is -1 for LAST_FRAME and LAST2_FRAME, or 0 (1 in spec) for others.
@@ -763,11 +752,9 @@ struct FilmGrainFuncs {
 // |motion_field| is the output which saves the projected motion field
 // information.
 using MotionFieldProjectionKernelFunc = void (*)(
-    const ReferenceFrameType* source_reference_type, const MotionVector* mv,
-    const uint8_t order_hint[kNumReferenceFrameTypes],
-    unsigned int current_frame_order_hint, unsigned int order_hint_shift_bits,
-    int reference_to_current_with_sign, int dst_sign, int y8_start, int y8_end,
-    int x8_start, int x8_end, TemporalMotionField* motion_field);
+    const ReferenceInfo& reference_info, int reference_to_current_with_sign,
+    int dst_sign, int y8_start, int y8_end, int x8_start, int x8_end,
+    TemporalMotionField* motion_field);
 
 // Compound temporal motion vector projection function signature.
 // Section 7.9.3 and 7.10.2.10.
@@ -797,35 +784,35 @@ using MvProjectionSingleFunc = void (*)(
     int reference_offset, int count, MotionVector* candidate_mvs);
 
 struct Dsp {
-  IntraPredictorFuncs intra_predictors;
+  AverageBlendFunc average_blend;
+  CdefDirectionFunc cdef_direction;
+  CdefFilteringFunc cdef_filter;
+  CflIntraPredictorFuncs cfl_intra_predictors;
+  CflSubsamplerFuncs cfl_subsamplers;
+  ConvolveFuncs convolve;
+  ConvolveScaleFuncs convolve_scale;
   DirectionalIntraPredictorZone1Func directional_intra_predictor_zone1;
   DirectionalIntraPredictorZone2Func directional_intra_predictor_zone2;
   DirectionalIntraPredictorZone3Func directional_intra_predictor_zone3;
+  DistanceWeightedBlendFunc distance_weighted_blend;
+  FilmGrainFuncs film_grain;
   FilterIntraPredictorFunc filter_intra_predictor;
-  CflIntraPredictorFuncs cfl_intra_predictors;
-  CflSubsamplerFuncs cfl_subsamplers;
+  InterIntraMaskBlendFuncs8bpp inter_intra_mask_blend_8bpp;
   IntraEdgeFilterFunc intra_edge_filter;
   IntraEdgeUpsamplerFunc intra_edge_upsampler;
+  IntraPredictorFuncs intra_predictors;
   InverseTransformAddFuncs inverse_transforms;
   LoopFilterFuncs loop_filters;
-  CdefDirectionFunc cdef_direction;
-  CdefFilteringFunc cdef_filter;
-  SuperResRowFunc super_res_row;
   LoopRestorationFuncs loop_restorations;
+  MaskBlendFuncs mask_blend;
   MotionFieldProjectionKernelFunc motion_field_projection_kernel;
   MvProjectionCompoundFunc mv_projection_compound[3];
   MvProjectionSingleFunc mv_projection_single[3];
-  ConvolveFuncs convolve;
-  ConvolveScaleFuncs convolve_scale;
-  WeightMaskFuncs weight_mask;
-  AverageBlendFunc average_blend;
-  DistanceWeightedBlendFunc distance_weighted_blend;
-  MaskBlendFuncs mask_blend;
-  InterIntraMaskBlendFuncs8bpp inter_intra_mask_blend_8bpp;
   ObmcBlendFuncs obmc_blend;
-  WarpFunc warp;
+  SuperResRowFunc super_res_row;
   WarpCompoundFunc warp_compound;
-  FilmGrainFuncs film_grain;
+  WarpFunc warp;
+  WeightMaskFuncs weight_mask;
 };
 
 // Initializes function pointers based on build config and runtime
diff --git a/chromium/third_party/libgav1/src/src/dsp/libgav1_dsp.cmake b/chromium/third_party/libgav1/src/src/dsp/libgav1_dsp.cmake
index 06e23ee0f4f..00574fa1953 100644
--- a/chromium/third_party/libgav1/src/src/dsp/libgav1_dsp.cmake
+++ b/chromium/third_party/libgav1/src/src/dsp/libgav1_dsp.cmake
@@ -24,6 +24,7 @@ list(APPEND libgav1_dsp_sources
             "${libgav1_source}/dsp/average_blend.h"
             "${libgav1_source}/dsp/cdef.cc"
             "${libgav1_source}/dsp/cdef.h"
+            "${libgav1_source}/dsp/cdef.inc"
             "${libgav1_source}/dsp/common.h"
             "${libgav1_source}/dsp/constants.cc"
             "${libgav1_source}/dsp/constants.h"
@@ -42,6 +43,7 @@ list(APPEND libgav1_dsp_sources
             "${libgav1_source}/dsp/intrapred.h"
             "${libgav1_source}/dsp/inverse_transform.cc"
             "${libgav1_source}/dsp/inverse_transform.h"
+            "${libgav1_source}/dsp/inverse_transform.inc"
             "${libgav1_source}/dsp/loop_filter.cc"
             "${libgav1_source}/dsp/loop_filter.h"
             "${libgav1_source}/dsp/loop_restoration.cc"
@@ -54,6 +56,7 @@ list(APPEND libgav1_dsp_sources
             "${libgav1_source}/dsp/motion_vector_search.h"
             "${libgav1_source}/dsp/obmc.cc"
             "${libgav1_source}/dsp/obmc.h"
+            "${libgav1_source}/dsp/obmc.inc"
             "${libgav1_source}/dsp/super_res.cc"
             "${libgav1_source}/dsp/super_res.h"
             "${libgav1_source}/dsp/warp.cc"
@@ -128,6 +131,10 @@ list(APPEND libgav1_dsp_sources_sse4
             "${libgav1_source}/dsp/x86/loop_restoration_sse4.h"
             "${libgav1_source}/dsp/x86/mask_blend_sse4.cc"
             "${libgav1_source}/dsp/x86/mask_blend_sse4.h"
+            "${libgav1_source}/dsp/x86/motion_field_projection_sse4.cc"
+            "${libgav1_source}/dsp/x86/motion_field_projection_sse4.h"
+            "${libgav1_source}/dsp/x86/motion_vector_search_sse4.cc"
+            "${libgav1_source}/dsp/x86/motion_vector_search_sse4.h"
             "${libgav1_source}/dsp/x86/obmc_sse4.cc"
             "${libgav1_source}/dsp/x86/obmc_sse4.h"
             "${libgav1_source}/dsp/x86/super_res_sse4.cc"
diff --git a/chromium/third_party/libgav1/src/src/dsp/loop_filter.cc b/chromium/third_party/libgav1/src/src/dsp/loop_filter.cc
index 946952b029c..6cad97d4280 100644
--- a/chromium/third_party/libgav1/src/src/dsp/loop_filter.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/loop_filter.cc
@@ -31,10 +31,10 @@ template <int bitdepth, typename Pixel>
 struct LoopFilterFuncs_C {
   LoopFilterFuncs_C() = delete;
 
-  static const int kMaxPixel = (1 << bitdepth) - 1;
-  static const int kMinSignedPixel = -(1 << (bitdepth - 1));
-  static const int kMaxSignedPixel = (1 << (bitdepth - 1)) - 1;
-  static const int kFlatThresh = 1 << (bitdepth - 8);
+  static constexpr int kMaxPixel = (1 << bitdepth) - 1;
+  static constexpr int kMinSignedPixel = -(1 << (bitdepth - 1));
+  static constexpr int kMaxSignedPixel = (1 << (bitdepth - 1)) - 1;
+  static constexpr int kFlatThresh = 1 << (bitdepth - 8);
 
   static void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh,
                         int inner_thresh, int hev_thresh);
diff --git a/chromium/third_party/libgav1/src/src/dsp/loop_restoration.cc b/chromium/third_party/libgav1/src/src/dsp/loop_restoration.cc
index 467e33492fd..b2ae99c0882 100644
--- a/chromium/third_party/libgav1/src/src/dsp/loop_restoration.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/loop_restoration.cc
@@ -26,15 +26,6 @@
 
 namespace libgav1 {
 namespace dsp {
-namespace {
-
-// Precision of a division table (mtable)
-constexpr int kSgrProjScaleBits = 20;
-constexpr int kSgrProjReciprocalBits = 12;
-// Core self-guided restoration precision bits.
-constexpr int kSgrProjSgrBits = 8;
-// Precision bits of generated values higher than source before projection.
-constexpr int kSgrProjRestoreBits = 4;
 
 // Section 7.17.3.
 // a2: range [1, 256].
@@ -44,7 +35,7 @@ constexpr int kSgrProjRestoreBits = 4;
 //   a2 = 1;
 // else
 //   a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1);
-constexpr int kXByXPlus1[256] = {
+const int kXByXPlus1[256] = {
     1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
     240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
     248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
@@ -64,65 +55,51 @@ constexpr int kXByXPlus1[256] = {
     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
     256};
 
+// a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1);
+// sgr_ma2 = 256 - a2
+const uint8_t kSgrMa2Lookup[256] = {
+    255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16, 15, 14,
+    13,  13,  12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,  8,  8,  7,  7,
+    7,   7,   7,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,
+    5,   5,   4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    4,   3,   3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,   3,   3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,  2,  2,
+    2,   2,   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,   2,   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,   2,   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,   2,   2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,   1,   1,  0};
+
+namespace {
+
+constexpr ptrdiff_t kIntermediateStride = kRestorationUnitWidth + 2;
+
+struct SgrIntermediateBuffer {
+  uint16_t a;  // [1, 256]
+  uint32_t b;  // < 2^20. 32-bit is required for bitdepth 10 and up.
+};
+
+struct SgrBuffer {
+  // Circular buffer to save memory.
+  // The 2d arrays A and B in Section 7.17.3, the intermediate results in the
+  // box filter process. Reused for pass 0 and pass 1. Pass 0 uses 2 rows. Pass
+  // 1 uses 3 or 4 rows.
+  SgrIntermediateBuffer intermediate[6 * kIntermediateStride];
+};
+
 constexpr int kOneByX[25] = {
     4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
     293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
 };
 
-// Compute integral image. In an integral image, each pixel value of (xi, yi)
-// is the sum of all pixel values {(x, y) | x <= xi, y <= yi} from the source
-// image.
-// The integral image (II) can be calculated as:
-// II(D) = Pixel(D) + II(B) + II(C) - II(A),
-// where the rectangular region ABCD is
-// A = (x, y), B = (x + 1, y), C = (x, y + 1), D = (x + 1, y + 1).
-// Integral image helps to compute the sum of a rectangular area fast.
-// The box centered at (x, y), with radius r, is rectangular ABCD:
-// A = (x - r, y - r), B = (x + r, y - r),
-// C = (x - r, y + r), D = (x + r, y + r),
-// The sum of the box, or the rectangular ABCD can be calculated with the
-// integral image (II):
-// sum = II(D) - II(B') - II(C') + II(A').
-// A' = (x - r - 1, y - r - 1), B' = (x + r, y - r - 1),
-// C' = (x - r - 1, y + r), D = (x + r, y + r),
-// Here we calculate the integral image, as well as the squared integral image.
-template <typename Pixel>
-void ComputeIntegralImage(const Pixel* const src, ptrdiff_t src_stride,
-                          int width, int height, uint16_t* integral_image,
-                          uint32_t* square_integral_image,
-                          ptrdiff_t image_stride) {
-  memset(integral_image, 0, image_stride * sizeof(integral_image[0]));
-  memset(square_integral_image, 0,
-         image_stride * sizeof(square_integral_image[0]));
-
-  const Pixel* src_ptr = src;
-  uint16_t* integral_image_ptr = integral_image + image_stride + 1;
-  uint32_t* square_integral_image_ptr =
-      square_integral_image + image_stride + 1;
-  int y = 0;
-  do {
-    integral_image_ptr[-1] = 0;
-    square_integral_image_ptr[-1] = 0;
-    for (int x = 0; x < width; ++x) {
-      integral_image_ptr[x] = src_ptr[x] + integral_image_ptr[x - 1] +
-                              integral_image_ptr[x - image_stride] -
-                              integral_image_ptr[x - image_stride - 1];
-      square_integral_image_ptr[x] =
-          src_ptr[x] * src_ptr[x] + square_integral_image_ptr[x - 1] +
-          square_integral_image_ptr[x - image_stride] -
-          square_integral_image_ptr[x - image_stride - 1];
-    }
-    src_ptr += src_stride;
-    integral_image_ptr += image_stride;
-    square_integral_image_ptr += image_stride;
-  } while (++y < height);
-}
-
 template <int bitdepth, typename Pixel>
 struct LoopRestorationFuncs_C {
   LoopRestorationFuncs_C() = delete;
 
-  // |stride| for SelfGuidedFilter and WienerFilter is given in bytes.
   static void SelfGuidedFilter(const void* source, void* dest,
                                const RestorationUnitInfo& restoration_info,
                                ptrdiff_t source_stride, ptrdiff_t dest_stride,
@@ -132,15 +109,18 @@ struct LoopRestorationFuncs_C {
                            const RestorationUnitInfo& restoration_info,
                            ptrdiff_t source_stride, ptrdiff_t dest_stride,
                            int width, int height, RestorationBuffer* buffer);
-  // |stride| for box filter processing is in Pixels.
-  static void BoxFilterPreProcess(const RestorationUnitInfo& restoration_info,
-                                  const uint16_t* integral_image,
-                                  const uint32_t* square_integral_image,
-                                  int width, int height, int pass,
-                                  RestorationBuffer* buffer);
   static void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
-                               const Pixel* src, ptrdiff_t stride, int width,
-                               int height, RestorationBuffer* buffer);
+                               const Pixel* src, ptrdiff_t src_stride,
+                               int width, int height, SgrBuffer* buffer,
+                               Pixel* dst, ptrdiff_t dst_stride);
+  static void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                    const Pixel* src, ptrdiff_t src_stride,
+                                    int width, int height, SgrBuffer* buffer,
+                                    Pixel* dst, ptrdiff_t dst_stride);
+  static void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                    const Pixel* src, ptrdiff_t src_stride,
+                                    int width, int height, SgrBuffer* buffer,
+                                    Pixel* dst, ptrdiff_t dst_stride);
 };
 
 // Note: range of wiener filter coefficients.
@@ -154,7 +134,7 @@ struct LoopRestorationFuncs_C {
 // filter[3] = 0 - (filter[0] + filter[1] + filter[2]) * 2.
 // Thus in libaom's computation, an offset of 128 is needed for filter[3].
 inline void PopulateWienerCoefficients(
-    const RestorationUnitInfo& restoration_info, int direction,
+    const RestorationUnitInfo& restoration_info, const int direction,
     int16_t* const filter) {
   filter[3] = 128;
   for (int i = 0; i < 3; ++i) {
@@ -178,26 +158,64 @@ inline int CountZeroCoefficients(const int16_t* const filter) {
   return number_zero_coefficients;
 }
 
-template <typename Pixel>
-inline int WienerHorizontal(const Pixel* const source,
-                            const int16_t* const filter,
-                            const int number_zero_coefficients, int sum) {
+template <int bitdepth, typename Pixel>
+inline void WienerHorizontal(const Pixel* source, const ptrdiff_t source_stride,
+                             const int width, const int height,
+                             const int16_t* const filter,
+                             const int number_zero_coefficients,
+                             uint16_t** wiener_buffer) {
   constexpr int kCenterTap = (kSubPixelTaps - 1) / 2;
-  for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
-    sum += filter[k] * (source[k] + source[kSubPixelTaps - 2 - k]);
-  }
-  return sum;
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  constexpr int limit =
+      (1 << (bitdepth + 1 + kWienerFilterBits - kRoundBitsHorizontal)) - 1;
+  constexpr int horizontal_rounding = 1 << (bitdepth + kWienerFilterBits - 1);
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      // sum fits into 16 bits only when bitdepth = 8.
+      int sum = horizontal_rounding;
+      for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
+        sum += filter[k] * (source[x + k] + source[x + kSubPixelTaps - 2 - k]);
+      }
+      sum += filter[kCenterTap] * source[x + kCenterTap];
+      const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsHorizontal);
+      (*wiener_buffer)[x] = static_cast<uint16_t>(Clip3(rounded_sum, 0, limit));
+    } while (++x < width);
+    source += source_stride;
+    *wiener_buffer += width;
+  } while (--y != 0);
 }
 
-inline int WienerVertical(const uint16_t* const source,
-                          const int16_t* const filter, const int width,
-                          const int number_zero_coefficients, int sum) {
+template <int bitdepth, typename Pixel>
+inline void WienerVertical(const uint16_t* wiener_buffer, const int width,
+                           const int height, const int16_t* const filter,
+                           const int number_zero_coefficients, void* const dest,
+                           const ptrdiff_t dest_stride) {
   constexpr int kCenterTap = (kSubPixelTaps - 1) / 2;
-  for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
-    sum += filter[k] *
-           (source[k * width] + source[(kSubPixelTaps - 2 - k) * width]);
-  }
-  return sum;
+  constexpr int kRoundBitsVertical =
+      (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
+  constexpr int vertical_rounding = -(1 << (bitdepth + kRoundBitsVertical - 1));
+  auto* dst = static_cast<Pixel*>(dest);
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      // sum needs 32 bits.
+      int sum = vertical_rounding;
+      for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
+        sum += filter[k] * (wiener_buffer[k * width + x] +
+                            wiener_buffer[(kSubPixelTaps - 2 - k) * width + x]);
+      }
+      sum += filter[kCenterTap] * wiener_buffer[kCenterTap * width + x];
+      const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical);
+      dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1));
+    } while (++x < width);
+    wiener_buffer += width;
+    dst += dest_stride;
+  } while (--y != 0);
 }
 
 // Note: bit range for wiener filter.
@@ -223,13 +241,6 @@ void LoopRestorationFuncs_C<bitdepth, Pixel>::WienerFilter(
     ptrdiff_t dest_stride, int width, int height,
     RestorationBuffer* const buffer) {
   constexpr int kCenterTap = (kSubPixelTaps - 1) / 2;
-  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
-                                           ? kInterRoundBitsHorizontal12bpp
-                                           : kInterRoundBitsHorizontal;
-  constexpr int kRoundBitsVertical =
-      (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
-  const int limit =
-      (1 << (bitdepth + 1 + kWienerFilterBits - kRoundBitsHorizontal)) - 1;
   int16_t filter_horizontal[kSubPixelTaps / 2];
   int16_t filter_vertical[kSubPixelTaps / 2];
   PopulateWienerCoefficients(restoration_info, WienerInfo::kHorizontal,
@@ -240,448 +251,470 @@ void LoopRestorationFuncs_C<bitdepth, Pixel>::WienerFilter(
       CountZeroCoefficients(filter_horizontal);
   const int number_zero_coefficients_vertical =
       CountZeroCoefficients(filter_vertical);
-
-  source_stride /= sizeof(Pixel);
-  dest_stride /= sizeof(Pixel);
+  const int number_rows_to_skip =
+      std::max(number_zero_coefficients_vertical, 1);
 
   // horizontal filtering.
   const auto* src = static_cast<const Pixel*>(source);
-  src -= (kCenterTap - number_zero_coefficients_vertical) * source_stride +
-         kCenterTap;
-  auto* wiener_buffer =
-      buffer->wiener_buffer + number_zero_coefficients_vertical * width;
-  const int horizontal_rounding = 1 << (bitdepth + kWienerFilterBits - 1);
-  int y = height + kSubPixelTaps - 2 - 2 * number_zero_coefficients_vertical;
+  src -= (kCenterTap - number_rows_to_skip) * source_stride + kCenterTap;
+  auto* wiener_buffer = buffer->wiener_buffer + number_rows_to_skip * width;
+  const int height_horizontal =
+      height + kSubPixelTaps - 2 - 2 * number_rows_to_skip;
 
   if (number_zero_coefficients_horizontal == 0) {
-    do {
-      int x = 0;
-      do {
-        // sum fits into 16 bits only when bitdepth = 8.
-        int sum = horizontal_rounding;
-        sum = WienerHorizontal<Pixel>(src + x, filter_horizontal, 0, sum);
-        sum += filter_horizontal[kCenterTap] * src[x + kCenterTap];
-        const int rounded_sum =
-            RightShiftWithRounding(sum, kRoundBitsHorizontal);
-        wiener_buffer[x] = static_cast<uint16_t>(Clip3(rounded_sum, 0, limit));
-      } while (++x < width);
-      src += source_stride;
-      wiener_buffer += width;
-    } while (--y != 0);
+    WienerHorizontal<bitdepth, Pixel>(src, source_stride, width,
+                                      height_horizontal, filter_horizontal, 0,
+                                      &wiener_buffer);
   } else if (number_zero_coefficients_horizontal == 1) {
-    do {
-      int x = 0;
-      do {
-        // sum fits into 16 bits only when bitdepth = 8.
-        int sum = horizontal_rounding;
-        sum = WienerHorizontal<Pixel>(src + x, filter_horizontal, 1, sum);
-        sum += filter_horizontal[kCenterTap] * src[x + kCenterTap];
-        const int rounded_sum =
-            RightShiftWithRounding(sum, kRoundBitsHorizontal);
-        wiener_buffer[x] = static_cast<uint16_t>(Clip3(rounded_sum, 0, limit));
-      } while (++x < width);
-      src += source_stride;
-      wiener_buffer += width;
-    } while (--y != 0);
+    WienerHorizontal<bitdepth, Pixel>(src, source_stride, width,
+                                      height_horizontal, filter_horizontal, 1,
+                                      &wiener_buffer);
   } else if (number_zero_coefficients_horizontal == 2) {
-    do {
-      int x = 0;
-      do {
-        // sum fits into 16 bits only when bitdepth = 8.
-        int sum = horizontal_rounding;
-        sum = WienerHorizontal<Pixel>(src + x, filter_horizontal, 2, sum);
-        sum += filter_horizontal[kCenterTap] * src[x + kCenterTap];
-        const int rounded_sum =
-            RightShiftWithRounding(sum, kRoundBitsHorizontal);
-        wiener_buffer[x] = static_cast<uint16_t>(Clip3(rounded_sum, 0, limit));
-      } while (++x < width);
-      src += source_stride;
-      wiener_buffer += width;
-    } while (--y != 0);
+    WienerHorizontal<bitdepth, Pixel>(src, source_stride, width,
+                                      height_horizontal, filter_horizontal, 2,
+                                      &wiener_buffer);
   } else {
-    do {
-      int x = 0;
-      do {
-        // sum fits into 16 bits only when bitdepth = 8.
-        int sum = horizontal_rounding;
-        sum += filter_horizontal[kCenterTap] * src[x + kCenterTap];
-        const int rounded_sum =
-            RightShiftWithRounding(sum, kRoundBitsHorizontal);
-        wiener_buffer[x] = static_cast<uint16_t>(Clip3(rounded_sum, 0, limit));
-      } while (++x < width);
-      src += source_stride;
-      wiener_buffer += width;
-    } while (--y != 0);
+    WienerHorizontal<bitdepth, Pixel>(src, source_stride, width,
+                                      height_horizontal, filter_horizontal, 3,
+                                      &wiener_buffer);
   }
 
   // vertical filtering.
-  const int vertical_rounding = -(1 << (bitdepth + kRoundBitsVertical - 1));
-  auto* dst = static_cast<Pixel*>(dest);
-  wiener_buffer = buffer->wiener_buffer;
-  y = height;
-
   if (number_zero_coefficients_vertical == 0) {
-    do {
-      int x = 0;
-      do {
-        // sum needs 32 bits.
-        int sum = vertical_rounding;
-        sum = WienerVertical(wiener_buffer + x, filter_vertical, width, 0, sum);
-        sum +=
-            filter_vertical[kCenterTap] * wiener_buffer[kCenterTap * width + x];
-        const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical);
-        dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1));
-      } while (++x < width);
-      dst += dest_stride;
-      wiener_buffer += width;
-    } while (--y != 0);
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer, wiener_buffer - width,
+           sizeof(*wiener_buffer) * width);
+    memcpy(buffer->wiener_buffer, buffer->wiener_buffer + width,
+           sizeof(*wiener_buffer) * width);
+    WienerVertical<bitdepth, Pixel>(buffer->wiener_buffer, width, height,
+                                    filter_vertical, 0, dest, dest_stride);
   } else if (number_zero_coefficients_vertical == 1) {
-    do {
-      int x = 0;
-      do {
-        // sum needs 32 bits.
-        int sum = vertical_rounding;
-        sum = WienerVertical(wiener_buffer + x, filter_vertical, width, 1, sum);
-        sum +=
-            filter_vertical[kCenterTap] * wiener_buffer[kCenterTap * width + x];
-        const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical);
-        dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1));
-      } while (++x < width);
-      dst += dest_stride;
-      wiener_buffer += width;
-    } while (--y != 0);
+    WienerVertical<bitdepth, Pixel>(buffer->wiener_buffer, width, height,
+                                    filter_vertical, 1, dest, dest_stride);
   } else if (number_zero_coefficients_vertical == 2) {
-    do {
-      int x = 0;
-      do {
-        // sum needs 32 bits.
-        int sum = vertical_rounding;
-        sum = WienerVertical(wiener_buffer + x, filter_vertical, width, 2, sum);
-        sum +=
-            filter_vertical[kCenterTap] * wiener_buffer[kCenterTap * width + x];
-        const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical);
-        dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1));
-      } while (++x < width);
-      dst += dest_stride;
-      wiener_buffer += width;
-    } while (--y != 0);
+    WienerVertical<bitdepth, Pixel>(buffer->wiener_buffer, width, height,
+                                    filter_vertical, 2, dest, dest_stride);
   } else {
-    do {
-      int x = 0;
-      do {
-        // sum needs 32 bits.
-        int sum = vertical_rounding;
-        sum +=
-            filter_vertical[kCenterTap] * wiener_buffer[kCenterTap * width + x];
-        const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical);
-        dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1));
-      } while (++x < width);
-      dst += dest_stride;
-      wiener_buffer += width;
-    } while (--y != 0);
+    WienerVertical<bitdepth, Pixel>(buffer->wiener_buffer, width, height,
+                                    filter_vertical, 3, dest, dest_stride);
   }
 }
 
+//------------------------------------------------------------------------------
+// SGR
+
+template <int bitdepth>
+inline void CalculateIntermediate(const uint32_t s, uint32_t a,
+                                  const uint32_t b, const uint32_t n,
+                                  SgrIntermediateBuffer* const intermediate) {
+  // a: before shift, max is 25 * (2^(bitdepth) - 1) * (2^(bitdepth) - 1).
+  // since max bitdepth = 12, max < 2^31.
+  // after shift, a < 2^16 * n < 2^22 regardless of bitdepth
+  a = RightShiftWithRounding(a, (bitdepth - 8) << 1);
+  // b: max is 25 * (2^(bitdepth) - 1). If bitdepth = 12, max < 2^19.
+  // d < 2^8 * n < 2^14 regardless of bitdepth
+  const uint32_t d = RightShiftWithRounding(b, bitdepth - 8);
+  // p: Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
+  // and p itself satisfies p < 2^14 * n^2 < 2^26.
+  // This bound on p is due to:
+  // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
+  // Note: Sometimes, in high bitdepth, we can end up with a*n < b*b.
+  // This is an artifact of rounding, and can only happen if all pixels
+  // are (almost) identical, so in this case we saturate to p=0.
+  const uint32_t p = (a * n < d * d) ? 0 : a * n - d * d;
+  // p * s < (2^14 * n^2) * round(2^20 / (n^2 * scale)) < 2^34 / scale <
+  // 2^32 as long as scale >= 4. So p * s fits into a uint32_t, and z < 2^12
+  // (this holds even after accounting for the rounding in s)
+  const uint32_t z = RightShiftWithRounding(p * s, kSgrProjScaleBits);
+  // a2: range [1, 256].
+  uint32_t a2 = kXByXPlus1[std::min(z, 255u)];
+  const uint32_t one_over_n = kOneByX[n - 1];
+  // (kSgrProjSgrBits - a2) < 2^8, b < 2^(bitdepth) * n,
+  // one_over_n = round(2^12 / n)
+  // => the product here is < 2^(20 + bitdepth) <= 2^32,
+  // and b is set to a value < 2^(8 + bitdepth).
+  // This holds even with the rounding in one_over_n and in the overall
+  // result, as long as (kSgrProjSgrBits - a2) is strictly less than 2^8.
+  const uint32_t b2 = ((1 << kSgrProjSgrBits) - a2) * b * one_over_n;
+  intermediate->a = a2;
+  intermediate->b = RightShiftWithRounding(b2, kSgrProjReciprocalBits);
+}
+
 template <int bitdepth, typename Pixel>
-void LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterPreProcess(
-    const RestorationUnitInfo& restoration_info, const uint16_t* integral_image,
-    const uint32_t* square_integral_image, int width, int height, int pass,
-    RestorationBuffer* const buffer) {
-  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
-  const uint8_t radius = kSgrProjParams[sgr_proj_index][pass * 2];
-  assert(radius != 0);
-  const uint32_t n = (2 * radius + 1) * (2 * radius + 1);
-  // const uint8_t scale = kSgrProjParams[sgr_proj_index][pass * 2 + 1];
-  // n2_with_scale: max value < 2^16. min value is 4.
-  // const uint32_t n2_with_scale = n * n * scale;
-  // s: max value < 2^12.
-  // const uint32_t s =
-  // ((1 << kSgrProjScaleBits) + (n2_with_scale >> 1)) / n2_with_scale;
-  const uint32_t s = kSgrScaleParameter[sgr_proj_index][pass];
-  assert(s != 0);
-  const ptrdiff_t array_stride = buffer->box_filter_process_intermediate_stride;
-  const ptrdiff_t integral_image_stride =
-      kRestorationProcessingUnitSizeWithBorders + 1;
-  // The size of the intermediate result buffer is the size of the filter area
-  // plus horizontal (3) and vertical (3) padding. The processing start point
-  // is the filter area start point -1 row and -1 column. Therefore we need to
-  // set offset and use the intermediate_result as the start point for
-  // processing.
-  const ptrdiff_t intermediate_buffer_offset =
-      kRestorationBorder * array_stride + kRestorationBorder;
-  uint32_t* intermediate_result[2] = {
-      buffer->box_filter_process_intermediate[0] + intermediate_buffer_offset -
-          array_stride,
-      buffer->box_filter_process_intermediate[1] + intermediate_buffer_offset -
-          array_stride};
-
-  // Calculate intermediate results, including one-pixel border, for example,
-  // if unit size is 64x64, we calculate 66x66 pixels.
-  const int step = (pass == 0) ? 2 : 1;
-  const ptrdiff_t intermediate_stride = step * array_stride;
-  for (int y = -1; y <= height; y += step) {
-    for (int x = -1; x <= width; ++x) {
-      // The integral image helps to calculate the sum of the square
-      // centered at (x, y).
-      // The calculation of a, b is equal to the following lines:
-      // uint32_t a = 0;
-      // uint32_t b = 0;
-      // for (int dy = -radius; dy <= radius; ++dy) {
-      //   for (int dx = -radius; dx <= radius; ++dx) {
-      //     const Pixel source = src[(y + dy) * stride + (x + dx)];
-      //     a += source * source;
-      //     b += source;
-      //   }
-      // }
-      const int top_left =
-          (y + kRestorationBorder - radius) * integral_image_stride + x +
-          kRestorationBorder - radius;
-      const int top_right = top_left + 2 * radius + 1;
-      const int bottom_left =
-          top_left + (2 * radius + 1) * integral_image_stride;
-      const int bottom_right = bottom_left + 2 * radius + 1;
-      uint32_t a = square_integral_image[bottom_right] -
-                   square_integral_image[bottom_left] -
-                   square_integral_image[top_right] +
-                   square_integral_image[top_left];
-      uint32_t b;
-
-      if (bitdepth <= 10 || radius < 2) {
-        // The following cast is mandatory to get truncated sum.
-        b = static_cast<uint16_t>(
-            integral_image[bottom_right] - integral_image[bottom_left] -
-            integral_image[top_right] + integral_image[top_left]);
-      } else {
-        assert(radius == 2);
-        const uint16_t b_top_15_pixels =
-            integral_image[top_right + 3 * integral_image_stride] -
-            integral_image[top_left + 3 * integral_image_stride] -
-            integral_image[top_right] + integral_image[top_left];
-        const uint16_t b_bottom_10_pixels =
-            integral_image[bottom_right] - integral_image[bottom_left] -
-            integral_image[top_right + 3 * integral_image_stride] +
-            integral_image[top_left + 3 * integral_image_stride];
-        b = b_top_15_pixels + b_bottom_10_pixels;
-      }
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessTop(
+    const Pixel* src, const ptrdiff_t stride, const int width, const uint32_t s,
+    SgrIntermediateBuffer* intermediate) {
+  uint32_t a = 0;
+  uint32_t b = 0;
+  for (int dx = 0; dx < 5; ++dx) {
+    const Pixel source = src[dx];
+    a += source * source;
+    b += source;
+  }
+  a += a;
+  b += b;
+  for (int dy = 1; dy < 4; ++dy) {
+    for (int dx = 0; dx < 5; ++dx) {
+      const Pixel source = src[dy * stride + dx];
+      a += source * source;
+      b += source;
+    }
+  }
+  CalculateIntermediate<bitdepth>(s, a, b, 25, intermediate);
+  int x = width - 1;
+  do {
+    {
+      const Pixel source0 = src[0];
+      const Pixel source1 = src[5];
+      a += 2 * (source1 * source1 - source0 * source0);
+      b += 2 * (source1 - source0);
+    }
+    int dy = 1;
+    do {
+      const Pixel source0 = src[dy * stride];
+      const Pixel source1 = src[dy * stride + 5];
+      a -= source0 * source0;
+      a += source1 * source1;
+      b -= source0;
+      b += source1;
+    } while (++dy < 4);
+    src++;
+    CalculateIntermediate<bitdepth>(s, a, b, 25, ++intermediate);
+  } while (--x != 0);
+}
 
-      // a: before shift, max is 25 * (2^(bitdepth) - 1) * (2^(bitdepth) - 1).
-      // since max bitdepth = 12, max < 2^31.
-      // after shift, a < 2^16 * n < 2^22 regardless of bitdepth
-      a = RightShiftWithRounding(a, (bitdepth - 8) << 1);
-      // b: max is 25 * (2^(bitdepth) - 1). If bitdepth = 12, max < 2^19.
-      // d < 2^8 * n < 2^14 regardless of bitdepth
-      const uint32_t d = RightShiftWithRounding(b, bitdepth - 8);
-      // p: Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
-      // and p itself satisfies p < 2^14 * n^2 < 2^26.
-      // This bound on p is due to:
-      // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
-      // Note: Sometimes, in high bitdepth, we can end up with a*n < b*b.
-      // This is an artifact of rounding, and can only happen if all pixels
-      // are (almost) identical, so in this case we saturate to p=0.
-      const uint32_t p = (a * n < d * d) ? 0 : a * n - d * d;
-      // p * s < (2^14 * n^2) * round(2^20 / (n^2 * scale)) < 2^34 / scale <
-      // 2^32 as long as scale >= 4. So p * s fits into a uint32_t, and z < 2^12
-      // (this holds even after accounting for the rounding in s)
-      const uint32_t z = RightShiftWithRounding(p * s, kSgrProjScaleBits);
-      // a2: range [1, 256].
-      uint32_t a2 = kXByXPlus1[std::min(z, 255u)];
-      const uint32_t one_over_n = kOneByX[n - 1];
-      // (kSgrProjSgrBits - a2) < 2^8, b < 2^(bitdepth) * n,
-      // one_over_n = round(2^12 / n)
-      // => the product here is < 2^(20 + bitdepth) <= 2^32,
-      // and b is set to a value < 2^(8 + bitdepth).
-      // This holds even with the rounding in one_over_n and in the overall
-      // result, as long as (kSgrProjSgrBits - a2) is strictly less than 2^8.
-      const uint32_t b2 = ((1 << kSgrProjSgrBits) - a2) * b * one_over_n;
-      intermediate_result[0][x] = a2;
-      intermediate_result[1][x] =
-          RightShiftWithRounding(b2, kSgrProjReciprocalBits);
+template <int bitdepth, typename Pixel, int size>
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const Pixel* src, const ptrdiff_t stride, const int width, const uint32_t s,
+    SgrIntermediateBuffer* intermediate) {
+  const int n = size * size;
+  uint32_t a = 0;
+  uint32_t b = 0;
+  for (int dy = 0; dy < size; ++dy) {
+    for (int dx = 0; dx < size; ++dx) {
+      const Pixel source = src[dy * stride + dx];
+      a += source * source;
+      b += source;
     }
-    intermediate_result[0] += intermediate_stride;
-    intermediate_result[1] += intermediate_stride;
   }
+  CalculateIntermediate<bitdepth>(s, a, b, n, intermediate);
+  int x = width - 1;
+  do {
+    int dy = 0;
+    do {
+      const Pixel source0 = src[dy * stride];
+      const Pixel source1 = src[dy * stride + size];
+      a -= source0 * source0;
+      a += source1 * source1;
+      b -= source0;
+      b += source1;
+    } while (++dy < size);
+    src++;
+    CalculateIntermediate<bitdepth>(s, a, b, n, ++intermediate);
+  } while (--x != 0);
+}
+
+template <int bitdepth, typename Pixel>
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessBottom(
+    const Pixel* src, const ptrdiff_t stride, const int width, const uint32_t s,
+    SgrIntermediateBuffer* intermediate) {
+  uint32_t a = 0;
+  uint32_t b = 0;
+  for (int dx = 0; dx < 5; ++dx) {
+    const Pixel source = src[3 * stride + dx];
+    a += source * source;
+    b += source;
+  }
+  a += a;
+  b += b;
+  for (int dy = 0; dy < 3; ++dy) {
+    for (int dx = 0; dx < 5; ++dx) {
+      const Pixel source = src[dy * stride + dx];
+      a += source * source;
+      b += source;
+    }
+  }
+  CalculateIntermediate<bitdepth>(s, a, b, 25, intermediate);
+  int x = width - 1;
+  do {
+    {
+      const Pixel source0 = src[3 * stride + 0];
+      const Pixel source1 = src[3 * stride + 5];
+      a += 2 * (source1 * source1 - source0 * source0);
+      b += 2 * (source1 - source0);
+    }
+    int dy = 0;
+    do {
+      const Pixel source0 = src[dy * stride];
+      const Pixel source1 = src[dy * stride + 5];
+      a -= source0 * source0;
+      a += source1 * source1;
+      b -= source0;
+      b += source1;
+    } while (++dy < 3);
+    src++;
+    CalculateIntermediate<bitdepth>(s, a, b, 25, ++intermediate);
+  } while (--x != 0);
+}
+
+inline void Sum565(const SgrIntermediateBuffer* const intermediate,
+                   uint16_t* const a, uint32_t* const b) {
+  *a = 5 * (intermediate[0].a + intermediate[2].a) + 6 * intermediate[1].a;
+  *b = 5 * (intermediate[0].b + intermediate[2].b) + 6 * intermediate[1].b;
+}
+
+template <typename Pixel>
+inline int CalculateFilteredOutput(const Pixel src, const uint32_t a,
+                                   const uint32_t b, const int shift) {
+  // v < 2^32. All intermediate calculations are positive.
+  const uint32_t v = a * src + b;
+  return RightShiftWithRounding(v,
+                                kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <typename Pixel>
+inline void BoxFilterPass1(const Pixel src0, const Pixel src1,
+                           const SgrIntermediateBuffer* const intermediate[2],
+                           const ptrdiff_t x, int p[2]) {
+  uint16_t a[2];
+  uint32_t b[2];
+  Sum565(intermediate[0] + x, &a[0], &b[0]);
+  Sum565(intermediate[1] + x, &a[1], &b[1]);
+  p[0] = CalculateFilteredOutput<Pixel>(src0, a[0] + a[1], b[0] + b[1], 5);
+  p[1] = CalculateFilteredOutput<Pixel>(src1, a[1], b[1], 4);
+}
+
+template <typename Pixel>
+inline int BoxFilterPass2(const Pixel src,
+                          const SgrIntermediateBuffer* const intermediate[3],
+                          const ptrdiff_t x) {
+  const uint32_t a = 3 * (intermediate[0][x + 0].a + intermediate[0][x + 2].a +
+                          intermediate[2][x + 0].a + intermediate[2][x + 2].a) +
+                     4 * (intermediate[0][x + 1].a + intermediate[1][x + 0].a +
+                          intermediate[1][x + 1].a + intermediate[1][x + 2].a +
+                          intermediate[2][x + 1].a);
+  const uint32_t b = 3 * (intermediate[0][x + 0].b + intermediate[0][x + 2].b +
+                          intermediate[2][x + 0].b + intermediate[2][x + 2].b) +
+                     4 * (intermediate[0][x + 1].b + intermediate[1][x + 0].b +
+                          intermediate[1][x + 1].b + intermediate[1][x + 2].b +
+                          intermediate[2][x + 1].b);
+  return CalculateFilteredOutput<Pixel>(src, a, b, 5);
+}
+
+template <int bitdepth, typename Pixel>
+inline Pixel SelfGuidedDoubleMultiplier(const int src,
+                                        const int box_filter_process_output0,
+                                        const int box_filter_process_output1,
+                                        const int16_t w0, const int16_t w1,
+                                        const int16_t w2) {
+  const int v = w1 * (src << kSgrProjRestoreBits) +
+                w0 * box_filter_process_output0 +
+                w2 * box_filter_process_output1;
+  // if radius_pass_0 == 0 and radius_pass_1 == 0, the range of v is:
+  // bits(u) + bits(w0/w1/w2) + 2 = bitdepth + 13.
+  // Then, range of s is bitdepth + 2. This is a rough estimation, taking
+  // the maximum value of each element.
+  const int s =
+      RightShiftWithRounding(v, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  return static_cast<Pixel>(Clip3(s, 0, (1 << bitdepth) - 1));
+}
+
+template <int bitdepth, typename Pixel>
+inline Pixel SelfGuidedSingleMultiplier(const int src,
+                                        const int box_filter_process_output,
+                                        const int16_t w0, const int16_t w1) {
+  const int v =
+      w1 * (src << kSgrProjRestoreBits) + w0 * box_filter_process_output;
+  // if radius_pass_0 == 0 and radius_pass_1 == 0, the range of v is:
+  // bits(u) + bits(w0/w1/w2) + 2 = bitdepth + 13.
+  // Then, range of s is bitdepth + 2. This is a rough estimation, taking
+  // the maximum value of each element.
+  const int s =
+      RightShiftWithRounding(v, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  return static_cast<Pixel>(Clip3(s, 0, (1 << bitdepth) - 1));
 }
 
 template <int bitdepth, typename Pixel>
-void LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcess(
+inline void LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcess(
     const RestorationUnitInfo& restoration_info, const Pixel* src,
-    ptrdiff_t stride, int width, int height, RestorationBuffer* const buffer) {
+    const ptrdiff_t src_stride, const int width, const int height,
+    SgrBuffer* const buffer, Pixel* dst, const ptrdiff_t dst_stride) {
   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t s0 = kSgrScaleParameter[sgr_proj_index][0];  // s0 < 2^12.
+  const uint32_t s1 = kSgrScaleParameter[sgr_proj_index][1];  // s1 < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  SgrIntermediateBuffer *intermediate0[2], *intermediate1[4];
+  assert(s0 != 0);
+  assert(s1 != 0);
+  intermediate0[0] = buffer->intermediate;
+  intermediate0[1] = intermediate0[0] + kIntermediateStride;
+  intermediate1[0] = intermediate0[1] + kIntermediateStride;
+  intermediate1[1] = intermediate1[0] + kIntermediateStride,
+  intermediate1[2] = intermediate1[1] + kIntermediateStride,
+  intermediate1[3] = intermediate1[2] + kIntermediateStride;
+  BoxFilterPreProcessTop<bitdepth, Pixel>(src - 2 * src_stride - 3, src_stride,
+                                          width + 2, s0, intermediate0[0]);
+  BoxFilterPreProcess<bitdepth, Pixel, 3>(src - 2 * src_stride - 2, src_stride,
+                                          width + 2, s1, intermediate1[0]);
+  BoxFilterPreProcess<bitdepth, Pixel, 3>(src - 1 * src_stride - 2, src_stride,
+                                          width + 2, s1, intermediate1[1]);
+  for (int y = height >> 1; y != 0; --y) {
+    BoxFilterPreProcess<bitdepth, Pixel, 5>(src - src_stride - 3, src_stride,
+                                            width + 2, s0, intermediate0[1]);
+    BoxFilterPreProcess<bitdepth, Pixel, 3>(src - 2, src_stride, width + 2, s1,
+                                            intermediate1[2]);
+    BoxFilterPreProcess<bitdepth, Pixel, 3>(src + src_stride - 2, src_stride,
+                                            width + 2, s1, intermediate1[3]);
+    int x = 0;
+    do {
+      int p[2][2];
+      BoxFilterPass1<Pixel>(src[x], src[src_stride + x], intermediate0, x,
+                            p[0]);
+      p[1][0] = BoxFilterPass2<Pixel>(src[x], intermediate1, x);
+      p[1][1] =
+          BoxFilterPass2<Pixel>(src[src_stride + x], intermediate1 + 1, x);
+      dst[x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p[0][0],
+                                                           p[1][0], w0, w1, w2);
+      dst[dst_stride + x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(
+          src[src_stride + x], p[0][1], p[1][1], w0, w1, w2);
+    } while (++x < width);
+    src += 2 * src_stride;
+    dst += 2 * dst_stride;
+    std::swap(intermediate0[0], intermediate0[1]);
+    std::swap(intermediate1[0], intermediate1[2]);
+    std::swap(intermediate1[1], intermediate1[3]);
+  }
+  if ((height & 1) != 0) {
+    BoxFilterPreProcessBottom<bitdepth, Pixel>(src - src_stride - 3, src_stride,
+                                               width + 2, s0, intermediate0[1]);
+    BoxFilterPreProcess<bitdepth, Pixel, 3>(src - 2, src_stride, width + 2, s1,
+                                            intermediate1[2]);
+    int x = 0;
+    do {
+      int p[2][2];
+      BoxFilterPass1<Pixel>(src[x], src[src_stride + x], intermediate0, x,
+                            p[0]);
+      p[1][0] = BoxFilterPass2<Pixel>(src[x], intermediate1, x);
+      dst[x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p[0][0],
+                                                           p[1][0], w0, w1, w2);
+    } while (++x < width);
+  }
+}
 
-  // We calculate intermediate values for the region (width + 1) x (height + 1).
-  // The region we can access is (width + 1 + radius) x (height + 1 + radius).
-  // The max radius is 2. width = height =
-  // kRestorationProcessingUnitSizeWithBorders.
-  // For the integral_image, we need one row before the accessible region,
-  // so the stride is kRestorationProcessingUnitSizeWithBorders + 1.
-  // We fix the first row and first column of integral image be 0 to facilitate
-  // computation.
-
-  // Note that the max sum = (2 ^ bitdepth - 1) *
-  // kRestorationProcessingUnitSizeWithBorders *
-  // kRestorationProcessingUnitSizeWithBorders.
-  // The max sum is larger than 2^16.
-  // Case 8 bit and 10 bit:
-  // The final box sum has at most 25 pixels, which is within 16 bits. So
-  // keeping truncated 16-bit values is enough.
-  // Case 12 bit, radius 1:
-  // The final box sum has 9 pixels, which is within 16 bits. So keeping
-  // truncated 16-bit values is enough.
-  // Case 12 bit, radius 2:
-  // The final box sum has 25 pixels. It can be calculated by calculating the
-  // top 15 pixels and the bottom 10 pixels separately, and adding them
-  // together. So keeping truncated 16-bit values is enough.
-  // If it is slower than using 32-bit for specific CPU targets, please split
-  // into 2 paths.
-  uint16_t integral_image[(kRestorationProcessingUnitSizeWithBorders + 1) *
-                          (kRestorationProcessingUnitSizeWithBorders + 1)];
-
-  // Note that the max squared sum =
-  // (2 ^ bitdepth - 1) * (2 ^ bitdepth - 1) *
-  // kRestorationProcessingUnitSizeWithBorders *
-  // kRestorationProcessingUnitSizeWithBorders.
-  // For 8 bit, 32-bit is enough. For 10 bit and up, the sum could be larger
-  // than 2^32. However, the final box sum has at most 25 squares, which is
-  // within 32 bits. So keeping truncated 32-bit values is enough.
-  uint32_t
-      square_integral_image[(kRestorationProcessingUnitSizeWithBorders + 1) *
-                            (kRestorationProcessingUnitSizeWithBorders + 1)];
-  const ptrdiff_t integral_image_stride =
-      kRestorationProcessingUnitSizeWithBorders + 1;
-  const ptrdiff_t filtered_output_stride =
-      buffer->box_filter_process_output_stride;
-  const ptrdiff_t intermediate_stride =
-      buffer->box_filter_process_intermediate_stride;
-  const ptrdiff_t intermediate_buffer_offset =
-      kRestorationBorder * intermediate_stride + kRestorationBorder;
-
-  ComputeIntegralImage<Pixel>(
-      src - kRestorationBorder * stride - kRestorationBorder, stride,
-      width + 2 * kRestorationBorder, height + 2 * kRestorationBorder,
-      integral_image, square_integral_image, integral_image_stride);
-
-  for (int pass = 0; pass < 2; ++pass) {
-    const uint8_t radius = kSgrProjParams[sgr_proj_index][pass * 2];
-    if (radius == 0) continue;
-    LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterPreProcess(
-        restoration_info, integral_image, square_integral_image, width, height,
-        pass, buffer);
-
-    const Pixel* src_ptr = src;
-    // Set intermediate buffer start point to the actual start point of
-    // filtering.
-    const uint32_t* array_start[2] = {
-        buffer->box_filter_process_intermediate[0] + intermediate_buffer_offset,
-        buffer->box_filter_process_intermediate[1] +
-            intermediate_buffer_offset};
-    int* filtered_output = buffer->box_filter_process_output[pass];
-    for (int y = 0; y < height; ++y) {
-      const int shift = (pass == 0 && (y & 1) != 0) ? 4 : 5;
-      // array_start[0]: range [1, 256].
-      // array_start[1] < 2^20.
-      for (int x = 0; x < width; ++x) {
-        uint32_t a, b;
-        if (pass == 0) {
-          if ((y & 1) == 0) {
-            a = 5 * (array_start[0][-intermediate_stride + x - 1] +
-                     array_start[0][-intermediate_stride + x + 1] +
-                     array_start[0][intermediate_stride + x - 1] +
-                     array_start[0][intermediate_stride + x + 1]) +
-                6 * (array_start[0][-intermediate_stride + x] +
-                     array_start[0][intermediate_stride + x]);
-            b = 5 * (array_start[1][-intermediate_stride + x - 1] +
-                     array_start[1][-intermediate_stride + x + 1] +
-                     array_start[1][intermediate_stride + x - 1] +
-                     array_start[1][intermediate_stride + x + 1]) +
-                6 * (array_start[1][-intermediate_stride + x] +
-                     array_start[1][intermediate_stride + x]);
-          } else {
-            a = 5 * (array_start[0][x - 1] + array_start[0][x + 1]) +
-                6 * array_start[0][x];
-            b = 5 * (array_start[1][x - 1] + array_start[1][x + 1]) +
-                6 * array_start[1][x];
-          }
-        } else {
-          a = 3 * (array_start[0][-intermediate_stride + x - 1] +
-                   array_start[0][-intermediate_stride + x + 1] +
-                   array_start[0][intermediate_stride + x - 1] +
-                   array_start[0][intermediate_stride + x + 1]) +
-              4 * (array_start[0][-intermediate_stride + x] +
-                   array_start[0][x - 1] + array_start[0][x] +
-                   array_start[0][x + 1] +
-                   array_start[0][intermediate_stride + x]);
-          b = 3 * (array_start[1][-intermediate_stride + x - 1] +
-                   array_start[1][-intermediate_stride + x + 1] +
-                   array_start[1][intermediate_stride + x - 1] +
-                   array_start[1][intermediate_stride + x + 1]) +
-              4 * (array_start[1][-intermediate_stride + x] +
-                   array_start[1][x - 1] + array_start[1][x] +
-                   array_start[1][x + 1] +
-                   array_start[1][intermediate_stride + x]);
-        }
-        // v < 2^32. All intermediate calculations are positive.
-        const uint32_t v = a * src_ptr[x] + b;
-        filtered_output[x] = RightShiftWithRounding(
-            v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
-      }
-      src_ptr += stride;
-      array_start[0] += intermediate_stride;
-      array_start[1] += intermediate_stride;
-      filtered_output += filtered_output_stride;
-    }
+template <int bitdepth, typename Pixel>
+inline void LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcessPass1(
+    const RestorationUnitInfo& restoration_info, const Pixel* src,
+    const ptrdiff_t src_stride, const int width, const int height,
+    SgrBuffer* const buffer, Pixel* dst, const ptrdiff_t dst_stride) {
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t s = kSgrScaleParameter[sgr_proj_index][0];  // s < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = (1 << kSgrProjPrecisionBits) - w0;
+  SgrIntermediateBuffer* intermediate[2];
+  assert(s != 0);
+  intermediate[0] = buffer->intermediate;
+  intermediate[1] = intermediate[0] + kIntermediateStride;
+  BoxFilterPreProcessTop<bitdepth, Pixel>(src - 2 * src_stride - 3, src_stride,
+                                          width + 2, s, intermediate[0]);
+  for (int y = height >> 1; y != 0; --y) {
+    BoxFilterPreProcess<bitdepth, Pixel, 5>(src - src_stride - 3, src_stride,
+                                            width + 2, s, intermediate[1]);
+    int x = 0;
+    do {
+      int p[2];
+      BoxFilterPass1<Pixel>(src[x], src[src_stride + x], intermediate, x, p);
+      dst[x] =
+          SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p[0], w0, w1);
+      dst[dst_stride + x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(
+          src[src_stride + x], p[1], w0, w1);
+    } while (++x < width);
+    src += 2 * src_stride;
+    dst += 2 * dst_stride;
+    std::swap(intermediate[0], intermediate[1]);
+  }
+  if ((height & 1) != 0) {
+    BoxFilterPreProcessBottom<bitdepth, Pixel>(src - src_stride - 3, src_stride,
+                                               width + 2, s, intermediate[1]);
+    int x = 0;
+    do {
+      int p[2];
+      BoxFilterPass1<Pixel>(src[x], src[src_stride + x], intermediate, x, p);
+      dst[x] =
+          SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p[0], w0, w1);
+      dst[dst_stride + x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(
+          src[src_stride + x], p[1], w0, w1);
+    } while (++x < width);
   }
 }
 
-// Assume box_filter_process_output[2] are allocated before calling
-// this function. Their sizes are width * height, stride equals width.
+template <int bitdepth, typename Pixel>
+inline void LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcessPass2(
+    const RestorationUnitInfo& restoration_info, const Pixel* src,
+    const ptrdiff_t src_stride, const int width, const int height,
+    SgrBuffer* const buffer, Pixel* dst, const ptrdiff_t dst_stride) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t s = kSgrScaleParameter[sgr_proj_index][1];  // s < 2^12.
+  SgrIntermediateBuffer* intermediate[3];
+  assert(s != 0);
+  intermediate[0] = buffer->intermediate;
+  intermediate[1] = intermediate[0] + kIntermediateStride;
+  intermediate[2] = intermediate[1] + kIntermediateStride;
+  BoxFilterPreProcess<bitdepth, Pixel, 3>(src - 2 * src_stride - 2, src_stride,
+                                          width + 2, s, intermediate[0]);
+  BoxFilterPreProcess<bitdepth, Pixel, 3>(src - 1 * src_stride - 2, src_stride,
+                                          width + 2, s, intermediate[1]);
+  int y = height;
+  do {
+    BoxFilterPreProcess<bitdepth, Pixel, 3>(src - 2, src_stride, width + 2, s,
+                                            intermediate[2]);
+    int x = 0;
+    do {
+      const int p = BoxFilterPass2<Pixel>(src[x], intermediate, x);
+      dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0, w1);
+    } while (++x < width);
+    src += src_stride;
+    dst += dst_stride;
+    SgrIntermediateBuffer* const intermediate0 = intermediate[0];
+    intermediate[0] = intermediate[1];
+    intermediate[1] = intermediate[2];
+    intermediate[2] = intermediate0;
+  } while (--y != 0);
+}
+
 template <int bitdepth, typename Pixel>
 void LoopRestorationFuncs_C<bitdepth, Pixel>::SelfGuidedFilter(
     const void* const source, void* const dest,
     const RestorationUnitInfo& restoration_info, ptrdiff_t source_stride,
     ptrdiff_t dest_stride, int width, int height,
-    RestorationBuffer* const buffer) {
-  const int w0 = restoration_info.sgr_proj_info.multiplier[0];
-  const int w1 = restoration_info.sgr_proj_info.multiplier[1];
-  const int w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+    RestorationBuffer* const /*buffer*/) {
   const int index = restoration_info.sgr_proj_info.index;
-  const int radius_pass_0 = kSgrProjParams[index][0];
-  const int radius_pass_1 = kSgrProjParams[index][2];
-  const ptrdiff_t array_stride = buffer->box_filter_process_output_stride;
-  const int* box_filter_process_output[2] = {
-      buffer->box_filter_process_output[0],
-      buffer->box_filter_process_output[1]};
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
   const auto* src = static_cast<const Pixel*>(source);
   auto* dst = static_cast<Pixel*>(dest);
-  source_stride /= sizeof(Pixel);
-  dest_stride /= sizeof(Pixel);
-  LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcess(
-      restoration_info, src, source_stride, width, height, buffer);
-  for (int y = 0; y < height; ++y) {
-    for (int x = 0; x < width; ++x) {
-      const int u = src[x] << kSgrProjRestoreBits;
-      int v = w1 * u;
-      if (radius_pass_0 != 0) {
-        v += w0 * box_filter_process_output[0][x];
-      } else {
-        v += w0 * u;
-      }
-      if (radius_pass_1 != 0) {
-        v += w2 * box_filter_process_output[1][x];
-      } else {
-        v += w2 * u;
-      }
-      // if radius_pass_0 == 0 and radius_pass_1 == 0, the range of v is:
-      // bits(u) + bits(w0/w1/w2) + 2 = bitdepth + 13.
-      // Then, range of s is bitdepth + 2. This is a rough estimation, taking
-      // the maximum value of each element.
-      const int s = RightShiftWithRounding(
-          v, kSgrProjRestoreBits + kSgrProjPrecisionBits);
-      dst[x] = static_cast<Pixel>(Clip3(s, 0, (1 << bitdepth) - 1));
-    }
-    src += source_stride;
-    dst += dest_stride;
-    box_filter_process_output[0] += array_stride;
-    box_filter_process_output[1] += array_stride;
+  SgrBuffer buffer;
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcessPass1(
+        restoration_info, src, source_stride, width, height, &buffer, dst,
+        dest_stride);
+  } else if (radius_pass_0 == 0) {
+    LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcessPass2(
+        restoration_info, src, source_stride, width, height, &buffer, dst,
+        dest_stride);
+  } else {
+    LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcess(
+        restoration_info, src, source_stride, width, height, &buffer, dst,
+        dest_stride);
   }
 }
 
@@ -739,7 +772,7 @@ void LoopRestorationInit_C() {
   // available.
   static_cast<void>(CountZeroCoefficients);
   static_cast<void>(PopulateWienerCoefficients);
-  static_cast<void>(WienerVertical);
+  static_cast<void>(Sum565);
 }
 
 }  // namespace dsp
diff --git a/chromium/third_party/libgav1/src/src/dsp/loop_restoration.h b/chromium/third_party/libgav1/src/src/dsp/loop_restoration.h
index 663639c682f..d5511eab24f 100644
--- a/chromium/third_party/libgav1/src/src/dsp/loop_restoration.h
+++ b/chromium/third_party/libgav1/src/src/dsp/loop_restoration.h
@@ -38,6 +38,19 @@
 namespace libgav1 {
 namespace dsp {
 
+enum {
+  // Precision of a division table (mtable)
+  kSgrProjScaleBits = 20,
+  kSgrProjReciprocalBits = 12,
+  // Core self-guided restoration precision bits.
+  kSgrProjSgrBits = 8,
+  // Precision bits of generated values higher than source before projection.
+  kSgrProjRestoreBits = 4
+};  // anonymous enum
+
+extern const int kXByXPlus1[256];
+extern const uint8_t kSgrMa2Lookup[256];
+
 // Initializes Dsp::loop_restorations. This function is not thread-safe.
 void LoopRestorationInit_C();
 
diff --git a/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.cc b/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.cc
index 59cfeb4db72..b51ec8f7270 100644
--- a/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.cc
@@ -22,6 +22,7 @@
 #include "src/dsp/dsp.h"
 #include "src/utils/common.h"
 #include "src/utils/constants.h"
+#include "src/utils/reference_info.h"
 #include "src/utils/types.h"
 
 namespace libgav1 {
@@ -36,12 +37,11 @@ namespace {
      !defined(LIBGAV1_Dsp10bpp_MotionFieldProjectionKernel))
 
 // 7.9.2.
-void MotionFieldProjectionKernel_C(
-    const ReferenceFrameType* source_reference_type, const MotionVector* mv,
-    const uint8_t order_hint[kNumReferenceFrameTypes],
-    unsigned int current_frame_order_hint, unsigned int order_hint_shift_bits,
-    int reference_to_current_with_sign, int dst_sign, int y8_start, int y8_end,
-    int x8_start, int x8_end, TemporalMotionField* motion_field) {
+void MotionFieldProjectionKernel_C(const ReferenceInfo& reference_info,
+                                   int reference_to_current_with_sign,
+                                   int dst_sign, int y8_start, int y8_end,
+                                   int x8_start, int x8_end,
+                                   TemporalMotionField* motion_field) {
   const ptrdiff_t stride = motion_field->mv.columns();
   // The column range has to be offset by kProjectionMvMaxHorizontalOffset since
   // coordinates in that range could end up being position_x8 because of
@@ -50,37 +50,31 @@ void MotionFieldProjectionKernel_C(
       std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
   const int adjusted_x8_end = std::min(
       x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
+  const int8_t* const reference_offsets =
+      reference_info.relative_distance_to.data();
+  const bool* const skip_references = reference_info.skip_references.data();
+  const int16_t* const projection_divisions =
+      reference_info.projection_divisions.data();
+  const ReferenceFrameType* source_reference_types =
+      &reference_info.motion_field_reference_frame[y8_start][0];
+  const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
   int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
   MotionVector* dst_mv = motion_field->mv[y8_start];
-  int reference_offsets[kNumReferenceFrameTypes];
-  bool skip_reference[kNumReferenceFrameTypes];
   assert(stride == motion_field->reference_offset.columns());
   assert((y8_start & 7) == 0);
 
-  // Initialize skip_reference[kReferenceFrameIntra] to simplify branch
-  // conditions in projection.
-  skip_reference[kReferenceFrameIntra] = true;
-  for (int reference_type = kReferenceFrameLast;
-       reference_type <= kNumInterReferenceFrameTypes; ++reference_type) {
-    const int reference_offset =
-        GetRelativeDistance(current_frame_order_hint,
-                            order_hint[reference_type], order_hint_shift_bits);
-    skip_reference[reference_type] =
-        reference_offset > kMaxFrameDistance || reference_offset <= 0;
-    reference_offsets[reference_type] = reference_offset;
-  }
-
   int y8 = y8_start;
   do {
     const int y8_floor = (y8 & ~7) - y8;
     const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8);
     int x8 = adjusted_x8_start;
     do {
-      if (skip_reference[source_reference_type[x8]]) continue;
-      const int reference_offset = reference_offsets[source_reference_type[x8]];
+      const int source_reference_type = source_reference_types[x8];
+      if (skip_references[source_reference_type]) continue;
       MotionVector projection_mv;
       // reference_to_current_with_sign could be 0.
-      GetMvProjection(mv[x8], reference_to_current_with_sign, reference_offset,
+      GetMvProjection(mv[x8], reference_to_current_with_sign,
+                      projection_divisions[source_reference_type],
                       &projection_mv);
       // Do not update the motion vector if the block position is not valid or
       // if position_x8 is outside the current range of x8_start and x8_end.
@@ -97,9 +91,9 @@ void MotionFieldProjectionKernel_C(
       if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
       dst_mv[position_y8 * stride + position_x8] = mv[x8];
       dst_reference_offset[position_y8 * stride + position_x8] =
-          reference_offset;
+          reference_offsets[source_reference_type];
     } while (++x8 < adjusted_x8_end);
-    source_reference_type += stride;
+    source_reference_types += stride;
     mv += stride;
     dst_reference_offset += stride;
     dst_mv += stride;
diff --git a/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.h b/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.h
index 5b18be5a3ac..36de459d8f3 100644
--- a/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.h
+++ b/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.h
@@ -24,6 +24,14 @@
 
 // ARM:
 #include "src/dsp/arm/motion_field_projection_neon.h"
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+// SSE4_1
+#include "src/dsp/x86/motion_field_projection_sse4.h"
+// clang-format on
 
 // IWYU pragma: end_exports
 
diff --git a/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.cc b/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.cc
index 33ecb2b1818..94023027fd9 100644
--- a/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.cc
@@ -47,9 +47,10 @@ void MvProjectionCompoundLowPrecision_C(
     for (int i = 0; i < 2; ++i) {
       // |offsets| non-zero check usually equals true and could be ignored.
       if (offsets[i] != 0) {
-        GetMvProjection(temporal_mvs[index], offsets[i],
-                        temporal_reference_offsets[index],
-                        &candidate_mvs[index].mv[i]);
+        GetMvProjection(
+            temporal_mvs[index], offsets[i],
+            kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+            &candidate_mvs[index].mv[i]);
         for (auto& mv : candidate_mvs[index].mv[i].mv) {
           // The next line is equivalent to:
           // if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1;
@@ -73,9 +74,10 @@ void MvProjectionCompoundForceInteger_C(
     for (int i = 0; i < 2; ++i) {
       // |offsets| non-zero check usually equals true and could be ignored.
       if (offsets[i] != 0) {
-        GetMvProjection(temporal_mvs[index], offsets[i],
-                        temporal_reference_offsets[index],
-                        &candidate_mvs[index].mv[i]);
+        GetMvProjection(
+            temporal_mvs[index], offsets[i],
+            kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+            &candidate_mvs[index].mv[i]);
         for (auto& mv : candidate_mvs[index].mv[i].mv) {
           // The next line is equivalent to:
           // const int value = (std::abs(static_cast<int>(mv)) + 3) & ~7;
@@ -101,9 +103,10 @@ void MvProjectionCompoundHighPrecision_C(
     for (int i = 0; i < 2; ++i) {
       // |offsets| non-zero check usually equals true and could be ignored.
       if (offsets[i] != 0) {
-        GetMvProjection(temporal_mvs[index], offsets[i],
-                        temporal_reference_offsets[index],
-                        &candidate_mvs[index].mv[i]);
+        GetMvProjection(
+            temporal_mvs[index], offsets[i],
+            kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+            &candidate_mvs[index].mv[i]);
       }
     }
   } while (++index < count);
@@ -115,8 +118,10 @@ void MvProjectionSingleLowPrecision_C(
     const int count, MotionVector* const candidate_mvs) {
   int index = 0;
   do {
-    GetMvProjection(temporal_mvs[index], reference_offset,
-                    temporal_reference_offsets[index], &candidate_mvs[index]);
+    GetMvProjection(
+        temporal_mvs[index], reference_offset,
+        kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+        &candidate_mvs[index]);
     for (auto& mv : candidate_mvs[index].mv) {
       // The next line is equivalent to:
       // if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1;
@@ -131,8 +136,10 @@ void MvProjectionSingleForceInteger_C(
     const int count, MotionVector* const candidate_mvs) {
   int index = 0;
   do {
-    GetMvProjection(temporal_mvs[index], reference_offset,
-                    temporal_reference_offsets[index], &candidate_mvs[index]);
+    GetMvProjection(
+        temporal_mvs[index], reference_offset,
+        kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+        &candidate_mvs[index]);
     for (auto& mv : candidate_mvs[index].mv) {
       // The next line is equivalent to:
       // const int value = (std::abs(static_cast<int>(mv)) + 3) & ~7;
@@ -149,8 +156,10 @@ void MvProjectionSingleHighPrecision_C(
     const int count, MotionVector* const candidate_mvs) {
   int index = 0;
   do {
-    GetMvProjection(temporal_mvs[index], reference_offset,
-                    temporal_reference_offsets[index], &candidate_mvs[index]);
+    GetMvProjection(
+        temporal_mvs[index], reference_offset,
+        kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+        &candidate_mvs[index]);
   } while (++index < count);
 }
 
diff --git a/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.h b/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.h
index 7ab99a3f2f9..ae16726a961 100644
--- a/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.h
+++ b/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.h
@@ -25,6 +25,15 @@
 // ARM:
 #include "src/dsp/arm/motion_vector_search_neon.h"
 
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+// SSE4_1
+#include "src/dsp/x86/motion_vector_search_sse4.h"
+// clang-format on
+
 // IWYU pragma: end_exports
 
 namespace libgav1 {
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/cdef_sse4.cc b/chromium/third_party/libgav1/src/src/dsp/x86/cdef_sse4.cc
index eed99e5a9c6..fd2c54af4f2 100644
--- a/chromium/third_party/libgav1/src/src/dsp/x86/cdef_sse4.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/cdef_sse4.cc
@@ -38,16 +38,7 @@ namespace dsp {
 namespace low_bitdepth {
 namespace {
 
-// CdefDirection:
-// Mirror values and pad to 16 elements.
-alignas(16) constexpr uint32_t kDivisionTable[] = {840, 420, 280, 210, 168, 140,
-                                                   120, 105, 120, 140, 168, 210,
-                                                   280, 420, 840, 0};
-
-// Used when calculating odd |cost[x]| values to mask off unwanted elements.
-// Holds elements 1 3 5 X 5 3 1 X
-alignas(16) constexpr uint32_t kDivisionTableOdd[] = {420, 210, 140, 0,
-                                                      140, 210, 420, 0};
+#include "src/dsp/cdef.inc"
 
 // Used to calculate |partial[0][i + j]| and |partial[4][7 + i - j]|. The input
 // is |src[j]| and it is being added to |partial[]| based on the above indices.
@@ -160,10 +151,10 @@ inline __m128i Square_S32(__m128i a) { return _mm_mullo_epi32(a, a); }
 
 // |cost[0]| and |cost[4]| square the input and sum with the corresponding
 // element from the other end of the vector:
-// |kDivisionTable[]| element:
+// |kCdefDivisionTable[]| element:
 // cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
-//             kDivisionTable[i + 1];
-// cost[0] += Square(partial[0][7]) * kDivisionTable[8];
+//             kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
 // Because everything is being summed into a single value the distributive
 // property allows us to mirror the division table and accumulate once.
 inline uint32_t Cost0Or4(const __m128i a, const __m128i b,
@@ -185,16 +176,16 @@ inline uint32_t CostOdd(const __m128i a, const __m128i b,
   const __m128i a_hi_square =
       Square_S32(_mm_cvtepi16_epi32(_mm_srli_si128(a, 8)));
   // Swap element 0 and element 2. This pairs partial[i][10 - j] with
-  // kDivisionTable[2*j+1].
+  // kCdefDivisionTable[2*j+1].
   const __m128i b_lo_square =
       _mm_shuffle_epi32(Square_S32(_mm_cvtepi16_epi32(b)), 0x06);
   // First terms are indices 3-7.
   __m128i c = _mm_srli_si128(a_lo_square, 12);
   c = _mm_add_epi32(c, a_hi_square);
-  c = _mm_mullo_epi32(c, _mm_set1_epi32(kDivisionTable[7]));
+  c = _mm_mullo_epi32(c, _mm_set1_epi32(kCdefDivisionTable[7]));
 
   // cost[i] += (Square(base_partial[i][j]) + Square(base_partial[i][10 - j])) *
-  //          kDivisionTable[2 * j + 1];
+  //          kCdefDivisionTable[2 * j + 1];
   const __m128i second_cost = _mm_add_epi32(a_lo_square, b_lo_square);
   c = _mm_add_epi32(c, _mm_mullo_epi32(second_cost, division_table));
   return SumVector_S32(c);
@@ -241,18 +232,18 @@ void CdefDirection_SSE4_1(const void* const source, ptrdiff_t stride,
   const __m128i signed_offset = _mm_set1_epi16(128 * 8);
   partial_lo[2] = _mm_sub_epi16(partial_lo[2], signed_offset);
 
-  cost[2] = kDivisionTable[7] * SquareSum_S16(partial_lo[2]);
-  cost[6] = kDivisionTable[7] * SquareSum_S16(partial_lo[6]);
+  cost[2] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[2]);
+  cost[6] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[6]);
 
-  const __m128i division_table[4] = {LoadUnaligned16(kDivisionTable),
-                                     LoadUnaligned16(kDivisionTable + 4),
-                                     LoadUnaligned16(kDivisionTable + 8),
-                                     LoadUnaligned16(kDivisionTable + 12)};
+  const __m128i division_table[4] = {LoadUnaligned16(kCdefDivisionTable),
+                                     LoadUnaligned16(kCdefDivisionTable + 4),
+                                     LoadUnaligned16(kCdefDivisionTable + 8),
+                                     LoadUnaligned16(kCdefDivisionTable + 12)};
 
   cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table);
   cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table);
 
-  const __m128i division_table_odd = LoadAligned16(kDivisionTableOdd);
+  const __m128i division_table_odd = LoadAligned16(kCdefDivisionTableOdd);
 
   cost[1] = CostOdd(partial_lo[1], partial_hi[1], division_table_odd);
   cost[3] = CostOdd(partial_lo[3], partial_hi[3], division_table_odd);
@@ -315,24 +306,6 @@ void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride,
                       src + y_1 * stride + stride + x_1);
 }
 
-// Load 4 vectors based on the given |direction|. Use when |block_width| == 2 to
-// do 2 rows at a time.
-void LoadDirection2(const uint16_t* const src, const ptrdiff_t stride,
-                    __m128i* output, const int direction) {
-  const int y_0 = kCdefDirections[direction][0][0];
-  const int x_0 = kCdefDirections[direction][0][1];
-  const int y_1 = kCdefDirections[direction][1][0];
-  const int x_1 = kCdefDirections[direction][1][1];
-  output[0] =
-      Load4x2(src - y_0 * stride - x_0, src - y_0 * stride - x_0 + stride);
-  output[1] =
-      Load4x2(src + y_0 * stride + x_0, src - y_0 * stride - x_0 + stride);
-  output[2] =
-      Load4x2(src - y_1 * stride - x_1, src - y_0 * stride - x_0 + stride);
-  output[3] =
-      Load4x2(src + y_1 * stride + x_1, src - y_0 * stride - x_0 + stride);
-}
-
 inline __m128i Constrain(const __m128i& pixel, const __m128i& reference,
                          const __m128i& damping, const __m128i& threshold) {
   const __m128i diff = _mm_sub_epi16(pixel, reference);
@@ -340,6 +313,11 @@ inline __m128i Constrain(const __m128i& pixel, const __m128i& reference,
   // sign(diff) * Clip3(threshold - (std::abs(diff) >> damping),
   //                    0, std::abs(diff))
   const __m128i shifted_diff = _mm_srl_epi16(abs_diff, damping);
+  // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
+  // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
+  // larger than threshold. Subtract using saturation will return 0 when pixel
+  // == kCdefLargeValue.
+  static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
   const __m128i thresh_minus_shifted_diff =
       _mm_subs_epu16(threshold, shifted_diff);
   const __m128i clamp_abs_diff =
@@ -349,34 +327,35 @@ inline __m128i Constrain(const __m128i& pixel, const __m128i& reference,
 }
 
 inline __m128i ApplyConstrainAndTap(const __m128i& pixel, const __m128i& val,
-                                    const __m128i& mask, const __m128i& tap,
-                                    const __m128i& damping,
+                                    const __m128i& tap, const __m128i& damping,
                                     const __m128i& threshold) {
   const __m128i constrained = Constrain(val, pixel, damping, threshold);
-  return _mm_mullo_epi16(_mm_and_si128(constrained, mask), tap);
+  return _mm_mullo_epi16(constrained, tap);
 }
 
-template <int width>
+template <int width, bool enable_primary = true, bool enable_secondary = true>
 void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height,
             const int direction, const int primary_strength,
             const int secondary_strength, const int damping, uint8_t* dst,
             const ptrdiff_t dst_stride) {
-  static_assert(width == 8 || width == 4 || width == 2, "Invalid CDEF width.");
-
+  static_assert(width == 8 || width == 4, "Invalid CDEF width.");
+  static_assert(enable_primary || enable_secondary, "");
   __m128i primary_damping_shift, secondary_damping_shift;
+
   // FloorLog2() requires input to be > 0.
-  if (primary_strength == 0) {
-    primary_damping_shift = _mm_setzero_si128();
-  } else {
+  // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+  if (enable_primary) {
+    // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
+    // for UV filtering.
     primary_damping_shift =
         _mm_cvtsi32_si128(std::max(0, damping - FloorLog2(primary_strength)));
   }
-
-  if (secondary_strength == 0) {
-    secondary_damping_shift = _mm_setzero_si128();
-  } else {
+  if (enable_secondary) {
+    // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+    // necessary.
+    assert(damping - FloorLog2(secondary_strength) >= 0);
     secondary_damping_shift =
-        _mm_cvtsi32_si128(std::max(0, damping - FloorLog2(secondary_strength)));
+        _mm_cvtsi32_si128(damping - FloorLog2(secondary_strength));
   }
 
   const __m128i primary_tap_0 =
@@ -385,8 +364,6 @@ void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height,
       _mm_set1_epi16(kCdefPrimaryTaps[primary_strength & 1][1]);
   const __m128i secondary_tap_0 = _mm_set1_epi16(kCdefSecondaryTap0);
   const __m128i secondary_tap_1 = _mm_set1_epi16(kCdefSecondaryTap1);
-  const __m128i cdef_large_value =
-      _mm_set1_epi16(static_cast<int16_t>(kCdefLargeValue));
   const __m128i cdef_large_value_mask =
       _mm_set1_epi16(static_cast<int16_t>(~kCdefLargeValue));
   const __m128i primary_threshold = _mm_set1_epi16(primary_strength);
@@ -397,126 +374,113 @@ void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height,
     __m128i pixel;
     if (width == 8) {
       pixel = LoadUnaligned16(src);
-    } else if (width == 4) {
-      pixel = LoadHi8(LoadLo8(src), src + src_stride);
-    } else {
-      pixel = Load4x2(src, src + src_stride);
-    }
-
-    // Primary |direction|.
-    __m128i primary_val[4];
-    if (width == 8) {
-      LoadDirection(src, src_stride, primary_val, direction);
-    } else if (width == 4) {
-      LoadDirection4(src, src_stride, primary_val, direction);
     } else {
-      LoadDirection2(src, src_stride, primary_val, direction);
+      pixel = LoadHi8(LoadLo8(src), src + src_stride);
     }
 
     __m128i min = pixel;
-    min = _mm_min_epu16(min, primary_val[0]);
-    min = _mm_min_epu16(min, primary_val[1]);
-    min = _mm_min_epu16(min, primary_val[2]);
-    min = _mm_min_epu16(min, primary_val[3]);
-
     __m128i max = pixel;
-    max = _mm_max_epu16(max,
-                        _mm_and_si128(primary_val[0], cdef_large_value_mask));
-    max = _mm_max_epu16(max,
-                        _mm_and_si128(primary_val[1], cdef_large_value_mask));
-    max = _mm_max_epu16(max,
-                        _mm_and_si128(primary_val[2], cdef_large_value_mask));
-    max = _mm_max_epu16(max,
-                        _mm_and_si128(primary_val[3], cdef_large_value_mask));
-    __m128i mask = _mm_cmplt_epi16(primary_val[0], cdef_large_value);
-    __m128i sum =
-        ApplyConstrainAndTap(pixel, primary_val[0], mask, primary_tap_0,
-                             primary_damping_shift, primary_threshold);
-    mask = _mm_cmplt_epi16(primary_val[1], cdef_large_value);
-    sum = _mm_add_epi16(
-        sum, ApplyConstrainAndTap(pixel, primary_val[1], mask, primary_tap_0,
-                                  primary_damping_shift, primary_threshold));
-    mask = _mm_cmplt_epi16(primary_val[2], cdef_large_value);
-    sum = _mm_add_epi16(
-        sum, ApplyConstrainAndTap(pixel, primary_val[2], mask, primary_tap_1,
-                                  primary_damping_shift, primary_threshold));
-    mask = _mm_cmplt_epi16(primary_val[3], cdef_large_value);
-    sum = _mm_add_epi16(
-        sum, ApplyConstrainAndTap(pixel, primary_val[3], mask, primary_tap_1,
-                                  primary_damping_shift, primary_threshold));
-
-    // Secondary |direction| values (+/- 2). Clamp |direction|.
-    __m128i secondary_val[8];
-    if (width == 8) {
-      LoadDirection(src, src_stride, secondary_val, (direction + 2) & 0x7);
-      LoadDirection(src, src_stride, secondary_val + 4, (direction - 2) & 0x7);
-    } else if (width == 4) {
-      LoadDirection4(src, src_stride, secondary_val, (direction + 2) & 0x7);
-      LoadDirection4(src, src_stride, secondary_val + 4, (direction - 2) & 0x7);
+    __m128i sum;
+
+    if (enable_primary) {
+      // Primary |direction|.
+      __m128i primary_val[4];
+      if (width == 8) {
+        LoadDirection(src, src_stride, primary_val, direction);
+      } else {
+        LoadDirection4(src, src_stride, primary_val, direction);
+      }
+
+      min = _mm_min_epu16(min, primary_val[0]);
+      min = _mm_min_epu16(min, primary_val[1]);
+      min = _mm_min_epu16(min, primary_val[2]);
+      min = _mm_min_epu16(min, primary_val[3]);
+
+      // The source is 16 bits, however, we only really care about the lower
+      // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
+      // primary max has been calculated, zero out the upper 8 bits.  Use this
+      // to find the "16 bit" max.
+      const __m128i max_p01 = _mm_max_epu8(primary_val[0], primary_val[1]);
+      const __m128i max_p23 = _mm_max_epu8(primary_val[2], primary_val[3]);
+      const __m128i max_p = _mm_max_epu8(max_p01, max_p23);
+      max = _mm_max_epu16(max, _mm_and_si128(max_p, cdef_large_value_mask));
+
+      sum = ApplyConstrainAndTap(pixel, primary_val[0], primary_tap_0,
+                                 primary_damping_shift, primary_threshold);
+      sum = _mm_add_epi16(
+          sum, ApplyConstrainAndTap(pixel, primary_val[1], primary_tap_0,
+                                    primary_damping_shift, primary_threshold));
+      sum = _mm_add_epi16(
+          sum, ApplyConstrainAndTap(pixel, primary_val[2], primary_tap_1,
+                                    primary_damping_shift, primary_threshold));
+      sum = _mm_add_epi16(
+          sum, ApplyConstrainAndTap(pixel, primary_val[3], primary_tap_1,
+                                    primary_damping_shift, primary_threshold));
     } else {
-      LoadDirection2(src, src_stride, secondary_val, (direction + 2) & 0x7);
-      LoadDirection2(src, src_stride, secondary_val + 4, (direction - 2) & 0x7);
+      sum = _mm_setzero_si128();
     }
 
-    min = _mm_min_epu16(min, secondary_val[0]);
-    min = _mm_min_epu16(min, secondary_val[1]);
-    min = _mm_min_epu16(min, secondary_val[2]);
-    min = _mm_min_epu16(min, secondary_val[3]);
-    min = _mm_min_epu16(min, secondary_val[4]);
-    min = _mm_min_epu16(min, secondary_val[5]);
-    min = _mm_min_epu16(min, secondary_val[6]);
-    min = _mm_min_epu16(min, secondary_val[7]);
-
-    max = _mm_max_epu16(max,
-                        _mm_and_si128(secondary_val[0], cdef_large_value_mask));
-    max = _mm_max_epu16(max,
-                        _mm_and_si128(secondary_val[1], cdef_large_value_mask));
-    max = _mm_max_epu16(max,
-                        _mm_and_si128(secondary_val[2], cdef_large_value_mask));
-    max = _mm_max_epu16(max,
-                        _mm_and_si128(secondary_val[3], cdef_large_value_mask));
-    max = _mm_max_epu16(max,
-                        _mm_and_si128(secondary_val[4], cdef_large_value_mask));
-    max = _mm_max_epu16(max,
-                        _mm_and_si128(secondary_val[5], cdef_large_value_mask));
-    max = _mm_max_epu16(max,
-                        _mm_and_si128(secondary_val[6], cdef_large_value_mask));
-    max = _mm_max_epu16(max,
-                        _mm_and_si128(secondary_val[7], cdef_large_value_mask));
-
-    mask = _mm_cmplt_epi16(secondary_val[0], cdef_large_value);
-    sum = _mm_add_epi16(sum, ApplyConstrainAndTap(
-                                 pixel, secondary_val[0], mask, secondary_tap_0,
-                                 secondary_damping_shift, secondary_threshold));
-    mask = _mm_cmplt_epi16(secondary_val[1], cdef_large_value);
-    sum = _mm_add_epi16(sum, ApplyConstrainAndTap(
-                                 pixel, secondary_val[1], mask, secondary_tap_0,
-                                 secondary_damping_shift, secondary_threshold));
-    mask = _mm_cmplt_epi16(secondary_val[2], cdef_large_value);
-    sum = _mm_add_epi16(sum, ApplyConstrainAndTap(
-                                 pixel, secondary_val[2], mask, secondary_tap_1,
-                                 secondary_damping_shift, secondary_threshold));
-    mask = _mm_cmplt_epi16(secondary_val[3], cdef_large_value);
-    sum = _mm_add_epi16(sum, ApplyConstrainAndTap(
-                                 pixel, secondary_val[3], mask, secondary_tap_1,
-                                 secondary_damping_shift, secondary_threshold));
-    mask = _mm_cmplt_epi16(secondary_val[4], cdef_large_value);
-    sum = _mm_add_epi16(sum, ApplyConstrainAndTap(
-                                 pixel, secondary_val[4], mask, secondary_tap_0,
-                                 secondary_damping_shift, secondary_threshold));
-    mask = _mm_cmplt_epi16(secondary_val[5], cdef_large_value);
-    sum = _mm_add_epi16(sum, ApplyConstrainAndTap(
-                                 pixel, secondary_val[5], mask, secondary_tap_0,
-                                 secondary_damping_shift, secondary_threshold));
-    mask = _mm_cmplt_epi16(secondary_val[6], cdef_large_value);
-    sum = _mm_add_epi16(sum, ApplyConstrainAndTap(
-                                 pixel, secondary_val[6], mask, secondary_tap_1,
-                                 secondary_damping_shift, secondary_threshold));
-    mask = _mm_cmplt_epi16(secondary_val[7], cdef_large_value);
-    sum = _mm_add_epi16(sum, ApplyConstrainAndTap(
-                                 pixel, secondary_val[7], mask, secondary_tap_1,
-                                 secondary_damping_shift, secondary_threshold));
-
+    if (enable_secondary) {
+      // Secondary |direction| values (+/- 2). Clamp |direction|.
+      __m128i secondary_val[8];
+      if (width == 8) {
+        LoadDirection(src, src_stride, secondary_val, direction + 2);
+        LoadDirection(src, src_stride, secondary_val + 4, direction - 2);
+      } else {
+        LoadDirection4(src, src_stride, secondary_val, direction + 2);
+        LoadDirection4(src, src_stride, secondary_val + 4, direction - 2);
+      }
+
+      min = _mm_min_epu16(min, secondary_val[0]);
+      min = _mm_min_epu16(min, secondary_val[1]);
+      min = _mm_min_epu16(min, secondary_val[2]);
+      min = _mm_min_epu16(min, secondary_val[3]);
+      min = _mm_min_epu16(min, secondary_val[4]);
+      min = _mm_min_epu16(min, secondary_val[5]);
+      min = _mm_min_epu16(min, secondary_val[6]);
+      min = _mm_min_epu16(min, secondary_val[7]);
+
+      const __m128i max_s01 = _mm_max_epu8(secondary_val[0], secondary_val[1]);
+      const __m128i max_s23 = _mm_max_epu8(secondary_val[2], secondary_val[3]);
+      const __m128i max_s45 = _mm_max_epu8(secondary_val[4], secondary_val[5]);
+      const __m128i max_s67 = _mm_max_epu8(secondary_val[6], secondary_val[7]);
+      const __m128i max_s = _mm_max_epu8(_mm_max_epu8(max_s01, max_s23),
+                                         _mm_max_epu8(max_s45, max_s67));
+      max = _mm_max_epu16(max, _mm_and_si128(max_s, cdef_large_value_mask));
+
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[0], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[1], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[2], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[3], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[4], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[5], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[6], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[7], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+    }
     // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
     const __m128i sum_lt_0 = _mm_srai_epi16(sum, 15);
     // 8 + sum
@@ -536,20 +500,13 @@ void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height,
       StoreLo8(dst, result);
       dst += dst_stride;
       ++y;
-    } else if (width == 4) {
+    } else {
       src += 2 * src_stride;
       Store4(dst, result);
       dst += dst_stride;
       Store4(dst, _mm_srli_si128(result, 4));
       dst += dst_stride;
       y += 2;
-    } else {
-      src += 2 * src_stride;
-      Store2(dst, result);
-      dst += dst_stride;
-      Store2(dst, _mm_srli_si128(result, 2));
-      dst += dst_stride;
-      y += 2;
     }
   } while (y < height);
 }
@@ -558,29 +515,46 @@ void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height,
 // inside the frame. However it requires the source input to be padded with a
 // constant large value if at the boundary. The input must be uint16_t.
 void CdefFilter_SSE4_1(const void* const source, const ptrdiff_t source_stride,
-                       const int rows4x4, const int columns4x4,
-                       const int curr_x, const int curr_y,
-                       const int subsampling_x, const int subsampling_y,
+                       const int block_width, const int block_height,
                        const int primary_strength, const int secondary_strength,
                        const int damping, const int direction, void* const dest,
                        const ptrdiff_t dest_stride) {
-  const int plane_width = MultiplyBy4(columns4x4) >> subsampling_x;
-  const int plane_height = MultiplyBy4(rows4x4) >> subsampling_y;
-  const int block_width = std::min(8 >> subsampling_x, plane_width - curr_x);
-  const int block_height = std::min(8 >> subsampling_y, plane_height - curr_y);
   const auto* src = static_cast<const uint16_t*>(source);
   auto* dst = static_cast<uint8_t*>(dest);
 
-  if (block_width == 8) {
-    DoCdef<8>(src, source_stride, block_height, direction, primary_strength,
-              secondary_strength, damping, dst, dest_stride);
-  } else if (block_width == 4) {
-    DoCdef<4>(src, source_stride, block_height, direction, primary_strength,
-              secondary_strength, damping, dst, dest_stride);
+  if (secondary_strength > 0) {
+    if (primary_strength > 0) {
+      if (block_width == 8) {
+        DoCdef<8>(src, source_stride, block_height, direction, primary_strength,
+                  secondary_strength, damping, dst, dest_stride);
+      } else {
+        assert(block_width == 4);
+        DoCdef<4>(src, source_stride, block_height, direction, primary_strength,
+                  secondary_strength, damping, dst, dest_stride);
+      }
+    } else {
+      if (block_width == 8) {
+        DoCdef<8, /*enable_primary=*/false>(
+            src, source_stride, block_height, direction, primary_strength,
+            secondary_strength, damping, dst, dest_stride);
+      } else {
+        assert(block_width == 4);
+        DoCdef<4, /*enable_primary=*/false>(
+            src, source_stride, block_height, direction, primary_strength,
+            secondary_strength, damping, dst, dest_stride);
+      }
+    }
   } else {
-    assert(block_width == 2);
-    DoCdef<2>(src, source_stride, block_height, direction, primary_strength,
-              secondary_strength, damping, dst, dest_stride);
+    if (block_width == 8) {
+      DoCdef<8, /*enable_primary=*/true, /*enable_secondary=*/false>(
+          src, source_stride, block_height, direction, primary_strength,
+          secondary_strength, damping, dst, dest_stride);
+    } else {
+      assert(block_width == 4);
+      DoCdef<4, /*enable_primary=*/true, /*enable_secondary=*/false>(
+          src, source_stride, block_height, direction, primary_strength,
+          secondary_strength, damping, dst, dest_stride);
+    }
   }
 }
 
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/common_sse4.h b/chromium/third_party/libgav1/src/src/dsp/x86/common_sse4.h
index 8b03db69f7a..24c801fd863 100644
--- a/chromium/third_party/libgav1/src/src/dsp/x86/common_sse4.h
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/common_sse4.h
@@ -17,6 +17,7 @@
 #ifndef LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
 #define LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
 
+#include "src/utils/compiler_attributes.h"
 #include "src/utils/cpu.h"
 
 #if LIBGAV1_ENABLE_SSE4_1
@@ -91,6 +92,14 @@ inline __m128i Load2x2(const void* src1, const void* src2) {
   return _mm_cvtsi32_si128(val1 | (val2 << 16));
 }
 
+// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
+template <int lane>
+inline __m128i Load2(const void* const buf, __m128i val) {
+  uint16_t temp;
+  memcpy(&temp, buf, 2);
+  return _mm_insert_epi16(val, temp, lane);
+}
+
 inline __m128i Load4(const void* src) {
   // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
   // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
@@ -136,6 +145,41 @@ inline __m128i LoadAligned16(const void* a) {
 }
 
 //------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline __m128i MaskOverreads(const __m128i source,
+                             const int over_read_in_bytes) {
+  __m128i dst = source;
+#if LIBGAV1_MSAN
+  if (over_read_in_bytes > 0) {
+    __m128i mask = _mm_set1_epi8(-1);
+    for (int i = 0; i < over_read_in_bytes; ++i) {
+      mask = _mm_srli_si128(mask, 1);
+    }
+    dst = _mm_and_si128(dst, mask);
+  }
+#else
+  static_cast<void>(over_read_in_bytes);
+#endif
+  return dst;
+}
+
+inline __m128i LoadLo8Msan(const void* const source,
+                           const int over_read_in_bytes) {
+  return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8);
+}
+
+inline __m128i LoadAligned16Msan(const void* const source,
+                                 const int over_read_in_bytes) {
+  return MaskOverreads(LoadAligned16(source), over_read_in_bytes);
+}
+
+inline __m128i LoadUnaligned16Msan(const void* const source,
+                                   const int over_read_in_bytes) {
+  return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes);
+}
+
+//------------------------------------------------------------------------------
 // Store functions.
 
 inline void Store2(void* dst, const __m128i x) {
@@ -156,6 +200,10 @@ inline void StoreHi8(void* a, const __m128i v) {
   _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v));
 }
 
+inline void StoreAligned16(void* a, const __m128i v) {
+  _mm_store_si128(static_cast<__m128i*>(a), v);
+}
+
 inline void StoreUnaligned16(void* a, const __m128i v) {
   _mm_storeu_si128(static_cast<__m128i*>(a), v);
 }
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.cc b/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.cc
index 40ce568a491..a0ed3bea758 100644
--- a/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "src/dsp/convolve.h"
+#include "src/utils/constants.h"
 #include "src/utils/cpu.h"
 
 #if LIBGAV1_ENABLE_SSE4_1
@@ -33,8 +34,40 @@ namespace dsp {
 namespace low_bitdepth {
 namespace {
 
+// TODO(slavarnway): Move to common neon/sse4 file.
+int GetNumTapsInFilter(const int filter_index) {
+  if (filter_index < 2) {
+    // Despite the names these only use 6 taps.
+    // kInterpolationFilterEightTap
+    // kInterpolationFilterEightTapSmooth
+    return 6;
+  }
+
+  if (filter_index == 2) {
+    // kInterpolationFilterEightTapSharp
+    return 8;
+  }
+
+  if (filter_index == 3) {
+    // kInterpolationFilterBilinear
+    return 2;
+  }
+
+  assert(filter_index > 3);
+  // For small sizes (width/height <= 4) the large filters are replaced with 4
+  // tap options.
+  // If the original filters were |kInterpolationFilterEightTap| or
+  // |kInterpolationFilterEightTapSharp| then it becomes
+  // |kInterpolationFilterSwitchable|.
+  // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
+  // tap filter.
+  return 4;
+}
+
+constexpr int kIntermediateStride = kMaxSuperBlockSizeInPixels;
 constexpr int kSubPixelMask = (1 << kSubPixelBits) - 1;
 constexpr int kHorizontalOffset = 3;
+constexpr int kFilterIndexShift = 6;
 
 // Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
 // sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
@@ -177,6 +210,15 @@ __m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
   return _mm_packus_epi16(sum, sum);
 }
 
+template <int filter_index>
+__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                                const __m128i* const v_tap) {
+  const __m128i sum =
+      SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
 template <int num_taps, int step, int filter_index, bool is_2d = false,
           bool is_compound = false>
 void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
@@ -195,7 +237,11 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
         if (is_2d || is_compound) {
           const __m128i v_sum =
               HorizontalTaps8To16<filter_index>(&src[x], v_tap);
-          StoreUnaligned16(&dest16[x], v_sum);
+          if (is_2d) {
+            StoreAligned16(&dest16[x], v_sum);
+          } else {
+            StoreUnaligned16(&dest16[x], v_sum);
+          }
         } else {
           const __m128i result =
               SimpleHorizontalTaps<filter_index>(&src[x], v_tap);
@@ -236,7 +282,12 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
       int y = 0;
       do {
         if (is_2d) {
-          // TODO(slavarnway): Add 2d support
+          const __m128i sum =
+              HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+          Store4(&dest16[0], sum);
+          dest16 += pred_stride;
+          Store4(&dest16[0], _mm_srli_si128(sum, 8));
+          dest16 += pred_stride;
         } else {
           const __m128i sum =
               SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
@@ -254,13 +305,33 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
       // generates context for the vertical pass.
       if (is_2d) {
         assert(height % 2 == 1);
-        // TODO(slavarnway): Add 2d support
+        __m128i sum;
+        const __m128i input = LoadLo8(&src[2]);
+        if (filter_index == 3) {
+          // 03 04 04 05 05 06 06 07 ....
+          const __m128i v_src_43 =
+              _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
+          sum = _mm_maddubs_epi16(v_src_43, v_tap[0]);  // k4k3
+        } else {
+          // 02 03 03 04 04 05 05 06 06 07 ....
+          const __m128i v_src_32 =
+              _mm_srli_si128(_mm_unpacklo_epi8(input, input), 1);
+          // 04 05 05 06 06 07 07 08 ...
+          const __m128i v_src_54 = _mm_srli_si128(v_src_32, 4);
+          const __m128i v_madd_32 =
+              _mm_maddubs_epi16(v_src_32, v_tap[0]);  // k3k2
+          const __m128i v_madd_54 =
+              _mm_maddubs_epi16(v_src_54, v_tap[1]);  // k5k4
+          sum = _mm_add_epi16(v_madd_54, v_madd_32);
+        }
+        sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+        Store4(dest16, sum);
       }
     }
   }
 }
 
-template <int num_taps>
+template <int num_taps, bool is_2d_vertical = false>
 LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
                                      __m128i* v_tap) {
   if (num_taps == 8) {
@@ -268,30 +339,295 @@ LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
     v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
     v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
     v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff);  // k7k6
-    v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
-    v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
-    v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
-    v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+      v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+      v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
+    }
   } else if (num_taps == 6) {
     const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
     v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0);   // k2k1
     v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
     v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa);  // k6k5
-    v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
-    v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
-    v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+    }
   } else if (num_taps == 4) {
     v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
     v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
-    v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
-    v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+    }
   } else {  // num_taps == 2
     const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
     v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
-    v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+    }
   }
 }
 
+template <int num_taps, bool is_compound>
+__m128i SimpleSum2DVerticalTaps(const __m128i* const src,
+                                const __m128i* const taps) {
+  __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]);
+  __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]);
+  if (num_taps >= 4) {
+    __m128i madd_lo =
+        _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]);
+    __m128i madd_hi =
+        _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]);
+    sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+    sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+    if (num_taps >= 6) {
+      madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]);
+      madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]);
+      sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+      sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+      if (num_taps == 8) {
+        madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]);
+        madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]);
+        sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+        sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+      }
+    }
+  }
+
+  if (is_compound) {
+    return _mm_packs_epi32(
+        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        RightShiftWithRounding_S32(sum_hi,
+                                   kInterRoundBitsCompoundVertical - 1));
+  }
+
+  return _mm_packs_epi32(
+      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical(const uint16_t* src, void* const dst,
+                      const ptrdiff_t dst_stride, const int width,
+                      const int height, const __m128i* const taps) {
+  assert(width >= 8);
+  constexpr int next_row = num_taps - 1;
+  // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+  const ptrdiff_t src_stride = width;
+
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  int x = 0;
+  do {
+    __m128i srcs[8];
+    const uint16_t* src_x = src + x;
+    srcs[0] = LoadAligned16(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = LoadAligned16(src_x);
+      src_x += src_stride;
+      srcs[2] = LoadAligned16(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = LoadAligned16(src_x);
+        src_x += src_stride;
+        srcs[4] = LoadAligned16(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = LoadAligned16(src_x);
+          src_x += src_stride;
+          srcs[6] = LoadAligned16(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    int y = 0;
+    do {
+      srcs[next_row] = LoadAligned16(src_x);
+      src_x += src_stride;
+
+      const __m128i sum =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+      if (is_compound) {
+        StoreUnaligned16(dst16 + x + y * dst_stride, sum);
+      } else {
+        StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(sum, sum));
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (++y < height);
+    x += 8;
+  } while (x < width);
+}
+
+// Take advantage of |src_stride| == |width| to process two rows at a time.
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical4xH(const uint16_t* src, void* const dst,
+                         const ptrdiff_t dst_stride, const int height,
+                         const __m128i* const taps) {
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  __m128i srcs[9];
+  srcs[0] = LoadAligned16(src);
+  src += 8;
+  if (num_taps >= 4) {
+    srcs[2] = LoadAligned16(src);
+    src += 8;
+    srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]);
+    if (num_taps >= 6) {
+      srcs[4] = LoadAligned16(src);
+      src += 8;
+      srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]);
+      if (num_taps == 8) {
+        srcs[6] = LoadAligned16(src);
+        src += 8;
+        srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]);
+      }
+    }
+  }
+
+  int y = 0;
+  do {
+    srcs[num_taps] = LoadAligned16(src);
+    src += 8;
+    srcs[num_taps - 1] = _mm_unpacklo_epi64(
+        _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]);
+
+    const __m128i sum =
+        SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+    if (is_compound) {
+      StoreUnaligned16(dst16, sum);
+      dst16 += 4 << 1;
+    } else {
+      const __m128i results = _mm_packus_epi16(sum, sum);
+      Store4(dst8, results);
+      dst8 += dst_stride;
+      Store4(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+    }
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y += 2;
+  } while (y < height);
+}
+
+// Take advantage of |src_stride| == |width| to process four rows at a time.
+template <int num_taps>
+void Filter2DVertical2xH(const uint16_t* src, void* const dst,
+                         const ptrdiff_t dst_stride, const int height,
+                         const __m128i* const taps) {
+  constexpr int next_row = (num_taps < 6) ? 4 : 8;
+
+  auto* dst8 = static_cast<uint8_t*>(dst);
+
+  __m128i srcs[9];
+  srcs[0] = LoadAligned16(src);
+  src += 8;
+  if (num_taps >= 6) {
+    srcs[4] = LoadAligned16(src);
+    src += 8;
+    srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+    if (num_taps == 8) {
+      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+    }
+  }
+
+  int y = 0;
+  do {
+    srcs[next_row] = LoadAligned16(src);
+    src += 8;
+    if (num_taps == 2) {
+      srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+    } else if (num_taps == 4) {
+      srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+    } else if (num_taps == 6) {
+      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+      srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+    } else if (num_taps == 8) {
+      srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+      srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8);
+      srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12);
+    }
+
+    const __m128i sum =
+        SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
+    const __m128i results = _mm_packus_epi16(sum, sum);
+
+    Store2(dst8, results);
+    dst8 += dst_stride;
+    Store2(dst8, _mm_srli_si128(results, 2));
+    // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
+    // Therefore we don't need to check this condition when |height| > 4.
+    if (num_taps <= 4 && height == 2) return;
+    dst8 += dst_stride;
+    Store2(dst8, _mm_srli_si128(results, 4));
+    dst8 += dst_stride;
+    Store2(dst8, _mm_srli_si128(results, 6));
+    dst8 += dst_stride;
+
+    srcs[0] = srcs[4];
+    if (num_taps == 6) {
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+    } else if (num_taps == 8) {
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+    }
+
+    y += 4;
+  } while (y < height);
+}
+
 template <bool is_2d = false, bool is_compound = false>
 LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
     const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
@@ -330,6 +666,765 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
   }
 }
 
+void Convolve2D_SSE4_1(const void* const reference,
+                       const ptrdiff_t reference_stride,
+                       const int horizontal_filter_index,
+                       const int vertical_filter_index, const int subpixel_x,
+                       const int subpixel_y, const int width, const int height,
+                       void* prediction, const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+
+  // The output of the horizontal filter is guaranteed to fit in 16 bits.
+  alignas(16) uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+  const int intermediate_height = height + vertical_taps - 1;
+
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+
+  DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
+                                   width, intermediate_height, subpixel_x,
+                                   horiz_filter_index);
+
+  // Vertical filter.
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
+  assert(filter_id != 0);
+
+  __m128i taps[4];
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[vert_filter_index][filter_id]);
+
+  if (vertical_taps == 8) {
+    SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 2) {
+      Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  } else if (vertical_taps == 6) {
+    SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 2) {
+      Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  } else if (vertical_taps == 4) {
+    SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 2) {
+      Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  } else {  // |vertical_taps| == 2
+    SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 2) {
+      Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  }
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+__m128i Compound1DShift(const __m128i sum) {
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int filter_index>
+__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
+  __m128i v_src[4];
+
+  if (filter_index < 2) {
+    // 6 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+    v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+  } else if (filter_index == 2) {
+    // 8 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+    v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+    v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
+  } else if (filter_index == 3) {
+    // 2 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+  } else if (filter_index > 3) {
+    // 4 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+  }
+  const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+  return sum;
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
+                    void* const dst, const ptrdiff_t dst_stride,
+                    const int width, const int height,
+                    const __m128i* const v_tap) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps - 1;
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+  assert(width >= 8);
+
+  int x = 0;
+  do {
+    const uint8_t* src_x = src + x;
+    __m128i srcs[8];
+    srcs[0] = LoadLo8(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = LoadLo8(src_x);
+      src_x += src_stride;
+      srcs[2] = LoadLo8(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = LoadLo8(src_x);
+        src_x += src_stride;
+        srcs[4] = LoadLo8(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = LoadLo8(src_x);
+          src_x += src_stride;
+          srcs[6] = LoadLo8(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    int y = 0;
+    do {
+      srcs[next_row] = LoadLo8(src_x);
+      src_x += src_stride;
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16 + x + y * dst_stride, results);
+      } else {
+        const __m128i results =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(results, results));
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (++y < height);
+    x += 8;
+  } while (x < width);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
+                       void* const dst, const ptrdiff_t dst_stride,
+                       const int height, const __m128i* const v_tap) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  __m128i srcs[9];
+
+  if (num_taps == 2) {
+    srcs[2] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+
+    int y = 0;
+    do {
+      // 10 11 12 13
+      const __m128i a = Load4(src);
+      // 00 01 02 03 10 11 12 13
+      srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+      src += src_stride;
+      // 20 21 22 23
+      srcs[2] = Load4(src);
+      src += src_stride;
+      // 10 11 12 13 20 21 22 23
+      srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      y += 2;
+    } while (y < height);
+  } else if (num_taps == 4) {
+    srcs[4] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13
+    const __m128i a = Load4(src);
+    // 00 01 02 03 10 11 12 13
+    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+    src += src_stride;
+    // 20 21 22 23
+    srcs[2] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13 20 21 22 23
+    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+    int y = 0;
+    do {
+      // 30 31 32 33
+      const __m128i b = Load4(src);
+      // 20 21 22 23 30 31 32 33
+      srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+      src += src_stride;
+      // 40 41 42 43
+      srcs[4] = Load4(src);
+      src += src_stride;
+      // 30 31 32 33 40 41 42 43
+      srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      y += 2;
+    } while (y < height);
+  } else if (num_taps == 6) {
+    srcs[6] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13
+    const __m128i a = Load4(src);
+    // 00 01 02 03 10 11 12 13
+    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+    src += src_stride;
+    // 20 21 22 23
+    srcs[2] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13 20 21 22 23
+    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+    // 30 31 32 33
+    const __m128i b = Load4(src);
+    // 20 21 22 23 30 31 32 33
+    srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+    src += src_stride;
+    // 40 41 42 43
+    srcs[4] = Load4(src);
+    src += src_stride;
+    // 30 31 32 33 40 41 42 43
+    srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+    int y = 0;
+    do {
+      // 50 51 52 53
+      const __m128i c = Load4(src);
+      // 40 41 42 43 50 51 52 53
+      srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+      src += src_stride;
+      // 60 61 62 63
+      srcs[6] = Load4(src);
+      src += src_stride;
+      // 50 51 52 53 60 61 62 63
+      srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      srcs[3] = srcs[5];
+      srcs[4] = srcs[6];
+      y += 2;
+    } while (y < height);
+  } else if (num_taps == 8) {
+    srcs[8] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13
+    const __m128i a = Load4(src);
+    // 00 01 02 03 10 11 12 13
+    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+    src += src_stride;
+    // 20 21 22 23
+    srcs[2] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13 20 21 22 23
+    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+    // 30 31 32 33
+    const __m128i b = Load4(src);
+    // 20 21 22 23 30 31 32 33
+    srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+    src += src_stride;
+    // 40 41 42 43
+    srcs[4] = Load4(src);
+    src += src_stride;
+    // 30 31 32 33 40 41 42 43
+    srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+    // 50 51 52 53
+    const __m128i c = Load4(src);
+    // 40 41 42 43 50 51 52 53
+    srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+    src += src_stride;
+    // 60 61 62 63
+    srcs[6] = Load4(src);
+    src += src_stride;
+    // 50 51 52 53 60 61 62 63
+    srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+    int y = 0;
+    do {
+      // 70 71 72 73
+      const __m128i d = Load4(src);
+      // 60 61 62 63 70 71 72 73
+      srcs[6] = _mm_unpacklo_epi32(srcs[6], d);
+      src += src_stride;
+      // 80 81 82 83
+      srcs[8] = Load4(src);
+      src += src_stride;
+      // 70 71 72 73 80 81 82 83
+      srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      srcs[3] = srcs[5];
+      srcs[4] = srcs[6];
+      srcs[5] = srcs[7];
+      srcs[6] = srcs[8];
+      y += 2;
+    } while (y < height);
+  }
+}
+
+template <int filter_index, bool negative_outside_taps = false>
+void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
+                       void* const dst, const ptrdiff_t dst_stride,
+                       const int height, const __m128i* const v_tap) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  auto* dst8 = static_cast<uint8_t*>(dst);
+
+  __m128i srcs[9];
+
+  if (num_taps == 2) {
+    srcs[2] = _mm_setzero_si128();
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+
+    int y = 0;
+    do {
+      // 00 01 10 11
+      srcs[0] = Load2<1>(src, srcs[0]);
+      src += src_stride;
+      // 00 01 10 11 20 21
+      srcs[0] = Load2<2>(src, srcs[0]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31
+      srcs[0] = Load2<3>(src, srcs[0]);
+      src += src_stride;
+      // 40 41
+      srcs[2] = Load2<0>(src, srcs[2]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31 40 41
+      const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]);
+      // 10 11 20 21 30 31 40 41
+      srcs[1] = _mm_srli_si128(srcs_0_2, 2);
+      // This uses srcs[0]..srcs[1].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      if (height == 2) return;
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[2];
+      y += 4;
+    } while (y < height);
+  } else if (num_taps == 4) {
+    srcs[4] = _mm_setzero_si128();
+
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+
+    int y = 0;
+    do {
+      // 00 01 10 11 20 21 30 31
+      srcs[0] = Load2<3>(src, srcs[0]);
+      src += src_stride;
+      // 40 41
+      srcs[4] = Load2<0>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51
+      srcs[4] = Load2<1>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51 60 61
+      srcs[4] = Load2<2>(src, srcs[4]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+      const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+      // 10 11 20 21 30 31 40 41
+      srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+      // 20 21 30 31 40 41 50 51
+      srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+      // 30 31 40 41 50 51 60 61
+      srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+      // This uses srcs[0]..srcs[3].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      if (height == 2) return;
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      y += 4;
+    } while (y < height);
+  } else if (num_taps == 6) {
+    // During the vertical pass the number of taps is restricted when
+    // |height| <= 4.
+    assert(height > 4);
+    srcs[8] = _mm_setzero_si128();
+
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21 30 31
+    srcs[0] = Load2<3>(src, srcs[0]);
+    src += src_stride;
+    // 40 41
+    srcs[4] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+    const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+    // 10 11 20 21 30 31 40 41
+    srcs[1] = _mm_srli_si128(srcs_0_4x, 2);
+
+    int y = 0;
+    do {
+      // 40 41 50 51
+      srcs[4] = Load2<1>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51 60 61
+      srcs[4] = Load2<2>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51 60 61 70 71
+      srcs[4] = Load2<3>(src, srcs[4]);
+      src += src_stride;
+      // 80 81
+      srcs[8] = Load2<0>(src, srcs[8]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+      const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+      // 20 21 30 31 40 41 50 51
+      srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+      // 30 31 40 41 50 51 60 61
+      srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+      const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+      // 50 51 60 61 70 71 80 81
+      srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+
+      // This uses srcs[0]..srcs[5].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+      y += 4;
+    } while (y < height);
+  } else if (num_taps == 8) {
+    // During the vertical pass the number of taps is restricted when
+    // |height| <= 4.
+    assert(height > 4);
+    srcs[8] = _mm_setzero_si128();
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21 30 31
+    srcs[0] = Load2<3>(src, srcs[0]);
+    src += src_stride;
+    // 40 41
+    srcs[4] = Load2(src);
+    src += src_stride;
+    // 40 41 50 51
+    srcs[4] = Load2<1>(src, srcs[4]);
+    src += src_stride;
+    // 40 41 50 51 60 61
+    srcs[4] = Load2<2>(src, srcs[4]);
+    src += src_stride;
+
+    // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+    const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+    // 10 11 20 21 30 31 40 41
+    srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+    // 20 21 30 31 40 41 50 51
+    srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+    // 30 31 40 41 50 51 60 61
+    srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+    int y = 0;
+    do {
+      // 40 41 50 51 60 61 70 71
+      srcs[4] = Load2<3>(src, srcs[4]);
+      src += src_stride;
+      // 80 81
+      srcs[8] = Load2<0>(src, srcs[8]);
+      src += src_stride;
+      // 80 81 90 91
+      srcs[8] = Load2<1>(src, srcs[8]);
+      src += src_stride;
+      // 80 81 90 91 a0 a1
+      srcs[8] = Load2<2>(src, srcs[8]);
+      src += src_stride;
+
+      // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1
+      const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+      // 50 51 60 61 70 71 80 81
+      srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+      // 60 61 70 71 80 81 90 91
+      srcs[6] = _mm_srli_si128(srcs_4_8, 4);
+      // 70 71 80 81 90 91 a0 a1
+      srcs[7] = _mm_srli_si128(srcs_4_8, 6);
+
+      // This uses srcs[0]..srcs[7].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+      y += 4;
+    } while (y < height);
+  }
+}
+
+void ConvolveVertical_SSE4_1(const void* const reference,
+                             const ptrdiff_t reference_stride,
+                             const int /*horizontal_filter_index*/,
+                             const int vertical_filter_index,
+                             const int /*subpixel_x*/, const int subpixel_y,
+                             const int width, const int height,
+                             void* prediction, const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
+  assert(filter_id != 0);
+
+  __m128i taps[4];
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+  if (filter_index < 2) {  // 6 tap.
+    SetupTaps<6>(&v_filter, taps);
+    if (width == 2) {
+      FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else if (filter_index == 2) {  // 8 tap.
+    SetupTaps<8>(&v_filter, taps);
+    if (width == 2) {
+      FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else if (filter_index == 3) {  // 2 tap.
+    SetupTaps<2>(&v_filter, taps);
+    if (width == 2) {
+      FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else if (filter_index == 4) {  // 4 tap.
+    SetupTaps<4>(&v_filter, taps);
+    if (width == 2) {
+      FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else {
+    // TODO(slavarnway): Investigate adding |filter_index| == 1 special cases.
+    // See convolve_neon.cc
+    SetupTaps<4>(&v_filter, taps);
+
+    if (width == 2) {
+      FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  }
+}
+
 void ConvolveCompoundCopy_SSE4(
     const void* const reference, const ptrdiff_t reference_stride,
     const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
@@ -388,6 +1483,76 @@ void ConvolveCompoundCopy_SSE4(
   }
 }
 
+void ConvolveCompoundVertical_SSE4_1(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int vertical_filter_index,
+    const int /*subpixel_x*/, const int subpixel_y, const int width,
+    const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* dest = static_cast<uint16_t*>(prediction);
+  const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
+  assert(filter_id != 0);
+
+  __m128i taps[4];
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+  if (filter_index < 2) {  // 6 tap.
+    SetupTaps<6>(&v_filter, taps);
+    if (width == 4) {
+      FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else if (filter_index == 2) {  // 8 tap.
+    SetupTaps<8>(&v_filter, taps);
+
+    if (width == 4) {
+      FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else if (filter_index == 3) {  // 2 tap.
+    SetupTaps<2>(&v_filter, taps);
+
+    if (width == 4) {
+      FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else if (filter_index == 4) {  // 4 tap.
+    SetupTaps<4>(&v_filter, taps);
+
+    if (width == 4) {
+      FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else {
+    SetupTaps<4>(&v_filter, taps);
+
+    if (width == 4) {
+      FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  }
+}
+
 void ConvolveHorizontal_SSE4_1(const void* const reference,
                                const ptrdiff_t reference_stride,
                                const int horizontal_filter_index,
@@ -418,13 +1583,720 @@ void ConvolveCompoundHorizontal_SSE4_1(
       filter_index);
 }
 
+void ConvolveCompound2D_SSE4_1(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int horizontal_filter_index, const int vertical_filter_index,
+    const int subpixel_x, const int subpixel_y, const int width,
+    const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  alignas(16) uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  // Similarly for height.
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+  const int intermediate_height = height + vertical_taps - 1;
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* const src = static_cast<const uint8_t*>(reference) -
+                          (vertical_taps / 2 - 1) * src_stride -
+                          kHorizontalOffset;
+
+  DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+      src, src_stride, intermediate_result, width, width, intermediate_height,
+      subpixel_x, horiz_filter_index);
+
+  // Vertical filter.
+  auto* dest = static_cast<uint16_t*>(prediction);
+  const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
+  assert(filter_id != 0);
+
+  const ptrdiff_t dest_stride = width;
+  __m128i taps[4];
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[vert_filter_index][filter_id]);
+
+  if (vertical_taps == 8) {
+    SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 4) {
+      Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<8, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  } else if (vertical_taps == 6) {
+    SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 4) {
+      Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<6, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  } else if (vertical_taps == 4) {
+    SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 4) {
+      Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<4, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  } else {  // |vertical_taps| == 2
+    SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 4) {
+      Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<2, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  }
+}
+
+// Pre-transposed filters.
+template <int filter_index>
+inline void GetHalfSubPixelFilter(__m128i* output) {
+  // Filter 0
+  alignas(
+      16) static constexpr int8_t kHalfSubPixel6TapSignedFilterColumns[6][16] =
+      {{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
+       {0, -3, -5, -6, -7, -7, -8, -7, -7, -6, -6, -6, -5, -4, -2, -1},
+       {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+       {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+       {0, -1, -2, -4, -5, -6, -6, -6, -7, -7, -8, -7, -7, -6, -5, -3},
+       {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+  // Filter 1
+  alignas(16) static constexpr int8_t
+      kHalfSubPixel6TapMixedSignedFilterColumns[6][16] = {
+          {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
+          {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
+          {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+          {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+          {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14},
+          {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
+  // Filter 2
+  alignas(
+      16) static constexpr int8_t kHalfSubPixel8TapSignedFilterColumns[8][16] =
+      {{0, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, 0},
+       {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
+       {0, -3, -6, -9, -11, -11, -12, -12, -12, -11, -10, -9, -7, -5, -3, -1},
+       {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
+       {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
+       {0, -1, -3, -5, -7, -9, -10, -11, -12, -12, -12, -11, -11, -9, -6, -3},
+       {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
+       {0, 0, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1}};
+  // Filter 3
+  alignas(16) static constexpr uint8_t kHalfSubPixel2TapFilterColumns[2][16] = {
+      {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
+      {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
+  // Filter 4
+  alignas(
+      16) static constexpr int8_t kHalfSubPixel4TapSignedFilterColumns[4][16] =
+      {{0, -2, -4, -5, -6, -6, -7, -6, -6, -5, -5, -5, -4, -3, -2, -1},
+       {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+       {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+       {0, -1, -2, -3, -4, -5, -5, -5, -6, -6, -7, -6, -6, -5, -4, -2}};
+  // Filter 5
+  alignas(
+      16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
+      {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
+      {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+      {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+      {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
+  switch (filter_index) {
+    case 0:
+      output[0] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[1]);
+      output[2] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[2]);
+      output[3] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[3]);
+      output[4] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[4]);
+      output[5] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[5]);
+      break;
+    case 1:
+      // The term "mixed" refers to the fact that the outer taps have a mix of
+      // negative and positive values.
+      output[0] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[1]);
+      output[2] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[2]);
+      output[3] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[3]);
+      output[4] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[4]);
+      output[5] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[5]);
+      break;
+    case 2:
+      output[0] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[1]);
+      output[2] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[2]);
+      output[3] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[3]);
+      output[4] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[4]);
+      output[5] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[5]);
+      output[6] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[6]);
+      output[7] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[7]);
+      break;
+    case 3:
+      output[0] = LoadAligned16(kHalfSubPixel2TapFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel2TapFilterColumns[1]);
+      break;
+    case 4:
+      output[0] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[1]);
+      output[2] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[2]);
+      output[3] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[3]);
+      break;
+    default:
+      assert(filter_index == 5);
+      output[0] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[0]);
+      output[1] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[1]);
+      output[2] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[2]);
+      output[3] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[3]);
+      break;
+  }
+}
+
+// There are many opportunities for overreading in scaled convolve, because
+// the range of starting points for filter windows is anywhere from 0 to 16
+// for 8 destination pixels, and the window sizes range from 2 to 8. To
+// accommodate this range concisely, we use |grade_x| to mean the most steps
+// in src that can be traversed in a single |step_x| increment, i.e. 1 or 2.
+// More importantly, |grade_x| answers the question "how many vector loads are
+// needed to cover the source values?"
+// When |grade_x| == 1, the maximum number of source values needed is 8 separate
+// starting positions plus 7 more to cover taps, all fitting into 16 bytes.
+// When |grade_x| > 1, we are guaranteed to exceed 8 whole steps in src for
+// every 8 |step_x| increments, on top of 8 possible taps. The first load covers
+// the starting sources for each kernel, while the final load covers the taps.
+// Since the offset value of src_x cannot exceed 8 and |num_taps| does not
+// exceed 4 when width <= 4, |grade_x| is set to 1 regardless of the value of
+// |step_x|.
+template <int num_taps, int grade_x>
+inline void PrepareSourceVectors(const uint8_t* src, const __m128i src_indices,
+                                 __m128i source[num_taps >> 1]) {
+  const __m128i src_vals = LoadUnaligned16(src);
+  source[0] = _mm_shuffle_epi8(src_vals, src_indices);
+  if (grade_x == 1) {
+    if (num_taps > 2) {
+      source[1] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 2), src_indices);
+    }
+    if (num_taps > 4) {
+      source[2] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 4), src_indices);
+    }
+    if (num_taps > 6) {
+      source[3] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 6), src_indices);
+    }
+  } else {
+    assert(grade_x > 1);
+    assert(num_taps != 4);
+    // grade_x > 1 also means width >= 8 && num_taps != 4
+    const __m128i src_vals_ext = LoadLo8(src + 16);
+    if (num_taps > 2) {
+      source[1] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 2),
+                                   src_indices);
+      source[2] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 4),
+                                   src_indices);
+    }
+    if (num_taps > 6) {
+      source[3] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 6),
+                                   src_indices);
+    }
+  }
+}
+
+template <int num_taps>
+inline void PrepareHorizontalTaps(const __m128i subpel_indices,
+                                  const __m128i* filter_taps,
+                                  __m128i* out_taps) {
+  const __m128i scale_index_offsets =
+      _mm_srli_epi16(subpel_indices, kFilterIndexShift);
+  const __m128i filter_index_mask = _mm_set1_epi8(kSubPixelMask);
+  const __m128i filter_indices =
+      _mm_and_si128(_mm_packus_epi16(scale_index_offsets, scale_index_offsets),
+                    filter_index_mask);
+  // Line up taps for maddubs_epi16.
+  // The unpack is also assumed to be lighter than shift+alignr.
+  for (int k = 0; k < (num_taps >> 1); ++k) {
+    const __m128i taps0 = _mm_shuffle_epi8(filter_taps[2 * k], filter_indices);
+    const __m128i taps1 =
+        _mm_shuffle_epi8(filter_taps[2 * k + 1], filter_indices);
+    out_taps[k] = _mm_unpacklo_epi8(taps0, taps1);
+  }
+}
+
+inline __m128i HorizontalScaleIndices(const __m128i subpel_indices) {
+  const __m128i src_indices16 =
+      _mm_srli_epi16(subpel_indices, kScaleSubPixelBits);
+  const __m128i src_indices = _mm_packus_epi16(src_indices16, src_indices16);
+  return _mm_unpacklo_epi8(src_indices,
+                           _mm_add_epi8(src_indices, _mm_set1_epi8(1)));
+}
+
+template <int grade_x, int filter_index, int num_taps>
+inline void ConvolveHorizontalScale(const uint8_t* src, ptrdiff_t src_stride,
+                                    int width, int subpixel_x, int step_x,
+                                    int intermediate_height,
+                                    int16_t* intermediate) {
+  // Account for the 0-taps that precede the 2 nonzero taps.
+  const int kernel_offset = (8 - num_taps) >> 1;
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  __m128i filter_taps[num_taps];
+  GetHalfSubPixelFilter<filter_index>(filter_taps);
+  const __m128i index_steps =
+      _mm_mullo_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
+                      _mm_set1_epi16(static_cast<int16_t>(step_x)));
+
+  __m128i taps[num_taps >> 1];
+  __m128i source[num_taps >> 1];
+  int p = subpixel_x;
+  // Case when width <= 4 is possible.
+  if (filter_index >= 3) {
+    if (filter_index > 3 || width <= 4) {
+      const uint8_t* src_x =
+          &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+      // Only add steps to the 10-bit truncated p to avoid overflow.
+      const __m128i p_fraction = _mm_set1_epi16(p & 1023);
+      const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction);
+      PrepareHorizontalTaps<num_taps>(subpel_indices, filter_taps, taps);
+      const __m128i packed_indices = HorizontalScaleIndices(subpel_indices);
+
+      int y = intermediate_height;
+      do {
+        // Load and line up source values with the taps. Width 4 means no need
+        // to load extended source.
+        PrepareSourceVectors<num_taps, /*grade_x=*/1>(src_x, packed_indices,
+                                                      source);
+
+        StoreLo8(intermediate, RightShiftWithRounding_S16(
+                                   SumOnePassTaps<filter_index>(source, taps),
+                                   kInterRoundBitsHorizontal - 1));
+        src_x += src_stride;
+        intermediate += kIntermediateStride;
+      } while (--y != 0);
+      return;
+    }
+  }
+
+  // |width| >= 8
+  int x = 0;
+  do {
+    const uint8_t* src_x =
+        &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+    int16_t* intermediate_x = intermediate + x;
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const __m128i p_fraction = _mm_set1_epi16(p & 1023);
+    const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction);
+    PrepareHorizontalTaps<num_taps>(subpel_indices, filter_taps, taps);
+    const __m128i packed_indices = HorizontalScaleIndices(subpel_indices);
+
+    int y = intermediate_height;
+    do {
+      // For each x, a lane of src_k[k] contains src_x[k].
+      PrepareSourceVectors<num_taps, grade_x>(src_x, packed_indices, source);
+
+      // Shift by one less because the taps are halved.
+      StoreAligned16(
+          intermediate_x,
+          RightShiftWithRounding_S16(SumOnePassTaps<filter_index>(source, taps),
+                                     kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate_x += kIntermediateStride;
+    } while (--y != 0);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+template <int num_taps>
+inline void PrepareVerticalTaps(const int8_t* taps, __m128i* output) {
+  // Avoid overreading the filter due to starting at kernel_offset.
+  // The only danger of overread is in the final filter, which has 4 taps.
+  const __m128i filter =
+      _mm_cvtepi8_epi16((num_taps > 4) ? LoadLo8(taps) : Load4(taps));
+  output[0] = _mm_shuffle_epi32(filter, 0);
+  if (num_taps > 2) {
+    output[1] = _mm_shuffle_epi32(filter, 0x55);
+  }
+  if (num_taps > 4) {
+    output[2] = _mm_shuffle_epi32(filter, 0xAA);
+  }
+  if (num_taps > 6) {
+    output[3] = _mm_shuffle_epi32(filter, 0xFF);
+  }
+}
+
+// Process eight 16 bit inputs and output eight 16 bit values.
+template <int num_taps, bool is_compound>
+inline __m128i Sum2DVerticalTaps(const __m128i* const src,
+                                 const __m128i* taps) {
+  const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]);
+  __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps[0]);
+  const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]);
+  __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps[0]);
+  if (num_taps > 2) {
+    const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps[1]));
+    const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps[1]));
+  }
+  if (num_taps > 4) {
+    const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps[2]));
+    const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps[2]));
+  }
+  if (num_taps > 6) {
+    const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps[3]));
+    const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps[3]));
+  }
+  if (is_compound) {
+    return _mm_packs_epi32(
+        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        RightShiftWithRounding_S32(sum_hi,
+                                   kInterRoundBitsCompoundVertical - 1));
+  }
+  return _mm_packs_epi32(
+      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+// Bottom half of each src[k] is the source for one filter, and the top half
+// is the source for the other filter, for the next destination row.
+template <int num_taps, bool is_compound>
+__m128i Sum2DVerticalTaps4x2(const __m128i* const src, const __m128i* taps_lo,
+                             const __m128i* taps_hi) {
+  const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]);
+  __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps_lo[0]);
+  const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]);
+  __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps_hi[0]);
+  if (num_taps > 2) {
+    const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps_lo[1]));
+    const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps_hi[1]));
+  }
+  if (num_taps > 4) {
+    const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps_lo[2]));
+    const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps_hi[2]));
+  }
+  if (num_taps > 6) {
+    const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps_lo[3]));
+    const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps_hi[3]));
+  }
+
+  if (is_compound) {
+    return _mm_packs_epi32(
+        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        RightShiftWithRounding_S32(sum_hi,
+                                   kInterRoundBitsCompoundVertical - 1));
+  }
+  return _mm_packs_epi32(
+      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+// |width_class| is 2, 4, or 8, according to the Store function that should be
+// used.
+template <int num_taps, int width_class, bool is_compound>
+#if LIBGAV1_MSAN
+__attribute__((no_sanitize_memory)) void ConvolveVerticalScale(
+#else
+inline void ConvolveVerticalScale(
+#endif
+    const int16_t* src, const int width, const int subpixel_y,
+    const int filter_index, const int step_y, const int height, void* dest,
+    const ptrdiff_t dest_stride) {
+  constexpr ptrdiff_t src_stride = kIntermediateStride;
+  constexpr int kernel_offset = (8 - num_taps) / 2;
+  const int16_t* src_y = src;
+  // |dest| is 16-bit in compound mode, Pixel otherwise.
+  auto* dest16_y = static_cast<uint16_t*>(dest);
+  auto* dest_y = static_cast<uint8_t*>(dest);
+  __m128i s[num_taps];
+
+  int p = subpixel_y & 1023;
+  int y = height;
+  if (width_class <= 4) {
+    __m128i filter_taps_lo[num_taps >> 1];
+    __m128i filter_taps_hi[num_taps >> 1];
+    do {  // y > 0
+      for (int i = 0; i < num_taps; ++i) {
+        s[i] = LoadLo8(src_y + i * src_stride);
+      }
+      int filter_id = (p >> 6) & kSubPixelMask;
+      const int8_t* filter0 =
+          kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+      PrepareVerticalTaps<num_taps>(filter0, filter_taps_lo);
+      p += step_y;
+      src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+
+      for (int i = 0; i < num_taps; ++i) {
+        s[i] = LoadHi8(s[i], src_y + i * src_stride);
+      }
+      filter_id = (p >> 6) & kSubPixelMask;
+      const int8_t* filter1 =
+          kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+      PrepareVerticalTaps<num_taps>(filter1, filter_taps_hi);
+      p += step_y;
+      src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+
+      const __m128i sums = Sum2DVerticalTaps4x2<num_taps, is_compound>(
+          s, filter_taps_lo, filter_taps_hi);
+      if (is_compound) {
+        assert(width_class > 2);
+        StoreLo8(dest16_y, sums);
+        dest16_y += dest_stride;
+        StoreHi8(dest16_y, sums);
+        dest16_y += dest_stride;
+      } else {
+        const __m128i result = _mm_packus_epi16(sums, sums);
+        if (width_class == 2) {
+          Store2(dest_y, result);
+          dest_y += dest_stride;
+          Store2(dest_y, _mm_srli_si128(result, 4));
+        } else {
+          Store4(dest_y, result);
+          dest_y += dest_stride;
+          Store4(dest_y, _mm_srli_si128(result, 4));
+        }
+        dest_y += dest_stride;
+      }
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  // |width_class| >= 8
+  __m128i filter_taps[num_taps >> 1];
+  do {  // y > 0
+    src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+    const int filter_id = (p >> 6) & kSubPixelMask;
+    const int8_t* filter =
+        kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+    PrepareVerticalTaps<num_taps>(filter, filter_taps);
+
+    int x = 0;
+    do {  // x < width
+      for (int i = 0; i < num_taps; ++i) {
+        s[i] = LoadUnaligned16(src_y + i * src_stride);
+      }
+
+      const __m128i sums =
+          Sum2DVerticalTaps<num_taps, is_compound>(s, filter_taps);
+      if (is_compound) {
+        StoreUnaligned16(dest16_y + x, sums);
+      } else {
+        StoreLo8(dest_y + x, _mm_packus_epi16(sums, sums));
+      }
+      x += 8;
+      src_y += 8;
+    } while (x < width);
+    p += step_y;
+    dest_y += dest_stride;
+    dest16_y += dest_stride;
+  } while (--y != 0);
+}
+
+template <bool is_compound>
+void ConvolveScale2D_SSE4_1(const void* const reference,
+                            const ptrdiff_t reference_stride,
+                            const int horizontal_filter_index,
+                            const int vertical_filter_index,
+                            const int subpixel_x, const int subpixel_y,
+                            const int step_x, const int step_y, const int width,
+                            const int height, void* prediction,
+                            const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  assert(step_x <= 2048);
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  // TODO(petersonab): Reduce intermediate block stride to width to make smaller
+  // blocks faster.
+  alignas(16) int16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (2 * kMaxSuperBlockSizeInPixels + kSubPixelTaps)];
+  const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
+  const int intermediate_height =
+      (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+       kScaleSubPixelBits) +
+      num_vert_taps;
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [3, 5].
+  // Similarly for height.
+  int16_t* intermediate = intermediate_result;
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference);
+  const int vert_kernel_offset = (8 - num_vert_taps) / 2;
+  src += vert_kernel_offset * src_stride;
+
+  // Derive the maximum value of |step_x| at which all source values fit in one
+  // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
+  // step_x*7 is the final base sub-pixel index for the shuffle mask for filter
+  // inputs in each iteration on large blocks. When step_x is large, we need a
+  // second register and alignr in order to gather all filter inputs.
+  // |num_taps| - 1 is the offset for the shuffle of inputs to the final tap.
+  const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+  const int kernel_start_ceiling = 16 - num_horiz_taps;
+  // This truncated quotient |grade_x_threshold| selects |step_x| such that:
+  // (step_x * 7) >> kScaleSubPixelBits < single load limit
+  const int grade_x_threshold =
+      (kernel_start_ceiling << kScaleSubPixelBits) / 7;
+  switch (horiz_filter_index) {
+    case 0:
+      if (step_x > grade_x_threshold) {
+        ConvolveHorizontalScale<2, 0, 6>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      } else {
+        ConvolveHorizontalScale<1, 0, 6>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      }
+      break;
+    case 1:
+      if (step_x > grade_x_threshold) {
+        ConvolveHorizontalScale<2, 1, 6>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+
+      } else {
+        ConvolveHorizontalScale<1, 1, 6>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      }
+      break;
+    case 2:
+      if (step_x > grade_x_threshold) {
+        ConvolveHorizontalScale<2, 2, 8>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      } else {
+        ConvolveHorizontalScale<1, 2, 8>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      }
+      break;
+    case 3:
+      if (step_x > grade_x_threshold) {
+        ConvolveHorizontalScale<2, 3, 2>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      } else {
+        ConvolveHorizontalScale<1, 3, 2>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      }
+      break;
+    case 4:
+      assert(width <= 4);
+      ConvolveHorizontalScale<1, 4, 4>(src, src_stride, width, subpixel_x,
+                                       step_x, intermediate_height,
+                                       intermediate);
+      break;
+    default:
+      assert(horiz_filter_index == 5);
+      assert(width <= 4);
+      ConvolveHorizontalScale<1, 5, 4>(src, src_stride, width, subpixel_x,
+                                       step_x, intermediate_height,
+                                       intermediate);
+  }
+
+  // Vertical filter.
+  intermediate = intermediate_result;
+  switch (vert_filter_index) {
+    case 0:
+    case 1:
+      if (!is_compound && width == 2) {
+        ConvolveVerticalScale<6, 2, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else if (width == 4) {
+        ConvolveVerticalScale<6, 4, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else {
+        ConvolveVerticalScale<6, 8, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      }
+      break;
+    case 2:
+      if (!is_compound && width == 2) {
+        ConvolveVerticalScale<8, 2, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else if (width == 4) {
+        ConvolveVerticalScale<8, 4, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else {
+        ConvolveVerticalScale<8, 8, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      }
+      break;
+    case 3:
+      if (!is_compound && width == 2) {
+        ConvolveVerticalScale<2, 2, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else if (width == 4) {
+        ConvolveVerticalScale<2, 4, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else {
+        ConvolveVerticalScale<2, 8, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      }
+      break;
+    default:
+      assert(vert_filter_index == 4 || vert_filter_index == 5);
+      if (!is_compound && width == 2) {
+        ConvolveVerticalScale<4, 2, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else if (width == 4) {
+        ConvolveVerticalScale<4, 4, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else {
+        ConvolveVerticalScale<4, 8, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      }
+  }
+}
+
 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);
   dsp->convolve[0][0][0][1] = ConvolveHorizontal_SSE4_1;
+  dsp->convolve[0][0][1][0] = ConvolveVertical_SSE4_1;
+  dsp->convolve[0][0][1][1] = Convolve2D_SSE4_1;
 
   dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4;
   dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_SSE4_1;
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_SSE4_1;
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_SSE4_1;
+
+  dsp->convolve_scale[0] = ConvolveScale2D_SSE4_1<false>;
+  dsp->convolve_scale[1] = ConvolveScale2D_SSE4_1<true>;
 }
 
 }  // namespace
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.h b/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.h
index 92f35d79426..e449a87436f 100644
--- a/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.h
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.h
@@ -38,6 +38,14 @@ void ConvolveInit_SSE4_1();
 #define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_SSE4_1
 #endif
 
+#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical
+#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Convolve2D
+#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_SSE4_1
+#endif
+
 #ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundCopy
 #define LIBGAV1_Dsp8bpp_ConvolveCompoundCopy LIBGAV1_CPU_SSE4_1
 #endif
@@ -46,6 +54,22 @@ void ConvolveInit_SSE4_1();
 #define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_SSE4_1
 #endif
 
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompound2D
+#define LIBGAV1_Dsp8bpp_ConvolveCompound2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveScale2D
+#define LIBGAV1_Dsp8bpp_ConvolveScale2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D LIBGAV1_CPU_SSE4_1
+#endif
+
 #endif  // LIBGAV1_ENABLE_SSE4_1
 
 #endif  // LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/loop_filter_sse4.cc b/chromium/third_party/libgav1/src/src/dsp/x86/loop_filter_sse4.cc
index 78dec96bc69..edb8b1405f8 100644
--- a/chromium/third_party/libgav1/src/src/dsp/x86/loop_filter_sse4.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/loop_filter_sse4.cc
@@ -1143,7 +1143,7 @@ template <int bitdepth>
 struct LoopFilterFuncs_SSE4_1 {
   LoopFilterFuncs_SSE4_1() = delete;
 
-  static const int kThreshShift = bitdepth - 8;
+  static constexpr int kThreshShift = bitdepth - 8;
 
   static void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh,
                         int inner_thresh, int hev_thresh);
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/loop_restoration_sse4.cc b/chromium/third_party/libgav1/src/src/dsp/x86/loop_restoration_sse4.cc
index 02b7ed03e1a..7a01ab15aae 100644
--- a/chromium/third_party/libgav1/src/src/dsp/x86/loop_restoration_sse4.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/loop_restoration_sse4.cc
@@ -36,14 +36,6 @@ namespace dsp {
 namespace low_bitdepth {
 namespace {
 
-// Precision of a division table (mtable)
-constexpr int kSgrProjScaleBits = 20;
-constexpr int kSgrProjReciprocalBits = 12;
-// Core self-guided restoration precision bits.
-constexpr int kSgrProjSgrBits = 8;
-// Precision bits of generated values higher than source before projection.
-constexpr int kSgrProjRestoreBits = 4;
-
 // Note: range of wiener filter coefficients.
 // Wiener filter coefficients are symmetric, and their sum is 1 (128).
 // The range of each coefficient:
@@ -85,12 +77,12 @@ void WienerFilter_SSE4_1(const void* source, void* const dest,
       (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
   const auto* src = static_cast<const uint8_t*>(source);
   auto* dst = static_cast<uint8_t*>(dest);
-  const ptrdiff_t buffer_stride = buffer->wiener_buffer_stride;
-  auto* wiener_buffer = buffer->wiener_buffer;
+  const ptrdiff_t buffer_stride = (width + 7) & ~7;
+  auto* wiener_buffer = buffer->wiener_buffer + buffer_stride;
   // horizontal filtering.
   PopulateWienerCoefficients(restoration_info, WienerInfo::kHorizontal, filter);
   const int center_tap = 3;
-  src -= center_tap * source_stride + center_tap;
+  src -= (center_tap - 1) * source_stride + center_tap;
 
   const int horizontal_rounding =
       1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
@@ -108,7 +100,7 @@ void WienerFilter_SSE4_1(const void* source, void* const dest,
   const __m128i v_offset_shift =
       _mm_cvtsi32_si128(7 - kInterRoundBitsHorizontal);
 
-  int y = 0;
+  int y = height + kSubPixelTaps - 4;
   do {
     int x = 0;
     do {
@@ -156,9 +148,16 @@ void WienerFilter_SSE4_1(const void* source, void* const dest,
     } while (x < width);
     src += source_stride;
     wiener_buffer += buffer_stride;
-  } while (++y < height + kSubPixelTaps - 2);
-
+  } while (--y != 0);
+  // Because the top row of |source| is a duplicate of the second row, and the
+  // bottom row of |source| is a duplicate of its above row, we can duplicate
+  // the top and bottom row of |wiener_buffer| accordingly.
+  memcpy(wiener_buffer, wiener_buffer - buffer_stride,
+         sizeof(*wiener_buffer) * width);
   wiener_buffer = buffer->wiener_buffer;
+  memcpy(wiener_buffer, wiener_buffer + buffer_stride,
+         sizeof(*wiener_buffer) * width);
+
   // vertical filtering.
   PopulateWienerCoefficients(restoration_info, WienerInfo::kVertical, filter);
 
@@ -211,521 +210,1380 @@ void WienerFilter_SSE4_1(const void* source, void* const dest,
   } while (++y < height);
 }
 
-// Section 7.17.3.
-// a2: range [1, 256].
-// if (z >= 255)
-//   a2 = 256;
-// else if (z == 0)
-//   a2 = 1;
-// else
-//   a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1);
-constexpr int kXByXPlus1[256] = {
-    1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
-    240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
-    248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
-    250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
-    252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
-    253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
-    253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
-    254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
-    254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
-    254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
-    254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
-    254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    256};
-
-inline __m128i HorizontalAddVerticalSumsRadius1(const uint32_t* vert_sums) {
-  // Horizontally add vertical sums to get total box sum.
-  const __m128i v_sums_3210 = LoadUnaligned16(&vert_sums[0]);
-  const __m128i v_sums_7654 = LoadUnaligned16(&vert_sums[4]);
-  const __m128i v_sums_4321 = _mm_alignr_epi8(v_sums_7654, v_sums_3210, 4);
-  const __m128i v_sums_5432 = _mm_alignr_epi8(v_sums_7654, v_sums_3210, 8);
-  const __m128i v_s0 = _mm_add_epi32(v_sums_3210, v_sums_4321);
-  const __m128i v_s1 = _mm_add_epi32(v_s0, v_sums_5432);
-  return v_s1;
-}
-
-inline __m128i HorizontalAddVerticalSumsRadius2(const uint32_t* vert_sums) {
-  // Horizontally add vertical sums to get total box sum.
-  const __m128i v_sums_3210 = LoadUnaligned16(&vert_sums[0]);
-  const __m128i v_sums_7654 = LoadUnaligned16(&vert_sums[4]);
-  const __m128i v_sums_4321 = _mm_alignr_epi8(v_sums_7654, v_sums_3210, 4);
-  const __m128i v_sums_5432 = _mm_alignr_epi8(v_sums_7654, v_sums_3210, 8);
-  const __m128i v_sums_6543 = _mm_alignr_epi8(v_sums_7654, v_sums_3210, 12);
-  const __m128i v_s0 = _mm_add_epi32(v_sums_3210, v_sums_4321);
-  const __m128i v_s1 = _mm_add_epi32(v_s0, v_sums_5432);
-  const __m128i v_s2 = _mm_add_epi32(v_s1, v_sums_6543);
-  const __m128i v_s3 = _mm_add_epi32(v_s2, v_sums_7654);
-  return v_s3;
-}
-
-void BoxFilterPreProcessRadius1_SSE4_1(
-    const uint8_t* const src, ptrdiff_t stride, int width, int height,
-    uint32_t s, uint32_t* intermediate_result[2], ptrdiff_t array_stride,
-    uint32_t* vertical_sums, uint32_t* vertical_sum_of_squares) {
-  assert(s != 0);
-  const uint32_t n = 9;
-  const uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
-  const __m128i v_one_over_n =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128(one_over_n), 0);
-  const __m128i v_sgrbits =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << kSgrProjSgrBits), 0);
-
-#if LIBGAV1_MSAN
-  // Over-reads occur in the x loop, so set to a known value.
-  memset(&vertical_sums[width], 0, 8 * sizeof(vertical_sums[0]));
-  memset(&vertical_sum_of_squares[width], 0,
-         8 * sizeof(vertical_sum_of_squares[0]));
-#endif
+//------------------------------------------------------------------------------
+// SGR
 
-  // Calculate intermediate results, including one-pixel border, for example,
-  // if unit size is 64x64, we calculate 66x66 pixels.
-  int y = -1;
-  do {
-    const uint8_t* top_left = &src[(y - 1) * stride - 2];
-    // Calculate the box vertical sums for each x position.
-    int vsx = -2;
-    do {
-      const __m128i v_box0 = _mm_cvtepu8_epi32(Load4(top_left));
-      const __m128i v_box1 = _mm_cvtepu8_epi32(Load4(top_left + stride));
-      const __m128i v_box2 = _mm_cvtepu8_epi32(Load4(top_left + stride * 2));
-      const __m128i v_sqr0 = _mm_mullo_epi32(v_box0, v_box0);
-      const __m128i v_sqr1 = _mm_mullo_epi32(v_box1, v_box1);
-      const __m128i v_sqr2 = _mm_mullo_epi32(v_box2, v_box2);
-      const __m128i v_a01 = _mm_add_epi32(v_sqr0, v_sqr1);
-      const __m128i v_a012 = _mm_add_epi32(v_a01, v_sqr2);
-      const __m128i v_b01 = _mm_add_epi32(v_box0, v_box1);
-      const __m128i v_b012 = _mm_add_epi32(v_b01, v_box2);
-      StoreUnaligned16(&vertical_sum_of_squares[vsx], v_a012);
-      StoreUnaligned16(&vertical_sums[vsx], v_b012);
-      top_left += 4;
-      vsx += 4;
-    } while (vsx <= width + 1);
-
-    int x = -1;
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m128i VaddlLo8(const __m128i a, const __m128i b) {
+  const __m128i a0 = _mm_unpacklo_epi8(a, _mm_setzero_si128());
+  const __m128i b0 = _mm_unpacklo_epi8(b, _mm_setzero_si128());
+  return _mm_add_epi16(a0, b0);
+}
+
+inline __m128i VaddlHi8(const __m128i a, const __m128i b) {
+  const __m128i a0 = _mm_unpackhi_epi8(a, _mm_setzero_si128());
+  const __m128i b0 = _mm_unpackhi_epi8(b, _mm_setzero_si128());
+  return _mm_add_epi16(a0, b0);
+}
+
+inline __m128i VaddlLo16(const __m128i a, const __m128i b) {
+  const __m128i a0 = _mm_unpacklo_epi16(a, _mm_setzero_si128());
+  const __m128i b0 = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+  return _mm_add_epi32(a0, b0);
+}
+
+inline __m128i VaddlHi16(const __m128i a, const __m128i b) {
+  const __m128i a0 = _mm_unpackhi_epi16(a, _mm_setzero_si128());
+  const __m128i b0 = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+  return _mm_add_epi32(a0, b0);
+}
+
+inline __m128i VaddwLo8(const __m128i a, const __m128i b) {
+  const __m128i b0 = _mm_unpacklo_epi8(b, _mm_setzero_si128());
+  return _mm_add_epi16(a, b0);
+}
+
+inline __m128i VaddwHi8(const __m128i a, const __m128i b) {
+  const __m128i b0 = _mm_unpackhi_epi8(b, _mm_setzero_si128());
+  return _mm_add_epi16(a, b0);
+}
+
+inline __m128i VaddwLo16(const __m128i a, const __m128i b) {
+  const __m128i b0 = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+  return _mm_add_epi32(a, b0);
+}
+
+inline __m128i VaddwHi16(const __m128i a, const __m128i b) {
+  const __m128i b0 = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+  return _mm_add_epi32(a, b0);
+}
+
+// Using VgetLane16() can save a sign extension instruction.
+template <int n>
+inline int16_t VgetLane16(const __m128i a) {
+  return _mm_extract_epi16(a, n);
+}
+
+inline __m128i VmullLo8(const __m128i a, const __m128i b) {
+  const __m128i a0 = _mm_unpacklo_epi8(a, _mm_setzero_si128());
+  const __m128i b0 = _mm_unpacklo_epi8(b, _mm_setzero_si128());
+  return _mm_mullo_epi16(a0, b0);
+}
+
+inline __m128i VmullHi8(const __m128i a, const __m128i b) {
+  const __m128i a0 = _mm_unpackhi_epi8(a, _mm_setzero_si128());
+  const __m128i b0 = _mm_unpackhi_epi8(b, _mm_setzero_si128());
+  return _mm_mullo_epi16(a0, b0);
+}
+
+inline __m128i VmullNLo8(const __m128i a, const int16_t b) {
+  const __m128i a0 = _mm_unpacklo_epi16(a, _mm_setzero_si128());
+  return _mm_madd_epi16(a0, _mm_set1_epi32(b));
+}
+
+inline __m128i VmullNHi8(const __m128i a, const int16_t b) {
+  const __m128i a0 = _mm_unpackhi_epi16(a, _mm_setzero_si128());
+  return _mm_madd_epi16(a0, _mm_set1_epi32(b));
+}
+
+inline __m128i VmullLo16(const __m128i a, const __m128i b) {
+  const __m128i a0 = _mm_unpacklo_epi16(a, _mm_setzero_si128());
+  const __m128i b0 = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+  return _mm_madd_epi16(a0, b0);
+}
+
+inline __m128i VmullHi16(const __m128i a, const __m128i b) {
+  const __m128i a0 = _mm_unpackhi_epi16(a, _mm_setzero_si128());
+  const __m128i b0 = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+  return _mm_madd_epi16(a0, b0);
+}
+
+inline __m128i VmulwLo16(const __m128i a, const __m128i b) {
+  const __m128i b0 = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+  return _mm_madd_epi16(a, b0);
+}
+
+inline __m128i VmulwHi16(const __m128i a, const __m128i b) {
+  const __m128i b0 = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+  return _mm_madd_epi16(a, b0);
+}
+
+inline __m128i VmlalNLo16(const __m128i sum, const __m128i a, const int16_t b) {
+  return _mm_add_epi32(sum, VmullNLo8(a, b));
+}
+
+inline __m128i VmlalNHi16(const __m128i sum, const __m128i a, const int16_t b) {
+  return _mm_add_epi32(sum, VmullNHi8(a, b));
+}
+
+inline __m128i VmlawLo16(const __m128i sum, const __m128i a, const __m128i b) {
+  const __m128i b0 = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+  return _mm_add_epi32(sum, _mm_madd_epi16(a, b0));
+}
+
+inline __m128i VmlawHi16(const __m128i sum, const __m128i a, const __m128i b) {
+  const __m128i b0 = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+  return _mm_add_epi32(sum, _mm_madd_epi16(a, b0));
+}
+
+inline __m128i VrshrNS32(const __m128i a, const int b) {
+  const __m128i sum = _mm_add_epi32(a, _mm_set1_epi32(1 << (b - 1)));
+  return _mm_srai_epi32(sum, b);
+}
+
+inline __m128i VrshrN32(const __m128i a, const int b) {
+  const __m128i sum = _mm_add_epi32(a, _mm_set1_epi32(1 << (b - 1)));
+  return _mm_srli_epi32(sum, b);
+}
+
+inline __m128i VshllN8(const __m128i a, const int b) {
+  const __m128i a0 = _mm_unpacklo_epi8(a, _mm_setzero_si128());
+  return _mm_slli_epi16(a0, b);
+}
+
+template <int n>
+inline __m128i CalcAxN(const __m128i a) {
+  static_assert(n == 9 || n == 25, "");
+  // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm_mullo_epi32(a, _mm_set1_epi32(n));
+  const __m128i ax9 = _mm_add_epi32(a, _mm_slli_epi32(a, 3));
+  if (n == 9) return ax9;
+  if (n == 25) return _mm_add_epi32(ax9, _mm_slli_epi32(a, 4));
+}
+
+template <int n>
+inline __m128i CalculateSgrMA2(const __m128i sum_sq, const __m128i sum,
+                               const uint32_t s) {
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m128i dxd = _mm_madd_epi16(sum, sum);
+  const __m128i axn = CalcAxN<n>(sum_sq);
+  const __m128i sub = _mm_sub_epi32(axn, dxd);
+  const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+
+  // z = RightShiftWithRounding(p * s, kSgrProjScaleBits);
+  const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(s));
+  return VrshrN32(pxs, kSgrProjScaleBits);
+}
+
+inline __m128i CalculateIntermediate4(const __m128i sgr_ma2, const __m128i sum,
+                                      const uint32_t one_over_n) {
+  // b2 = ((1 << kSgrProjSgrBits) - a2) * b * one_over_n
+  // 1 << kSgrProjSgrBits = 256
+  // |a2| = [1, 256]
+  // |sgr_ma2| max value = 255
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  const __m128i sgr_ma2q = _mm_unpacklo_epi8(sgr_ma2, _mm_setzero_si128());
+  const __m128i s = _mm_unpackhi_epi16(sgr_ma2q, _mm_setzero_si128());
+  const __m128i m = _mm_madd_epi16(s, sum);
+  const __m128i b2 = _mm_mullo_epi32(m, _mm_set1_epi32(one_over_n));
+  // static_cast<int>(RightShiftWithRounding(b2, kSgrProjReciprocalBits));
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m128i truncate_u32 = VrshrN32(b2, kSgrProjReciprocalBits);
+  return _mm_packus_epi32(truncate_u32, truncate_u32);
+}
+
+inline __m128i CalculateIntermediate8(const __m128i sgr_ma2, const __m128i sum,
+                                      const uint32_t one_over_n) {
+  // b2 = ((1 << kSgrProjSgrBits) - a2) * b * one_over_n
+  // 1 << kSgrProjSgrBits = 256
+  // |a2| = [1, 256]
+  // |sgr_ma2| max value = 255
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  const __m128i sgr_ma2q = _mm_unpackhi_epi8(sgr_ma2, _mm_setzero_si128());
+  const __m128i m0 = VmullLo16(sgr_ma2q, sum);
+  const __m128i m1 = VmullHi16(sgr_ma2q, sum);
+  const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+  const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+  // static_cast<int>(RightShiftWithRounding(b2, kSgrProjReciprocalBits));
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m128i b2_lo = VrshrN32(m2, kSgrProjReciprocalBits);
+  const __m128i b2_hi = VrshrN32(m3, kSgrProjReciprocalBits);
+  return _mm_packus_epi32(b2_lo, b2_hi);
+}
+
+inline __m128i Sum3_16(const __m128i left, const __m128i middle,
+                       const __m128i right) {
+  const __m128i sum = _mm_add_epi16(left, middle);
+  return _mm_add_epi16(sum, right);
+}
+
+inline __m128i Sum3_32(const __m128i left, const __m128i middle,
+                       const __m128i right) {
+  const __m128i sum = _mm_add_epi32(left, middle);
+  return _mm_add_epi32(sum, right);
+}
+
+inline __m128i Sum3W_16(const __m128i left, const __m128i middle,
+                        const __m128i right) {
+  const __m128i sum = VaddlLo8(left, middle);
+  return VaddwLo8(sum, right);
+}
+
+inline __m128i Sum3WLo_16(const __m128i a[3]) {
+  return Sum3W_16(a[0], a[1], a[2]);
+}
+
+inline __m128i Sum3WHi_16(const __m128i a[3]) {
+  const __m128i sum = VaddlHi8(a[0], a[1]);
+  return VaddwHi8(sum, a[2]);
+}
+
+inline __m128i Sum3WLo_32(const __m128i left, const __m128i middle,
+                          const __m128i right) {
+  const __m128i sum = VaddlLo16(left, middle);
+  return VaddwLo16(sum, right);
+}
+
+inline __m128i Sum3WHi_32(const __m128i left, const __m128i middle,
+                          const __m128i right) {
+  const __m128i sum = VaddlHi16(left, middle);
+  return VaddwHi16(sum, right);
+}
+
+inline __m128i* Sum3W_16x2(const __m128i a[3], __m128i sum[2]) {
+  sum[0] = Sum3WLo_16(a);
+  sum[1] = Sum3WHi_16(a);
+  return sum;
+}
+
+inline __m128i* Sum3W(const __m128i a[3], __m128i sum[2]) {
+  sum[0] = Sum3WLo_32(a[0], a[1], a[2]);
+  sum[1] = Sum3WHi_32(a[0], a[1], a[2]);
+  return sum;
+}
+
+template <int index>
+inline __m128i Sum3WLo(const __m128i a[3][2]) {
+  const __m128i b0 = a[0][index];
+  const __m128i b1 = a[1][index];
+  const __m128i b2 = a[2][index];
+  return Sum3WLo_32(b0, b1, b2);
+}
+
+inline __m128i Sum3WHi(const __m128i a[3][2]) {
+  const __m128i b0 = a[0][0];
+  const __m128i b1 = a[1][0];
+  const __m128i b2 = a[2][0];
+  return Sum3WHi_32(b0, b1, b2);
+}
+
+inline __m128i* Sum3W(const __m128i a[3][2], __m128i sum[3]) {
+  sum[0] = Sum3WLo<0>(a);
+  sum[1] = Sum3WHi(a);
+  sum[2] = Sum3WLo<1>(a);
+  return sum;
+}
+
+inline __m128i Sum5_16(const __m128i a[5]) {
+  const __m128i sum01 = _mm_add_epi16(a[0], a[1]);
+  const __m128i sum23 = _mm_add_epi16(a[2], a[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return _mm_add_epi16(sum, a[4]);
+}
+
+inline __m128i Sum5_32(const __m128i a[5]) {
+  const __m128i sum01 = _mm_add_epi32(a[0], a[1]);
+  const __m128i sum23 = _mm_add_epi32(a[2], a[3]);
+  const __m128i sum = _mm_add_epi32(sum01, sum23);
+  return _mm_add_epi32(sum, a[4]);
+}
+
+inline __m128i Sum5WLo_16(const __m128i a[5]) {
+  const __m128i sum01 = VaddlLo8(a[0], a[1]);
+  const __m128i sum23 = VaddlLo8(a[2], a[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return VaddwLo8(sum, a[4]);
+}
+
+inline __m128i Sum5WHi_16(const __m128i a[5]) {
+  const __m128i sum01 = VaddlHi8(a[0], a[1]);
+  const __m128i sum23 = VaddlHi8(a[2], a[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return VaddwHi8(sum, a[4]);
+}
+
+inline __m128i Sum5WLo_32(const __m128i a[5]) {
+  const __m128i sum01 = VaddlLo16(a[0], a[1]);
+  const __m128i sum23 = VaddlLo16(a[2], a[3]);
+  const __m128i sum0123 = _mm_add_epi32(sum01, sum23);
+  return VaddwLo16(sum0123, a[4]);
+}
+
+inline __m128i Sum5WHi_32(const __m128i a[5]) {
+  const __m128i sum01 = VaddlHi16(a[0], a[1]);
+  const __m128i sum23 = VaddlHi16(a[2], a[3]);
+  const __m128i sum0123 = _mm_add_epi32(sum01, sum23);
+  return VaddwHi16(sum0123, a[4]);
+}
+
+inline __m128i* Sum5W_16D(const __m128i a[5], __m128i sum[2]) {
+  sum[0] = Sum5WLo_16(a);
+  sum[1] = Sum5WHi_16(a);
+  return sum;
+}
+
+inline __m128i* Sum5W_32x2(const __m128i a[5], __m128i sum[2]) {
+  sum[0] = Sum5WLo_32(a);
+  sum[1] = Sum5WHi_32(a);
+  return sum;
+}
+
+template <int index>
+inline __m128i Sum5WLo(const __m128i a[5][2]) {
+  __m128i b[5];
+  b[0] = a[0][index];
+  b[1] = a[1][index];
+  b[2] = a[2][index];
+  b[3] = a[3][index];
+  b[4] = a[4][index];
+  return Sum5WLo_32(b);
+}
+
+inline __m128i Sum5WHi(const __m128i a[5][2]) {
+  __m128i b[5];
+  b[0] = a[0][0];
+  b[1] = a[1][0];
+  b[2] = a[2][0];
+  b[3] = a[3][0];
+  b[4] = a[4][0];
+  return Sum5WHi_32(b);
+}
+
+inline __m128i* Sum5W_32x3(const __m128i a[5][2], __m128i sum[3]) {
+  sum[0] = Sum5WLo<0>(a);
+  sum[1] = Sum5WHi(a);
+  sum[2] = Sum5WLo<1>(a);
+  return sum;
+}
+
+inline __m128i Sum3Horizontal(const __m128i a) {
+  const auto left = a;
+  const auto middle = _mm_srli_si128(a, 2);
+  const auto right = _mm_srli_si128(a, 4);
+  return Sum3_16(left, middle, right);
+}
+
+inline __m128i Sum3Horizontal_16(const __m128i a[2]) {
+  const auto left = a[0];
+  const auto middle = _mm_alignr_epi8(a[1], a[0], 2);
+  const auto right = _mm_alignr_epi8(a[1], a[0], 4);
+  return Sum3_16(left, middle, right);
+}
+
+inline __m128i Sum3Horizontal_32(const __m128i a[2]) {
+  const auto left = a[0];
+  const auto middle = _mm_alignr_epi8(a[1], a[0], 4);
+  const auto right = _mm_alignr_epi8(a[1], a[0], 8);
+  return Sum3_32(left, middle, right);
+}
+
+inline __m128i* Sum3Horizontal_32x2(const __m128i a[3], __m128i sum[2]) {
+  {
+    const auto left = a[0];
+    const auto middle = _mm_alignr_epi8(a[1], a[0], 4);
+    const auto right = _mm_alignr_epi8(a[1], a[0], 8);
+    sum[0] = Sum3_32(left, middle, right);
+  }
+  {
+    const auto left = a[1];
+    const auto middle = _mm_alignr_epi8(a[2], a[1], 4);
+    const auto right = _mm_alignr_epi8(a[2], a[1], 8);
+    sum[1] = Sum3_32(left, middle, right);
+  }
+  return sum;
+}
+
+inline __m128i Sum3HorizontalOffset1(const __m128i a) {
+  const auto left = _mm_srli_si128(a, 2);
+  const auto middle = _mm_srli_si128(a, 4);
+  const auto right = _mm_srli_si128(a, 6);
+  return Sum3_16(left, middle, right);
+}
+
+inline __m128i Sum3HorizontalOffset1_16(const __m128i a[2]) {
+  const auto left = _mm_alignr_epi8(a[1], a[0], 2);
+  const auto middle = _mm_alignr_epi8(a[1], a[0], 4);
+  const auto right = _mm_alignr_epi8(a[1], a[0], 6);
+  return Sum3_16(left, middle, right);
+}
+
+inline __m128i Sum3HorizontalOffset1_32(const __m128i a[2]) {
+  const auto left = _mm_alignr_epi8(a[1], a[0], 4);
+  const auto middle = _mm_alignr_epi8(a[1], a[0], 8);
+  const auto right = _mm_alignr_epi8(a[1], a[0], 12);
+  return Sum3_32(left, middle, right);
+}
+
+inline void Sum3HorizontalOffset1_32x2(const __m128i a[3], __m128i sum[2]) {
+  sum[0] = Sum3HorizontalOffset1_32(a + 0);
+  sum[1] = Sum3HorizontalOffset1_32(a + 1);
+}
+
+inline __m128i Sum5Horizontal(const __m128i a) {
+  __m128i s[5];
+  s[0] = a;
+  s[1] = _mm_srli_si128(a, 2);
+  s[2] = _mm_srli_si128(a, 4);
+  s[3] = _mm_srli_si128(a, 6);
+  s[4] = _mm_srli_si128(a, 8);
+  return Sum5_16(s);
+}
+
+inline __m128i Sum5Horizontal_16(const __m128i a[2]) {
+  __m128i s[5];
+  s[0] = a[0];
+  s[1] = _mm_alignr_epi8(a[1], a[0], 2);
+  s[2] = _mm_alignr_epi8(a[1], a[0], 4);
+  s[3] = _mm_alignr_epi8(a[1], a[0], 6);
+  s[4] = _mm_alignr_epi8(a[1], a[0], 8);
+  return Sum5_16(s);
+}
+
+inline __m128i Sum5Horizontal_32(const __m128i a[2]) {
+  __m128i s[5];
+  s[0] = a[0];
+  s[1] = _mm_alignr_epi8(a[1], a[0], 4);
+  s[2] = _mm_alignr_epi8(a[1], a[0], 8);
+  s[3] = _mm_alignr_epi8(a[1], a[0], 12);
+  s[4] = a[1];
+  return Sum5_32(s);
+}
+
+inline __m128i* Sum5Horizontal_32x2(const __m128i a[3], __m128i sum[2]) {
+  __m128i s[5];
+  s[0] = a[0];
+  s[1] = _mm_alignr_epi8(a[1], a[0], 4);
+  s[2] = _mm_alignr_epi8(a[1], a[0], 8);
+  s[3] = _mm_alignr_epi8(a[1], a[0], 12);
+  s[4] = a[1];
+  sum[0] = Sum5_32(s);
+  s[0] = a[1];
+  s[1] = _mm_alignr_epi8(a[2], a[1], 4);
+  s[2] = _mm_alignr_epi8(a[2], a[1], 8);
+  s[3] = _mm_alignr_epi8(a[2], a[1], 12);
+  s[4] = a[2];
+  sum[1] = Sum5_32(s);
+  return sum;
+}
+
+template <int size, int offset>
+inline void BoxFilterPreProcess4(const __m128i* const row,
+                                 const __m128i* const row_sq, const uint32_t s,
+                                 uint16_t* const dst) {
+  static_assert(offset == 0 || offset == 1, "");
+  // Number of elements in the box being summed.
+  constexpr uint32_t n = size * size;
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+  __m128i sum, sum_sq;
+  if (size == 3) {
+    __m128i temp32[2];
+    if (offset == 0) {
+      sum = Sum3Horizontal(Sum3WLo_16(row));
+      sum_sq = Sum3Horizontal_32(Sum3W(row_sq, temp32));
+    } else {
+      sum = Sum3HorizontalOffset1(Sum3WLo_16(row));
+      sum_sq = Sum3HorizontalOffset1_32(Sum3W(row_sq, temp32));
+    }
+  }
+  if (size == 5) {
+    __m128i temp[2];
+    sum = Sum5Horizontal(Sum5WLo_16(row));
+    sum_sq = Sum5Horizontal_32(Sum5W_32x2(row_sq, temp));
+  }
+  const __m128i sum_32 = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
+  const __m128i z0 = CalculateSgrMA2<n>(sum_sq, sum_32, s);
+  const __m128i z1 = _mm_packus_epi32(z0, z0);
+  const __m128i z = _mm_min_epu16(z1, _mm_set1_epi16(255));
+  __m128i sgr_ma2 = _mm_setzero_si128();
+  sgr_ma2 = _mm_insert_epi8(sgr_ma2, kSgrMa2Lookup[VgetLane16<0>(z)], 4);
+  sgr_ma2 = _mm_insert_epi8(sgr_ma2, kSgrMa2Lookup[VgetLane16<1>(z)], 5);
+  sgr_ma2 = _mm_insert_epi8(sgr_ma2, kSgrMa2Lookup[VgetLane16<2>(z)], 6);
+  sgr_ma2 = _mm_insert_epi8(sgr_ma2, kSgrMa2Lookup[VgetLane16<3>(z)], 7);
+  const __m128i b2 = CalculateIntermediate4(sgr_ma2, sum_32, one_over_n);
+  const __m128i sgr_ma2_b2 = _mm_unpacklo_epi64(sgr_ma2, b2);
+  StoreAligned16(dst, sgr_ma2_b2);
+}
+
+template <int size, int offset>
+inline void BoxFilterPreProcess8(const __m128i* const row,
+                                 const __m128i row_sq[][2], const uint32_t s,
+                                 __m128i* const sgr_ma2, __m128i* const b2,
+                                 uint16_t* const dst) {
+  // Number of elements in the box being summed.
+  constexpr uint32_t n = size * size;
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+  __m128i sum, sum_sq[2];
+  if (size == 3) {
+    __m128i temp16[2], temp32[3];
+    if (offset == 0) {
+      sum = Sum3Horizontal_16(Sum3W_16x2(row, temp16));
+      Sum3Horizontal_32x2(Sum3W(row_sq, temp32), sum_sq);
+    } else /* if (offset == 1) */ {
+      sum = Sum3HorizontalOffset1_16(Sum3W_16x2(row, temp16));
+      Sum3HorizontalOffset1_32x2(Sum3W(row_sq, temp32), sum_sq);
+    }
+  }
+  if (size == 5) {
+    __m128i temp16[2], temp32[3];
+    sum = Sum5Horizontal_16(Sum5W_16D(row, temp16));
+    Sum5Horizontal_32x2(Sum5W_32x3(row_sq, temp32), sum_sq);
+  }
+  const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
+  const __m128i z0 = CalculateSgrMA2<n>(sum_sq[0], sum_lo, s);
+  const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
+  const __m128i z1 = CalculateSgrMA2<n>(sum_sq[1], sum_hi, s);
+  const __m128i z01 = _mm_packus_epi32(z0, z1);
+  const __m128i z = _mm_min_epu16(z01, _mm_set1_epi16(255));
+  *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<0>(z)], 8);
+  *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<1>(z)], 9);
+  *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<2>(z)], 10);
+  *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<3>(z)], 11);
+  *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<4>(z)], 12);
+  *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<5>(z)], 13);
+  *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<6>(z)], 14);
+  *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<7>(z)], 15);
+  *b2 = CalculateIntermediate8(*sgr_ma2, sum, one_over_n);
+  const __m128i sgr_ma2_b2 = _mm_unpackhi_epi64(*sgr_ma2, *b2);
+  StoreAligned16(dst, sgr_ma2_b2);
+}
+
+inline void Prepare3_8(const __m128i a, __m128i* const left,
+                       __m128i* const middle, __m128i* const right) {
+  *left = _mm_srli_si128(a, 4);
+  *middle = _mm_srli_si128(a, 5);
+  *right = _mm_srli_si128(a, 6);
+}
+
+inline void Prepare3_16(const __m128i a[2], __m128i* const left,
+                        __m128i* const middle, __m128i* const right) {
+  *left = _mm_alignr_epi8(a[1], a[0], 8);
+  *middle = _mm_alignr_epi8(a[1], a[0], 10);
+  *right = _mm_alignr_epi8(a[1], a[0], 12);
+}
+
+inline __m128i Sum343(const __m128i a) {
+  __m128i left, middle, right;
+  Prepare3_8(a, &left, &middle, &right);
+  const auto sum = Sum3W_16(left, middle, right);
+  const auto sum3 = Sum3_16(sum, sum, sum);
+  return VaddwLo8(sum3, middle);
+}
+
+inline void Sum343_444(const __m128i a, __m128i* const sum343,
+                       __m128i* const sum444) {
+  __m128i left, middle, right;
+  Prepare3_8(a, &left, &middle, &right);
+  const auto sum = Sum3W_16(left, middle, right);
+  const auto sum3 = Sum3_16(sum, sum, sum);
+  *sum343 = VaddwLo8(sum3, middle);
+  *sum444 = _mm_slli_epi16(sum, 2);
+}
+
+inline __m128i* Sum343W(const __m128i a[2], __m128i d[2]) {
+  __m128i left, middle, right;
+  Prepare3_16(a, &left, &middle, &right);
+  d[0] = Sum3WLo_32(left, middle, right);
+  d[1] = Sum3WHi_32(left, middle, right);
+  d[0] = Sum3_32(d[0], d[0], d[0]);
+  d[1] = Sum3_32(d[1], d[1], d[1]);
+  d[0] = VaddwLo16(d[0], middle);
+  d[1] = VaddwHi16(d[1], middle);
+  return d;
+}
+
+inline void Sum343_444W(const __m128i a[2], __m128i sum343[2],
+                        __m128i sum444[2]) {
+  __m128i left, middle, right;
+  Prepare3_16(a, &left, &middle, &right);
+  sum444[0] = Sum3WLo_32(left, middle, right);
+  sum444[1] = Sum3WHi_32(left, middle, right);
+  sum343[0] = Sum3_32(sum444[0], sum444[0], sum444[0]);
+  sum343[1] = Sum3_32(sum444[1], sum444[1], sum444[1]);
+  sum343[0] = VaddwLo16(sum343[0], middle);
+  sum343[1] = VaddwHi16(sum343[1], middle);
+  sum444[0] = _mm_slli_epi32(sum444[0], 2);
+  sum444[1] = _mm_slli_epi32(sum444[1], 2);
+}
+
+inline __m128i Sum565(const __m128i a) {
+  __m128i left, middle, right;
+  Prepare3_8(a, &left, &middle, &right);
+  const auto sum = Sum3W_16(left, middle, right);
+  const auto sum4 = _mm_slli_epi16(sum, 2);
+  const auto sum5 = _mm_add_epi16(sum4, sum);
+  return VaddwLo8(sum5, middle);
+}
+
+inline __m128i Sum565W(const __m128i a) {
+  const auto left = a;
+  const auto middle = _mm_srli_si128(a, 2);
+  const auto right = _mm_srli_si128(a, 4);
+  const auto sum = Sum3WLo_32(left, middle, right);
+  const auto sum4 = _mm_slli_epi32(sum, 2);
+  const auto sum5 = _mm_add_epi32(sum4, sum);
+  return VaddwLo16(sum5, middle);
+}
+
+// RightShiftWithRounding(
+//   (a * src_ptr[x] + b), kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+template <int shift>
+inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i a,
+                                       const __m128i b[2]) {
+  const __m128i src_u16 = _mm_unpacklo_epi8(src, _mm_setzero_si128());
+  // a: 256 * 32 = 8192 (14 bits)
+  // b: 65088 * 32 = 2082816 (21 bits)
+  const __m128i axsrc_lo = VmullLo16(a, src_u16);
+  const __m128i axsrc_hi = VmullHi16(a, src_u16);
+  // v: 8192 * 255 + 2082816 = 4171876 (22 bits)
+  const __m128i v_lo = _mm_add_epi32(axsrc_lo, b[0]);
+  const __m128i v_hi = _mm_add_epi32(axsrc_hi, b[1]);
+
+  // kSgrProjSgrBits = 8
+  // kSgrProjRestoreBits = 4
+  // shift = 4 or 5
+  // v >> 8 or 9
+  // 22 bits >> 8 = 14 bits
+  const __m128i dst_lo =
+      VrshrN32(v_lo, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+  const __m128i dst_hi =
+      VrshrN32(v_hi, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+  return _mm_packus_epi32(dst_lo, dst_hi);  // 14 bits
+}
+
+inline __m128i BoxFilterPass1(const __m128i src_u8, const __m128i a2,
+                              const __m128i b2[2], __m128i sum565_a[2],
+                              __m128i sum565_b[2][2]) {
+  __m128i b_v[2];
+  sum565_a[1] = Sum565(a2);
+  sum565_a[1] = _mm_sub_epi16(_mm_set1_epi16((5 + 6 + 5) * 256), sum565_a[1]);
+  sum565_b[1][0] = Sum565W(_mm_alignr_epi8(b2[1], b2[0], 8));
+  sum565_b[1][1] = Sum565W(b2[1]);
+
+  __m128i a_v = _mm_add_epi16(sum565_a[0], sum565_a[1]);
+  b_v[0] = _mm_add_epi32(sum565_b[0][0], sum565_b[1][0]);
+  b_v[1] = _mm_add_epi32(sum565_b[0][1], sum565_b[1][1]);
+  return CalculateFilteredOutput<5>(src_u8, a_v, b_v);  // 14 bits
+}
+
+inline __m128i BoxFilterPass2(const __m128i src_u8, const __m128i a2,
+                              const __m128i b2[2], __m128i sum343_a[4],
+                              __m128i sum444_a[3], __m128i sum343_b[4][2],
+                              __m128i sum444_b[3][2]) {
+  __m128i b_v[2];
+  Sum343_444(a2, &sum343_a[2], &sum444_a[1]);
+  sum343_a[2] = _mm_sub_epi16(_mm_set1_epi16((3 + 4 + 3) * 256), sum343_a[2]);
+  sum444_a[1] = _mm_sub_epi16(_mm_set1_epi16((4 + 4 + 4) * 256), sum444_a[1]);
+  __m128i a_v = Sum3_16(sum343_a[0], sum444_a[0], sum343_a[2]);
+  Sum343_444W(b2, sum343_b[2], sum444_b[1]);
+  b_v[0] = Sum3_32(sum343_b[0][0], sum444_b[0][0], sum343_b[2][0]);
+  b_v[1] = Sum3_32(sum343_b[0][1], sum444_b[0][1], sum343_b[2][1]);
+  return CalculateFilteredOutput<5>(src_u8, a_v, b_v);  // 14 bits
+}
+
+inline void SelfGuidedDoubleMultiplier(
+    const __m128i src, const __m128i box_filter_process_output[2],
+    const __m128i w0, const __m128i w1, const __m128i w2, uint8_t* const dst) {
+  // |wN| values are signed. |src| values can be treated as int16_t.
+  const __m128i u = VshllN8(src, kSgrProjRestoreBits);
+  __m128i v_lo = VmulwLo16(w1, u);
+  v_lo = VmlawLo16(v_lo, w0, box_filter_process_output[0]);
+  v_lo = VmlawLo16(v_lo, w2, box_filter_process_output[1]);
+  __m128i v_hi = VmulwHi16(w1, u);
+  v_hi = VmlawHi16(v_hi, w0, box_filter_process_output[0]);
+  v_hi = VmlawHi16(v_hi, w2, box_filter_process_output[1]);
+  // |s| is saturated to uint8_t.
+  const __m128i s_lo =
+      VrshrNS32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m128i s_hi =
+      VrshrNS32(v_hi, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m128i s = _mm_packs_epi32(s_lo, s_hi);
+  StoreLo8(dst, _mm_packus_epi16(s, s));
+}
+
+inline void SelfGuidedSingleMultiplier(const __m128i src,
+                                       const __m128i box_filter_process_output,
+                                       const int16_t w0, const int16_t w1,
+                                       uint8_t* const dst) {
+  // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+  const __m128i u = VshllN8(src, kSgrProjRestoreBits);
+  // u * w1 + u * wN == u * (w1 + wN)
+  __m128i v_lo = VmullNLo8(u, w1);
+  v_lo = VmlalNLo16(v_lo, box_filter_process_output, w0);
+  __m128i v_hi = VmullNHi8(u, w1);
+  v_hi = VmlalNHi16(v_hi, box_filter_process_output, w0);
+  const __m128i s_lo =
+      VrshrNS32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m128i s_hi =
+      VrshrNS32(v_hi, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m128i s = _mm_packs_epi32(s_lo, s_hi);
+  StoreLo8(dst, _mm_packus_epi16(s, s));
+}
+
+inline void BoxFilterProcess(const uint8_t* const src,
+                             const ptrdiff_t src_stride,
+                             const RestorationUnitInfo& restoration_info,
+                             const int width, const int height,
+                             const uint16_t s[2], uint16_t* const temp,
+                             uint8_t* const dst, const ptrdiff_t dst_stride) {
+  // We have combined PreProcess and Process for the first pass by storing
+  // intermediate values in the |a2| region. The values stored are one vertical
+  // column of interleaved |a2| and |b2| values and consume 8 * |height| values.
+  // This is |height| and not |height| * 2 because PreProcess only generates
+  // output for every other row. When processing the next column we write the
+  // new scratch values right after reading the previously saved ones.
+
+  // The PreProcess phase calculates a 5x5 box sum for every other row
+  //
+  // PreProcess and Process have been combined into the same step. We need 12
+  // input values to generate 8 output values for PreProcess:
+  // 0 1 2 3 4 5 6 7 8 9 10 11
+  // 2 = 0 + 1 + 2 +  3 +  4
+  // 3 = 1 + 2 + 3 +  4 +  5
+  // 4 = 2 + 3 + 4 +  5 +  6
+  // 5 = 3 + 4 + 5 +  6 +  7
+  // 6 = 4 + 5 + 6 +  7 +  8
+  // 7 = 5 + 6 + 7 +  8 +  9
+  // 8 = 6 + 7 + 8 +  9 + 10
+  // 9 = 7 + 8 + 9 + 10 + 11
+  //
+  // and then we need 10 input values to generate 8 output values for Process:
+  // 0 1 2 3 4 5 6 7 8 9
+  // 1 = 0 + 1 + 2
+  // 2 = 1 + 2 + 3
+  // 3 = 2 + 3 + 4
+  // 4 = 3 + 4 + 5
+  // 5 = 4 + 5 + 6
+  // 6 = 5 + 6 + 7
+  // 7 = 6 + 7 + 8
+  // 8 = 7 + 8 + 9
+  //
+  // To avoid re-calculating PreProcess values over and over again we will do a
+  // single column of 8 output values and store the second half of them
+  // interleaved in |temp|. The first half is not stored, since it is used
+  // immediately and becomes useless for the next column. Next we will start the
+  // second column. When 2 rows have been calculated we can calculate Process
+  // and output the results.
+
+  // Calculate and store a single column. Scope so we can re-use the variable
+  // names for the next step.
+  uint16_t* ab_ptr = temp;
+
+  const uint8_t* const src_pre_process = src - 2 * src_stride - 3;
+  // Calculate intermediate results, including two-pixel border, for example, if
+  // unit size is 64x64, we calculate 68x68 pixels.
+  {
+    const uint8_t* column = src_pre_process;
+    __m128i row[5], row_sq[5];
+    row[0] = row[1] = LoadLo8Msan(column, 2 - width);
+    column += src_stride;
+    row[2] = LoadLo8Msan(column, 2 - width);
+
+    row_sq[0] = row_sq[1] = VmullLo8(row[1], row[1]);
+    row_sq[2] = VmullLo8(row[2], row[2]);
+
+    int y = (height + 2) >> 1;
     do {
-      const __m128i v_a =
-          HorizontalAddVerticalSumsRadius1(&vertical_sum_of_squares[x - 1]);
-      const __m128i v_b =
-          HorizontalAddVerticalSumsRadius1(&vertical_sums[x - 1]);
-      // -----------------------
-      // calc p, z, a2
-      // -----------------------
-      const __m128i v_255 = _mm_shuffle_epi32(_mm_cvtsi32_si128(255), 0);
-      const __m128i v_n = _mm_shuffle_epi32(_mm_cvtsi32_si128(n), 0);
-      const __m128i v_s = _mm_shuffle_epi32(_mm_cvtsi32_si128(s), 0);
-      const __m128i v_dxd = _mm_mullo_epi32(v_b, v_b);
-      const __m128i v_axn = _mm_mullo_epi32(v_a, v_n);
-      const __m128i v_p = _mm_sub_epi32(v_axn, v_dxd);
-      const __m128i v_z = _mm_min_epi32(
-          v_255, RightShiftWithRounding_U32(_mm_mullo_epi32(v_p, v_s),
-                                            kSgrProjScaleBits));
-      const __m128i v_a2 = _mm_set_epi32(kXByXPlus1[_mm_extract_epi32(v_z, 3)],
-                                         kXByXPlus1[_mm_extract_epi32(v_z, 2)],
-                                         kXByXPlus1[_mm_extract_epi32(v_z, 1)],
-                                         kXByXPlus1[_mm_extract_epi32(v_z, 0)]);
-      // -----------------------
-      // calc b2 and store
-      // -----------------------
-      const __m128i v_sgrbits_sub_a2 = _mm_sub_epi32(v_sgrbits, v_a2);
-      const __m128i v_b2 =
-          _mm_mullo_epi32(v_sgrbits_sub_a2, _mm_mullo_epi32(v_b, v_one_over_n));
-      StoreUnaligned16(&intermediate_result[0][x], v_a2);
-      StoreUnaligned16(
-          &intermediate_result[1][x],
-          RightShiftWithRounding_U32(v_b2, kSgrProjReciprocalBits));
-      x += 4;
-    } while (x <= width);
-    intermediate_result[0] += array_stride;
-    intermediate_result[1] += array_stride;
-  } while (++y <= height);
-}
-
-void BoxFilterPreProcessRadius2_SSE4_1(
-    const uint8_t* const src, ptrdiff_t stride, int width, int height,
-    uint32_t s, uint32_t* intermediate_result[2], ptrdiff_t array_stride,
-    uint32_t* vertical_sums, uint32_t* vertical_sum_of_squares) {
-  assert(s != 0);
-  const uint32_t n = 25;
-  const uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
-  const __m128i v_one_over_n =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128(one_over_n), 0);
-  const __m128i v_sgrbits =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << kSgrProjSgrBits), 0);
-
-  // Calculate intermediate results, including one-pixel border, for example,
-  // if unit size is 64x64, we calculate 66x66 pixels.
-  int y = -1;
+      column += src_stride;
+      row[3] = LoadLo8Msan(column, 2 - width);
+      column += src_stride;
+      row[4] = LoadLo8Msan(column, 2 - width);
+
+      row_sq[3] = VmullLo8(row[3], row[3]);
+      row_sq[4] = VmullLo8(row[4], row[4]);
+
+      BoxFilterPreProcess4<5, 0>(row + 0, row_sq + 0, s[0], ab_ptr + 0);
+      BoxFilterPreProcess4<3, 1>(row + 1, row_sq + 1, s[1], ab_ptr + 8);
+      BoxFilterPreProcess4<3, 1>(row + 2, row_sq + 2, s[1], ab_ptr + 16);
+
+      row[0] = row[2];
+      row[1] = row[3];
+      row[2] = row[4];
+
+      row_sq[0] = row_sq[2];
+      row_sq[1] = row_sq[3];
+      row_sq[2] = row_sq[4];
+      ab_ptr += 24;
+    } while (--y != 0);
+    if ((height & 1) != 0) {
+      column += src_stride;
+      row[3] = row[4] = LoadLo8Msan(column, 2 - width);
+      row_sq[3] = row_sq[4] = VmullLo8(row[3], row[3]);
+      BoxFilterPreProcess4<5, 0>(row + 0, row_sq + 0, s[0], ab_ptr + 0);
+      BoxFilterPreProcess4<3, 1>(row + 1, row_sq + 1, s[1], ab_ptr + 8);
+    }
+  }
+
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  const __m128i w0_v = _mm_set1_epi32(w0);
+  const __m128i w1_v = _mm_set1_epi32(w1);
+  const __m128i w2_v = _mm_set1_epi32(w2);
+  int x = 0;
   do {
-    // Calculate the box vertical sums for each x position.
-    const uint8_t* top_left = &src[(y - 2) * stride - 3];
-    int vsx = -3;
-    do {
-      const __m128i v_box0 = _mm_cvtepu8_epi32(Load4(top_left));
-      const __m128i v_box1 = _mm_cvtepu8_epi32(Load4(top_left + stride));
-      const __m128i v_box2 = _mm_cvtepu8_epi32(Load4(top_left + stride * 2));
-      const __m128i v_box3 = _mm_cvtepu8_epi32(Load4(top_left + stride * 3));
-      const __m128i v_box4 = _mm_cvtepu8_epi32(Load4(top_left + stride * 4));
-      const __m128i v_sqr0 = _mm_mullo_epi32(v_box0, v_box0);
-      const __m128i v_sqr1 = _mm_mullo_epi32(v_box1, v_box1);
-      const __m128i v_sqr2 = _mm_mullo_epi32(v_box2, v_box2);
-      const __m128i v_sqr3 = _mm_mullo_epi32(v_box3, v_box3);
-      const __m128i v_sqr4 = _mm_mullo_epi32(v_box4, v_box4);
-      const __m128i v_a01 = _mm_add_epi32(v_sqr0, v_sqr1);
-      const __m128i v_a012 = _mm_add_epi32(v_a01, v_sqr2);
-      const __m128i v_a0123 = _mm_add_epi32(v_a012, v_sqr3);
-      const __m128i v_a01234 = _mm_add_epi32(v_a0123, v_sqr4);
-      const __m128i v_b01 = _mm_add_epi32(v_box0, v_box1);
-      const __m128i v_b012 = _mm_add_epi32(v_b01, v_box2);
-      const __m128i v_b0123 = _mm_add_epi32(v_b012, v_box3);
-      const __m128i v_b01234 = _mm_add_epi32(v_b0123, v_box4);
-      StoreUnaligned16(&vertical_sum_of_squares[vsx], v_a01234);
-      StoreUnaligned16(&vertical_sums[vsx], v_b01234);
-      top_left += 4;
-      vsx += 4;
-    } while (vsx <= width + 2);
-
-    int x = -1;
+    // |src_pre_process| is X but we already processed the first column of 4
+    // values so we want to start at Y and increment from there.
+    // X s s s Y s s
+    // s s s s s s s
+    // s s i i i i i
+    // s s i o o o o
+    // s s i o o o o
+
+    // Seed the loop with one line of output. Then, inside the loop, for each
+    // iteration we can output one even row and one odd row and carry the new
+    // line to the next iteration. In the diagram below 'i' values are
+    // intermediary values from the first step and '-' values are empty.
+    // iiii
+    // ---- > even row
+    // iiii - odd row
+    // ---- > even row
+    // iiii
+    __m128i a2[2], b2[2][2], sum565_a[2], sum343_a[4], sum444_a[3];
+    __m128i sum565_b[2][2], sum343_b[4][2], sum444_b[3][2];
+    ab_ptr = temp;
+    a2[0] = b2[0][0] = LoadAligned16(ab_ptr);
+    a2[1] = b2[1][0] = LoadAligned16(ab_ptr + 8);
+
+    const uint8_t* column = src_pre_process + x + 4;
+    __m128i row[5], row_sq[5][2];
+    row[0] = row[1] = LoadUnaligned16Msan(column, x + 14 - width);
+    column += src_stride;
+    row[2] = LoadUnaligned16Msan(column, x + 14 - width);
+    column += src_stride;
+    row[3] = LoadUnaligned16Msan(column, x + 14 - width);
+    column += src_stride;
+    row[4] = LoadUnaligned16Msan(column, x + 14 - width);
+
+    row_sq[0][0] = row_sq[1][0] = VmullLo8(row[1], row[1]);
+    row_sq[0][1] = row_sq[1][1] = VmullHi8(row[1], row[1]);
+    row_sq[2][0] = VmullLo8(row[2], row[2]);
+    row_sq[2][1] = VmullHi8(row[2], row[2]);
+    row_sq[3][0] = VmullLo8(row[3], row[3]);
+    row_sq[3][1] = VmullHi8(row[3], row[3]);
+    row_sq[4][0] = VmullLo8(row[4], row[4]);
+    row_sq[4][1] = VmullHi8(row[4], row[4]);
+
+    BoxFilterPreProcess8<5, 0>(row, row_sq, s[0], &a2[0], &b2[0][1], ab_ptr);
+    BoxFilterPreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1], &b2[1][1],
+                               ab_ptr + 8);
+
+    // Pass 1 Process. These are the only values we need to propagate between
+    // rows.
+    sum565_a[0] = Sum565(a2[0]);
+    sum565_a[0] = _mm_sub_epi16(_mm_set1_epi16((5 + 6 + 5) * 256), sum565_a[0]);
+    sum565_b[0][0] = Sum565W(_mm_alignr_epi8(b2[0][1], b2[0][0], 8));
+    sum565_b[0][1] = Sum565W(b2[0][1]);
+
+    sum343_a[0] = Sum343(a2[1]);
+    sum343_a[0] = _mm_sub_epi16(_mm_set1_epi16((3 + 4 + 3) * 256), sum343_a[0]);
+    Sum343W(b2[1], sum343_b[0]);
+
+    a2[1] = b2[1][0] = LoadAligned16(ab_ptr + 16);
+
+    BoxFilterPreProcess8<3, 1>(row + 2, row_sq + 2, s[1], &a2[1], &b2[1][1],
+                               ab_ptr + 16);
+
+    Sum343_444(a2[1], &sum343_a[1], &sum444_a[0]);
+    sum343_a[1] = _mm_sub_epi16(_mm_set1_epi16((3 + 4 + 3) * 256), sum343_a[1]);
+    sum444_a[0] = _mm_sub_epi16(_mm_set1_epi16((4 + 4 + 4) * 256), sum444_a[0]);
+    Sum343_444W(b2[1], sum343_b[1], sum444_b[0]);
+
+    const uint8_t* src_ptr = src + x;
+    uint8_t* dst_ptr = dst + x;
+
+    // Calculate one output line. Add in the line from the previous pass and
+    // output one even row. Sum the new line and output the odd row. Carry the
+    // new row into the next pass.
+    for (int y = height >> 1; y != 0; --y) {
+      ab_ptr += 24;
+      a2[0] = b2[0][0] = LoadAligned16(ab_ptr);
+      a2[1] = b2[1][0] = LoadAligned16(ab_ptr + 8);
+
+      row[0] = row[2];
+      row[1] = row[3];
+      row[2] = row[4];
+
+      row_sq[0][0] = row_sq[2][0], row_sq[0][1] = row_sq[2][1];
+      row_sq[1][0] = row_sq[3][0], row_sq[1][1] = row_sq[3][1];
+      row_sq[2][0] = row_sq[4][0], row_sq[2][1] = row_sq[4][1];
+
+      column += src_stride;
+      row[3] = LoadUnaligned16Msan(column, x + 14 - width);
+      column += src_stride;
+      row[4] = LoadUnaligned16Msan(column, x + 14 - width);
+
+      row_sq[3][0] = VmullLo8(row[3], row[3]);
+      row_sq[3][1] = VmullHi8(row[3], row[3]);
+      row_sq[4][0] = VmullLo8(row[4], row[4]);
+      row_sq[4][1] = VmullHi8(row[4], row[4]);
+
+      BoxFilterPreProcess8<5, 0>(row, row_sq, s[0], &a2[0], &b2[0][1], ab_ptr);
+      BoxFilterPreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1], &b2[1][1],
+                                 ab_ptr + 8);
+
+      __m128i p[2];
+      const __m128i src0 = LoadLo8(src_ptr);
+      p[0] = BoxFilterPass1(src0, a2[0], b2[0], sum565_a, sum565_b);
+      p[1] = BoxFilterPass2(src0, a2[1], b2[1], sum343_a, sum444_a, sum343_b,
+                            sum444_b);
+      SelfGuidedDoubleMultiplier(src0, p, w0_v, w1_v, w2_v, dst_ptr);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+
+      const __m128i src1 = LoadLo8(src_ptr);
+      p[0] = CalculateFilteredOutput<4>(src1, sum565_a[1], sum565_b[1]);
+      a2[1] = b2[1][0] = LoadAligned16(ab_ptr + 16);
+      BoxFilterPreProcess8<3, 1>(row + 2, row_sq + 2, s[1], &a2[1], &b2[1][1],
+                                 ab_ptr + 16);
+      p[1] = BoxFilterPass2(src1, a2[1], b2[1], sum343_a + 1, sum444_a + 1,
+                            sum343_b + 1, sum444_b + 1);
+      SelfGuidedDoubleMultiplier(src1, p, w0_v, w1_v, w2_v, dst_ptr);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+
+      sum565_a[0] = sum565_a[1];
+      sum565_b[0][0] = sum565_b[1][0], sum565_b[0][1] = sum565_b[1][1];
+      sum343_a[0] = sum343_a[2];
+      sum343_a[1] = sum343_a[3];
+      sum444_a[0] = sum444_a[2];
+      sum343_b[0][0] = sum343_b[2][0], sum343_b[0][1] = sum343_b[2][1];
+      sum343_b[1][0] = sum343_b[3][0], sum343_b[1][1] = sum343_b[3][1];
+      sum444_b[0][0] = sum444_b[2][0], sum444_b[0][1] = sum444_b[2][1];
+    }
+    if ((height & 1) != 0) {
+      ab_ptr += 24;
+      a2[0] = b2[0][0] = LoadAligned16(ab_ptr);
+      a2[1] = b2[1][0] = LoadAligned16(ab_ptr + 8);
+
+      row[0] = row[2];
+      row[1] = row[3];
+      row[2] = row[4];
+
+      row_sq[0][0] = row_sq[2][0], row_sq[0][1] = row_sq[2][1];
+      row_sq[1][0] = row_sq[3][0], row_sq[1][1] = row_sq[3][1];
+      row_sq[2][0] = row_sq[4][0], row_sq[2][1] = row_sq[4][1];
+
+      column += src_stride;
+      row[3] = row[4] = LoadUnaligned16Msan(column, x + 14 - width);
+
+      row_sq[3][0] = row_sq[4][0] = VmullLo8(row[3], row[3]);
+      row_sq[3][1] = row_sq[4][1] = VmullHi8(row[3], row[3]);
+
+      BoxFilterPreProcess8<5, 0>(row, row_sq, s[0], &a2[0], &b2[0][1], ab_ptr);
+      BoxFilterPreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1], &b2[1][1],
+                                 ab_ptr + 8);
+
+      __m128i p[2];
+      const __m128i src0 = LoadLo8(src_ptr);
+      p[0] = BoxFilterPass1(src0, a2[0], b2[0], sum565_a, sum565_b);
+      p[1] = BoxFilterPass2(src0, a2[1], b2[1], sum343_a, sum444_a, sum343_b,
+                            sum444_b);
+      SelfGuidedDoubleMultiplier(src0, p, w0_v, w1_v, w2_v, dst_ptr);
+    }
+    x += 8;
+  } while (x < width);
+}
+
+inline void BoxFilterProcessPass1(const uint8_t* const src,
+                                  const ptrdiff_t src_stride,
+                                  const RestorationUnitInfo& restoration_info,
+                                  const int width, const int height,
+                                  const uint32_t s, uint16_t* const temp,
+                                  uint8_t* const dst,
+                                  const ptrdiff_t dst_stride) {
+  // We have combined PreProcess and Process for the first pass by storing
+  // intermediate values in the |a2| region. The values stored are one vertical
+  // column of interleaved |a2| and |b2| values and consume 8 * |height| values.
+  // This is |height| and not |height| * 2 because PreProcess only generates
+  // output for every other row. When processing the next column we write the
+  // new scratch values right after reading the previously saved ones.
+
+  // The PreProcess phase calculates a 5x5 box sum for every other row
+  //
+  // PreProcess and Process have been combined into the same step. We need 12
+  // input values to generate 8 output values for PreProcess:
+  // 0 1 2 3 4 5 6 7 8 9 10 11
+  // 2 = 0 + 1 + 2 +  3 +  4
+  // 3 = 1 + 2 + 3 +  4 +  5
+  // 4 = 2 + 3 + 4 +  5 +  6
+  // 5 = 3 + 4 + 5 +  6 +  7
+  // 6 = 4 + 5 + 6 +  7 +  8
+  // 7 = 5 + 6 + 7 +  8 +  9
+  // 8 = 6 + 7 + 8 +  9 + 10
+  // 9 = 7 + 8 + 9 + 10 + 11
+  //
+  // and then we need 10 input values to generate 8 output values for Process:
+  // 0 1 2 3 4 5 6 7 8 9
+  // 1 = 0 + 1 + 2
+  // 2 = 1 + 2 + 3
+  // 3 = 2 + 3 + 4
+  // 4 = 3 + 4 + 5
+  // 5 = 4 + 5 + 6
+  // 6 = 5 + 6 + 7
+  // 7 = 6 + 7 + 8
+  // 8 = 7 + 8 + 9
+  //
+  // To avoid re-calculating PreProcess values over and over again we will do a
+  // single column of 8 output values and store the second half of them
+  // interleaved in |temp|. The first half is not stored, since it is used
+  // immediately and becomes useless for the next column. Next we will start the
+  // second column. When 2 rows have been calculated we can calculate Process
+  // and output the results.
+
+  // Calculate and store a single column. Scope so we can re-use the variable
+  // names for the next step.
+  uint16_t* ab_ptr = temp;
+
+  const uint8_t* const src_pre_process = src - 2 * src_stride - 3;
+  // Calculate intermediate results, including two-pixel border, for example, if
+  // unit size is 64x64, we calculate 68x68 pixels.
+  {
+    const uint8_t* column = src_pre_process;
+    __m128i row[5], row_sq[5];
+    row[0] = row[1] = LoadLo8Msan(column, 2 - width);
+    column += src_stride;
+    row[2] = LoadLo8Msan(column, 2 - width);
+
+    row_sq[0] = row_sq[1] = VmullLo8(row[1], row[1]);
+    row_sq[2] = VmullLo8(row[2], row[2]);
+
+    int y = (height + 2) >> 1;
     do {
-      const __m128i v_a =
-          HorizontalAddVerticalSumsRadius2(&vertical_sum_of_squares[x - 2]);
-      const __m128i v_b =
-          HorizontalAddVerticalSumsRadius2(&vertical_sums[x - 2]);
-      // -----------------------
-      // calc p, z, a2
-      // -----------------------
-      const __m128i v_255 = _mm_shuffle_epi32(_mm_cvtsi32_si128(255), 0);
-      const __m128i v_n = _mm_shuffle_epi32(_mm_cvtsi32_si128(n), 0);
-      const __m128i v_s = _mm_shuffle_epi32(_mm_cvtsi32_si128(s), 0);
-      const __m128i v_dxd = _mm_mullo_epi32(v_b, v_b);
-      const __m128i v_axn = _mm_mullo_epi32(v_a, v_n);
-      const __m128i v_p = _mm_sub_epi32(v_axn, v_dxd);
-      const __m128i v_z = _mm_min_epi32(
-          v_255, RightShiftWithRounding_U32(_mm_mullo_epi32(v_p, v_s),
-                                            kSgrProjScaleBits));
-      const __m128i v_a2 = _mm_set_epi32(kXByXPlus1[_mm_extract_epi32(v_z, 3)],
-                                         kXByXPlus1[_mm_extract_epi32(v_z, 2)],
-                                         kXByXPlus1[_mm_extract_epi32(v_z, 1)],
-                                         kXByXPlus1[_mm_extract_epi32(v_z, 0)]);
-      // -----------------------
-      // calc b2 and store
-      // -----------------------
-      const __m128i v_sgrbits_sub_a2 = _mm_sub_epi32(v_sgrbits, v_a2);
-      const __m128i v_b2 =
-          _mm_mullo_epi32(v_sgrbits_sub_a2, _mm_mullo_epi32(v_b, v_one_over_n));
-      StoreUnaligned16(&intermediate_result[0][x], v_a2);
-      StoreUnaligned16(
-          &intermediate_result[1][x],
-          RightShiftWithRounding_U32(v_b2, kSgrProjReciprocalBits));
-      x += 4;
-    } while (x <= width);
-    intermediate_result[0] += 2 * array_stride;
-    intermediate_result[1] += 2 * array_stride;
-    y += 2;
-  } while (y <= height);
-}
-
-void BoxFilterPreProcess_SSE4_1(const RestorationUnitInfo& restoration_info,
-                                const uint8_t* const src, ptrdiff_t stride,
-                                int width, int height, int pass,
-                                RestorationBuffer* const buffer) {
-  uint32_t vertical_sums_buf[kRestorationProcessingUnitSize +
-                             2 * kRestorationBorder + kRestorationPadding];
-  uint32_t vertical_sum_of_squares_buf[kRestorationProcessingUnitSize +
-                                       2 * kRestorationBorder +
-                                       kRestorationPadding];
-  uint32_t* vertical_sums = &vertical_sums_buf[4];
-  uint32_t* vertical_sum_of_squares = &vertical_sum_of_squares_buf[4];
-  const ptrdiff_t array_stride = buffer->box_filter_process_intermediate_stride;
-  // The size of the intermediate result buffer is the size of the filter area
-  // plus horizontal (3) and vertical (3) padding. The processing start point
-  // is the filter area start point -1 row and -1 column. Therefore we need to
-  // set offset and use the intermediate_result as the start point for
-  // processing.
-  const ptrdiff_t intermediate_buffer_offset =
-      kRestorationBorder * array_stride + kRestorationBorder;
-  uint32_t* intermediate_result[2] = {
-      buffer->box_filter_process_intermediate[0] + intermediate_buffer_offset -
-          array_stride,
-      buffer->box_filter_process_intermediate[1] + intermediate_buffer_offset -
-          array_stride};
-  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
-  if (pass == 0) {
-    assert(kSgrProjParams[sgr_proj_index][0] == 2);
-    BoxFilterPreProcessRadius2_SSE4_1(src, stride, width, height,
-                                      kSgrScaleParameter[sgr_proj_index][0],
-                                      intermediate_result, array_stride,
-                                      vertical_sums, vertical_sum_of_squares);
-  } else {
-    assert(kSgrProjParams[sgr_proj_index][2] == 1);
-    BoxFilterPreProcessRadius1_SSE4_1(src, stride, width, height,
-                                      kSgrScaleParameter[sgr_proj_index][1],
-                                      intermediate_result, array_stride,
-                                      vertical_sums, vertical_sum_of_squares);
+      column += src_stride;
+      row[3] = LoadLo8Msan(column, 2 - width);
+      column += src_stride;
+      row[4] = LoadLo8Msan(column, 2 - width);
+
+      row_sq[3] = VmullLo8(row[3], row[3]);
+      row_sq[4] = VmullLo8(row[4], row[4]);
+
+      BoxFilterPreProcess4<5, 0>(row, row_sq, s, ab_ptr);
+
+      row[0] = row[2];
+      row[1] = row[3];
+      row[2] = row[4];
+
+      row_sq[0] = row_sq[2];
+      row_sq[1] = row_sq[3];
+      row_sq[2] = row_sq[4];
+      ab_ptr += 8;
+    } while (--y != 0);
+    if ((height & 1) != 0) {
+      column += src_stride;
+      row[3] = row[4] = LoadLo8Msan(column, 2 - width);
+      row_sq[3] = row_sq[4] = VmullLo8(row[3], row[3]);
+      BoxFilterPreProcess4<5, 0>(row, row_sq, s, ab_ptr);
+    }
   }
-}
 
-inline __m128i Sum565Row(const __m128i v_DBCA, const __m128i v_XXFE) {
-  __m128i v_sum = v_DBCA;
-  const __m128i v_EDCB = _mm_alignr_epi8(v_XXFE, v_DBCA, 4);
-  v_sum = _mm_add_epi32(v_sum, v_EDCB);
-  const __m128i v_FEDC = _mm_alignr_epi8(v_XXFE, v_DBCA, 8);
-  v_sum = _mm_add_epi32(v_sum, v_FEDC);
-  //   D C B A x4
-  // + E D C B x4
-  // + F E D C x4
-  v_sum = _mm_slli_epi32(v_sum, 2);
-  // + D C B A
-  v_sum = _mm_add_epi32(v_sum, v_DBCA);  // 5
-  // + E D C B x2
-  v_sum = _mm_add_epi32(v_sum, _mm_slli_epi32(v_EDCB, 1));  // 6
-  // + F E D C
-  return _mm_add_epi32(v_sum, v_FEDC);  // 5
-}
-
-inline __m128i Process3x3Block_565_Odd(const uint32_t* src, ptrdiff_t stride) {
-  // 0 0 0
-  // 5 6 5
-  // 0 0 0
-  const uint32_t* top_left = src - 1;
-  const __m128i v_src1_lo = LoadUnaligned16(top_left + stride);
-  const __m128i v_src1_hi = LoadLo8(top_left + stride + 4);
-  return Sum565Row(v_src1_lo, v_src1_hi);
-}
-
-inline __m128i Process3x3Block_565_Even(const uint32_t* src, ptrdiff_t stride) {
-  // 5 6 5
-  // 0 0 0
-  // 5 6 5
-  const uint32_t* top_left = src - 1;
-  const __m128i v_src0_lo = LoadUnaligned16(top_left);
-  const __m128i v_src0_hi = LoadLo8(top_left + 4);
-  const __m128i v_src2_lo = LoadUnaligned16(top_left + stride * 2);
-  const __m128i v_src2_hi = LoadLo8(top_left + stride * 2 + 4);
-  const __m128i v_a0 = Sum565Row(v_src0_lo, v_src0_hi);
-  const __m128i v_a2 = Sum565Row(v_src2_lo, v_src2_hi);
-  return _mm_add_epi32(v_a0, v_a2);
-}
-
-inline __m128i Sum343Row(const __m128i v_DBCA, const __m128i v_XXFE) {
-  __m128i v_sum = v_DBCA;
-  const __m128i v_EDCB = _mm_alignr_epi8(v_XXFE, v_DBCA, 4);
-  v_sum = _mm_add_epi32(v_sum, v_EDCB);
-  const __m128i v_FEDC = _mm_alignr_epi8(v_XXFE, v_DBCA, 8);
-  v_sum = _mm_add_epi32(v_sum, v_FEDC);
-  //   D C B A x4
-  // + E D C B x4
-  // + F E D C x4
-  v_sum = _mm_slli_epi32(v_sum, 2);  // 4
-  // - D C B A
-  v_sum = _mm_sub_epi32(v_sum, v_DBCA);  // 3
-  // - F E D C
-  return _mm_sub_epi32(v_sum, v_FEDC);  // 3
-}
-
-inline __m128i Sum444Row(const __m128i v_DBCA, const __m128i v_XXFE) {
-  __m128i v_sum = v_DBCA;
-  const __m128i v_EDCB = _mm_alignr_epi8(v_XXFE, v_DBCA, 4);
-  v_sum = _mm_add_epi32(v_sum, v_EDCB);
-  const __m128i v_FEDC = _mm_alignr_epi8(v_XXFE, v_DBCA, 8);
-  v_sum = _mm_add_epi32(v_sum, v_FEDC);
-  //   D C B A x4
-  // + E D C B x4
-  // + F E D C x4
-  return _mm_slli_epi32(v_sum, 2);  // 4
-}
-
-inline __m128i Process3x3Block_343(const uint32_t* src, ptrdiff_t stride) {
-  const uint32_t* top_left = src - 1;
-  const __m128i v_ir0_lo = LoadUnaligned16(top_left);
-  const __m128i v_ir0_hi = LoadLo8(top_left + 4);
-  const __m128i v_ir1_lo = LoadUnaligned16(top_left + stride);
-  const __m128i v_ir1_hi = LoadLo8(top_left + stride + 4);
-  const __m128i v_ir2_lo = LoadUnaligned16(top_left + stride * 2);
-  const __m128i v_ir2_hi = LoadLo8(top_left + stride * 2 + 4);
-  const __m128i v_a0 = Sum343Row(v_ir0_lo, v_ir0_hi);
-  const __m128i v_a1 = Sum444Row(v_ir1_lo, v_ir1_hi);
-  const __m128i v_a2 = Sum343Row(v_ir2_lo, v_ir2_hi);
-  return _mm_add_epi32(v_a0, _mm_add_epi32(v_a1, v_a2));
-}
-
-void BoxFilterProcess_SSE4_1(const RestorationUnitInfo& restoration_info,
-                             const uint8_t* src, ptrdiff_t stride, int width,
-                             int height, RestorationBuffer* const buffer) {
-  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
-  for (int pass = 0; pass < 2; ++pass) {
-    const uint8_t radius = kSgrProjParams[sgr_proj_index][pass * 2];
-    const uint8_t* src_ptr = src;
-    if (radius == 0) continue;
-
-    BoxFilterPreProcess_SSE4_1(restoration_info, src_ptr, stride, width, height,
-                               pass, buffer);
-
-    int* filtered_output = buffer->box_filter_process_output[pass];
-    const ptrdiff_t filtered_output_stride =
-        buffer->box_filter_process_output_stride;
-    const ptrdiff_t intermediate_stride =
-        buffer->box_filter_process_intermediate_stride;
-    // Set intermediate buffer start point to the actual start point of
-    // filtering.
-    const ptrdiff_t intermediate_buffer_offset =
-        kRestorationBorder * intermediate_stride + kRestorationBorder;
-
-    if (pass == 0) {
-      int y = 0;
-      do {
-        const int shift = ((y & 1) != 0) ? 4 : 5;
-        uint32_t* const array_start[2] = {
-            buffer->box_filter_process_intermediate[0] +
-                intermediate_buffer_offset + y * intermediate_stride,
-            buffer->box_filter_process_intermediate[1] +
-                intermediate_buffer_offset + y * intermediate_stride};
-        uint32_t* intermediate_result2[2] = {
-            array_start[0] - intermediate_stride,
-            array_start[1] - intermediate_stride};
-        if ((y & 1) == 0) {  // even row
-          int x = 0;
-          do {
-            // 5 6 5
-            // 0 0 0
-            // 5 6 5
-            const __m128i v_A = Process3x3Block_565_Even(
-                &intermediate_result2[0][x], intermediate_stride);
-            const __m128i v_B = Process3x3Block_565_Even(
-                &intermediate_result2[1][x], intermediate_stride);
-            const __m128i v_src = _mm_cvtepu8_epi32(Load4(src_ptr + x));
-            const __m128i v_v0 = _mm_mullo_epi32(v_A, v_src);
-            const __m128i v_v = _mm_add_epi32(v_v0, v_B);
-            const __m128i v_filtered = RightShiftWithRounding_U32(
-                v_v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
-
-            StoreUnaligned16(&filtered_output[x], v_filtered);
-            x += 4;
-          } while (x < width);
-        } else {
-          int x = 0;
-          do {
-            // 0 0 0
-            // 5 6 5
-            // 0 0 0
-            const __m128i v_A = Process3x3Block_565_Odd(
-                &intermediate_result2[0][x], intermediate_stride);
-            const __m128i v_B = Process3x3Block_565_Odd(
-                &intermediate_result2[1][x], intermediate_stride);
-            const __m128i v_src = _mm_cvtepu8_epi32(Load4(src_ptr + x));
-            const __m128i v_v0 = _mm_mullo_epi32(v_A, v_src);
-            const __m128i v_v = _mm_add_epi32(v_v0, v_B);
-            const __m128i v_filtered = RightShiftWithRounding_U32(
-                v_v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
-
-            StoreUnaligned16(&filtered_output[x], v_filtered);
-            x += 4;
-          } while (x < width);
-        }
-        src_ptr += stride;
-        filtered_output += filtered_output_stride;
-      } while (++y < height);
-    } else {
-      int y = 0;
-      do {
-        const int shift = 5;
-        uint32_t* const array_start[2] = {
-            buffer->box_filter_process_intermediate[0] +
-                intermediate_buffer_offset + y * intermediate_stride,
-            buffer->box_filter_process_intermediate[1] +
-                intermediate_buffer_offset + y * intermediate_stride};
-        uint32_t* intermediate_result2[2] = {
-            array_start[0] - intermediate_stride,
-            array_start[1] - intermediate_stride};
-        int x = 0;
-        do {
-          const __m128i v_A = Process3x3Block_343(&intermediate_result2[0][x],
-                                                  intermediate_stride);
-          const __m128i v_B = Process3x3Block_343(&intermediate_result2[1][x],
-                                                  intermediate_stride);
-          const __m128i v_src = _mm_cvtepu8_epi32(Load4(src_ptr + x));
-          const __m128i v_v0 = _mm_mullo_epi32(v_A, v_src);
-          const __m128i v_v = _mm_add_epi32(v_v0, v_B);
-          const __m128i v_filtered = RightShiftWithRounding_U32(
-              v_v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
-
-          StoreUnaligned16(&filtered_output[x], v_filtered);
-          x += 4;
-        } while (x < width);
-        src_ptr += stride;
-        filtered_output += filtered_output_stride;
-      } while (++y < height);
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = (1 << kSgrProjPrecisionBits) - w0;
+  int x = 0;
+  do {
+    // |src_pre_process| is X but we already processed the first column of 4
+    // values so we want to start at Y and increment from there.
+    // X s s s Y s s
+    // s s s s s s s
+    // s s i i i i i
+    // s s i o o o o
+    // s s i o o o o
+
+    // Seed the loop with one line of output. Then, inside the loop, for each
+    // iteration we can output one even row and one odd row and carry the new
+    // line to the next iteration. In the diagram below 'i' values are
+    // intermediary values from the first step and '-' values are empty.
+    // iiii
+    // ---- > even row
+    // iiii - odd row
+    // ---- > even row
+    // iiii
+    __m128i a2[2], b2[2], sum565_a[2], sum565_b[2][2];
+    ab_ptr = temp;
+    a2[0] = b2[0] = LoadAligned16(ab_ptr);
+
+    const uint8_t* column = src_pre_process + x + 4;
+    __m128i row[5], row_sq[5][2];
+    row[0] = row[1] = LoadUnaligned16Msan(column, x + 14 - width);
+    column += src_stride;
+    row[2] = LoadUnaligned16Msan(column, x + 14 - width);
+    column += src_stride;
+    row[3] = LoadUnaligned16Msan(column, x + 14 - width);
+    column += src_stride;
+    row[4] = LoadUnaligned16Msan(column, x + 14 - width);
+
+    row_sq[0][0] = row_sq[1][0] = VmullLo8(row[1], row[1]);
+    row_sq[0][1] = row_sq[1][1] = VmullHi8(row[1], row[1]);
+    row_sq[2][0] = VmullLo8(row[2], row[2]);
+    row_sq[2][1] = VmullHi8(row[2], row[2]);
+    row_sq[3][0] = VmullLo8(row[3], row[3]);
+    row_sq[3][1] = VmullHi8(row[3], row[3]);
+    row_sq[4][0] = VmullLo8(row[4], row[4]);
+    row_sq[4][1] = VmullHi8(row[4], row[4]);
+
+    BoxFilterPreProcess8<5, 0>(row, row_sq, s, &a2[0], &b2[1], ab_ptr);
+
+    // Pass 1 Process. These are the only values we need to propagate between
+    // rows.
+    sum565_a[0] = Sum565(a2[0]);
+    sum565_a[0] = _mm_sub_epi16(_mm_set1_epi16((5 + 6 + 5) * 256), sum565_a[0]);
+    sum565_b[0][0] = Sum565W(_mm_alignr_epi8(b2[1], b2[0], 8));
+    sum565_b[0][1] = Sum565W(b2[1]);
+
+    const uint8_t* src_ptr = src + x;
+    uint8_t* dst_ptr = dst + x;
+
+    // Calculate one output line. Add in the line from the previous pass and
+    // output one even row. Sum the new line and output the odd row. Carry the
+    // new row into the next pass.
+    for (int y = height >> 1; y != 0; --y) {
+      ab_ptr += 8;
+      a2[0] = b2[0] = LoadAligned16(ab_ptr);
+
+      row[0] = row[2];
+      row[1] = row[3];
+      row[2] = row[4];
+
+      row_sq[0][0] = row_sq[2][0], row_sq[0][1] = row_sq[2][1];
+      row_sq[1][0] = row_sq[3][0], row_sq[1][1] = row_sq[3][1];
+      row_sq[2][0] = row_sq[4][0], row_sq[2][1] = row_sq[4][1];
+
+      column += src_stride;
+      row[3] = LoadUnaligned16Msan(column, x + 14 - width);
+      column += src_stride;
+      row[4] = LoadUnaligned16Msan(column, x + 14 - width);
+
+      row_sq[3][0] = VmullLo8(row[3], row[3]);
+      row_sq[3][1] = VmullHi8(row[3], row[3]);
+      row_sq[4][0] = VmullLo8(row[4], row[4]);
+      row_sq[4][1] = VmullHi8(row[4], row[4]);
+
+      BoxFilterPreProcess8<5, 0>(row, row_sq, s, &a2[0], &b2[1], ab_ptr);
+
+      const __m128i src0 = LoadLo8(src_ptr);
+      const __m128i p0 = BoxFilterPass1(src0, a2[0], b2, sum565_a, sum565_b);
+      SelfGuidedSingleMultiplier(src0, p0, w0, w1, dst_ptr);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+
+      const __m128i src1 = LoadLo8(src_ptr);
+      const __m128i p1 =
+          CalculateFilteredOutput<4>(src1, sum565_a[1], sum565_b[1]);
+      SelfGuidedSingleMultiplier(src1, p1, w0, w1, dst_ptr);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+
+      sum565_a[0] = sum565_a[1];
+      sum565_b[0][0] = sum565_b[1][0], sum565_b[0][1] = sum565_b[1][1];
+    }
+    if ((height & 1) != 0) {
+      ab_ptr += 8;
+      a2[0] = b2[0] = LoadAligned16(ab_ptr);
+
+      row[0] = row[2];
+      row[1] = row[3];
+      row[2] = row[4];
+
+      row_sq[0][0] = row_sq[2][0], row_sq[0][1] = row_sq[2][1];
+      row_sq[1][0] = row_sq[3][0], row_sq[1][1] = row_sq[3][1];
+      row_sq[2][0] = row_sq[4][0], row_sq[2][1] = row_sq[4][1];
+
+      column += src_stride;
+      row[3] = row[4] = LoadUnaligned16Msan(column, x + 14 - width);
+
+      row_sq[3][0] = row_sq[4][0] = VmullLo8(row[3], row[3]);
+      row_sq[3][1] = row_sq[4][1] = VmullHi8(row[3], row[3]);
+
+      BoxFilterPreProcess8<5, 0>(row, row_sq, s, &a2[0], &b2[1], ab_ptr);
+
+      const __m128i src0 = LoadLo8(src_ptr);
+      const __m128i p0 = BoxFilterPass1(src0, a2[0], b2, sum565_a, sum565_b);
+      SelfGuidedSingleMultiplier(src0, p0, w0, w1, dst_ptr);
     }
+    x += 8;
+  } while (x < width);
+}
+
+inline void BoxFilterProcessPass2(const uint8_t* src,
+                                  const ptrdiff_t src_stride,
+                                  const RestorationUnitInfo& restoration_info,
+                                  const int width, const int height,
+                                  const uint32_t s, uint16_t* const temp,
+                                  uint8_t* const dst,
+                                  const ptrdiff_t dst_stride) {
+  uint16_t* ab_ptr = temp;
+
+  // Calculate intermediate results, including one-pixel border, for example, if
+  // unit size is 64x64, we calculate 66x66 pixels.
+  // Because of the vectors this calculates start in blocks of 4 so we actually
+  // get 68 values.
+  const uint8_t* const src_top_left_corner = src - 2 * src_stride - 2;
+  {
+    const uint8_t* column = src_top_left_corner;
+    __m128i row[3], row_sq[3];
+    row[0] = LoadLo8Msan(column, 4 - width);
+    column += src_stride;
+    row[1] = LoadLo8Msan(column, 4 - width);
+    row_sq[0] = VmullLo8(row[0], row[0]);
+    row_sq[1] = VmullLo8(row[1], row[1]);
+
+    int y = height + 2;
+    do {
+      column += src_stride;
+      row[2] = LoadLo8Msan(column, 4 - width);
+      row_sq[2] = VmullLo8(row[2], row[2]);
+
+      BoxFilterPreProcess4<3, 0>(row, row_sq, s, ab_ptr);
+
+      row[0] = row[1];
+      row[1] = row[2];
+
+      row_sq[0] = row_sq[1];
+      row_sq[1] = row_sq[2];
+      ab_ptr += 8;
+    } while (--y != 0);
   }
+
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  int x = 0;
+  do {
+    ab_ptr = temp;
+
+    __m128i a2, b2[2], sum343_a[3], sum444_a[2], sum343_b[3][2], sum444_b[2][2];
+    a2 = b2[0] = LoadAligned16(ab_ptr);
+
+    const uint8_t* column = src_top_left_corner + x + 4;
+    __m128i row[3], row_sq[3][2];
+    row[0] = LoadUnaligned16Msan(column, x + 16 - width);
+    column += src_stride;
+    row[1] = LoadUnaligned16Msan(column, x + 16 - width);
+    column += src_stride;
+    row[2] = LoadUnaligned16Msan(column, x + 16 - width);
+
+    row_sq[0][0] = VmullLo8(row[0], row[0]);
+    row_sq[0][1] = VmullHi8(row[0], row[0]);
+    row_sq[1][0] = VmullLo8(row[1], row[1]);
+    row_sq[1][1] = VmullHi8(row[1], row[1]);
+    row_sq[2][0] = VmullLo8(row[2], row[2]);
+    row_sq[2][1] = VmullHi8(row[2], row[2]);
+
+    BoxFilterPreProcess8<3, 0>(row, row_sq, s, &a2, &b2[1], ab_ptr);
+
+    sum343_a[0] = Sum343(a2);
+    sum343_a[0] = _mm_sub_epi16(_mm_set1_epi16((3 + 4 + 3) * 256), sum343_a[0]);
+    Sum343W(b2, sum343_b[0]);
+
+    ab_ptr += 8;
+    a2 = b2[0] = LoadAligned16(ab_ptr);
+
+    row[0] = row[1];
+    row[1] = row[2];
+
+    row_sq[0][0] = row_sq[1][0], row_sq[0][1] = row_sq[1][1];
+    row_sq[1][0] = row_sq[2][0], row_sq[1][1] = row_sq[2][1];
+    column += src_stride;
+    row[2] = LoadUnaligned16Msan(column, x + 16 - width);
+
+    row_sq[2][0] = VmullLo8(row[2], row[2]);
+    row_sq[2][1] = VmullHi8(row[2], row[2]);
+
+    BoxFilterPreProcess8<3, 0>(row, row_sq, s, &a2, &b2[1], ab_ptr);
+
+    Sum343_444(a2, &sum343_a[1], &sum444_a[0]);
+    sum343_a[1] = _mm_sub_epi16(_mm_set1_epi16((3 + 4 + 3) * 256), sum343_a[1]);
+    sum444_a[0] = _mm_sub_epi16(_mm_set1_epi16((4 + 4 + 4) * 256), sum444_a[0]);
+    Sum343_444W(b2, sum343_b[1], sum444_b[0]);
+
+    const uint8_t* src_ptr = src + x;
+    uint8_t* dst_ptr = dst + x;
+    int y = height;
+    do {
+      ab_ptr += 8;
+      a2 = b2[0] = LoadAligned16(ab_ptr);
+
+      row[0] = row[1];
+      row[1] = row[2];
+
+      row_sq[0][0] = row_sq[1][0], row_sq[0][1] = row_sq[1][1];
+      row_sq[1][0] = row_sq[2][0], row_sq[1][1] = row_sq[2][1];
+      column += src_stride;
+      row[2] = LoadUnaligned16Msan(column, x + 16 - width);
+
+      row_sq[2][0] = VmullLo8(row[2], row[2]);
+      row_sq[2][1] = VmullHi8(row[2], row[2]);
+
+      BoxFilterPreProcess8<3, 0>(row, row_sq, s, &a2, &b2[1], ab_ptr);
+
+      const __m128i src_u8 = LoadLo8(src_ptr);
+      const __m128i p = BoxFilterPass2(src_u8, a2, b2, sum343_a, sum444_a,
+                                       sum343_b, sum444_b);
+      SelfGuidedSingleMultiplier(src_u8, p, w0, w1, dst_ptr);
+      sum343_a[0] = sum343_a[1];
+      sum343_a[1] = sum343_a[2];
+      sum444_a[0] = sum444_a[1];
+      sum343_b[0][0] = sum343_b[1][0], sum343_b[0][1] = sum343_b[1][1];
+      sum343_b[1][0] = sum343_b[2][0], sum343_b[1][1] = sum343_b[2][1];
+      sum444_b[0][0] = sum444_b[1][0], sum444_b[0][1] = sum444_b[1][1];
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--y != 0);
+    x += 8;
+  } while (x < width);
 }
 
-void SelfGuidedFilter_SSE4_1(const void* source, void* dest,
+// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
+// the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_SSE4_1(const void* const source, void* const dest,
                              const RestorationUnitInfo& restoration_info,
-                             ptrdiff_t source_stride, ptrdiff_t dest_stride,
-                             int width, int height,
+                             const ptrdiff_t source_stride,
+                             const ptrdiff_t dest_stride, const int width,
+                             const int height,
                              RestorationBuffer* const buffer) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
   const auto* src = static_cast<const uint8_t*>(source);
   auto* dst = static_cast<uint8_t*>(dest);
-  const int w0 = restoration_info.sgr_proj_info.multiplier[0];
-  const int w1 = restoration_info.sgr_proj_info.multiplier[1];
-  const int w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
-  const int index = restoration_info.sgr_proj_info.index;
-  const uint8_t r0 = kSgrProjParams[index][0];
-  const uint8_t r1 = kSgrProjParams[index][2];
-  const ptrdiff_t array_stride = buffer->box_filter_process_output_stride;
-  int* box_filter_process_output[2] = {buffer->box_filter_process_output[0],
-                                       buffer->box_filter_process_output[1]};
-
-  BoxFilterProcess_SSE4_1(restoration_info, src, source_stride, width, height,
-                          buffer);
-
-  const __m128i v_w0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(w0), 0);
-  const __m128i v_w1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(w1), 0);
-  const __m128i v_w2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(w2), 0);
-  const __m128i v_r0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(r0), 0);
-  const __m128i v_r1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(r1), 0);
-  const __m128i zero = _mm_setzero_si128();
-  // Create masks used to select between src and box_filter_process_output.
-  const __m128i v_r0_mask = _mm_cmpeq_epi32(v_r0, zero);
-  const __m128i v_r1_mask = _mm_cmpeq_epi32(v_r1, zero);
-
-  int y = 0;
-  do {
-    int x = 0;
-    do {
-      const __m128i v_src = _mm_cvtepu8_epi32(Load4(src + x));
-      const __m128i v_u = _mm_slli_epi32(v_src, kSgrProjRestoreBits);
-      const __m128i v_v_a = _mm_mullo_epi32(v_w1, v_u);
-      const __m128i v_bfp_out0 =
-          LoadUnaligned16(&box_filter_process_output[0][x]);
-      // Select u or box_filter_process_output[0][x].
-      const __m128i v_r0_mult = _mm_blendv_epi8(v_bfp_out0, v_u, v_r0_mask);
-      const __m128i v_v_b = _mm_mullo_epi32(v_w0, v_r0_mult);
-      const __m128i v_v_c = _mm_add_epi32(v_v_a, v_v_b);
-      const __m128i v_bfp_out1 =
-          LoadUnaligned16(&box_filter_process_output[1][x]);
-      // Select u or box_filter_process_output[1][x].
-      const __m128i v_r1_mult = _mm_blendv_epi8(v_bfp_out1, v_u, v_r1_mask);
-      const __m128i v_v_d = _mm_mullo_epi32(v_w2, v_r1_mult);
-      const __m128i v_v_e = _mm_add_epi32(v_v_c, v_v_d);
-      __m128i v_s = RightShiftWithRounding_S32(
-          v_v_e, kSgrProjRestoreBits + kSgrProjPrecisionBits);
-      v_s = _mm_packs_epi32(v_s, v_s);
-      v_s = _mm_packus_epi16(v_s, v_s);
-      Store4(&dst[x], v_s);
-      x += 4;
-    } while (x < width);
-
-    src += source_stride;
-    dst += dest_stride;
-    box_filter_process_output[0] += array_stride;
-    box_filter_process_output[1] += array_stride;
-  } while (++y < height);
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1(src, source_stride, restoration_info, width, height,
+                          kSgrScaleParameter[index][0], buffer->sgf_buffer, dst,
+                          dest_stride);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2(src, source_stride, restoration_info, width, height,
+                          kSgrScaleParameter[index][1], buffer->sgf_buffer, dst,
+                          dest_stride);
+  } else {
+    BoxFilterProcess(src, source_stride, restoration_info, width, height,
+                     kSgrScaleParameter[index], buffer->sgf_buffer, dst,
+                     dest_stride);
+  }
 }
 
 void Init8bpp() {
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.cc b/chromium/third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.cc
new file mode 100644
index 00000000000..13f0853b2cb
--- /dev/null
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.cc
@@ -0,0 +1,397 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_field_projection.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline __m128i LoadDivision(const __m128i division_table,
+                            const __m128i reference_offset) {
+  const __m128i kOne = _mm_set1_epi16(0x0100);
+  const __m128i t = _mm_add_epi8(reference_offset, reference_offset);
+  const __m128i tt = _mm_unpacklo_epi8(t, t);
+  const __m128i idx = _mm_add_epi8(tt, kOne);
+  return _mm_shuffle_epi8(division_table, idx);
+}
+
+inline __m128i MvProjection(const __m128i mv, const __m128i denominator,
+                            const int numerator) {
+  const __m128i m0 = _mm_madd_epi16(mv, denominator);
+  const __m128i m = _mm_mullo_epi32(m0, _mm_set1_epi32(numerator));
+  // Add the sign (0 or -1) to round towards zero.
+  const __m128i sign = _mm_srai_epi32(m, 31);
+  const __m128i add_sign = _mm_add_epi32(m, sign);
+  const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13));
+  return _mm_srai_epi32(sum, 14);
+}
+
+inline __m128i MvProjectionClip(const __m128i mv, const __m128i denominator,
+                                const int numerator) {
+  const __m128i mv0 = _mm_unpacklo_epi16(mv, _mm_setzero_si128());
+  const __m128i mv1 = _mm_unpackhi_epi16(mv, _mm_setzero_si128());
+  const __m128i denorm0 = _mm_unpacklo_epi16(denominator, _mm_setzero_si128());
+  const __m128i denorm1 = _mm_unpackhi_epi16(denominator, _mm_setzero_si128());
+  const __m128i s0 = MvProjection(mv0, denorm0, numerator);
+  const __m128i s1 = MvProjection(mv1, denorm1, numerator);
+  const __m128i projection = _mm_packs_epi32(s0, s1);
+  const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp);
+  const __m128i projection_mv_clamp_negative =
+      _mm_set1_epi16(-kProjectionMvClamp);
+  const __m128i clamp = _mm_min_epi16(projection, projection_mv_clamp);
+  return _mm_max_epi16(clamp, projection_mv_clamp_negative);
+}
+
+inline __m128i Project_SSE4_1(const __m128i delta, const __m128i dst_sign) {
+  // Add 63 to negative delta so that it shifts towards zero.
+  const __m128i delta_sign = _mm_srai_epi16(delta, 15);
+  const __m128i delta_sign_63 = _mm_srli_epi16(delta_sign, 10);
+  const __m128i delta_adjust = _mm_add_epi16(delta, delta_sign_63);
+  const __m128i offset0 = _mm_srai_epi16(delta_adjust, 6);
+  const __m128i offset1 = _mm_xor_si128(offset0, dst_sign);
+  return _mm_sub_epi16(offset1, dst_sign);
+}
+
+inline void GetPosition(
+    const __m128i division_table, const MotionVector* const mv,
+    const int numerator, const int x8_start, const int x8_end, const int x8,
+    const __m128i r_offsets, const __m128i source_reference_type8,
+    const __m128i skip_r, const __m128i y8_floor8, const __m128i y8_ceiling8,
+    const __m128i d_sign, const int delta, __m128i* const r,
+    __m128i* const position_xy, int64_t* const skip_64, __m128i mvs[2]) {
+  const auto* const mv_int = reinterpret_cast<const int32_t*>(mv + x8);
+  *r = _mm_shuffle_epi8(r_offsets, source_reference_type8);
+  const __m128i denorm = LoadDivision(division_table, source_reference_type8);
+  __m128i projection_mv[2];
+  mvs[0] = LoadUnaligned16(mv_int + 0);
+  mvs[1] = LoadUnaligned16(mv_int + 4);
+  // Deinterlace x and y components
+  const __m128i kShuffle =
+      _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+  const __m128i mv0 = _mm_shuffle_epi8(mvs[0], kShuffle);
+  const __m128i mv1 = _mm_shuffle_epi8(mvs[1], kShuffle);
+  const __m128i mv_y = _mm_unpacklo_epi64(mv0, mv1);
+  const __m128i mv_x = _mm_unpackhi_epi64(mv0, mv1);
+  // numerator could be 0.
+  projection_mv[0] = MvProjectionClip(mv_y, denorm, numerator);
+  projection_mv[1] = MvProjectionClip(mv_x, denorm, numerator);
+  // Do not update the motion vector if the block position is not valid or
+  // if position_x8 is outside the current range of x8_start and x8_end.
+  // Note that position_y8 will always be within the range of y8_start and
+  // y8_end.
+  // After subtracting the base, valid projections are within 8-bit.
+  const __m128i position_y = Project_SSE4_1(projection_mv[0], d_sign);
+  const __m128i position_x = Project_SSE4_1(projection_mv[1], d_sign);
+  const __m128i positions = _mm_packs_epi16(position_x, position_y);
+  const __m128i k01234567 =
+      _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
+  *position_xy = _mm_add_epi8(positions, k01234567);
+  const int x8_floor = std::max(
+      x8_start - x8, delta - kProjectionMvMaxHorizontalOffset);  // [-8, 8]
+  const int x8_ceiling =
+      std::min(x8_end - x8, delta + 8 + kProjectionMvMaxHorizontalOffset) -
+      1;  // [-1, 15]
+  const __m128i x8_floor8 = _mm_set1_epi8(x8_floor);
+  const __m128i x8_ceiling8 = _mm_set1_epi8(x8_ceiling);
+  const __m128i floor_xy = _mm_unpacklo_epi64(x8_floor8, y8_floor8);
+  const __m128i ceiling_xy = _mm_unpacklo_epi64(x8_ceiling8, y8_ceiling8);
+  const __m128i underflow = _mm_cmplt_epi8(*position_xy, floor_xy);
+  const __m128i overflow = _mm_cmpgt_epi8(*position_xy, ceiling_xy);
+  const __m128i out = _mm_or_si128(underflow, overflow);
+  const __m128i skip_low = _mm_or_si128(skip_r, out);
+  const __m128i skip = _mm_or_si128(skip_low, _mm_srli_si128(out, 8));
+  StoreLo8(skip_64, skip);
+}
+
+template <int idx>
+inline void Store(const __m128i position, const __m128i reference_offset,
+                  const __m128i mv, int8_t* dst_reference_offset,
+                  MotionVector* dst_mv) {
+  const ptrdiff_t offset =
+      static_cast<int16_t>(_mm_extract_epi16(position, idx));
+  if ((idx & 3) == 0) {
+    dst_mv[offset].mv32 = _mm_cvtsi128_si32(mv);
+  } else {
+    dst_mv[offset].mv32 = _mm_extract_epi32(mv, idx & 3);
+  }
+  dst_reference_offset[offset] = _mm_extract_epi8(reference_offset, idx);
+}
+
+template <int idx>
+inline void CheckStore(const int8_t* skips, const __m128i position,
+                       const __m128i reference_offset, const __m128i mv,
+                       int8_t* dst_reference_offset, MotionVector* dst_mv) {
+  if (skips[idx] == 0) {
+    Store<idx>(position, reference_offset, mv, dst_reference_offset, dst_mv);
+  }
+}
+
+// 7.9.2.
+void MotionFieldProjectionKernel_SSE4_1(
+    const ReferenceInfo& reference_info,
+    const int reference_to_current_with_sign, const int dst_sign,
+    const int y8_start, const int y8_end, const int x8_start, const int x8_end,
+    TemporalMotionField* const motion_field) {
+  const ptrdiff_t stride = motion_field->mv.columns();
+  // The column range has to be offset by kProjectionMvMaxHorizontalOffset since
+  // coordinates in that range could end up being position_x8 because of
+  // projection.
+  const int adjusted_x8_start =
+      std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
+  const int adjusted_x8_end = std::min(
+      x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
+  const int adjusted_x8_end8 = adjusted_x8_end & ~7;
+  const int leftover = adjusted_x8_end - adjusted_x8_end8;
+  const int8_t* const reference_offsets =
+      reference_info.relative_distance_to.data();
+  const bool* const skip_references = reference_info.skip_references.data();
+  const int16_t* const projection_divisions =
+      reference_info.projection_divisions.data();
+  const ReferenceFrameType* source_reference_types =
+      &reference_info.motion_field_reference_frame[y8_start][0];
+  const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
+  int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
+  MotionVector* dst_mv = motion_field->mv[y8_start];
+  const __m128i d_sign = _mm_set1_epi16(dst_sign);
+
+  static_assert(sizeof(int8_t) == sizeof(bool), "");
+  static_assert(sizeof(int8_t) == sizeof(ReferenceFrameType), "");
+  static_assert(sizeof(int32_t) == sizeof(MotionVector), "");
+  assert(dst_sign == 0 || dst_sign == -1);
+  assert(stride == motion_field->reference_offset.columns());
+  assert((y8_start & 7) == 0);
+  assert((adjusted_x8_start & 7) == 0);
+  // The final position calculation is represented with int16_t. Valid
+  // position_y8 from its base is at most 7. After considering the horizontal
+  // offset which is at most |stride - 1|, we have the following assertion,
+  // which means this optimization works for frame width up to 32K (each
+  // position is a 8x8 block).
+  assert(8 * stride <= 32768);
+  const __m128i skip_reference = LoadLo8(skip_references);
+  const __m128i r_offsets = LoadLo8(reference_offsets);
+  const __m128i division_table = LoadUnaligned16(projection_divisions);
+
+  int y8 = y8_start;
+  do {
+    const int y8_floor = (y8 & ~7) - y8;                             // [-7, 0]
+    const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8) - 1;  // [0, 7]
+    const __m128i y8_floor8 = _mm_set1_epi8(y8_floor);
+    const __m128i y8_ceiling8 = _mm_set1_epi8(y8_ceiling);
+    int x8;
+
+    for (x8 = adjusted_x8_start; x8 < adjusted_x8_end8; x8 += 8) {
+      const __m128i source_reference_type8 =
+          LoadLo8(source_reference_types + x8);
+      const __m128i skip_r =
+          _mm_shuffle_epi8(skip_reference, source_reference_type8);
+      int64_t early_skip;
+      StoreLo8(&early_skip, skip_r);
+      // Early termination #1 if all are skips. Chance is typically ~30-40%.
+      if (early_skip == -1) continue;
+      int64_t skip_64;
+      __m128i r, position_xy, mvs[2];
+      GetPosition(division_table, mv, reference_to_current_with_sign, x8_start,
+                  x8_end, x8, r_offsets, source_reference_type8, skip_r,
+                  y8_floor8, y8_ceiling8, d_sign, 0, &r, &position_xy, &skip_64,
+                  mvs);
+      // Early termination #2 if all are skips.
+      // Chance is typically ~15-25% after Early termination #1.
+      if (skip_64 == -1) continue;
+      const __m128i p_y = _mm_cvtepi8_epi16(_mm_srli_si128(position_xy, 8));
+      const __m128i p_x = _mm_cvtepi8_epi16(position_xy);
+      const __m128i p_y_offset = _mm_mullo_epi16(p_y, _mm_set1_epi16(stride));
+      const __m128i pos = _mm_add_epi16(p_y_offset, p_x);
+      const __m128i position = _mm_add_epi16(pos, _mm_set1_epi16(x8));
+      if (skip_64 == 0) {
+        // Store all. Chance is typically ~70-85% after Early termination #2.
+        Store<0>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+        Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+        Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+        Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+      } else {
+        // Check and store each.
+        // Chance is typically ~15-30% after Early termination #2.
+        // The compiler is smart enough to not create the local buffer skips[].
+        int8_t skips[8];
+        memcpy(skips, &skip_64, sizeof(skips));
+        CheckStore<0>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+        CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+        CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+        CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+      }
+    }
+
+    // The following leftover processing cannot be moved out of the do...while
+    // loop. Doing so may change the result storing orders of the same position.
+    if (leftover > 0) {
+      // Use SIMD only when leftover is at least 4, and there are at least 8
+      // elements in a row.
+      if (leftover >= 4 && adjusted_x8_start < adjusted_x8_end8) {
+        // Process the last 8 elements to avoid loading invalid memory. Some
+        // elements may have been processed in the above loop, which is OK.
+        const int delta = 8 - leftover;
+        x8 = adjusted_x8_end - 8;
+        const __m128i source_reference_type8 =
+            LoadLo8(source_reference_types + x8);
+        const __m128i skip_r =
+            _mm_shuffle_epi8(skip_reference, source_reference_type8);
+        int64_t early_skip;
+        StoreLo8(&early_skip, skip_r);
+        // Early termination #1 if all are skips.
+        if (early_skip != -1) {
+          int64_t skip_64;
+          __m128i r, position_xy, mvs[2];
+          GetPosition(division_table, mv, reference_to_current_with_sign,
+                      x8_start, x8_end, x8, r_offsets, source_reference_type8,
+                      skip_r, y8_floor8, y8_ceiling8, d_sign, delta, &r,
+                      &position_xy, &skip_64, mvs);
+          // Early termination #2 if all are skips.
+          if (skip_64 != -1) {
+            const __m128i p_y =
+                _mm_cvtepi8_epi16(_mm_srli_si128(position_xy, 8));
+            const __m128i p_x = _mm_cvtepi8_epi16(position_xy);
+            const __m128i p_y_offset =
+                _mm_mullo_epi16(p_y, _mm_set1_epi16(stride));
+            const __m128i pos = _mm_add_epi16(p_y_offset, p_x);
+            const __m128i position = _mm_add_epi16(pos, _mm_set1_epi16(x8));
+            // Store up to 7 elements since leftover is at most 7.
+            if (skip_64 == 0) {
+              // Store all.
+              Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+              Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+              Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+              Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+              Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+              Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+              Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+            } else {
+              // Check and store each.
+              // The compiler is smart enough to not create the local buffer
+              // skips[].
+              int8_t skips[8];
+              memcpy(skips, &skip_64, sizeof(skips));
+              CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset,
+                            dst_mv);
+              CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset,
+                            dst_mv);
+              CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset,
+                            dst_mv);
+              CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+              CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+              CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+              CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+            }
+          }
+        }
+      } else {
+        for (; x8 < adjusted_x8_end; ++x8) {
+          const int source_reference_type = source_reference_types[x8];
+          if (skip_references[source_reference_type]) continue;
+          MotionVector projection_mv;
+          // reference_to_current_with_sign could be 0.
+          GetMvProjection(mv[x8], reference_to_current_with_sign,
+                          projection_divisions[source_reference_type],
+                          &projection_mv);
+          // Do not update the motion vector if the block position is not valid
+          // or if position_x8 is outside the current range of x8_start and
+          // x8_end. Note that position_y8 will always be within the range of
+          // y8_start and y8_end.
+          const int position_y8 = Project(0, projection_mv.mv[0], dst_sign);
+          if (position_y8 < y8_floor || position_y8 > y8_ceiling) continue;
+          const int x8_base = x8 & ~7;
+          const int x8_floor =
+              std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset);
+          const int x8_ceiling =
+              std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset);
+          const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign);
+          if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
+          dst_mv[position_y8 * stride + position_x8] = mv[x8];
+          dst_reference_offset[position_y8 * stride + position_x8] =
+              reference_offsets[source_reference_type];
+        }
+      }
+    }
+
+    source_reference_types += stride;
+    mv += stride;
+    dst_reference_offset += stride;
+    dst_mv += stride;
+  } while (++y8 < y8_end);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1;
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1;
+}
+#endif
+
+}  // namespace
+
+void MotionFieldProjectionInit_SSE4_1() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void MotionFieldProjectionInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_SSE4_1
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.h b/chromium/third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.h
new file mode 100644
index 00000000000..7828de5ca39
--- /dev/null
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::motion_field_projection_kernel. This function is not
+// thread-safe.
+void MotionFieldProjectionInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_SSE4_1
+#define LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel LIBGAV1_CPU_SSE4_1
+#endif  // LIBGAV1_ENABLE_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.cc b/chromium/third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.cc
new file mode 100644
index 00000000000..a4b77da7877
--- /dev/null
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.cc
@@ -0,0 +1,262 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kProjectionMvDivisionLookup_32bit[kMaxFrameDistance + 1] = {
+    0,    16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638,
+    1489, 1365,  1260, 1170, 1092, 1024, 963,  910,  862,  819,  780,
+    744,  712,   682,  655,  630,  606,  585,  564,  546,  528};
+
+inline __m128i MvProjection(const __m128i mv, const __m128i denominator,
+                            const __m128i numerator) {
+  const __m128i m0 = _mm_madd_epi16(mv, denominator);
+  const __m128i m = _mm_mullo_epi32(m0, numerator);
+  // Add the sign (0 or -1) to round towards zero.
+  const __m128i sign = _mm_srai_epi32(m, 31);
+  const __m128i add_sign = _mm_add_epi32(m, sign);
+  const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13));
+  return _mm_srai_epi32(sum, 14);
+}
+
+inline __m128i MvProjectionClip(const __m128i mvs[2],
+                                const __m128i denominators[2],
+                                const __m128i numerator) {
+  const __m128i s0 = MvProjection(mvs[0], denominators[0], numerator);
+  const __m128i s1 = MvProjection(mvs[1], denominators[1], numerator);
+  const __m128i mv = _mm_packs_epi32(s0, s1);
+  const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp);
+  const __m128i projection_mv_clamp_negative =
+      _mm_set1_epi16(-kProjectionMvClamp);
+  const __m128i clamp = _mm_min_epi16(mv, projection_mv_clamp);
+  return _mm_max_epi16(clamp, projection_mv_clamp_negative);
+}
+
+inline __m128i MvProjectionCompoundClip(
+    const MotionVector* const temporal_mvs,
+    const int8_t temporal_reference_offsets[2],
+    const int reference_offsets[2]) {
+  const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
+  const __m128i temporal_mv = LoadLo8(tmvs);
+  const __m128i temporal_mv_0 = _mm_cvtepu16_epi32(temporal_mv);
+  __m128i mvs[2], denominators[2];
+  mvs[0] = _mm_unpacklo_epi64(temporal_mv_0, temporal_mv_0);
+  mvs[1] = _mm_unpackhi_epi64(temporal_mv_0, temporal_mv_0);
+  denominators[0] = _mm_set1_epi32(
+      kProjectionMvDivisionLookup[temporal_reference_offsets[0]]);
+  denominators[1] = _mm_set1_epi32(
+      kProjectionMvDivisionLookup[temporal_reference_offsets[1]]);
+  const __m128i offsets = LoadLo8(reference_offsets);
+  const __m128i numerator = _mm_unpacklo_epi32(offsets, offsets);
+  return MvProjectionClip(mvs, denominators, numerator);
+}
+
+inline __m128i MvProjectionSingleClip(
+    const MotionVector* const temporal_mvs,
+    const int8_t* const temporal_reference_offsets,
+    const int reference_offset) {
+  const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
+  const __m128i temporal_mv = LoadAligned16(tmvs);
+  __m128i lookup = _mm_cvtsi32_si128(
+      kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[0]]);
+  lookup = _mm_insert_epi32(
+      lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[1]],
+      1);
+  lookup = _mm_insert_epi32(
+      lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[2]],
+      2);
+  lookup = _mm_insert_epi32(
+      lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[3]],
+      3);
+  __m128i mvs[2], denominators[2];
+  mvs[0] = _mm_unpacklo_epi16(temporal_mv, _mm_setzero_si128());
+  mvs[1] = _mm_unpackhi_epi16(temporal_mv, _mm_setzero_si128());
+  denominators[0] = _mm_unpacklo_epi32(lookup, lookup);
+  denominators[1] = _mm_unpackhi_epi32(lookup, lookup);
+  const __m128i numerator = _mm_set1_epi32(reference_offset);
+  return MvProjectionClip(mvs, denominators, numerator);
+}
+
+inline void LowPrecision(const __m128i mv, void* const candidate_mvs) {
+  const __m128i kRoundDownMask = _mm_set1_epi16(~1);
+  const __m128i sign = _mm_srai_epi16(mv, 15);
+  const __m128i sub_sign = _mm_sub_epi16(mv, sign);
+  const __m128i d = _mm_and_si128(sub_sign, kRoundDownMask);
+  StoreAligned16(candidate_mvs, d);
+}
+
+inline void ForceInteger(const __m128i mv, void* const candidate_mvs) {
+  const __m128i kRoundDownMask = _mm_set1_epi16(~7);
+  const __m128i sign = _mm_srai_epi16(mv, 15);
+  const __m128i mv1 = _mm_add_epi16(mv, _mm_set1_epi16(3));
+  const __m128i mv2 = _mm_sub_epi16(mv1, sign);
+  const __m128i mv3 = _mm_and_si128(mv2, kRoundDownMask);
+  StoreAligned16(candidate_mvs, mv3);
+}
+
+void MvProjectionCompoundLowPrecision_SSE4_1(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* candidate_mvs) {
+  // |reference_offsets| non-zero check usually equals true and is ignored.
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  // One more element could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionCompoundClip(
+        temporal_mvs + i, temporal_reference_offsets + i, offsets);
+    LowPrecision(mv, candidate_mvs + i);
+    i += 2;
+  } while (i < count);
+}
+
+void MvProjectionCompoundForceInteger_SSE4_1(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* candidate_mvs) {
+  // |reference_offsets| non-zero check usually equals true and is ignored.
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  // One more element could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionCompoundClip(
+        temporal_mvs + i, temporal_reference_offsets + i, offsets);
+    ForceInteger(mv, candidate_mvs + i);
+    i += 2;
+  } while (i < count);
+}
+
+void MvProjectionCompoundHighPrecision_SSE4_1(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* candidate_mvs) {
+  // |reference_offsets| non-zero check usually equals true and is ignored.
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  // One more element could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionCompoundClip(
+        temporal_mvs + i, temporal_reference_offsets + i, offsets);
+    StoreAligned16(candidate_mvs + i, mv);
+    i += 2;
+  } while (i < count);
+}
+
+void MvProjectionSingleLowPrecision_SSE4_1(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const int reference_offset, const int count, MotionVector* candidate_mvs) {
+  // Up to three more elements could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionSingleClip(
+        temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+    LowPrecision(mv, candidate_mvs + i);
+    i += 4;
+  } while (i < count);
+}
+
+void MvProjectionSingleForceInteger_SSE4_1(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const int reference_offset, const int count, MotionVector* candidate_mvs) {
+  // Up to three more elements could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionSingleClip(
+        temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+    ForceInteger(mv, candidate_mvs + i);
+    i += 4;
+  } while (i < count);
+}
+
+void MvProjectionSingleHighPrecision_SSE4_1(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const int reference_offset, const int count, MotionVector* candidate_mvs) {
+  // Up to three more elements could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionSingleClip(
+        temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+    StoreAligned16(candidate_mvs + i, mv);
+    i += 4;
+  } while (i < count);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1;
+  dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1;
+  dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1;
+  dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1;
+  dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1;
+  dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1;
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1;
+  dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1;
+  dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1;
+  dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1;
+  dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1;
+  dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1;
+}
+#endif
+
+}  // namespace
+
+void MotionVectorSearchInit_SSE4_1() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void MotionVectorSearchInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_SSE4_1
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.h b/chromium/third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.h
new file mode 100644
index 00000000000..b8b04123635
--- /dev/null
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mv_projection_compound and Dsp::mv_projection_single. This
+// function is not thread-safe.
+void MotionVectorSearchInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_SSE4_1
+#define LIBGAV1_Dsp8bpp_MotionVectorSearch LIBGAV1_CPU_SSE4_1
+#endif  // LIBGAV1_ENABLE_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/transpose_sse4.h b/chromium/third_party/libgav1/src/src/dsp/x86/transpose_sse4.h
index 2a10dc05633..cd61c9275d3 100644
--- a/chromium/third_party/libgav1/src/src/dsp/x86/transpose_sse4.h
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/transpose_sse4.h
@@ -27,7 +27,7 @@ namespace libgav1 {
 namespace dsp {
 
 LIBGAV1_ALWAYS_INLINE __m128i Transpose4x4_U8(const __m128i* const in) {
-  // Unpack 16 bit elements. Goes from:
+  // Unpack 8 bit elements. Goes from:
   // in[0]: 00 01 02 03
   // in[1]: 10 11 12 13
   // in[2]: 20 21 22 23
@@ -43,10 +43,10 @@ LIBGAV1_ALWAYS_INLINE __m128i Transpose4x4_U8(const __m128i* const in) {
   return _mm_unpacklo_epi16(a0, a1);
 }
 
-LIBGAV1_ALWAYS_INLINE void Transpose8x8_U8(const __m128i* const in,
-                                           __m128i* out) {
-  // Unpack 16 bit elements. Goes from:
-  // in[0]: 00 01 02 03 04 05 06 07
+LIBGAV1_ALWAYS_INLINE void Transpose8x8To4x16_U8(const __m128i* const in,
+                                                 __m128i* out) {
+  // Unpack 8 bit elements. Goes from:
+  // in[0]:  00 01 02 03 04 05 06 07
   // in[1]:  10 11 12 13 14 15 16 17
   // in[2]:  20 21 22 23 24 25 26 27
   // in[3]:  30 31 32 33 34 35 36 37
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/warp_sse4.cc b/chromium/third_party/libgav1/src/src/dsp/x86/warp_sse4.cc
index 4003f5db459..922110ba573 100644
--- a/chromium/third_party/libgav1/src/src/dsp/x86/warp_sse4.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/warp_sse4.cc
@@ -19,11 +19,10 @@
 
 #include <smmintrin.h>
 
-#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include <cstdlib>
+#include <cstring>
 #include <type_traits>
 
 #include "src/dsp/constants.h"
@@ -69,7 +68,7 @@ inline void HorizontalFilter(const int sx4, const int16_t alpha,
     f = LoadLo8(kWarpedFilters8[offset]);
     sx += alpha;
   }
-  Transpose8x8_U8(filter, filter);
+  Transpose8x8To4x16_U8(filter, filter);
   // |filter| now contains two filters per register.
   // Staggered combinations allow us to take advantage of _mm_maddubs_epi16
   // without overflowing the sign bit. The sign bit is hit only where two taps
@@ -128,10 +127,10 @@ inline void WriteVerticalFilter(const __m128i filter[8],
   sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical);
   if (is_compound) {
     const __m128i sum = _mm_packs_epi32(sum_low, sum_high);
-    StoreUnaligned16(reinterpret_cast<int16_t*>(dst_row), sum);
+    StoreUnaligned16(static_cast<int16_t*>(dst_row), sum);
   } else {
     const __m128i sum = _mm_packus_epi32(sum_low, sum_high);
-    StoreLo8(reinterpret_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
+    StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
   }
 }
 
@@ -159,22 +158,206 @@ inline void WriteVerticalFilter(const __m128i filter[8],
   sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical);
   if (is_compound) {
     const __m128i sum = _mm_packs_epi32(sum_low, sum_high);
-    StoreUnaligned16(reinterpret_cast<int16_t*>(dst_row), sum);
+    StoreUnaligned16(static_cast<int16_t*>(dst_row), sum);
   } else {
     const __m128i sum = _mm_packus_epi32(sum_low, sum_high);
-    StoreLo8(reinterpret_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
+    StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
   }
 }
 
-template <bool is_compound>
-void Warp_SSE4_1(const void* source, ptrdiff_t source_stride, int source_width,
-                 int source_height, const int* warp_params, int subsampling_x,
-                 int subsampling_y, int block_start_x, int block_start_y,
-                 int block_width, int block_height, int16_t alpha, int16_t beta,
-                 int16_t gamma, int16_t delta, void* dest,
-                 ptrdiff_t dest_stride) {
-  constexpr int kRoundBitsVertical =
-      is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
+template <bool is_compound, typename DestType>
+inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma,
+                           int delta, DestType* dest_row,
+                           ptrdiff_t dest_stride) {
+  int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+  for (int y = 0; y < 8; ++y) {
+    int sy = sy4 - MultiplyBy4(gamma);
+    __m128i filter[8];
+    for (__m128i& f : filter) {
+      const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+                         kWarpedPixelPrecisionShifts;
+      f = LoadUnaligned16(kWarpedFilters[offset]);
+      sy += gamma;
+    }
+    Transpose8x8_U16(filter, filter);
+    WriteVerticalFilter<is_compound>(filter, source, y, dest_row);
+    dest_row += dest_stride;
+    sy4 += delta;
+  }
+}
+
+template <bool is_compound, typename DestType>
+inline void VerticalFilter(const int16_t* source_cols, int y4, int gamma,
+                           int delta, DestType* dest_row,
+                           ptrdiff_t dest_stride) {
+  int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+  for (int y = 0; y < 8; ++y) {
+    int sy = sy4 - MultiplyBy4(gamma);
+    __m128i filter[8];
+    for (__m128i& f : filter) {
+      const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+                         kWarpedPixelPrecisionShifts;
+      f = LoadUnaligned16(kWarpedFilters[offset]);
+      sy += gamma;
+    }
+    Transpose8x8_U16(filter, filter);
+    WriteVerticalFilter<is_compound>(filter, &source_cols[y], dest_row);
+    dest_row += dest_stride;
+    sy4 += delta;
+  }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion1(const uint8_t* src, ptrdiff_t source_stride,
+                        int source_width, int source_height, int ix4, int iy4,
+                        DestType* dst_row, ptrdiff_t dest_stride) {
+  // Region 1
+  // Points to the left or right border of the first row of |src|.
+  const uint8_t* first_row_border =
+      (ix4 + 7 <= 0) ? src : src + source_width - 1;
+  // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+  //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+  // In two special cases, iy4 + y is clipped to either 0 or
+  // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+  // bounded and we can avoid clipping iy4 + y by relying on a reference
+  // frame's boundary extension on the top and bottom.
+  // Region 1.
+  // Every sample used to calculate the prediction block has the same
+  // value. So the whole prediction block has the same value.
+  const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+  const uint8_t row_border_pixel = first_row_border[row * source_stride];
+
+  if (is_compound) {
+    const __m128i sum =
+        _mm_set1_epi16(row_border_pixel << (kInterRoundBitsVertical -
+                                            kInterRoundBitsCompoundVertical));
+    StoreUnaligned16(dst_row, sum);
+  } else {
+    memset(dst_row, row_border_pixel, 8);
+  }
+  const DestType* const first_dst_row = dst_row;
+  dst_row += dest_stride;
+  for (int y = 1; y < 8; ++y) {
+    memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row));
+    dst_row += dest_stride;
+  }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion2(const uint8_t* src, ptrdiff_t source_stride,
+                        int source_width, int y4, int ix4, int iy4, int gamma,
+                        int delta, int16_t intermediate_result_column[15],
+                        DestType* dst_row, ptrdiff_t dest_stride) {
+  // Region 2.
+  // Points to the left or right border of the first row of |src|.
+  const uint8_t* first_row_border =
+      (ix4 + 7 <= 0) ? src : src + source_width - 1;
+  // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+  //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+  // In two special cases, iy4 + y is clipped to either 0 or
+  // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+  // bounded and we can avoid clipping iy4 + y by relying on a reference
+  // frame's boundary extension on the top and bottom.
+
+  // Region 2.
+  // Horizontal filter.
+  // The input values in this region are generated by extending the border
+  // which makes them identical in the horizontal direction. This
+  // computation could be inlined in the vertical pass but most
+  // implementations will need a transpose of some sort.
+  // It is not necessary to use the offset values here because the
+  // horizontal pass is a simple shift and the vertical pass will always
+  // require using 32 bits.
+  for (int y = -7; y < 8; ++y) {
+    // We may over-read up to 13 pixels above the top source row, or up
+    // to 13 pixels below the bottom source row. This is proved in
+    // warp.cc.
+    const int row = iy4 + y;
+    int sum = first_row_border[row * source_stride];
+    sum <<= (kFilterBits - kInterRoundBitsHorizontal);
+    intermediate_result_column[y + 7] = sum;
+  }
+  // Region 2 vertical filter.
+  VerticalFilter<is_compound, DestType>(intermediate_result_column, y4, gamma,
+                                        delta, dst_row, dest_stride);
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion3(const uint8_t* src, ptrdiff_t source_stride,
+                        int source_height, int alpha, int beta, int x4, int ix4,
+                        int iy4, int16_t intermediate_result[15][8]) {
+  // Region 3
+  // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+  // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+  //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+  // In two special cases, iy4 + y is clipped to either 0 or
+  // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+  // bounded and we can avoid clipping iy4 + y by relying on a reference
+  // frame's boundary extension on the top and bottom.
+  // Horizontal filter.
+  const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+  const uint8_t* const src_row = src + row * source_stride;
+  // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+  // read but is ignored.
+  //
+  // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+  // bytes after src_row[source_width - 1]. We assume the source frame
+  // has left and right borders of at least 13 bytes that extend the
+  // frame boundary pixels. We also assume there is at least one extra
+  // padding byte after the right border of the last source row.
+  const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
+  int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+  for (int y = -7; y < 8; ++y) {
+    HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
+    sx4 += beta;
+  }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion4(const uint8_t* src, ptrdiff_t source_stride, int alpha,
+                        int beta, int x4, int ix4, int iy4,
+                        int16_t intermediate_result[15][8]) {
+  // Region 4.
+  // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+  // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+  //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+  // In two special cases, iy4 + y is clipped to either 0 or
+  // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+  // bounded and we can avoid clipping iy4 + y by relying on a reference
+  // frame's boundary extension on the top and bottom.
+  // Horizontal filter.
+  int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+  for (int y = -7; y < 8; ++y) {
+    // We may over-read up to 13 pixels above the top source row, or up
+    // to 13 pixels below the bottom source row. This is proved in
+    // warp.cc.
+    const int row = iy4 + y;
+    const uint8_t* const src_row = src + row * source_stride;
+    // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+    // read but is ignored.
+    //
+    // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+    // bytes after src_row[source_width - 1]. We assume the source frame
+    // has left and right borders of at least 13 bytes that extend the
+    // frame boundary pixels. We also assume there is at least one extra
+    // padding byte after the right border of the last source row.
+    const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
+    // Convert src_row_v to int8 (subtract 128).
+    HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
+    sx4 += beta;
+  }
+}
+
+template <bool is_compound, typename DestType>
+inline void HandleWarpBlock(const uint8_t* src, ptrdiff_t source_stride,
+                            int source_width, int source_height,
+                            const int* warp_params, int subsampling_x,
+                            int subsampling_y, int src_x, int src_y,
+                            int16_t alpha, int16_t beta, int16_t gamma,
+                            int16_t delta, DestType* dst_row,
+                            ptrdiff_t dest_stride) {
   union {
     // Intermediate_result is the output of the horizontal filtering and
     // rounding. The range is within 13 (= bitdepth + kFilterBits + 1 -
@@ -187,242 +370,133 @@ void Warp_SSE4_1(const void* source, ptrdiff_t source_stride, int source_width,
     int16_t intermediate_result_column[15];
   };
 
+  const int dst_x =
+      src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
+  const int dst_y =
+      src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
+  const int x4 = dst_x >> subsampling_x;
+  const int y4 = dst_y >> subsampling_y;
+  const int ix4 = x4 >> kWarpedModelPrecisionBits;
+  const int iy4 = y4 >> kWarpedModelPrecisionBits;
+  // A prediction block may fall outside the frame's boundaries. If a
+  // prediction block is calculated using only samples outside the frame's
+  // boundary, the filtering can be simplified. We can divide the plane
+  // into several regions and handle them differently.
+  //
+  //                |           |
+  //            1   |     3     |   1
+  //                |           |
+  //         -------+-----------+-------
+  //                |***********|
+  //            2   |*****4*****|   2
+  //                |***********|
+  //         -------+-----------+-------
+  //                |           |
+  //            1   |     3     |   1
+  //                |           |
+  //
+  // At the center, region 4 represents the frame and is the general case.
+  //
+  // In regions 1 and 2, the prediction block is outside the frame's
+  // boundary horizontally. Therefore the horizontal filtering can be
+  // simplified. Furthermore, in the region 1 (at the four corners), the
+  // prediction is outside the frame's boundary both horizontally and
+  // vertically, so we get a constant prediction block.
+  //
+  // In region 3, the prediction block is outside the frame's boundary
+  // vertically. Unfortunately because we apply the horizontal filters
+  // first, by the time we apply the vertical filters, they no longer see
+  // simple inputs. So the only simplification is that all the rows are
+  // the same, but we still need to apply all the horizontal and vertical
+  // filters.
+
+  // Check for two simple special cases, where the horizontal filter can
+  // be significantly simplified.
+  //
+  // In general, for each row, the horizontal filter is calculated as
+  // follows:
+  //   for (int x = -4; x < 4; ++x) {
+  //     const int offset = ...;
+  //     int sum = first_pass_offset;
+  //     for (int k = 0; k < 8; ++k) {
+  //       const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
+  //       sum += kWarpedFilters[offset][k] * src_row[column];
+  //     }
+  //     ...
+  //   }
+  // The column index before clipping, ix4 + x + k - 3, varies in the range
+  // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
+  // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
+  // border index (source_width - 1 or 0, respectively). Then for each x,
+  // the inner for loop of the horizontal filter is reduced to multiplying
+  // the border pixel by the sum of the filter coefficients.
+  if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
+    if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) {
+      // Outside the frame in both directions. One repeated value.
+      WarpRegion1<is_compound, DestType>(src, source_stride, source_width,
+                                         source_height, ix4, iy4, dst_row,
+                                         dest_stride);
+      return;
+    }
+    // Outside the frame horizontally. Rows repeated.
+    WarpRegion2<is_compound, DestType>(
+        src, source_stride, source_width, y4, ix4, iy4, gamma, delta,
+        intermediate_result_column, dst_row, dest_stride);
+    return;
+  }
+
+  if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) {
+    // Outside the frame vertically.
+    WarpRegion3<is_compound, DestType>(src, source_stride, source_height, alpha,
+                                       beta, x4, ix4, iy4, intermediate_result);
+  } else {
+    // Inside the frame.
+    WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta, x4, ix4,
+                                       iy4, intermediate_result);
+  }
+  // Region 3 and 4 vertical filter.
+  VerticalFilter<is_compound, DestType>(intermediate_result, y4, gamma, delta,
+                                        dst_row, dest_stride);
+}
+
+template <bool is_compound>
+void Warp_SSE4_1(const void* source, ptrdiff_t source_stride, int source_width,
+                 int source_height, const int* warp_params, int subsampling_x,
+                 int subsampling_y, int block_start_x, int block_start_y,
+                 int block_width, int block_height, int16_t alpha, int16_t beta,
+                 int16_t gamma, int16_t delta, void* dest,
+                 ptrdiff_t dest_stride) {
   const auto* const src = static_cast<const uint8_t*>(source);
   using DestType =
       typename std::conditional<is_compound, int16_t, uint8_t>::type;
   auto* dst = static_cast<DestType*>(dest);
 
+  // Warp process applies for each 8x8 block.
   assert(block_width >= 8);
   assert(block_height >= 8);
-
-  // Warp process applies for each 8x8 block (or smaller).
-  int start_y = block_start_y;
+  const int block_end_x = block_start_x + block_width;
+  const int block_end_y = block_start_y + block_height;
+
+  const int start_x = block_start_x;
+  const int start_y = block_start_y;
+  int src_x = (start_x + 4) << subsampling_x;
+  int src_y = (start_y + 4) << subsampling_y;
+  const int end_x = (block_end_x + 4) << subsampling_x;
+  const int end_y = (block_end_y + 4) << subsampling_y;
   do {
-    int start_x = block_start_x;
+    DestType* dst_row = dst;
+    src_x = (start_x + 4) << subsampling_x;
     do {
-      const int src_x = (start_x + 4) << subsampling_x;
-      const int src_y = (start_y + 4) << subsampling_y;
-      const int dst_x =
-          src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
-      const int dst_y =
-          src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
-      const int x4 = dst_x >> subsampling_x;
-      const int y4 = dst_y >> subsampling_y;
-      const int ix4 = x4 >> kWarpedModelPrecisionBits;
-      const int iy4 = y4 >> kWarpedModelPrecisionBits;
-      // A prediction block may fall outside the frame's boundaries. If a
-      // prediction block is calculated using only samples outside the frame's
-      // boundary, the filtering can be simplified. We can divide the plane
-      // into several regions and handle them differently.
-      //
-      //                |           |
-      //            1   |     3     |   1
-      //                |           |
-      //         -------+-----------+-------
-      //                |***********|
-      //            2   |*****4*****|   2
-      //                |***********|
-      //         -------+-----------+-------
-      //                |           |
-      //            1   |     3     |   1
-      //                |           |
-      //
-      // At the center, region 4 represents the frame and is the general case.
-      //
-      // In regions 1 and 2, the prediction block is outside the frame's
-      // boundary horizontally. Therefore the horizontal filtering can be
-      // simplified. Furthermore, in the region 1 (at the four corners), the
-      // prediction is outside the frame's boundary both horizontally and
-      // vertically, so we get a constant prediction block.
-      //
-      // In region 3, the prediction block is outside the frame's boundary
-      // vertically. Unfortunately because we apply the horizontal filters
-      // first, by the time we apply the vertical filters, they no longer see
-      // simple inputs. So the only simplification is that all the rows are
-      // the same, but we still need to apply all the horizontal and vertical
-      // filters.
-
-      // Check for two simple special cases, where the horizontal filter can
-      // be significantly simplified.
-      //
-      // In general, for each row, the horizontal filter is calculated as
-      // follows:
-      //   for (int x = -4; x < 4; ++x) {
-      //     const int offset = ...;
-      //     int sum = first_pass_offset;
-      //     for (int k = 0; k < 8; ++k) {
-      //       const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
-      //       sum += kWarpedFilters[offset][k] * src_row[column];
-      //     }
-      //     ...
-      //   }
-      // The column index before clipping, ix4 + x + k - 3, varies in the range
-      // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
-      // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
-      // border index (source_width - 1 or 0, respectively). Then for each x,
-      // the inner for loop of the horizontal filter is reduced to multiplying
-      // the border pixel by the sum of the filter coefficients.
-      if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
-        // Regions 1 and 2.
-        // Points to the left or right border of the first row of |src|.
-        const uint8_t* first_row_border =
-            (ix4 + 7 <= 0) ? src : src + source_width - 1;
-        // In general, for y in [-7, 8), the row number iy4 + y is clipped:
-        //   const int row = Clip3(iy4 + y, 0, source_height - 1);
-        // In two special cases, iy4 + y is clipped to either 0 or
-        // source_height - 1 for all y. In the rest of the cases, iy4 + y is
-        // bounded and we can avoid clipping iy4 + y by relying on a reference
-        // frame's boundary extension on the top and bottom.
-        if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
-          // Region 1.
-          // Every sample used to calculate the prediction block has the same
-          // value. So the whole prediction block has the same value.
-          const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
-          const uint8_t row_border_pixel =
-              first_row_border[row * source_stride];
-
-          DestType* dst_row = dst + start_x - block_start_x;
-          if (is_compound) {
-            const __m128i sum =
-                _mm_set1_epi16(row_border_pixel << (kInterRoundBitsVertical -
-                                                    kRoundBitsVertical));
-            StoreUnaligned16(dst_row, sum);
-          } else {
-            memset(dst_row, row_border_pixel, 8);
-          }
-          const DestType* const first_dst_row = dst_row;
-          dst_row += dest_stride;
-          for (int y = 1; y < 8; ++y) {
-            memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row));
-            dst_row += dest_stride;
-          }
-          // End of region 1. Continue the |start_x| do-while loop.
-          start_x += 8;
-          continue;
-        }
-
-        // Region 2.
-        // Horizontal filter.
-        // The input values in this region are generated by extending the border
-        // which makes them identical in the horizontal direction. This
-        // computation could be inlined in the vertical pass but most
-        // implementations will need a transpose of some sort.
-        // It is not necessary to use the offset values here because the
-        // horizontal pass is a simple shift and the vertical pass will always
-        // require using 32 bits.
-        for (int y = -7; y < 8; ++y) {
-          // We may over-read up to 13 pixels above the top source row, or up
-          // to 13 pixels below the bottom source row. This is proved in
-          // warp.cc.
-          const int row = iy4 + y;
-          int sum = first_row_border[row * source_stride];
-          sum <<= (kFilterBits - kInterRoundBitsHorizontal);
-          intermediate_result_column[y + 7] = sum;
-        }
-        // Vertical filter.
-        DestType* dst_row = dst + start_x - block_start_x;
-        int sy4 =
-            (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
-        for (int y = 0; y < 8; ++y) {
-          int sy = sy4 - MultiplyBy4(gamma);
-          __m128i filter[8];
-          for (__m128i& f : filter) {
-            const int offset =
-                RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
-                kWarpedPixelPrecisionShifts;
-            f = LoadUnaligned16(kWarpedFilters[offset]);
-            sy += gamma;
-          }
-          Transpose8x8_U16(filter, filter);
-          WriteVerticalFilter<is_compound>(
-              filter, &intermediate_result_column[y], dst_row);
-          dst_row += dest_stride;
-          sy4 += delta;
-        }
-        // End of region 2. Continue the |start_x| do-while loop.
-        start_x += 8;
-        continue;
-      }
-
-      // Regions 3 and 4.
-      // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
-
-      // In general, for y in [-7, 8), the row number iy4 + y is clipped:
-      //   const int row = Clip3(iy4 + y, 0, source_height - 1);
-      // In two special cases, iy4 + y is clipped to either 0 or
-      // source_height - 1 for all y. In the rest of the cases, iy4 + y is
-      // bounded and we can avoid clipping iy4 + y by relying on a reference
-      // frame's boundary extension on the top and bottom.
-      if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
-        // Region 3.
-        // Horizontal filter.
-        const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
-        const uint8_t* const src_row = src + row * source_stride;
-        // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
-        // read but is ignored.
-        //
-        // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
-        // bytes after src_row[source_width - 1]. We assume the source frame
-        // has left and right borders of at least 13 bytes that extend the
-        // frame boundary pixels. We also assume there is at least one extra
-        // padding byte after the right border of the last source row.
-        const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
-        int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
-        for (int y = -7; y < 8; ++y) {
-          HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
-          sx4 += beta;
-        }
-      } else {
-        // Region 4.
-        // Horizontal filter.
-        int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
-        for (int y = -7; y < 8; ++y) {
-          // We may over-read up to 13 pixels above the top source row, or up
-          // to 13 pixels below the bottom source row. This is proved in
-          // warp.cc.
-          const int row = iy4 + y;
-          const uint8_t* const src_row = src + row * source_stride;
-          // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
-          // read but is ignored.
-          //
-          // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
-          // bytes after src_row[source_width - 1]. We assume the source frame
-          // has left and right borders of at least 13 bytes that extend the
-          // frame boundary pixels. We also assume there is at least one extra
-          // padding byte after the right border of the last source row.
-          const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
-          // Convert src_row_v to int8 (subtract 128).
-          HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
-          sx4 += beta;
-        }
-      }
-
-      // Regions 3 and 4.
-      // Vertical filter.
-      DestType* dst_row = dst + start_x - block_start_x;
-      int sy4 =
-          (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
-      for (int y = 0; y < 8; ++y) {
-        int sy = sy4 - MultiplyBy4(gamma);
-        __m128i filter[8];
-        for (__m128i& f : filter) {
-          const int offset =
-              RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
-              kWarpedPixelPrecisionShifts;
-          f = LoadUnaligned16(kWarpedFilters[offset]);
-          sy += gamma;
-        }
-        Transpose8x8_U16(filter, filter);
-        WriteVerticalFilter<is_compound>(filter, intermediate_result, y,
-                                         dst_row);
-        dst_row += dest_stride;
-        sy4 += delta;
-      }
-      start_x += 8;
-    } while (start_x < block_start_x + block_width);
+      HandleWarpBlock<is_compound, DestType>(
+          src, source_stride, source_width, source_height, warp_params,
+          subsampling_x, subsampling_y, src_x, src_y, alpha, beta, gamma, delta,
+          dst_row, dest_stride);
+      src_x += (8 << subsampling_x);
+      dst_row += 8;
+    } while (src_x < end_x);
     dst += 8 * dest_stride;
-    start_y += 8;
-  } while (start_y < block_start_y + block_height);
+    src_y += (8 << subsampling_y);
+  } while (src_y < end_y);
 }
 
 void Init8bpp() {
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/weight_mask_sse4.h b/chromium/third_party/libgav1/src/src/dsp/x86/weight_mask_sse4.h
index 42309916eb0..841dd5a26af 100644
--- a/chromium/third_party/libgav1/src/src/dsp/x86/weight_mask_sse4.h
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/weight_mask_sse4.h
@@ -36,6 +36,7 @@ void WeightMaskInit_SSE4_1();
 #define LIBGAV1_Dsp8bpp_WeightMask_16x8 LIBGAV1_CPU_SSE4_1
 #define LIBGAV1_Dsp8bpp_WeightMask_16x16 LIBGAV1_CPU_SSE4_1
 #define LIBGAV1_Dsp8bpp_WeightMask_16x32 LIBGAV1_CPU_SSE4_1
+#define LIBGAV1_Dsp8bpp_WeightMask_16x64 LIBGAV1_CPU_SSE4_1
 #define LIBGAV1_Dsp8bpp_WeightMask_32x8 LIBGAV1_CPU_SSE4_1
 #define LIBGAV1_Dsp8bpp_WeightMask_32x16 LIBGAV1_CPU_SSE4_1
 #define LIBGAV1_Dsp8bpp_WeightMask_32x32 LIBGAV1_CPU_SSE4_1
diff --git a/chromium/third_party/libgav1/src/src/frame_scratch_buffer.h b/chromium/third_party/libgav1/src/src/frame_scratch_buffer.h
index 6b336b0a58c..1d6a1f4fadb 100644
--- a/chromium/third_party/libgav1/src/src/frame_scratch_buffer.h
+++ b/chromium/third_party/libgav1/src/src/frame_scratch_buffer.h
@@ -17,17 +17,19 @@
 #ifndef LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_
 #define LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_
 
+#include <condition_variable>  // NOLINT (unapproved c++11 header)
 #include <cstdint>
 #include <memory>
 #include <mutex>  // NOLINT (unapproved c++11 header)
 
-#include "src/loop_filter_mask.h"
 #include "src/loop_restoration_info.h"
 #include "src/residual_buffer_pool.h"
 #include "src/symbol_decoder_context.h"
 #include "src/threading_strategy.h"
 #include "src/tile_scratch_buffer.h"
 #include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/compiler_attributes.h"
 #include "src/utils/constants.h"
 #include "src/utils/dynamic_buffer.h"
 #include "src/utils/memory.h"
@@ -37,17 +39,21 @@
 
 namespace libgav1 {
 
+// Buffer used to store the unfiltered pixels that are necessary for decoding
+// the next superblock row (for the intra prediction process).
+using IntraPredictionBuffer =
+    std::array<AlignedDynamicBuffer<uint8_t, kMaxAlignment>, kMaxPlanes>;
+
 // Buffer to facilitate decoding a frame. This struct is used only within
 // DecoderImpl::DecodeTiles().
 struct FrameScratchBuffer {
-  LoopFilterMask loop_filter_mask;
   LoopRestorationInfo loop_restoration_info;
   Array2D<int16_t> cdef_index;
   Array2D<TransformSize> inter_transform_sizes;
+  BlockParametersHolder block_parameters_holder;
   TemporalMotionField motion_field;
   SymbolDecoderContext symbol_decoder_context;
   std::unique_ptr<ResidualBufferPool> residual_buffer_pool;
-  Array2D<SuperBlockState> superblock_state;
   // threaded_window_buffer will be subdivided by PostFilter into windows of
   // width 512 pixels. Each row in the window is filtered by a worker thread.
   // To avoid false sharing, each 512-pixel row processed by one thread should
@@ -62,11 +68,22 @@ struct FrameScratchBuffer {
   // for every 32x32 for chroma with subsampling). The indices of the rows that
   // are stored are specified in |kDeblockedRowsForLoopRestoration|.
   YuvBuffer deblock_buffer;
+  // The size of this dynamic buffer is |tile_rows|.
+  DynamicBuffer<IntraPredictionBuffer> intra_prediction_buffers;
   TileScratchBufferPool tile_scratch_buffer_pool;
-  // TODO(vigneshv): This is part of the frame scratch buffer for now. This will
-  // have to change or move to DecoderImpl when frame parallel mode with
-  // in-frame multi-theading is implemented.
   ThreadingStrategy threading_strategy;
+  std::mutex superblock_row_mutex;
+  // The size of this buffer is the number of superblock rows.
+  // |superblock_row_progress[i]| is incremented whenever a tile finishes
+  // decoding superblock row at index i. If the count reaches tile_columns, then
+  // |superblock_row_progress_condvar[i]| is notified.
+  DynamicBuffer<int> superblock_row_progress
+      LIBGAV1_GUARDED_BY(superblock_row_mutex);
+  // The size of this buffer is the number of superblock rows. Used to wait for
+  // |superblock_row_progress[i]| to reach tile_columns.
+  DynamicBuffer<std::condition_variable> superblock_row_progress_condvar;
+  // Used to signal tile decoding failure in the combined multithreading mode.
+  bool tile_decoding_failed LIBGAV1_GUARDED_BY(superblock_row_mutex);
 };
 
 class FrameScratchBufferPool {
@@ -89,8 +106,6 @@ class FrameScratchBufferPool {
 
  private:
   std::mutex mutex_;
-  // TODO(b/142583029): The size of this stack is set to kMaxThreads. This may
-  // have to be revisited as we iterate over the frame parallel design.
   Stack<std::unique_ptr<FrameScratchBuffer>, kMaxThreads> buffers_
       LIBGAV1_GUARDED_BY(mutex_);
 };
diff --git a/chromium/third_party/libgav1/src/src/gav1/decoder.h b/chromium/third_party/libgav1/src/src/gav1/decoder.h
index 5151d647b6f..9d0d87291ee 100644
--- a/chromium/third_party/libgav1/src/src/gav1/decoder.h
+++ b/chromium/third_party/libgav1/src/src/gav1/decoder.h
@@ -94,11 +94,11 @@ class LIBGAV1_PUBLIC Decoder {
   // NOTE: |EnqueueFrame()| does not copy the data. Therefore, after a
   // successful |EnqueueFrame()| call, the caller must keep the |data| buffer
   // alive until:
-  // 1) If release_input_buffer is not nullptr, then |data| buffer must be kept
-  // alive until release_input_buffer is called with the |buffer_private_data|
-  // passed into this EnqueueFrame call.
-  // 2) If release_input_buffer is nullptr, then |data| buffer must be kept
-  // alive until the corresponding DequeueFrame() call is completed.
+  // 1) If |settings_.release_input_buffer| is not nullptr, then |data| buffer
+  // must be kept alive until release_input_buffer is called with the
+  // |buffer_private_data| passed into this EnqueueFrame call.
+  // 2) If |settings_.release_input_buffer| is nullptr, then |data| buffer must
+  // be kept alive until the corresponding DequeueFrame() call is completed.
   StatusCode EnqueueFrame(const uint8_t* data, size_t size,
                           int64_t user_private_data, void* buffer_private_data);
 
@@ -107,9 +107,12 @@ class LIBGAV1_PUBLIC Decoder {
   // compressed frame. If there are no displayable frames available, sets
   // |*out_ptr| to nullptr. Returns an error status if there is an error.
   //
-  // In frame parallel mode, if |settings_.blocking_dequeue| is true, then this
-  // call will block until an enqueued frame has been decoded. Otherwise, it
-  // will return kStatusTryAgain if an enqueued frame is not yet decoded.
+  // If |settings_.blocking_dequeue| is false and the decoder is operating in
+  // frame parallel mode (|settings_.frame_parallel| is true and the video
+  // stream passes the decoder's heuristics for enabling frame parallel mode),
+  // then this call will return kStatusTryAgain if an enqueued frame is not yet
+  // decoded (it is a non blocking call in this case). In all other cases, this
+  // call will block until an enqueued frame has been decoded.
   StatusCode DequeueFrame(const DecoderBuffer** out_ptr);
 
   // Signals the end of stream.
diff --git a/chromium/third_party/libgav1/src/src/gav1/decoder_settings.h b/chromium/third_party/libgav1/src/src/gav1/decoder_settings.h
index d7ec8d6754b..33777248a3c 100644
--- a/chromium/third_party/libgav1/src/src/gav1/decoder_settings.h
+++ b/chromium/third_party/libgav1/src/src/gav1/decoder_settings.h
@@ -41,15 +41,13 @@ typedef void (*Libgav1ReleaseInputBufferCallback)(void* callback_private_data,
                                                   void* buffer_private_data);
 
 typedef struct Libgav1DecoderSettings {
-  // Number of threads to use when decoding. Must be greater than 0. The
-  // library will create at most |threads|-1 new threads, the calling thread is
-  // considered part of the library's thread count. Defaults to 1 (no new
-  // threads will be created).
+  // Number of threads to use when decoding. Must be greater than 0. The library
+  // will create at most |threads| new threads. Defaults to 1 (no new threads
+  // will be created).
   int threads;
-  // A boolean. Do frame parallel decoding.
-  //
-  // NOTE: Frame parallel decoding is not implemented, this setting is
-  // currently ignored.
+  // A boolean. Indicate to the decoder that frame parallel decoding is allowed.
+  // Note that this is just a request and the decoder will decide the number of
+  // frames to be decoded in parallel based on the video stream being decoded.
   int frame_parallel;
   // A boolean. In frame parallel mode, should Libgav1DecoderDequeueFrame wait
   // until a enqueued frame is available for dequeueing.
@@ -91,15 +89,13 @@ using ReleaseInputBufferCallback = Libgav1ReleaseInputBufferCallback;
 
 // Applications must populate this structure before creating a decoder instance.
 struct DecoderSettings {
-  // Number of threads to use when decoding. Must be greater than 0. The
-  // library will create at most |threads|-1 new threads, the calling thread is
-  // considered part of the library's thread count. Defaults to 1 (no new
-  // threads will be created).
+  // Number of threads to use when decoding. Must be greater than 0. The library
+  // will create at most |threads| new threads. Defaults to 1 (no new threads
+  // will be created).
   int threads = 1;
-  // Do frame parallel decoding.
-  //
-  // NOTE: Frame parallel decoding is not implemented, this setting is
-  // currently ignored.
+  // Indicate to the decoder that frame parallel decoding is allowed. Note that
+  // this is just a request and the decoder will decide the number of frames to
+  // be decoded in parallel based on the video stream being decoded.
   bool frame_parallel = false;
   // In frame parallel mode, should DequeueFrame wait until a enqueued frame is
   // available for dequeueing.
diff --git a/chromium/third_party/libgav1/src/src/libgav1_decoder.cmake b/chromium/third_party/libgav1/src/src/libgav1_decoder.cmake
index a97f1425dd3..b97d09def17 100644
--- a/chromium/third_party/libgav1/src/src/libgav1_decoder.cmake
+++ b/chromium/third_party/libgav1/src/src/libgav1_decoder.cmake
@@ -33,8 +33,6 @@ list(APPEND libgav1_decoder_sources
             "${libgav1_source}/inter_intra_masks.inc"
             "${libgav1_source}/internal_frame_buffer_list.cc"
             "${libgav1_source}/internal_frame_buffer_list.h"
-            "${libgav1_source}/loop_filter_mask.cc"
-            "${libgav1_source}/loop_filter_mask.h"
             "${libgav1_source}/loop_restoration_info.cc"
             "${libgav1_source}/loop_restoration_info.h"
             "${libgav1_source}/motion_vector.cc"
@@ -43,6 +41,7 @@ list(APPEND libgav1_decoder_sources
             "${libgav1_source}/obu_parser.h"
             "${libgav1_source}/post_filter/cdef.cc"
             "${libgav1_source}/post_filter/deblock.cc"
+            "${libgav1_source}/post_filter/deblock_thresholds.inc"
             "${libgav1_source}/post_filter/loop_restoration.cc"
             "${libgav1_source}/post_filter/post_filter.cc"
             "${libgav1_source}/post_filter/super_res.cc"
@@ -56,6 +55,7 @@ list(APPEND libgav1_decoder_sources
             "${libgav1_source}/reconstruction.h"
             "${libgav1_source}/residual_buffer_pool.cc"
             "${libgav1_source}/residual_buffer_pool.h"
+            "${libgav1_source}/scan_tables.inc"
             "${libgav1_source}/symbol_decoder_context.cc"
             "${libgav1_source}/symbol_decoder_context.h"
             "${libgav1_source}/symbol_decoder_context_cdfs.inc"
diff --git a/chromium/third_party/libgav1/src/src/loop_filter_mask.cc b/chromium/third_party/libgav1/src/src/loop_filter_mask.cc
deleted file mode 100644
index 8f96df9bf92..00000000000
--- a/chromium/third_party/libgav1/src/src/loop_filter_mask.cc
+++ /dev/null
@@ -1,208 +0,0 @@
-// Copyright 2019 The libgav1 Authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "src/loop_filter_mask.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <new>
-
-#include "src/utils/array_2d.h"
-#include "src/utils/compiler_attributes.h"
-
-namespace libgav1 {
-
-#if !LIBGAV1_CXX17
-// static.
-constexpr BitMaskSet LoopFilterMask::kPredictionModeDeltasMask;
-#endif
-
-bool LoopFilterMask::Reset(int width, int height) {
-  num_64x64_blocks_per_row_ = DivideBy64(width + 63);
-  num_64x64_blocks_per_column_ = DivideBy64(height + 63);
-  const int num_64x64_blocks =
-      num_64x64_blocks_per_row_ * num_64x64_blocks_per_column_;
-  if (num_64x64_blocks_ == -1 || num_64x64_blocks_ < num_64x64_blocks) {
-    // Note that this need not be zero initialized here since we zero
-    // initialize the required number of entries in the loop that follows.
-    loop_filter_masks_.reset(new (std::nothrow)
-                                 Data[num_64x64_blocks]);  // NOLINT.
-    if (loop_filter_masks_ == nullptr) {
-      return false;
-    }
-  }
-  for (int i = 0; i < num_64x64_blocks; ++i) {
-    memset(&loop_filter_masks_[i], 0, sizeof(loop_filter_masks_[i]));
-  }
-  num_64x64_blocks_ = num_64x64_blocks;
-  return true;
-}
-
-void LoopFilterMask::Build(
-    const ObuSequenceHeader& sequence_header,
-    const ObuFrameHeader& frame_header, int tile_group_start,
-    int tile_group_end, const BlockParametersHolder& block_parameters_holder,
-    const Array2D<TransformSize>& inter_transform_sizes) {
-  for (int tile_number = tile_group_start; tile_number <= tile_group_end;
-       ++tile_number) {
-    const int row = tile_number / frame_header.tile_info.tile_columns;
-    const int column = tile_number % frame_header.tile_info.tile_columns;
-    const int row4x4_start = frame_header.tile_info.tile_row_start[row];
-    const int row4x4_end = frame_header.tile_info.tile_row_start[row + 1];
-    const int column4x4_start =
-        frame_header.tile_info.tile_column_start[column];
-    const int column4x4_end =
-        frame_header.tile_info.tile_column_start[column + 1];
-
-    const int num_planes = sequence_header.color_config.is_monochrome
-                               ? kMaxPlanesMonochrome
-                               : kMaxPlanes;
-    for (int plane = kPlaneY; plane < num_planes; ++plane) {
-      // For U and V planes, do not build bit masks if level == 0.
-      if (plane > kPlaneY && frame_header.loop_filter.level[plane + 1] == 0) {
-        continue;
-      }
-      const int8_t subsampling_x =
-          (plane == kPlaneY) ? 0 : sequence_header.color_config.subsampling_x;
-      const int8_t subsampling_y =
-          (plane == kPlaneY) ? 0 : sequence_header.color_config.subsampling_y;
-      const int vertical_step = 1 << subsampling_y;
-      const int horizontal_step = 1 << subsampling_x;
-
-      // Build bit masks for vertical edges (except the frame boundary).
-      if (column4x4_start != 0) {
-        const int plane_height =
-            RightShiftWithRounding(frame_header.height, subsampling_y);
-        const int row4x4_limit =
-            std::min(row4x4_end, DivideBy4(plane_height + 3) << subsampling_y);
-        const int vertical_level_index =
-            kDeblockFilterLevelIndex[plane][kLoopFilterTypeVertical];
-        for (int row4x4 = GetDeblockPosition(row4x4_start, subsampling_y);
-             row4x4 < row4x4_limit; row4x4 += vertical_step) {
-          const int column4x4 =
-              GetDeblockPosition(column4x4_start, subsampling_x);
-          const BlockParameters& bp =
-              *block_parameters_holder.Find(row4x4, column4x4);
-          const uint8_t vertical_level =
-              bp.deblock_filter_level[vertical_level_index];
-          const BlockParameters& bp_left = *block_parameters_holder.Find(
-              row4x4, column4x4 - horizontal_step);
-          const uint8_t left_level =
-              bp_left.deblock_filter_level[vertical_level_index];
-          const int unit_id = DivideBy16(row4x4) * num_64x64_blocks_per_row_ +
-                              DivideBy16(column4x4);
-          const int row = row4x4 % kNum4x4InLoopFilterMaskUnit;
-          const int column = column4x4 % kNum4x4InLoopFilterMaskUnit;
-          const int shift = LoopFilterMask::GetShift(row, column);
-          const int index = LoopFilterMask::GetIndex(row);
-          const auto mask = static_cast<uint64_t>(1) << shift;
-          // Tile boundary must be coding block boundary. So we don't have to
-          // check (!left_skip || !skip || is_vertical_border).
-          if (vertical_level != 0 || left_level != 0) {
-            assert(inter_transform_sizes[row4x4] != nullptr);
-            const TransformSize tx_size =
-                (plane == kPlaneY) ? inter_transform_sizes[row4x4][column4x4]
-                                   : bp.uv_transform_size;
-            const TransformSize left_tx_size =
-                (plane == kPlaneY)
-                    ? inter_transform_sizes[row4x4][column4x4 - horizontal_step]
-                    : bp_left.uv_transform_size;
-            const LoopFilterTransformSizeId transform_size_id =
-                GetTransformSizeIdWidth(tx_size, left_tx_size);
-            SetLeft(mask, unit_id, plane, transform_size_id, index);
-            const uint8_t current_level =
-                (vertical_level == 0) ? left_level : vertical_level;
-            SetLevel(current_level, unit_id, plane, kLoopFilterTypeVertical,
-                     LoopFilterMask::GetLevelOffset(row, column));
-          }
-        }
-      }
-
-      // Build bit masks for horizontal edges (except the frame boundary).
-      if (row4x4_start != 0) {
-        const int plane_width =
-            RightShiftWithRounding(frame_header.width, subsampling_x);
-        const int column4x4_limit = std::min(
-            column4x4_end, DivideBy4(plane_width + 3) << subsampling_y);
-        const int horizontal_level_index =
-            kDeblockFilterLevelIndex[plane][kLoopFilterTypeHorizontal];
-        for (int column4x4 = GetDeblockPosition(column4x4_start, subsampling_x);
-             column4x4 < column4x4_limit; column4x4 += horizontal_step) {
-          const int row4x4 = GetDeblockPosition(row4x4_start, subsampling_y);
-          const BlockParameters& bp =
-              *block_parameters_holder.Find(row4x4, column4x4);
-          const uint8_t horizontal_level =
-              bp.deblock_filter_level[horizontal_level_index];
-          const BlockParameters& bp_top =
-              *block_parameters_holder.Find(row4x4 - vertical_step, column4x4);
-          const uint8_t top_level =
-              bp_top.deblock_filter_level[horizontal_level_index];
-          const int unit_id = DivideBy16(row4x4) * num_64x64_blocks_per_row_ +
-                              DivideBy16(column4x4);
-          const int row = row4x4 % kNum4x4InLoopFilterMaskUnit;
-          const int column = column4x4 % kNum4x4InLoopFilterMaskUnit;
-          const int shift = LoopFilterMask::GetShift(row, column);
-          const int index = LoopFilterMask::GetIndex(row);
-          const auto mask = static_cast<uint64_t>(1) << shift;
-          // Tile boundary must be coding block boundary. So we don't have to
-          // check (!top_skip || !skip || is_horizontal_border).
-          if (horizontal_level != 0 || top_level != 0) {
-            assert(inter_transform_sizes[row4x4] != nullptr);
-            const TransformSize tx_size =
-                (plane == kPlaneY) ? inter_transform_sizes[row4x4][column4x4]
-                                   : bp.uv_transform_size;
-            const TransformSize top_tx_size =
-                (plane == kPlaneY)
-                    ? inter_transform_sizes[row4x4 - vertical_step][column4x4]
-                    : bp_top.uv_transform_size;
-            const LoopFilterTransformSizeId transform_size_id =
-                static_cast<LoopFilterTransformSizeId>(
-                    std::min({kTransformHeightLog2[tx_size] - 2,
-                              kTransformHeightLog2[top_tx_size] - 2, 2}));
-            SetTop(mask, unit_id, plane, transform_size_id, index);
-            const uint8_t current_level =
-                (horizontal_level == 0) ? top_level : horizontal_level;
-            SetLevel(current_level, unit_id, plane, kLoopFilterTypeHorizontal,
-                     LoopFilterMask::GetLevelOffset(row, column));
-          }
-        }
-      }
-    }
-  }
-  assert(IsValid());
-}
-
-bool LoopFilterMask::IsValid() const {
-  for (int mask_id = 0; mask_id < num_64x64_blocks_; ++mask_id) {
-    for (int plane = 0; plane < kMaxPlanes; ++plane) {
-      for (int i = 0; i < kNumLoopFilterTransformSizeIds; ++i) {
-        for (int j = i + 1; j < kNumLoopFilterTransformSizeIds; ++j) {
-          for (int k = 0; k < kNumLoopFilterMasks; ++k) {
-            if ((loop_filter_masks_[mask_id].left[plane][i][k] &
-                 loop_filter_masks_[mask_id].left[plane][j][k]) != 0 ||
-                (loop_filter_masks_[mask_id].top[plane][i][k] &
-                 loop_filter_masks_[mask_id].top[plane][j][k]) != 0) {
-              return false;
-            }
-          }
-        }
-      }
-    }
-  }
-  return true;
-}
-
-}  // namespace libgav1
diff --git a/chromium/third_party/libgav1/src/src/loop_filter_mask.h b/chromium/third_party/libgav1/src/src/loop_filter_mask.h
deleted file mode 100644
index 314f020b99b..00000000000
--- a/chromium/third_party/libgav1/src/src/loop_filter_mask.h
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright 2019 The libgav1 Authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LIBGAV1_SRC_LOOP_FILTER_MASK_H_
-#define LIBGAV1_SRC_LOOP_FILTER_MASK_H_
-
-#include <array>
-#include <cassert>
-#include <cstdint>
-#include <memory>
-
-#include "src/dsp/constants.h"
-#include "src/dsp/dsp.h"
-#include "src/obu_parser.h"
-#include "src/utils/array_2d.h"
-#include "src/utils/bit_mask_set.h"
-#include "src/utils/block_parameters_holder.h"
-#include "src/utils/common.h"
-#include "src/utils/constants.h"
-#include "src/utils/segmentation.h"
-#include "src/utils/types.h"
-
-namespace libgav1 {
-
-class LoopFilterMask {
- public:
-  // This structure holds loop filter bit masks for a 64x64 block.
-  // 64x64 block contains kNum4x4In64x64 = (64x64 / (4x4) = 256)
-  // 4x4 blocks. It requires kNumLoopFilterMasks = 4 uint64_t to represent them.
-  struct Data : public Allocable {
-    uint8_t level[kMaxPlanes][kNumLoopFilterTypes][kNum4x4In64x64];
-    uint64_t left[kMaxPlanes][kNumLoopFilterTransformSizeIds]
-                 [kNumLoopFilterMasks];
-    uint64_t top[kMaxPlanes][kNumLoopFilterTransformSizeIds]
-                [kNumLoopFilterMasks];
-  };
-
-  LoopFilterMask() = default;
-
-  // Loop filter mask is built and used for each superblock individually.
-  // Thus not copyable/movable.
-  LoopFilterMask(const LoopFilterMask&) = delete;
-  LoopFilterMask& operator=(const LoopFilterMask&) = delete;
-  LoopFilterMask(LoopFilterMask&&) = delete;
-  LoopFilterMask& operator=(LoopFilterMask&&) = delete;
-
-  // Allocates the loop filter masks for the given |width| and
-  // |height| if necessary and zeros out the appropriate number of
-  // entries. Returns true on success.
-  bool Reset(int width, int height);
-
-  // Builds bit masks for tile boundaries.
-  // This function is called after the frame has been decoded so that
-  // information across tiles is available.
-  // Before this function call, bit masks of transform edges other than those
-  // on tile boundaries are built together with tile decoding, in
-  // Tile::BuildBitMask().
-  void Build(const ObuSequenceHeader& sequence_header,
-             const ObuFrameHeader& frame_header, int tile_group_start,
-             int tile_group_end,
-             const BlockParametersHolder& block_parameters_holder,
-             const Array2D<TransformSize>& inter_transform_sizes);
-
-  uint8_t GetLevel(int mask_id, int plane, LoopFilterType type,
-                   int offset) const {
-    return loop_filter_masks_[mask_id].level[plane][type][offset];
-  }
-
-  uint64_t GetLeft(int mask_id, int plane, LoopFilterTransformSizeId tx_size_id,
-                   int index) const {
-    return loop_filter_masks_[mask_id].left[plane][tx_size_id][index];
-  }
-
-  uint64_t GetTop(int mask_id, int plane, LoopFilterTransformSizeId tx_size_id,
-                  int index) const {
-    return loop_filter_masks_[mask_id].top[plane][tx_size_id][index];
-  }
-
-  int num_64x64_blocks_per_row() const { return num_64x64_blocks_per_row_; }
-
-  void SetLeft(uint64_t new_mask, int mask_id, int plane,
-               LoopFilterTransformSizeId transform_size_id, int index) {
-    loop_filter_masks_[mask_id].left[plane][transform_size_id][index] |=
-        new_mask;
-  }
-
-  void SetTop(uint64_t new_mask, int mask_id, int plane,
-              LoopFilterTransformSizeId transform_size_id, int index) {
-    loop_filter_masks_[mask_id].top[plane][transform_size_id][index] |=
-        new_mask;
-  }
-
-  void SetLevel(uint8_t level, int mask_id, int plane, LoopFilterType type,
-                int offset) {
-    loop_filter_masks_[mask_id].level[plane][type][offset] = level;
-  }
-
-  static int GetIndex(int row4x4) { return row4x4 >> 2; }
-
-  static int GetShift(int row4x4, int column4x4) {
-    return ((row4x4 & 3) << 4) | column4x4;
-  }
-
-  static int GetLevelOffset(int row4x4, int column4x4) {
-    assert(row4x4 < 16);
-    assert(column4x4 < 16);
-    return (row4x4 << 4) | column4x4;
-  }
-
-  static constexpr int GetModeId(PredictionMode mode) {
-    return static_cast<int>(kPredictionModeDeltasMask.Contains(mode));
-  }
-
-  // 7.14.5.
-  static void ComputeDeblockFilterLevels(
-      const ObuFrameHeader& frame_header, int segment_id, int level_index,
-      const int8_t delta_lf[kFrameLfCount],
-      uint8_t deblock_filter_levels[kNumReferenceFrameTypes][2]) {
-    const int delta = delta_lf[frame_header.delta_lf.multi ? level_index : 0];
-    uint8_t level = Clip3(frame_header.loop_filter.level[level_index] + delta,
-                          0, kMaxLoopFilterValue);
-    const auto feature = static_cast<SegmentFeature>(
-        kSegmentFeatureLoopFilterYVertical + level_index);
-    level = Clip3(
-        level + frame_header.segmentation.feature_data[segment_id][feature], 0,
-        kMaxLoopFilterValue);
-    if (!frame_header.loop_filter.delta_enabled) {
-      static_assert(sizeof(deblock_filter_levels[0][0]) == 1, "");
-      memset(deblock_filter_levels, level, kNumReferenceFrameTypes * 2);
-      return;
-    }
-    assert(frame_header.loop_filter.delta_enabled);
-    const int shift = level >> 5;
-    deblock_filter_levels[kReferenceFrameIntra][0] = Clip3(
-        level +
-            LeftShift(frame_header.loop_filter.ref_deltas[kReferenceFrameIntra],
-                      shift),
-        0, kMaxLoopFilterValue);
-    // deblock_filter_levels[kReferenceFrameIntra][1] is never used. So it does
-    // not have to be populated.
-    for (int reference_frame = kReferenceFrameIntra + 1;
-         reference_frame < kNumReferenceFrameTypes; ++reference_frame) {
-      for (int mode_id = 0; mode_id < 2; ++mode_id) {
-        deblock_filter_levels[reference_frame][mode_id] = Clip3(
-            level +
-                LeftShift(frame_header.loop_filter.ref_deltas[reference_frame] +
-                              frame_header.loop_filter.mode_deltas[mode_id],
-                          shift),
-            0, kMaxLoopFilterValue);
-      }
-    }
-  }
-
- private:
-  std::unique_ptr<Data[]> loop_filter_masks_;
-  int num_64x64_blocks_ = -1;
-  int num_64x64_blocks_per_row_;
-  int num_64x64_blocks_per_column_;
-
-  // Mask used to determine the index for mode_deltas lookup.
-  static constexpr BitMaskSet kPredictionModeDeltasMask{
-      BitMaskSet(kPredictionModeNearestMv, kPredictionModeNearMv,
-                 kPredictionModeNewMv, kPredictionModeNearestNearestMv,
-                 kPredictionModeNearNearMv, kPredictionModeNearestNewMv,
-                 kPredictionModeNewNearestMv, kPredictionModeNearNewMv,
-                 kPredictionModeNewNearMv, kPredictionModeNewNewMv)};
-
-  // Validates that the loop filter masks at different transform sizes are
-  // mutually exclusive. Only used in an assert. This function will not be used
-  // in optimized builds.
-  bool IsValid() const;
-};
-
-}  // namespace libgav1
-
-#endif  // LIBGAV1_SRC_LOOP_FILTER_MASK_H_
diff --git a/chromium/third_party/libgav1/src/src/motion_vector.cc b/chromium/third_party/libgav1/src/src/motion_vector.cc
index c7a496e5979..8223f3decc1 100644
--- a/chromium/third_party/libgav1/src/src/motion_vector.cc
+++ b/chromium/third_party/libgav1/src/src/motion_vector.cc
@@ -479,19 +479,28 @@ void TemporalScan(const Tile::Block& block, bool is_compound,
   if (count != 0) {
     BlockParameters* const bp = block.bp;
     int reference_offsets[2];
-    const int offset_0 = GetRelativeDistance(
-        tile.frame_header().order_hint,
-        tile.current_frame().order_hint(bp->reference_frame[0]),
-        tile.sequence_header().order_hint_shift_bits);
+    const int offset_0 = tile.current_frame()
+                             .reference_info()
+                             ->relative_distance_to[bp->reference_frame[0]];
     reference_offsets[0] =
         Clip3(offset_0, -kMaxFrameDistance, kMaxFrameDistance);
     if (is_compound) {
-      const int offset_1 = GetRelativeDistance(
-          tile.frame_header().order_hint,
-          tile.current_frame().order_hint(bp->reference_frame[1]),
-          tile.sequence_header().order_hint_shift_bits);
+      const int offset_1 = tile.current_frame()
+                               .reference_info()
+                               ->relative_distance_to[bp->reference_frame[1]];
       reference_offsets[1] =
           Clip3(offset_1, -kMaxFrameDistance, kMaxFrameDistance);
+      // Pad so that SIMD implementations won't read uninitialized memory.
+      if ((count & 1) != 0) {
+        temporal_mvs[count].mv32 = 0;
+        temporal_reference_offsets[count] = 0;
+      }
+    } else {
+      // Pad so that SIMD implementations won't read uninitialized memory.
+      for (int i = count; i < ((count + 3) & ~3); ++i) {
+        temporal_mvs[i].mv32 = 0;
+        temporal_reference_offsets[i] = 0;
+      }
     }
     AddTemporalReferenceMvCandidate(
         tile.frame_header(), reference_offsets, temporal_mvs,
@@ -752,12 +761,12 @@ void AddSample(const Tile::Block& block, int delta_row, int delta_column,
 // or -1 so that it can be XORed and subtracted directly in ApplySign() and
 // corresponding SIMD implementations.
 bool MotionFieldProjection(
-    const ObuFrameHeader& frame_header, const RefCountedBuffer& current_frame,
+    const ObuFrameHeader& frame_header,
     const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
         reference_frames,
-    ReferenceFrameType source, unsigned int order_hint_shift_bits,
-    int reference_to_current_with_sign, int dst_sign, int y8_start, int y8_end,
-    int x8_start, int x8_end, TemporalMotionField* const motion_field) {
+    ReferenceFrameType source, int reference_to_current_with_sign, int dst_sign,
+    int y8_start, int y8_end, int x8_start, int x8_end,
+    TemporalMotionField* const motion_field) {
   const int source_index =
       frame_header.reference_frame_index[source - kReferenceFrameLast];
   auto* const source_frame = reference_frames[source_index].get();
@@ -770,12 +779,10 @@ bool MotionFieldProjection(
   }
   assert(reference_to_current_with_sign >= -kMaxFrameDistance);
   if (reference_to_current_with_sign > kMaxFrameDistance) return true;
+  const ReferenceInfo& reference_info = *source_frame->reference_info();
   const dsp::Dsp& dsp = *dsp::GetDspTable(8);
   dsp.motion_field_projection_kernel(
-      source_frame->motion_field_reference_frame(y8_start, 0),
-      source_frame->motion_field_mv(y8_start, 0),
-      source_frame->order_hint_array(), current_frame.order_hint(source),
-      order_hint_shift_bits, reference_to_current_with_sign, dst_sign, y8_start,
+      reference_info, reference_to_current_with_sign, dst_sign, y8_start,
       y8_end, x8_start, x8_end, motion_field);
   return true;
 }
@@ -921,62 +928,58 @@ void SetupMotionField(
     const ObuFrameHeader& frame_header, const RefCountedBuffer& current_frame,
     const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
         reference_frames,
-    unsigned int order_hint_shift_bits, int row4x4_start, int row4x4_end,
-    int column4x4_start, int column4x4_end,
+    int row4x4_start, int row4x4_end, int column4x4_start, int column4x4_end,
     TemporalMotionField* const motion_field) {
   assert(frame_header.use_ref_frame_mvs);
-  assert(order_hint_shift_bits != 0);
   const int y8_start = DivideBy2(row4x4_start);
   const int y8_end = DivideBy2(std::min(row4x4_end, frame_header.rows4x4));
   const int x8_start = DivideBy2(column4x4_start);
   const int x8_end =
       DivideBy2(std::min(column4x4_end, frame_header.columns4x4));
-  const int8_t* const reference_frame_index =
-      frame_header.reference_frame_index;
-  const int last_index = reference_frame_index[0];
-  const int last_alternate_order_hint =
-      reference_frames[last_index]->order_hint(kReferenceFrameAlternate);
-  const int current_gold_order_hint =
-      current_frame.order_hint(kReferenceFrameGolden);
-  if (last_alternate_order_hint != current_gold_order_hint) {
-    const int reference_offset_last =
-        -GetRelativeDistance(current_frame.order_hint(kReferenceFrameLast),
-                             frame_header.order_hint, order_hint_shift_bits);
-    if (std::abs(reference_offset_last) <= kMaxFrameDistance) {
-      MotionFieldProjection(frame_header, current_frame, reference_frames,
-                            kReferenceFrameLast, order_hint_shift_bits,
-                            reference_offset_last, -1, y8_start, y8_end,
-                            x8_start, x8_end, motion_field);
+  const int last_index = frame_header.reference_frame_index[0];
+  const ReferenceInfo& reference_info = *current_frame.reference_info();
+  if (!IsIntraFrame(reference_frames[last_index]->frame_type())) {
+    const int last_alternate_order_hint =
+        reference_frames[last_index]
+            ->reference_info()
+            ->order_hint[kReferenceFrameAlternate];
+    const int current_gold_order_hint =
+        reference_info.order_hint[kReferenceFrameGolden];
+    if (last_alternate_order_hint != current_gold_order_hint) {
+      const int reference_offset_last =
+          -reference_info.relative_distance_from[kReferenceFrameLast];
+      if (std::abs(reference_offset_last) <= kMaxFrameDistance) {
+        MotionFieldProjection(frame_header, reference_frames,
+                              kReferenceFrameLast, reference_offset_last, -1,
+                              y8_start, y8_end, x8_start, x8_end, motion_field);
+      }
     }
   }
   int ref_stamp = 1;
   const int reference_offset_backward =
-      GetRelativeDistance(current_frame.order_hint(kReferenceFrameBackward),
-                          frame_header.order_hint, order_hint_shift_bits);
+      reference_info.relative_distance_from[kReferenceFrameBackward];
   if (reference_offset_backward > 0 &&
-      MotionFieldProjection(frame_header, current_frame, reference_frames,
-                            kReferenceFrameBackward, order_hint_shift_bits,
-                            reference_offset_backward, 0, y8_start, y8_end,
-                            x8_start, x8_end, motion_field)) {
+      MotionFieldProjection(frame_header, reference_frames,
+                            kReferenceFrameBackward, reference_offset_backward,
+                            0, y8_start, y8_end, x8_start, x8_end,
+                            motion_field)) {
     --ref_stamp;
   }
   const int reference_offset_alternate2 =
-      GetRelativeDistance(current_frame.order_hint(kReferenceFrameAlternate2),
-                          frame_header.order_hint, order_hint_shift_bits);
+      reference_info.relative_distance_from[kReferenceFrameAlternate2];
   if (reference_offset_alternate2 > 0 &&
-      MotionFieldProjection(frame_header, current_frame, reference_frames,
-                            kReferenceFrameAlternate2, order_hint_shift_bits,
+      MotionFieldProjection(frame_header, reference_frames,
+                            kReferenceFrameAlternate2,
                             reference_offset_alternate2, 0, y8_start, y8_end,
                             x8_start, x8_end, motion_field)) {
     --ref_stamp;
   }
   if (ref_stamp >= 0) {
     const int reference_offset_alternate =
-        GetRelativeDistance(current_frame.order_hint(kReferenceFrameAlternate),
-                            frame_header.order_hint, order_hint_shift_bits);
+        reference_info.relative_distance_from[kReferenceFrameAlternate];
     if (reference_offset_alternate > 0 &&
-        MotionFieldProjection(frame_header, current_frame, reference_frames,
-                              kReferenceFrameAlternate, order_hint_shift_bits,
+        MotionFieldProjection(frame_header, reference_frames,
+                              kReferenceFrameAlternate,
                               reference_offset_alternate, 0, y8_start, y8_end,
                               x8_start, x8_end, motion_field)) {
       --ref_stamp;
@@ -984,13 +987,11 @@ void SetupMotionField(
   }
   if (ref_stamp >= 0) {
     const int reference_offset_last2 =
-        -GetRelativeDistance(current_frame.order_hint(kReferenceFrameLast2),
-                             frame_header.order_hint, order_hint_shift_bits);
+        -reference_info.relative_distance_from[kReferenceFrameLast2];
     if (std::abs(reference_offset_last2) <= kMaxFrameDistance) {
-      MotionFieldProjection(frame_header, current_frame, reference_frames,
-                            kReferenceFrameLast2, order_hint_shift_bits,
-                            reference_offset_last2, -1, y8_start, y8_end,
-                            x8_start, x8_end, motion_field);
+      MotionFieldProjection(frame_header, reference_frames,
+                            kReferenceFrameLast2, reference_offset_last2, -1,
+                            y8_start, y8_end, x8_start, x8_end, motion_field);
     }
   }
 }
diff --git a/chromium/third_party/libgav1/src/src/motion_vector.h b/chromium/third_party/libgav1/src/src/motion_vector.h
index f34bebb5346..d739e802831 100644
--- a/chromium/third_party/libgav1/src/src/motion_vector.h
+++ b/chromium/third_party/libgav1/src/src/motion_vector.h
@@ -51,8 +51,8 @@ void SetupMotionField(
     const ObuFrameHeader& frame_header, const RefCountedBuffer& current_frame,
     const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
         reference_frames,
-    unsigned int order_hint_shift_bits, int row4x4_start, int row4x4_end,
-    int column4x4_start, int column4x4_end, TemporalMotionField* motion_field);
+    int row4x4_start, int row4x4_end, int column4x4_start, int column4x4_end,
+    TemporalMotionField* motion_field);
 
 }  // namespace libgav1
 
diff --git a/chromium/third_party/libgav1/src/src/obu_parser.cc b/chromium/third_party/libgav1/src/src/obu_parser.cc
index 0a3ccd49254..ffa267fb348 100644
--- a/chromium/third_party/libgav1/src/src/obu_parser.cc
+++ b/chromium/third_party/libgav1/src/src/obu_parser.cc
@@ -1080,29 +1080,32 @@ void ObuParser::ComputeSegmentLosslessAndQIndex() {
 }
 
 bool ObuParser::ParseCdefParameters() {
+  const int coeff_shift = sequence_header_.color_config.bitdepth - 8;
   if (frame_header_.coded_lossless || frame_header_.allow_intrabc ||
       !sequence_header_.enable_cdef) {
-    frame_header_.cdef.damping = 3;
+    frame_header_.cdef.damping = 3 + coeff_shift;
     return true;
   }
   Cdef* const cdef = &frame_header_.cdef;
   int64_t scratch;
   OBU_READ_LITERAL_OR_FAIL(2);
-  cdef->damping = scratch + 3;
+  cdef->damping = scratch + 3 + coeff_shift;
   OBU_READ_LITERAL_OR_FAIL(2);
   cdef->bits = scratch;
   for (int i = 0; i < (1 << cdef->bits); ++i) {
     OBU_READ_LITERAL_OR_FAIL(4);
-    cdef->y_primary_strength[i] = scratch;
+    cdef->y_primary_strength[i] = scratch << coeff_shift;
     OBU_READ_LITERAL_OR_FAIL(2);
     cdef->y_secondary_strength[i] = scratch;
     if (cdef->y_secondary_strength[i] == 3) ++cdef->y_secondary_strength[i];
+    cdef->y_secondary_strength[i] <<= coeff_shift;
     if (sequence_header_.color_config.is_monochrome) continue;
     OBU_READ_LITERAL_OR_FAIL(4);
-    cdef->uv_primary_strength[i] = scratch;
+    cdef->uv_primary_strength[i] = scratch << coeff_shift;
     OBU_READ_LITERAL_OR_FAIL(2);
     cdef->uv_secondary_strength[i] = scratch;
     if (cdef->uv_secondary_strength[i] == 3) ++cdef->uv_secondary_strength[i];
+    cdef->uv_secondary_strength[i] <<= coeff_shift;
   }
   return true;
 }
@@ -1192,6 +1195,12 @@ bool ObuParser::IsSkipModeAllowed() {
     const unsigned int reference_hint =
         decoder_state_
             .reference_order_hint[frame_header_.reference_frame_index[i]];
+    // TODO(linfengz): |relative_distance| equals
+    // current_frame_->reference_info()->
+    //     relative_distance_from[i + kReferenceFrameLast];
+    // However, the unit test ObuParserTest.SkipModeParameters() would fail.
+    // Will figure out how to initialize |current_frame_.reference_info_| in the
+    // RefCountedBuffer later.
     const int relative_distance =
         GetRelativeDistance(reference_hint, frame_header_.order_hint,
                             sequence_header_.order_hint_shift_bits);
@@ -1842,7 +1851,6 @@ bool ObuParser::ParseFrameParameters() {
   if (frame_header_.frame_type == kFrameKey && frame_header_.show_frame) {
     decoder_state_.reference_valid.fill(false);
     decoder_state_.reference_order_hint.fill(0);
-    current_frame_->ClearOrderHints();
   }
   OBU_READ_BIT_OR_FAIL;
   frame_header_.enable_cdf_update = !static_cast<bool>(scratch);
@@ -2092,16 +2100,44 @@ bool ObuParser::ParseFrameParameters() {
     return false;
   }
   if (!IsIntraFrame(frame_header_.frame_type)) {
-    for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
-      const auto reference_frame =
-          static_cast<ReferenceFrameType>(kReferenceFrameLast + i);
+    // Initialize the kReferenceFrameIntra type reference frame information to
+    // simplify the frame type validation in motion field projection.
+    // Set the kReferenceFrameIntra type |order_hint_| to
+    // |frame_header_.order_hint|. This guarantees that in SIMD implementations,
+    // the other reference frame information of the kReferenceFrameIntra type
+    // could be correctly initialized using the following loop with
+    // |frame_header_.order_hint| being the |hint|.
+    ReferenceInfo* const reference_info = current_frame_->reference_info();
+    reference_info->order_hint[kReferenceFrameIntra] = frame_header_.order_hint;
+    reference_info->relative_distance_from[kReferenceFrameIntra] = 0;
+    reference_info->relative_distance_to[kReferenceFrameIntra] = 0;
+    reference_info->skip_references[kReferenceFrameIntra] = true;
+    reference_info->projection_divisions[kReferenceFrameIntra] = 0;
+
+    for (int i = kReferenceFrameLast; i <= kNumInterReferenceFrameTypes; ++i) {
+      const auto reference_frame = static_cast<ReferenceFrameType>(i);
       const uint8_t hint =
-          decoder_state_
-              .reference_order_hint[frame_header_.reference_frame_index[i]];
-      current_frame_->set_order_hint(reference_frame, hint);
-      decoder_state_.reference_frame_sign_bias[reference_frame] =
+          decoder_state_.reference_order_hint
+              [frame_header_.reference_frame_index[i - kReferenceFrameLast]];
+      reference_info->order_hint[reference_frame] = hint;
+      const int relative_distance_from =
           GetRelativeDistance(hint, frame_header_.order_hint,
-                              sequence_header_.order_hint_shift_bits) > 0;
+                              sequence_header_.order_hint_shift_bits);
+      const int relative_distance_to =
+          GetRelativeDistance(frame_header_.order_hint, hint,
+                              sequence_header_.order_hint_shift_bits);
+      reference_info->relative_distance_from[reference_frame] =
+          relative_distance_from;
+      reference_info->relative_distance_to[reference_frame] =
+          relative_distance_to;
+      reference_info->skip_references[reference_frame] =
+          relative_distance_to > kMaxFrameDistance || relative_distance_to <= 0;
+      reference_info->projection_divisions[reference_frame] =
+          reference_info->skip_references[reference_frame]
+              ? 0
+              : kProjectionMvDivisionLookup[relative_distance_to];
+      decoder_state_.reference_frame_sign_bias[reference_frame] =
+          relative_distance_from > 0;
     }
   }
   if (frame_header_.enable_cdf_update &&
@@ -2128,6 +2164,11 @@ bool ObuParser::ParseFrameHeader() {
       ParseQuantizerIndexDeltaParameters() && ParseLoopFilterDeltaParameters();
   if (!status) return false;
   ComputeSegmentLosslessAndQIndex();
+  // Section 6.8.2: It is a requirement of bitstream conformance that
+  // delta_q_present is equal to 0 when CodedLossless is equal to 1.
+  if (frame_header_.coded_lossless && frame_header_.delta_q.present) {
+    return false;
+  }
   status = ParseLoopFilterParameters();
   if (!status) return false;
   current_frame_->SetLoopFilterDeltas(frame_header_.loop_filter);
diff --git a/chromium/third_party/libgav1/src/src/post_filter.h b/chromium/third_party/libgav1/src/src/post_filter.h
index 16c784ac458..c7af197575d 100644
--- a/chromium/third_party/libgav1/src/src/post_filter.h
+++ b/chromium/third_party/libgav1/src/src/post_filter.h
@@ -27,7 +27,7 @@
 
 #include "src/dsp/common.h"
 #include "src/dsp/dsp.h"
-#include "src/loop_filter_mask.h"
+#include "src/frame_scratch_buffer.h"
 #include "src/loop_restoration_info.h"
 #include "src/obu_parser.h"
 #include "src/utils/array_2d.h"
@@ -46,8 +46,6 @@ namespace libgav1 {
 // and loop restoration.
 // Historically, for example in libaom, loop filter refers to deblock filter.
 // To avoid name conflicts, we call this class PostFilter (post processing).
-// Input info includes deblock parameters (bit masks), CDEF
-// parameters, super resolution parameters and loop restoration parameters.
 // In-loop post filtering order is:
 // deblock --> CDEF --> super resolution--> loop restoration.
 // When CDEF and super resolution is not used, we can combine deblock
@@ -76,14 +74,9 @@ class PostFilter {
   //      * Output: |loop_restoration_buffer_|.
   //   -> Now |frame_buffer_| contains the filtered frame.
   PostFilter(const ObuFrameHeader& frame_header,
-             const ObuSequenceHeader& sequence_header, LoopFilterMask* masks,
-             const Array2D<int16_t>& cdef_index,
-             const Array2D<TransformSize>& inter_transform_sizes,
-             LoopRestorationInfo* restoration_info,
-             BlockParametersHolder* block_parameters, YuvBuffer* frame_buffer,
-             YuvBuffer* deblock_buffer, const dsp::Dsp* dsp,
-             ThreadPool* thread_pool, uint8_t* threaded_window_buffer,
-             uint8_t* superres_line_buffer, int do_post_filter_mask);
+             const ObuSequenceHeader& sequence_header,
+             FrameScratchBuffer* frame_scratch_buffer, YuvBuffer* frame_buffer,
+             const dsp::Dsp* dsp, int do_post_filter_mask);
 
   // non copyable/movable.
   PostFilter(const PostFilter&) = delete;
@@ -123,9 +116,9 @@ class PostFilter {
   //                with a shift to the top-left).
   void ApplyFilteringThreaded();
 
-  // Does the overall post processing filter for one superblock row (starting at
-  // |row4x4| with height 4*|sb4x4|. Cdef, SuperRes and Loop Restoration lag by
-  // one superblock row to account for deblocking.
+  // Does the overall post processing filter for one superblock row starting at
+  // |row4x4| with height 4*|sb4x4|. If |do_deblock| is false, deblocking filter
+  // will not be applied.
   //
   // Filter behavior (single-threaded):
   // * Deblock: In-place filtering. The output is written to |source_buffer_|.
@@ -143,26 +136,35 @@ class PostFilter {
   //                top-left).
   // Returns the index of the last row whose post processing is complete and can
   // be used for referencing.
-  int ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4,
-                                        bool is_last_row);
+  int ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4, bool is_last_row,
+                                        bool do_deblock);
+
+  // Apply deblocking filter in one direction (specified by |loop_filter_type|)
+  // for the superblock row starting at |row4x4_start| for columns starting from
+  // |column4x4_start| in increments of 16 (or 8 for chroma with subsampling)
+  // until the smallest multiple of 16 that is >= |column4x4_end| or until
+  // |frame_header_.columns4x4|, whichever is lower. This function must be
+  // called only if |DoDeblock()| returns true.
+  void ApplyDeblockFilter(LoopFilterType loop_filter_type, int row4x4_start,
+                          int column4x4_start, int column4x4_end, int sb4x4);
 
-  bool DoCdef() const { return DoCdef(frame_header_, do_post_filter_mask_); }
   static bool DoCdef(const ObuFrameHeader& frame_header,
                      int do_post_filter_mask) {
-    return (do_post_filter_mask & 0x02) != 0 &&
-           (frame_header.cdef.bits > 0 ||
+    return (frame_header.cdef.bits > 0 ||
             frame_header.cdef.y_primary_strength[0] > 0 ||
             frame_header.cdef.y_secondary_strength[0] > 0 ||
             frame_header.cdef.uv_primary_strength[0] > 0 ||
-            frame_header.cdef.uv_secondary_strength[0] > 0);
+            frame_header.cdef.uv_secondary_strength[0] > 0) &&
+           (do_post_filter_mask & 0x02) != 0;
   }
+  bool DoCdef() const { return DoCdef(frame_header_, do_post_filter_mask_); }
   // If filter levels for Y plane (0 for vertical, 1 for horizontal),
   // are all zero, deblock filter will not be applied.
   static bool DoDeblock(const ObuFrameHeader& frame_header,
                         uint8_t do_post_filter_mask) {
-    return (do_post_filter_mask & 0x01) != 0 &&
-           (frame_header.loop_filter.level[0] > 0 ||
-            frame_header.loop_filter.level[1] > 0);
+    return (frame_header.loop_filter.level[0] > 0 ||
+            frame_header.loop_filter.level[1] > 0) &&
+           (do_post_filter_mask & 0x01) != 0;
   }
   bool DoDeblock() const {
     return DoDeblock(frame_header_, do_post_filter_mask_);
@@ -178,20 +180,21 @@ class PostFilter {
       const int8_t delta_lf[kFrameLfCount],
       uint8_t deblock_filter_levels[kMaxSegments][kFrameLfCount]
                                    [kNumReferenceFrameTypes][2]) const;
-  bool DoRestoration() const {
-    return DoRestoration(loop_restoration_, do_post_filter_mask_, planes_);
-  }
   // Returns true if loop restoration will be performed for the given parameters
   // and mask.
   static bool DoRestoration(const LoopRestoration& loop_restoration,
                             uint8_t do_post_filter_mask, int num_planes) {
-    if ((do_post_filter_mask & 0x08) == 0) return false;
     if (num_planes == kMaxPlanesMonochrome) {
-      return loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone;
+      return loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone &&
+             (do_post_filter_mask & 0x08) != 0;
     }
-    return loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone ||
-           loop_restoration.type[kPlaneU] != kLoopRestorationTypeNone ||
-           loop_restoration.type[kPlaneV] != kLoopRestorationTypeNone;
+    return (loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone ||
+            loop_restoration.type[kPlaneU] != kLoopRestorationTypeNone ||
+            loop_restoration.type[kPlaneV] != kLoopRestorationTypeNone) &&
+           (do_post_filter_mask & 0x08) != 0;
+  }
+  bool DoRestoration() const {
+    return DoRestoration(loop_restoration_, do_post_filter_mask_, planes_);
   }
 
   // Returns a pointer to the unfiltered buffer. This is used by the Tile class
@@ -204,13 +207,12 @@ class PostFilter {
   // mask.
   static bool DoSuperRes(const ObuFrameHeader& frame_header,
                          uint8_t do_post_filter_mask) {
-    return (do_post_filter_mask & 0x04) != 0 &&
-           frame_header.width != frame_header.upscaled_width;
+    return frame_header.width != frame_header.upscaled_width &&
+           (do_post_filter_mask & 0x04) != 0;
   }
   bool DoSuperRes() const {
     return DoSuperRes(frame_header_, do_post_filter_mask_);
   }
-  LoopFilterMask* masks() const { return masks_; }
   LoopRestorationInfo* restoration_info() const { return restoration_info_; }
   uint8_t* GetBufferOffset(uint8_t* base_buffer, int stride, Plane plane,
                            int row4x4, int column4x4) const {
@@ -249,37 +251,23 @@ class PostFilter {
   // The type of the HorizontalDeblockFilter and VerticalDeblockFilter member
   // functions.
   using DeblockFilter = void (PostFilter::*)(Plane plane, int row4x4_start,
-                                             int column4x4_start, int unit_id);
-  // The lookup table for picking the deblock filter, according to:
-  // kDeblockFilterBitMask (first dimension), and deblock filter type (second).
-  const DeblockFilter deblock_filter_type_table_[2][2] = {
-      {&PostFilter::VerticalDeblockFilterNoMask,
-       &PostFilter::HorizontalDeblockFilterNoMask},
-      {&PostFilter::VerticalDeblockFilter,
-       &PostFilter::HorizontalDeblockFilter},
-  };
-  // Buffers for loop restoration intermediate results. Depending on the filter
-  // type, only one member of the union is used.
-  union IntermediateBuffers {
-    // For Wiener filter.
-    // The array |intermediate| in Section 7.17.4, the intermediate results
-    // between the horizontal and vertical filters.
-    alignas(kMaxAlignment)
-        uint16_t wiener[(kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1) *
-                        kMaxSuperBlockSizeInPixels];
-    // For self-guided filter.
-    struct {
-      // The arrays flt0 and flt1 in Section 7.17.2, the outputs of the box
-      // filter process in pass 0 and pass 1.
-      alignas(
-          kMaxAlignment) int32_t output[2][kMaxBoxFilterProcessOutputPixels];
-      // The 2d arrays A and B in Section 7.17.3, the intermediate results in
-      // the box filter process. Reused for pass 0 and pass 1.
-      alignas(kMaxAlignment) uint32_t
-          intermediate_a[kBoxFilterProcessIntermediatePixels];
-      alignas(kMaxAlignment) uint32_t
-          intermediate_b[kBoxFilterProcessIntermediatePixels];
-    } box_filter;
+                                             int column4x4_start);
+  // The lookup table for picking the deblock filter, according to deblock
+  // filter type.
+  const DeblockFilter deblock_filter_func_[2] = {
+      &PostFilter::VerticalDeblockFilter, &PostFilter::HorizontalDeblockFilter};
+
+  // The type of GetVerticalDeblockFilterEdgeInfo* member functions.
+  using DeblockVerticalEdgeInfo = bool (PostFilter::*)(
+      const Plane plane, int row4x4, int column4x4, const int8_t subsampling_x,
+      const int8_t subsampling_y, BlockParameters* const* bp_ptr,
+      uint8_t* level, int* step, int* filter_length) const;
+  // The lookup table for picking the GetVerticalDeblockEdgeInfo based on the
+  // plane.
+  const DeblockVerticalEdgeInfo deblock_vertical_edge_info_[kMaxPlanes] = {
+      &PostFilter::GetVerticalDeblockFilterEdgeInfo,
+      &PostFilter::GetVerticalDeblockFilterEdgeInfoUV,
+      &PostFilter::GetVerticalDeblockFilterEdgeInfoUV,
   };
 
   // Functions common to all post filters.
@@ -337,35 +325,26 @@ class PostFilter {
   int GetDeblockUnitId(int row_unit, int column_unit) const {
     return row_unit * num_64x64_blocks_per_row_ + column_unit;
   }
-  static dsp::LoopFilterSize GetLoopFilterSize(Plane plane, int step) {
-    if (step == 4) {
-      return dsp::kLoopFilterSize4;
-    }
-    if (step == 8) {
-      return (plane == kPlaneY) ? dsp::kLoopFilterSize8 : dsp::kLoopFilterSize6;
-    }
-    return (plane == kPlaneY) ? dsp::kLoopFilterSize14 : dsp::kLoopFilterSize6;
-  }
-  void InitDeblockFilterParams();  // Part of 7.14.4.
-  void GetDeblockFilterParams(uint8_t level, int* outer_thresh,
-                              int* inner_thresh, int* hev_thresh) const;
-  template <LoopFilterType type>
-  bool GetDeblockFilterEdgeInfo(Plane plane, int row4x4, int column4x4,
-                                int8_t subsampling_x, int8_t subsampling_y,
-                                uint8_t* level, int* step,
-                                int* filter_length) const;
+  bool GetHorizontalDeblockFilterEdgeInfo(Plane plane, int row4x4,
+                                          int column4x4, int8_t subsampling_x,
+                                          int8_t subsampling_y, uint8_t* level,
+                                          int* step, int* filter_length) const;
+  bool GetVerticalDeblockFilterEdgeInfo(Plane plane, int row4x4, int column4x4,
+                                        int8_t subsampling_x,
+                                        int8_t subsampling_y,
+                                        BlockParameters* const* bp_ptr,
+                                        uint8_t* level, int* step,
+                                        int* filter_length) const;
+  bool GetVerticalDeblockFilterEdgeInfoUV(Plane plane, int row4x4,
+                                          int column4x4, int8_t subsampling_x,
+                                          int8_t subsampling_y,
+                                          BlockParameters* const* bp_ptr,
+                                          uint8_t* level, int* step,
+                                          int* filter_length) const;
   void HorizontalDeblockFilter(Plane plane, int row4x4_start,
-                               int column4x4_start, int unit_id);
-  void VerticalDeblockFilter(Plane plane, int row4x4_start, int column4x4_start,
-                             int unit_id);
-  // |unit_id| is not used, keep it to match the same interface as
-  // HorizontalDeblockFilter().
-  void HorizontalDeblockFilterNoMask(Plane plane, int row4x4_start,
-                                     int column4x4_start, int unit_id);
-  // |unit_id| is not used, keep it to match the same interface as
-  // VerticalDeblockFilter().
-  void VerticalDeblockFilterNoMask(Plane plane, int row4x4_start,
-                                   int column4x4_start, int unit_id);
+                               int column4x4_start);
+  void VerticalDeblockFilter(Plane plane, int row4x4_start,
+                             int column4x4_start);
   // HorizontalDeblockFilter and VerticalDeblockFilter must have the correct
   // signature.
   static_assert(std::is_same<decltype(&PostFilter::HorizontalDeblockFilter),
@@ -385,7 +364,6 @@ class PostFilter {
   // Functions for the cdef filter.
 
   uint8_t* GetCdefBufferAndStride(int start_x, int start_y, int plane,
-                                  int subsampling_x, int subsampling_y,
                                   int window_buffer_plane_size,
                                   int* cdef_stride) const;
   // This function prepares the input source block for cdef filtering. The input
@@ -394,9 +372,9 @@ class PostFilter {
   // pixels with a large value. This achieves the required behavior defined in
   // section 5.11.52 of the spec.
   template <typename Pixel>
-  void PrepareCdefBlock(int block_width4x4, int block_height4x4, int row_64x64,
-                        int column_64x64, uint16_t* cdef_source,
-                        ptrdiff_t cdef_stride);
+  void PrepareCdefBlock(int block_width4x4, int block_height4x4, int row4x4,
+                        int column4x4, uint16_t* cdef_source,
+                        ptrdiff_t cdef_stride, bool y_plane);
   template <typename Pixel>
   void ApplyCdefForOneUnit(uint16_t* cdef_block, int index, int block_width4x4,
                            int block_height4x4, int row4x4_start,
@@ -434,12 +412,14 @@ class PostFilter {
   // Functions for the Loop Restoration filter.
 
   template <typename Pixel>
-  void ApplyLoopRestorationForOneUnit(
-      uint8_t* cdef_buffer, ptrdiff_t cdef_buffer_stride, Plane plane,
-      int plane_height, int x, int y, int row, int column, int unit_row,
-      int current_process_unit_height, int plane_process_unit_width,
-      int plane_unit_size, int num_horizontal_units, int plane_width,
-      Array2DView<Pixel>* loop_restored_window);
+  void ApplyLoopRestorationForOneUnit(uint8_t* cdef_buffer,
+                                      ptrdiff_t cdef_buffer_stride, Plane plane,
+                                      int plane_height, int x, int y, int row,
+                                      int column, int unit_row,
+                                      int current_process_unit_height,
+                                      int plane_unit_size,
+                                      int num_horizontal_units, int plane_width,
+                                      Array2DView<Pixel>* loop_restored_window);
   template <typename Pixel>
   void ApplyLoopRestorationForSuperBlock(Plane plane, int x, int y,
                                          int unit_row,
@@ -454,8 +434,8 @@ class PostFilter {
   void ApplyLoopRestorationForOneRowInWindow(
       uint8_t* cdef_buffer, ptrdiff_t cdef_buffer_stride, Plane plane,
       int plane_height, int plane_width, int x, int y, int row, int unit_row,
-      int current_process_unit_height, int process_unit_width, int window_width,
-      int plane_unit_size, int num_horizontal_units);
+      int current_process_unit_height, int plane_unit_size, int window_width,
+      int num_horizontal_units);
   // Note for ApplyLoopRestoration():
   // First, we must differentiate loop restoration processing unit from loop
   // restoration unit.
@@ -501,12 +481,8 @@ class PostFilter {
   const int8_t subsampling_y_[kMaxPlanes];
   const int8_t planes_;
   const int pixel_size_;
-  // This class does not take ownership of the masks/restoration_info, but it
-  // could change their values.
-  LoopFilterMask* const masks_;
-  uint8_t inner_thresh_[kMaxLoopFilterValue + 1] = {};
-  uint8_t outer_thresh_[kMaxLoopFilterValue + 1] = {};
-  uint8_t hev_thresh_[kMaxLoopFilterValue + 1] = {};
+  const uint8_t* const inner_thresh_;
+  const uint8_t* const outer_thresh_;
   // This stores the deblocking filter levels assuming that the delta is zero.
   // This will be used by all superblocks whose delta is zero (without having to
   // recompute them). The dimensions (in order) are: segment_id, level_index
@@ -529,8 +505,6 @@ class PostFilter {
   // nullptr as well.
   uint8_t* const threaded_window_buffer_;
   LoopRestorationInfo* const restoration_info_;
-  const int window_buffer_width_;
-  const int window_buffer_height_;
   // Pointer to the line buffer used by ApplySuperRes(). If SuperRes is on, then
   // the buffer will be large enough to hold one downscaled row +
   // kSuperResHorizontalBorder.
@@ -560,8 +534,10 @@ class PostFilter {
   // This buffer is used only when both Cdef and Loop Restoration are on.
   YuvBuffer& deblock_buffer_;
   const uint8_t do_post_filter_mask_;
-
   ThreadPool* const thread_pool_;
+  const int window_buffer_width_;
+  const int window_buffer_height_;
+
   // Tracks the progress of the post filters.
   int progress_row_ = -1;
 
@@ -571,13 +547,11 @@ class PostFilter {
   // Wiener filter needs extended border of three pixels.
   // Therefore the size of the buffer is 70x70 pixels.
   alignas(alignof(uint16_t)) uint8_t
-      block_buffer_[kRestorationProcessingUnitSizeWithBorders *
-                    kRestorationProcessingUnitSizeWithBorders *
-                    sizeof(uint16_t)];
+      block_buffer_[kRestorationUnitHeightWithBorders *
+                    kRestorationUnitWidthWithBorders * sizeof(uint16_t)];
   // A block buffer to hold the input that is converted to uint16_t before
   // cdef filtering. Only used in single threaded case.
-  uint16_t cdef_block_[kRestorationProcessingUnitSizeWithBorders *
-                       kRestorationProcessingUnitSizeWithBorders * 3];
+  uint16_t cdef_block_[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 3];
 
   template <int bitdepth, typename Pixel>
   friend class PostFilterSuperResTest;
@@ -586,75 +560,69 @@ class PostFilter {
   friend class PostFilterHelperFuncTest;
 };
 
+template <typename Pixel>
+void CopyTwoRows(const Pixel* src, const ptrdiff_t src_stride, Pixel** dst,
+                 const ptrdiff_t dst_stride, const int width) {
+  for (int i = 0; i < kRestorationBorder - 1; ++i) {
+    memcpy(*dst, src, sizeof(Pixel) * width);
+    src += src_stride;
+    *dst += dst_stride;
+  }
+}
+
 // This function takes the cdef filtered buffer and the deblocked buffer to
 // prepare a block as input for loop restoration.
 // In striped loop restoration:
-// The filtering needs to fetch the area of size (width + 6) x (height + 6),
-// in which (width + 6) x height area is from cdef filtered frame
-// (cdef_buffer). Top 3 rows and bottom 3 rows are from deblocked frame
-// (deblock_buffer).
+// The filtering needs to fetch the area of size (width + 6) x (height + 4),
+// in which (width + 6) x height area is from cdef filtered frame (cdef_buffer).
+// Top 2 rows and bottom 2 rows are from deblocked frame (deblock_buffer).
 // Special cases are:
-// (1). when it is the top border, the top 3 rows are from cdef
-// filtered frame.
-// (2). when it is the bottom border, the bottom 3 rows are from cdef
-// filtered frame.
-// For the top 3 rows and bottom 3 rows, the top_row[0] is a copy of the
-// top_row[1]. The bottom_row[2] is a copy of the bottom_row[1]. If cdef is
-// not applied for this frame, cdef_buffer is the same as deblock_buffer.
+// (1). when it is the top border, the top 2 rows are from cdef filtered frame.
+// (2). when it is the bottom border, the bottom 2 rows are from cdef filtered
+// frame.
+// This function is called only when cdef is applied for this frame.
 template <typename Pixel>
-void PrepareLoopRestorationBlock(const bool do_cdef, const uint8_t* cdef_buffer,
+void PrepareLoopRestorationBlock(const uint8_t* cdef_buffer,
                                  ptrdiff_t cdef_stride,
                                  const uint8_t* deblock_buffer,
                                  ptrdiff_t deblock_stride, uint8_t* dest,
                                  ptrdiff_t dest_stride, const int width,
                                  const int height, const bool frame_top_border,
                                  const bool frame_bottom_border) {
-  const auto* cdef_ptr = reinterpret_cast<const Pixel*>(cdef_buffer);
   cdef_stride /= sizeof(Pixel);
-  const auto* deblock_ptr = reinterpret_cast<const Pixel*>(deblock_buffer);
   deblock_stride /= sizeof(Pixel);
-  auto* dst = reinterpret_cast<Pixel*>(dest);
   dest_stride /= sizeof(Pixel);
-  // Top 3 rows.
-  cdef_ptr -= (kRestorationBorder - 1) * cdef_stride + kRestorationBorder;
-  if (deblock_ptr != nullptr) deblock_ptr -= kRestorationBorder;
-  for (int i = 0; i < kRestorationBorder; ++i) {
-    if (frame_top_border || !do_cdef) {
-      memcpy(dst, cdef_ptr, sizeof(Pixel) * (width + 2 * kRestorationBorder));
-    } else {
-      memcpy(dst, deblock_ptr,
-             sizeof(Pixel) * (width + 2 * kRestorationBorder));
-    }
-    if (i > 0) {
-      if (deblock_ptr != nullptr) deblock_ptr += deblock_stride;
-      cdef_ptr += cdef_stride;
-    }
-    dst += dest_stride;
+  const auto* cdef_ptr = reinterpret_cast<const Pixel*>(cdef_buffer) -
+                         (kRestorationBorder - 1) * cdef_stride -
+                         kRestorationBorder;
+  const auto* deblock_ptr =
+      reinterpret_cast<const Pixel*>(deblock_buffer) - kRestorationBorder;
+  auto* dst = reinterpret_cast<Pixel*>(dest) + dest_stride;
+  int h = height;
+  // Top 2 rows.
+  if (frame_top_border) {
+    h += kRestorationBorder - 1;
+  } else {
+    CopyTwoRows<Pixel>(deblock_ptr, deblock_stride, &dst, dest_stride,
+                       width + 2 * kRestorationBorder);
+    cdef_ptr += (kRestorationBorder - 1) * cdef_stride;
+    // If |frame_top_border| is true, then we are in the first superblock row,
+    // so in that case, do not increment |deblock_ptr| since we don't store
+    // anything from the first superblock row into |deblock_buffer|.
+    deblock_ptr += 4 * deblock_stride;
   }
+  if (frame_bottom_border) h += kRestorationBorder - 1;
   // Main body.
-  for (int i = 0; i < height; ++i) {
+  do {
     memcpy(dst, cdef_ptr, sizeof(Pixel) * (width + 2 * kRestorationBorder));
     cdef_ptr += cdef_stride;
     dst += dest_stride;
-  }
-  // Bottom 3 rows. If |frame_top_border| is true, then we are in the first
-  // superblock row, so in that case, do not increment |deblock_ptr| since we
-  // don't store anything from the first superblock row into |deblock_buffer|.
-  if (deblock_ptr != nullptr && !frame_top_border) {
-    deblock_ptr += deblock_stride * 4;
-  }
-  for (int i = 0; i < kRestorationBorder; ++i) {
-    if (frame_bottom_border || !do_cdef) {
-      memcpy(dst, cdef_ptr, sizeof(Pixel) * (width + 2 * kRestorationBorder));
-    } else {
-      memcpy(dst, deblock_ptr,
-             sizeof(Pixel) * (width + 2 * kRestorationBorder));
-    }
-    if (i < kRestorationBorder - 2) {
-      if (deblock_ptr != nullptr) deblock_ptr += deblock_stride;
-      cdef_ptr += cdef_stride;
-    }
-    dst += dest_stride;
+  } while (--h != 0);
+  // Bottom 2 rows.
+  if (!frame_bottom_border) {
+    deblock_ptr += (kRestorationBorder - 1) * deblock_stride;
+    CopyTwoRows<Pixel>(deblock_ptr, deblock_stride, &dst, dest_stride,
+                       width + 2 * kRestorationBorder);
   }
 }
 
diff --git a/chromium/third_party/libgav1/src/src/post_filter/cdef.cc b/chromium/third_party/libgav1/src/src/post_filter/cdef.cc
index c169acd6532..2b3b7119f0b 100644
--- a/chromium/third_party/libgav1/src/src/post_filter/cdef.cc
+++ b/chromium/third_party/libgav1/src/src/post_filter/cdef.cc
@@ -20,6 +20,7 @@ namespace libgav1 {
 namespace {
 
 constexpr int kStep64x64 = 16;  // =64/4.
+constexpr int kCdefSkip = 8;
 
 constexpr uint8_t kCdefUvDirection[2][2][8] = {
     {{0, 1, 2, 3, 4, 5, 6, 7}, {1, 2, 2, 2, 3, 4, 6, 0}},
@@ -57,19 +58,31 @@ void CopyRowForCdef(const Pixel* src, int block_width, int unit_width,
   }
 }
 
+// For |height| rows, copy |width| pixels of size |pixel_size| from |src| to
+// |dst|.
+void CopyPixels(const uint8_t* src, int src_stride, uint8_t* dst,
+                int dst_stride, int width, int height, size_t pixel_size) {
+  int y = height;
+  do {
+    memcpy(dst, src, width * pixel_size);
+    src += src_stride;
+    dst += dst_stride;
+  } while (--y != 0);
+}
+
 }  // namespace
 
 uint8_t* PostFilter::GetCdefBufferAndStride(const int start_x,
                                             const int start_y, const int plane,
-                                            const int subsampling_x,
-                                            const int subsampling_y,
                                             const int window_buffer_plane_size,
                                             int* cdef_stride) const {
   if (thread_pool_ != nullptr) {
     // write output to threaded_window_buffer.
     *cdef_stride = window_buffer_width_ * pixel_size_;
-    const int column_window = start_x % (window_buffer_width_ >> subsampling_x);
-    const int row_window = start_y % (window_buffer_height_ >> subsampling_y);
+    const int column_window =
+        start_x % (window_buffer_width_ >> subsampling_x_[plane]);
+    const int row_window =
+        start_y % (window_buffer_height_ >> subsampling_y_[plane]);
     return threaded_window_buffer_ + plane * window_buffer_plane_size +
            row_window * (*cdef_stride) + column_window * pixel_size_;
   }
@@ -80,72 +93,82 @@ uint8_t* PostFilter::GetCdefBufferAndStride(const int start_x,
 
 template <typename Pixel>
 void PostFilter::PrepareCdefBlock(int block_width4x4, int block_height4x4,
-                                  int row_64x64, int column_64x64,
-                                  uint16_t* cdef_source,
-                                  ptrdiff_t cdef_stride) {
-  for (int plane = kPlaneY; plane < planes_; ++plane) {
-    uint16_t* cdef_src =
-        cdef_source + plane * kRestorationProcessingUnitSizeWithBorders *
-                          kRestorationProcessingUnitSizeWithBorders;
-    const int8_t subsampling_x = subsampling_x_[plane];
-    const int8_t subsampling_y = subsampling_y_[plane];
-    const int start_x = MultiplyBy4(column_64x64) >> subsampling_x;
-    const int start_y = MultiplyBy4(row_64x64) >> subsampling_y;
-    const int plane_width = RightShiftWithRounding(width_, subsampling_x);
-    const int plane_height = RightShiftWithRounding(height_, subsampling_y);
-    const int block_width = MultiplyBy4(block_width4x4) >> subsampling_x;
-    const int block_height = MultiplyBy4(block_height4x4) >> subsampling_y;
-    // unit_width, unit_height are the same as block_width, block_height unless
-    // it reaches the frame boundary, where block_width < 64 or
-    // block_height < 64. unit_width, unit_height guarantee we build blocks on
-    // a multiple of 8.
-    const int unit_width = Align(block_width, (subsampling_x > 0) ? 4 : 8);
-    const int unit_height = Align(block_height, (subsampling_y > 0) ? 4 : 8);
-    const bool is_frame_left = column_64x64 == 0;
-    const bool is_frame_right = start_x + block_width >= plane_width;
-    const bool is_frame_top = row_64x64 == 0;
-    const bool is_frame_bottom = start_y + block_height >= plane_height;
+                                  int row4x4, int column4x4,
+                                  uint16_t* cdef_source, ptrdiff_t cdef_stride,
+                                  const bool y_plane) {
+  assert(y_plane || planes_ == kMaxPlanes);
+  const int max_planes = y_plane ? 1 : kMaxPlanes;
+  const int8_t subsampling_x = y_plane ? 0 : subsampling_x_[kPlaneU];
+  const int8_t subsampling_y = y_plane ? 0 : subsampling_y_[kPlaneU];
+  const int start_x = MultiplyBy4(column4x4) >> subsampling_x;
+  const int start_y = MultiplyBy4(row4x4) >> subsampling_y;
+  const int plane_width = RightShiftWithRounding(width_, subsampling_x);
+  const int plane_height = RightShiftWithRounding(height_, subsampling_y);
+  const int block_width = MultiplyBy4(block_width4x4) >> subsampling_x;
+  const int block_height = MultiplyBy4(block_height4x4) >> subsampling_y;
+  // unit_width, unit_height are the same as block_width, block_height unless
+  // it reaches the frame boundary, where block_width < 64 or
+  // block_height < 64. unit_width, unit_height guarantee we build blocks on
+  // a multiple of 8.
+  const int unit_width = Align(block_width, 8 >> subsampling_x);
+  const int unit_height = Align(block_height, 8 >> subsampling_y);
+  const bool is_frame_left = column4x4 == 0;
+  const bool is_frame_right = start_x + block_width >= plane_width;
+  const bool is_frame_top = row4x4 == 0;
+  const bool is_frame_bottom = start_y + block_height >= plane_height;
+  const int y_offset = is_frame_top ? 0 : kCdefBorder;
+
+  for (int plane = y_plane ? kPlaneY : kPlaneU; plane < max_planes; ++plane) {
+    uint16_t* cdef_src = cdef_source + plane * kCdefUnitSizeWithBorders *
+                                           kCdefUnitSizeWithBorders;
     const int src_stride = frame_buffer_.stride(plane) / sizeof(Pixel);
     const Pixel* src_buffer =
         reinterpret_cast<const Pixel*>(source_buffer_[plane]) +
-        (start_y - (is_frame_top ? 0 : kCdefBorder)) * src_stride + start_x;
+        (start_y - y_offset) * src_stride + start_x;
 
     // All the copying code will use negative indices for populating the left
     // border. So the starting point is set to kCdefBorder.
     cdef_src += kCdefBorder;
 
     // Copy the top 2 rows.
-    for (int y = 0; y < kCdefBorder; ++y) {
-      if (is_frame_top) {
+    if (is_frame_top) {
+      for (int y = 0; y < kCdefBorder; ++y) {
         Memset(cdef_src - kCdefBorder, kCdefLargeValue,
                unit_width + 2 * kCdefBorder);
-      } else {
+        cdef_src += cdef_stride;
+      }
+    } else {
+      for (int y = 0; y < kCdefBorder; ++y) {
         CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
                        is_frame_right, cdef_src);
         src_buffer += src_stride;
+        cdef_src += cdef_stride;
       }
-      cdef_src += cdef_stride;
     }
 
     // Copy the body.
-    for (int y = 0; y < block_height; ++y) {
+    int y = block_height;
+    do {
       CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
                      is_frame_right, cdef_src);
       cdef_src += cdef_stride;
       src_buffer += src_stride;
-    }
+    } while (--y != 0);
 
     // Copy the bottom 2 rows.
-    for (int y = 0; y < kCdefBorder + unit_height - block_height; ++y) {
-      if (is_frame_bottom) {
+    if (is_frame_bottom) {
+      do {
         Memset(cdef_src - kCdefBorder, kCdefLargeValue,
                unit_width + 2 * kCdefBorder);
-      } else {
+        cdef_src += cdef_stride;
+      } while (++y < kCdefBorder + unit_height - block_height);
+    } else {
+      do {
         CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
                        is_frame_right, cdef_src);
         src_buffer += src_stride;
-      }
-      cdef_src += cdef_stride;
+        cdef_src += cdef_stride;
+      } while (++y < kCdefBorder + unit_height - block_height);
     }
   }
 }
@@ -156,130 +179,237 @@ void PostFilter::ApplyCdefForOneUnit(uint16_t* cdef_block, const int index,
                                      const int block_height4x4,
                                      const int row4x4_start,
                                      const int column4x4_start) {
-  const int coeff_shift = bitdepth_ - 8;
-  const int step = kNum4x4BlocksWide[kBlock8x8];
+  // Cdef operates in 8x8 blocks (4x4 for chroma with subsampling).
+  static constexpr int kStep = 8;
+  static constexpr int kStep4x4 = 2;
+
   const int window_buffer_plane_size =
       window_buffer_width_ * window_buffer_height_ * pixel_size_;
+  int cdef_buffer_row_base_stride[kMaxPlanes];
+  int cdef_buffer_stride[kMaxPlanes];
+  uint8_t* cdef_buffer_row_base[kMaxPlanes];
+  int src_buffer_row_base_stride[kMaxPlanes];
+  const uint8_t* src_buffer_row_base[kMaxPlanes];
+  int column_step[kMaxPlanes];
+  for (int plane = kPlaneY; plane < planes_; ++plane) {
+    const int start_y = MultiplyBy4(row4x4_start) >> subsampling_y_[plane];
+    const int start_x = MultiplyBy4(column4x4_start) >> subsampling_x_[plane];
+    cdef_buffer_row_base[plane] = GetCdefBufferAndStride(
+        start_x, start_y, plane, window_buffer_plane_size,
+        &cdef_buffer_stride[plane]);
+    cdef_buffer_row_base_stride[plane] =
+        cdef_buffer_stride[plane] * (kStep >> subsampling_y_[plane]);
+    src_buffer_row_base[plane] = source_buffer_[plane] +
+                                 start_y * frame_buffer_.stride(plane) +
+                                 start_x * pixel_size_;
+    src_buffer_row_base_stride[plane] =
+        frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
+    column_step[plane] = (kStep >> subsampling_x_[plane]) * pixel_size_;
+  }
 
   if (index == -1) {
     for (int plane = kPlaneY; plane < planes_; ++plane) {
-      const int start_x = MultiplyBy4(column4x4_start) >> subsampling_x_[plane];
-      const int start_y = MultiplyBy4(row4x4_start) >> subsampling_y_[plane];
-      int cdef_stride;
-      uint8_t* const cdef_buffer = GetCdefBufferAndStride(
-          start_x, start_y, plane, subsampling_x_[plane], subsampling_y_[plane],
-          window_buffer_plane_size, &cdef_stride);
-      const int src_stride = frame_buffer_.stride(plane);
-      uint8_t* const src_buffer =
-          source_buffer_[plane] + start_y * src_stride + start_x * pixel_size_;
-      const int block_width =
-          MultiplyBy4(block_width4x4) >> subsampling_x_[plane];
-      const int block_height =
-          MultiplyBy4(block_height4x4) >> subsampling_y_[plane];
-      for (int y = 0; y < block_height; ++y) {
-        memcpy(cdef_buffer + y * cdef_stride, src_buffer + y * src_stride,
-               block_width * pixel_size_);
+      CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
+                 cdef_buffer_row_base[plane], cdef_buffer_stride[plane],
+                 MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
+                 MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+                 pixel_size_);
+    }
+    return;
+  }
+
+  PrepareCdefBlock<Pixel>(block_width4x4, block_height4x4, row4x4_start,
+                          column4x4_start, cdef_block, kCdefUnitSizeWithBorders,
+                          true);
+
+  // Stored direction used during the u/v pass.  If bit 3 is set, then block is
+  // a skip.
+  int direction_y[8 * 8];
+  int y_index = 0;
+
+  const uint8_t y_primary_strength =
+      frame_header_.cdef.y_primary_strength[index];
+  const uint8_t y_secondary_strength =
+      frame_header_.cdef.y_secondary_strength[index];
+
+  const bool compute_direction_and_variance =
+      (y_primary_strength | frame_header_.cdef.uv_primary_strength[index]) != 0;
+  BlockParameters* const* bp_row0_base =
+      block_parameters_.Address(row4x4_start, column4x4_start);
+  BlockParameters* const* bp_row1_base =
+      bp_row0_base + block_parameters_.columns4x4();
+  const int bp_stride = MultiplyBy2(block_parameters_.columns4x4());
+  int row4x4 = row4x4_start;
+  do {
+    uint8_t* cdef_buffer_base = cdef_buffer_row_base[kPlaneY];
+    const uint8_t* src_buffer_base = src_buffer_row_base[kPlaneY];
+    BlockParameters* const* bp0 = bp_row0_base;
+    BlockParameters* const* bp1 = bp_row1_base;
+    int column4x4 = column4x4_start;
+    do {
+      const int block_width = kStep;
+      const int block_height = kStep;
+      const int cdef_stride = cdef_buffer_stride[kPlaneY];
+      uint8_t* const cdef_buffer = cdef_buffer_base;
+      const int src_stride = frame_buffer_.stride(kPlaneY);
+      const uint8_t* const src_buffer = src_buffer_base;
+
+      const bool skip = (*bp0)->skip && (*(bp0 + 1))->skip && (*bp1)->skip &&
+                        (*(bp1 + 1))->skip;
+
+      if (skip) {  // No cdef filtering.
+        direction_y[y_index] = kCdefSkip;
+        CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+                   block_width, block_height, pixel_size_);
+      } else {
+        // Zero out residual skip flag.
+        direction_y[y_index] = 0;
+
+        int variance = 0;
+        if (compute_direction_and_variance) {
+          dsp_.cdef_direction(src_buffer, src_stride, &direction_y[y_index],
+                              &variance);
+        }
+        const int direction =
+            (y_primary_strength == 0) ? 0 : direction_y[y_index];
+        const int variance_strength =
+            ((variance >> 6) != 0) ? std::min(FloorLog2(variance >> 6), 12) : 0;
+        const uint8_t primary_strength =
+            (variance != 0)
+                ? (y_primary_strength * (4 + variance_strength) + 8) >> 4
+                : 0;
+
+        if ((primary_strength | y_secondary_strength) == 0) {
+          CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+                     block_width, block_height, pixel_size_);
+        } else {
+          uint16_t* cdef_src = cdef_block + kPlaneY * kCdefUnitSizeWithBorders *
+                                                kCdefUnitSizeWithBorders;
+          cdef_src += kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder;
+          cdef_src +=
+              (MultiplyBy4(row4x4 - row4x4_start)) * kCdefUnitSizeWithBorders +
+              (MultiplyBy4(column4x4 - column4x4_start));
+          dsp_.cdef_filter(cdef_src, kCdefUnitSizeWithBorders, block_width,
+                           block_height, primary_strength, y_secondary_strength,
+                           frame_header_.cdef.damping, direction, cdef_buffer,
+                           cdef_stride);
+        }
       }
+      cdef_buffer_base += column_step[kPlaneY];
+      src_buffer_base += column_step[kPlaneY];
+
+      bp0 += kStep4x4;
+      bp1 += kStep4x4;
+      column4x4 += kStep4x4;
+      y_index++;
+    } while (column4x4 < column4x4_start + block_width4x4);
+
+    cdef_buffer_row_base[kPlaneY] += cdef_buffer_row_base_stride[kPlaneY];
+    src_buffer_row_base[kPlaneY] += src_buffer_row_base_stride[kPlaneY];
+    bp_row0_base += bp_stride;
+    bp_row1_base += bp_stride;
+    row4x4 += kStep4x4;
+  } while (row4x4 < row4x4_start + block_height4x4);
+
+  if (planes_ == kMaxPlanesMonochrome) {
+    return;
+  }
+
+  const uint8_t uv_primary_strength =
+      frame_header_.cdef.uv_primary_strength[index];
+  const uint8_t uv_secondary_strength =
+      frame_header_.cdef.uv_secondary_strength[index];
+
+  if ((uv_primary_strength | uv_secondary_strength) == 0) {
+    for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
+      CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
+                 cdef_buffer_row_base[plane], cdef_buffer_stride[plane],
+                 MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
+                 MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+                 pixel_size_);
     }
     return;
   }
 
   PrepareCdefBlock<Pixel>(block_width4x4, block_height4x4, row4x4_start,
-                          column4x4_start, cdef_block,
-                          kRestorationProcessingUnitSizeWithBorders);
-
-  for (int row4x4 = row4x4_start; row4x4 < row4x4_start + block_height4x4;
-       row4x4 += step) {
-    for (int column4x4 = column4x4_start;
-         column4x4 < column4x4_start + block_width4x4; column4x4 += step) {
-      const bool skip =
-          block_parameters_.Find(row4x4, column4x4) != nullptr &&
-          block_parameters_.Find(row4x4 + 1, column4x4) != nullptr &&
-          block_parameters_.Find(row4x4, column4x4 + 1) != nullptr &&
-          block_parameters_.Find(row4x4 + 1, column4x4 + 1) != nullptr &&
-          block_parameters_.Find(row4x4, column4x4)->skip &&
-          block_parameters_.Find(row4x4 + 1, column4x4)->skip &&
-          block_parameters_.Find(row4x4, column4x4 + 1)->skip &&
-          block_parameters_.Find(row4x4 + 1, column4x4 + 1)->skip;
-      int damping = frame_header_.cdef.damping + coeff_shift;
-      int direction_y;
-      int direction;
-      int variance;
-      uint8_t primary_strength;
-      uint8_t secondary_strength;
+                          column4x4_start, cdef_block, kCdefUnitSizeWithBorders,
+                          false);
 
-      for (int plane = kPlaneY; plane < planes_; ++plane) {
-        const int8_t subsampling_x = subsampling_x_[plane];
-        const int8_t subsampling_y = subsampling_y_[plane];
-        const int start_x = MultiplyBy4(column4x4) >> subsampling_x;
-        const int start_y = MultiplyBy4(row4x4) >> subsampling_y;
-        const int block_width = 8 >> subsampling_x;
-        const int block_height = 8 >> subsampling_y;
-        int cdef_stride;
-        uint8_t* const cdef_buffer = GetCdefBufferAndStride(
-            start_x, start_y, plane, subsampling_x, subsampling_y,
-            window_buffer_plane_size, &cdef_stride);
+  for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
+    const int8_t subsampling_x = subsampling_x_[plane];
+    const int8_t subsampling_y = subsampling_y_[plane];
+    const int block_width = kStep >> subsampling_x;
+    const int block_height = kStep >> subsampling_y;
+    int row4x4 = row4x4_start;
+
+    y_index = 0;
+    do {
+      uint8_t* cdef_buffer_base = cdef_buffer_row_base[plane];
+      const uint8_t* src_buffer_base = src_buffer_row_base[plane];
+      int column4x4 = column4x4_start;
+      do {
+        const int cdef_stride = cdef_buffer_stride[plane];
+        uint8_t* const cdef_buffer = cdef_buffer_base;
         const int src_stride = frame_buffer_.stride(plane);
-        uint8_t* const src_buffer = source_buffer_[plane] +
-                                    start_y * src_stride +
-                                    start_x * pixel_size_;
+        const uint8_t* const src_buffer = src_buffer_base;
+        const bool skip = direction_y[y_index] & kCdefSkip;
+        int dual_cdef = 0;
 
         if (skip) {  // No cdef filtering.
-          for (int y = 0; y < block_height; ++y) {
-            memcpy(cdef_buffer + y * cdef_stride, src_buffer + y * src_stride,
-                   block_width * pixel_size_);
+          CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+                     block_width, block_height, pixel_size_);
+        } else {
+          // Make sure block pair is not out of bounds.
+          if (column4x4 + (kStep4x4 * 2) <= column4x4_start + block_width4x4) {
+            // Enable dual processing if subsampling_x is 1.
+            dual_cdef = subsampling_x;
           }
-          continue;
-        }
 
-        if (plane == kPlaneY) {
-          dsp_.cdef_direction(src_buffer, src_stride, &direction_y, &variance);
-          primary_strength = frame_header_.cdef.y_primary_strength[index]
-                             << coeff_shift;
-          secondary_strength = frame_header_.cdef.y_secondary_strength[index]
-                               << coeff_shift;
-          direction = (primary_strength == 0) ? 0 : direction_y;
-          const int variance_strength =
-              ((variance >> 6) != 0) ? std::min(FloorLog2(variance >> 6), 12)
-                                     : 0;
-          primary_strength =
-              (variance != 0)
-                  ? (primary_strength * (4 + variance_strength) + 8) >> 4
-                  : 0;
-        } else {
-          primary_strength = frame_header_.cdef.uv_primary_strength[index]
-                             << coeff_shift;
-          secondary_strength = frame_header_.cdef.uv_secondary_strength[index]
-                               << coeff_shift;
-          direction =
-              (primary_strength == 0)
-                  ? 0
-                  : kCdefUvDirection[subsampling_x][subsampling_y][direction_y];
-          damping = frame_header_.cdef.damping + coeff_shift - 1;
-        }
+          int direction = (uv_primary_strength == 0)
+                              ? 0
+                              : kCdefUvDirection[subsampling_x][subsampling_y]
+                                                [direction_y[y_index]];
+
+          if (dual_cdef != 0) {
+            if (uv_primary_strength &&
+                direction_y[y_index] != direction_y[y_index + 1]) {
+              // Disable dual processing if the second block of the pair does
+              // not have the same direction.
+              dual_cdef = 0;
+            }
 
-        if ((primary_strength | secondary_strength) == 0) {
-          for (int y = 0; y < block_height; ++y) {
-            memcpy(cdef_buffer + y * cdef_stride, src_buffer + y * src_stride,
-                   block_width * pixel_size_);
+            // Disable dual processing if the second block of the pair is a
+            // skip.
+            if (direction_y[y_index + 1] == kCdefSkip) {
+              dual_cdef = 0;
+            }
           }
-          continue;
+
+          uint16_t* cdef_src = cdef_block + plane * kCdefUnitSizeWithBorders *
+                                                kCdefUnitSizeWithBorders;
+          cdef_src += kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder;
+          cdef_src +=
+              (MultiplyBy4(row4x4 - row4x4_start) >> subsampling_y) *
+                  kCdefUnitSizeWithBorders +
+              (MultiplyBy4(column4x4 - column4x4_start) >> subsampling_x);
+          dsp_.cdef_filter(cdef_src, kCdefUnitSizeWithBorders,
+                           block_width << dual_cdef, block_height,
+                           uv_primary_strength, uv_secondary_strength,
+                           frame_header_.cdef.damping - 1, direction,
+                           cdef_buffer, cdef_stride);
         }
-        uint16_t* cdef_src =
-            cdef_block + plane * kRestorationProcessingUnitSizeWithBorders *
-                             kRestorationProcessingUnitSizeWithBorders;
-        cdef_src += kCdefBorder * kRestorationProcessingUnitSizeWithBorders +
-                    kCdefBorder;
-        cdef_src += (MultiplyBy4(row4x4 - row4x4_start) >> subsampling_y) *
-                        kRestorationProcessingUnitSizeWithBorders +
-                    (MultiplyBy4(column4x4 - column4x4_start) >> subsampling_x);
-        dsp_.cdef_filter(cdef_src, kRestorationProcessingUnitSizeWithBorders,
-                         frame_header_.rows4x4, frame_header_.columns4x4,
-                         start_x, start_y, subsampling_x, subsampling_y,
-                         primary_strength, secondary_strength, damping,
-                         direction, cdef_buffer, cdef_stride);
-      }
-    }
+        // When dual_cdef is set, the above cdef_filter() will process 2 blocks,
+        // so adjust the pointers and indexes for 2 blocks.
+        cdef_buffer_base += column_step[plane] << dual_cdef;
+        src_buffer_base += column_step[plane] << dual_cdef;
+        column4x4 += kStep4x4 << dual_cdef;
+        y_index += 1 << dual_cdef;
+      } while (column4x4 < column4x4_start + block_width4x4);
+
+      cdef_buffer_row_base[plane] += cdef_buffer_row_base_stride[plane];
+      src_buffer_row_base[plane] += src_buffer_row_base_stride[plane];
+      row4x4 += kStep4x4;
+    } while (row4x4 < row4x4_start + block_height4x4);
   }
 }
 
@@ -336,8 +466,7 @@ void PostFilter::ApplyCdefForOneSuperBlockRow(int row4x4_start, int sb4x4,
 template <typename Pixel>
 void PostFilter::ApplyCdefForOneRowInWindow(const int row4x4,
                                             const int column4x4_start) {
-  uint16_t cdef_block[kRestorationProcessingUnitSizeWithBorders *
-                      kRestorationProcessingUnitSizeWithBorders * 3];
+  uint16_t cdef_block[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 3];
 
   for (int column4x4_64x64 = 0;
        column4x4_64x64 < std::min(DivideBy4(window_buffer_width_),
diff --git a/chromium/third_party/libgav1/src/src/post_filter/deblock.cc b/chromium/third_party/libgav1/src/src/post_filter/deblock.cc
index db21d3db117..afe2895dbe3 100644
--- a/chromium/third_party/libgav1/src/src/post_filter/deblock.cc
+++ b/chromium/third_party/libgav1/src/src/post_filter/deblock.cc
@@ -17,6 +17,76 @@
 #include "src/utils/blocking_counter.h"
 
 namespace libgav1 {
+namespace {
+
+constexpr uint8_t HevThresh(int level) { return DivideBy16(level); }
+
+// GetLoopFilterSize* functions depend on this exact ordering of the
+// LoopFilterSize enums.
+static_assert(dsp::kLoopFilterSize4 == 0, "");
+static_assert(dsp::kLoopFilterSize6 == 1, "");
+static_assert(dsp::kLoopFilterSize8 == 2, "");
+static_assert(dsp::kLoopFilterSize14 == 3, "");
+
+dsp::LoopFilterSize GetLoopFilterSizeY(int filter_length) {
+  // |filter_length| must be a power of 2.
+  assert((filter_length & (filter_length - 1)) == 0);
+  // This code is the branch free equivalent of:
+  //   if (filter_length == 4) return kLoopFilterSize4;
+  //   if (filter_length == 8) return kLoopFilterSize8;
+  //   return kLoopFilterSize14;
+  return static_cast<dsp::LoopFilterSize>(
+      MultiplyBy2(static_cast<int>(filter_length > 4)) +
+      static_cast<int>(filter_length > 8));
+}
+
+constexpr dsp::LoopFilterSize GetLoopFilterSizeUV(int filter_length) {
+  // For U & V planes, size is kLoopFilterSize4 if |filter_length| is 4,
+  // otherwise size is kLoopFilterSize6.
+  return static_cast<dsp::LoopFilterSize>(filter_length != 4);
+}
+
+// 7.14.5.
+void ComputeDeblockFilterLevelsHelper(
+    const ObuFrameHeader& frame_header, int segment_id, int level_index,
+    const int8_t delta_lf[kFrameLfCount],
+    uint8_t deblock_filter_levels[kNumReferenceFrameTypes][2]) {
+  const int delta = delta_lf[frame_header.delta_lf.multi ? level_index : 0];
+  uint8_t level = Clip3(frame_header.loop_filter.level[level_index] + delta, 0,
+                        kMaxLoopFilterValue);
+  const auto feature = static_cast<SegmentFeature>(
+      kSegmentFeatureLoopFilterYVertical + level_index);
+  level =
+      Clip3(level + frame_header.segmentation.feature_data[segment_id][feature],
+            0, kMaxLoopFilterValue);
+  if (!frame_header.loop_filter.delta_enabled) {
+    static_assert(sizeof(deblock_filter_levels[0][0]) == 1, "");
+    memset(deblock_filter_levels, level, kNumReferenceFrameTypes * 2);
+    return;
+  }
+  assert(frame_header.loop_filter.delta_enabled);
+  const int shift = level >> 5;
+  deblock_filter_levels[kReferenceFrameIntra][0] = Clip3(
+      level +
+          LeftShift(frame_header.loop_filter.ref_deltas[kReferenceFrameIntra],
+                    shift),
+      0, kMaxLoopFilterValue);
+  // deblock_filter_levels[kReferenceFrameIntra][1] is never used. So it does
+  // not have to be populated.
+  for (int reference_frame = kReferenceFrameIntra + 1;
+       reference_frame < kNumReferenceFrameTypes; ++reference_frame) {
+    for (int mode_id = 0; mode_id < 2; ++mode_id) {
+      deblock_filter_levels[reference_frame][mode_id] = Clip3(
+          level +
+              LeftShift(frame_header.loop_filter.ref_deltas[reference_frame] +
+                            frame_header.loop_filter.mode_deltas[mode_id],
+                        shift),
+          0, kMaxLoopFilterValue);
+    }
+  }
+}
+
+}  // namespace
 
 void PostFilter::ComputeDeblockFilterLevels(
     const int8_t delta_lf[kFrameLfCount],
@@ -28,13 +98,13 @@ void PostFilter::ComputeDeblockFilterLevels(
        ++segment_id) {
     int level_index = 0;
     for (; level_index < 2; ++level_index) {
-      LoopFilterMask::ComputeDeblockFilterLevels(
+      ComputeDeblockFilterLevelsHelper(
           frame_header_, segment_id, level_index, delta_lf,
           deblock_filter_levels[segment_id][level_index]);
     }
     for (; level_index < kFrameLfCount; ++level_index) {
       if (frame_header_.loop_filter.level[level_index] != 0) {
-        LoopFilterMask::ComputeDeblockFilterLevels(
+        ComputeDeblockFilterLevelsHelper(
             frame_header_, segment_id, level_index, delta_lf,
             deblock_filter_levels[segment_id][level_index]);
       }
@@ -42,62 +112,28 @@ void PostFilter::ComputeDeblockFilterLevels(
   }
 }
 
-void PostFilter::InitDeblockFilterParams() {
-  const int8_t sharpness = frame_header_.loop_filter.sharpness;
-  assert(0 <= sharpness && sharpness < 8);
-  const int shift = DivideBy4(sharpness + 3);  // ceil(sharpness / 4.0)
-  for (int level = 0; level <= kMaxLoopFilterValue; ++level) {
-    uint8_t limit = level >> shift;
-    if (sharpness > 0) {
-      limit = Clip3(limit, 1, 9 - sharpness);
-    } else {
-      limit = std::max(limit, static_cast<uint8_t>(1));
-    }
-    inner_thresh_[level] = limit;
-    outer_thresh_[level] = 2 * (level + 2) + limit;
-    hev_thresh_[level] = level >> 4;
-  }
-}
-
-void PostFilter::GetDeblockFilterParams(uint8_t level, int* outer_thresh,
-                                        int* inner_thresh,
-                                        int* hev_thresh) const {
-  *outer_thresh = outer_thresh_[level];
-  *inner_thresh = inner_thresh_[level];
-  *hev_thresh = hev_thresh_[level];
-}
-
-template <LoopFilterType type>
-bool PostFilter::GetDeblockFilterEdgeInfo(const Plane plane, int row4x4,
-                                          int column4x4,
-                                          const int8_t subsampling_x,
-                                          const int8_t subsampling_y,
-                                          uint8_t* level, int* step,
-                                          int* filter_length) const {
+bool PostFilter::GetHorizontalDeblockFilterEdgeInfo(const Plane plane,
+                                                    int row4x4, int column4x4,
+                                                    const int8_t subsampling_x,
+                                                    const int8_t subsampling_y,
+                                                    uint8_t* level, int* step,
+                                                    int* filter_length) const {
   row4x4 = GetDeblockPosition(row4x4, subsampling_y);
   column4x4 = GetDeblockPosition(column4x4, subsampling_x);
   const BlockParameters* bp = block_parameters_.Find(row4x4, column4x4);
   const TransformSize transform_size =
       (plane == kPlaneY) ? inter_transform_sizes_[row4x4][column4x4]
                          : bp->uv_transform_size;
-  *step = (type == kLoopFilterTypeHorizontal) ? kTransformHeight[transform_size]
-                                              : kTransformWidth[transform_size];
-  if ((type == kLoopFilterTypeHorizontal && row4x4 == subsampling_y) ||
-      (type == kLoopFilterTypeVertical && column4x4 == subsampling_x)) {
-    return false;
-  }
+  *step = kTransformHeight[transform_size];
+  if (row4x4 == subsampling_y) return false;
 
-  const int filter_id = kDeblockFilterLevelIndex[plane][type];
+  const int filter_id =
+      kDeblockFilterLevelIndex[plane][kLoopFilterTypeHorizontal];
   const uint8_t level_this = bp->deblock_filter_level[filter_id];
-  const int row4x4_prev = (type == kLoopFilterTypeHorizontal)
-                              ? row4x4 - (1 << subsampling_y)
-                              : row4x4;
-  const int column4x4_prev = (type == kLoopFilterTypeHorizontal)
-                                 ? column4x4
-                                 : column4x4 - (1 << subsampling_x);
-  assert(row4x4_prev >= 0 && column4x4_prev >= 0);
+  const int row4x4_prev = row4x4 - (1 << subsampling_y);
+  assert(row4x4_prev >= 0);
   const BlockParameters* bp_prev =
-      block_parameters_.Find(row4x4_prev, column4x4_prev);
+      block_parameters_.Find(row4x4_prev, column4x4);
   const uint8_t level_prev = bp_prev->deblock_filter_level[filter_id];
   *level = level_this;
   if (level_this == 0) {
@@ -107,373 +143,91 @@ bool PostFilter::GetDeblockFilterEdgeInfo(const Plane plane, int row4x4,
 
   const BlockSize size =
       kPlaneResidualSize[bp->size][subsampling_x][subsampling_y];
-  const int prediction_masks = (type == kLoopFilterTypeHorizontal)
-                                   ? kBlockHeightPixels[size] - 1
-                                   : kBlockWidthPixels[size] - 1;
-  const int pixel_position = MultiplyBy4((type == kLoopFilterTypeHorizontal)
-                                             ? row4x4 >> subsampling_y
-                                             : column4x4 >> subsampling_x);
+  const int prediction_masks = kBlockHeightPixels[size] - 1;
+  const int pixel_position = MultiplyBy4(row4x4 >> subsampling_y);
   const bool is_border = (pixel_position & prediction_masks) == 0;
   const bool skip = bp->skip && bp->is_inter;
   const bool skip_prev = bp_prev->skip && bp_prev->is_inter;
   if (!skip || !skip_prev || is_border) {
     const TransformSize transform_size_prev =
-        (plane == kPlaneY) ? inter_transform_sizes_[row4x4_prev][column4x4_prev]
+        (plane == kPlaneY) ? inter_transform_sizes_[row4x4_prev][column4x4]
                            : bp_prev->uv_transform_size;
-    const int step_prev = (type == kLoopFilterTypeHorizontal)
-                              ? kTransformHeight[transform_size_prev]
-                              : kTransformWidth[transform_size_prev];
+    const int step_prev = kTransformHeight[transform_size_prev];
     *filter_length = std::min(*step, step_prev);
     return true;
   }
   return false;
 }
 
-void PostFilter::HorizontalDeblockFilter(Plane plane, int row4x4_start,
-                                         int column4x4_start, int unit_id) {
-  const int8_t subsampling_x = subsampling_x_[plane];
-  const int8_t subsampling_y = subsampling_y_[plane];
-  const int row_step = 1 << subsampling_y;
-  const int column_step = 1 << subsampling_x;
-  const size_t src_step = 4 * pixel_size_;
-  const ptrdiff_t row_stride = MultiplyBy4(frame_buffer_.stride(plane));
-  const ptrdiff_t src_stride = frame_buffer_.stride(plane);
-  uint8_t* src = GetSourceBuffer(plane, row4x4_start, column4x4_start);
-  const uint64_t single_row_mask = 0xffff;
-  // 3 (11), 5 (0101).
-  const uint64_t two_block_mask = (subsampling_x > 0) ? 5 : 3;
-  const LoopFilterType type = kLoopFilterTypeHorizontal;
-  // Subsampled UV samples correspond to the right/bottom position of
-  // Y samples.
-  const int column = subsampling_x;
-
-  // AV1 smallest transform size is 4x4, thus minimum horizontal edge size is
-  // 4x4. For SIMD implementation, sse2 could compute 8 pixels at the same time.
-  // __m128i = 8 x uint16_t, AVX2 could compute 16 pixels at the same time.
-  // __m256i = 16 x uint16_t, assuming pixel type is 16 bit. It means we could
-  // filter 2 horizontal edges using sse2 and 4 edges using AVX2.
-  // The bitmask enables us to call different SIMD implementations to filter
-  // 1 edge, or 2 edges or 4 edges.
-  // TODO(chengchen): Here, the implementation only consider 1 and 2 edges.
-  // Add support for 4 edges. More branches involved, for example, if input is
-  // 8 bit, __m128i = 16 x 8 bit, we could apply filtering for 4 edges using
-  // sse2, 8 edges using AVX2. If input is 16 bit, __m128 = 8 x 16 bit, then
-  // we apply filtering for 2 edges using sse2, and 4 edges using AVX2.
-  for (int row4x4 = 0; MultiplyBy4(row4x4_start + row4x4) < height_ &&
-                       row4x4 < kNum4x4InLoopFilterMaskUnit;
-       row4x4 += row_step) {
-    if (row4x4_start + row4x4 == 0) {
-      src += row_stride;
-      continue;
-    }
-    // Subsampled UV samples correspond to the right/bottom position of
-    // Y samples.
-    const int row = GetDeblockPosition(row4x4, subsampling_y);
-    const int index = GetIndex(row);
-    const int shift = GetShift(row, column);
-    const int level_offset = LoopFilterMask::GetLevelOffset(row, column);
-    // Mask of current row. mask4x4 represents the vertical filter length for
-    // the current horizontal edge is 4, and we needs to apply 3-tap filtering.
-    // Similarly, mask8x8 and mask16x16 represent filter lengths are 8 and 16.
-    uint64_t mask4x4 =
-        (masks_->GetTop(unit_id, plane, kLoopFilterTransformSizeId4x4, index) >>
-         shift) &
-        single_row_mask;
-    uint64_t mask8x8 =
-        (masks_->GetTop(unit_id, plane, kLoopFilterTransformSizeId8x8, index) >>
-         shift) &
-        single_row_mask;
-    uint64_t mask16x16 =
-        (masks_->GetTop(unit_id, plane, kLoopFilterTransformSizeId16x16,
-                        index) >>
-         shift) &
-        single_row_mask;
-    // mask4x4, mask8x8, mask16x16 are mutually exclusive.
-    assert((mask4x4 & mask8x8) == 0 && (mask4x4 & mask16x16) == 0 &&
-           (mask8x8 & mask16x16) == 0);
-    // Apply deblock filter for one row.
-    uint8_t* src_row = src;
-    int column_offset = 0;
-    for (uint64_t mask = mask4x4 | mask8x8 | mask16x16; mask != 0;) {
-      int edge_count = 1;
-      if ((mask & 1) != 0) {
-        // Filter parameters of current edge.
-        const uint8_t level = masks_->GetLevel(unit_id, plane, type,
-                                               level_offset + column_offset);
-        int outer_thresh_0;
-        int inner_thresh_0;
-        int hev_thresh_0;
-        GetDeblockFilterParams(level, &outer_thresh_0, &inner_thresh_0,
-                               &hev_thresh_0);
-        // Filter parameters of next edge. Clip the index to avoid over
-        // reading at the edge of the block. The values will be unused in that
-        // case.
-        const int level_next_index = level_offset + column_offset + column_step;
-        const uint8_t level_next =
-            masks_->GetLevel(unit_id, plane, type, level_next_index & 0xff);
-        int outer_thresh_1;
-        int inner_thresh_1;
-        int hev_thresh_1;
-        GetDeblockFilterParams(level_next, &outer_thresh_1, &inner_thresh_1,
-                               &hev_thresh_1);
-
-        if ((mask16x16 & 1) != 0) {
-          const dsp::LoopFilterSize size = (plane == kPlaneY)
-                                               ? dsp::kLoopFilterSize14
-                                               : dsp::kLoopFilterSize6;
-          const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type];
-          if ((mask16x16 & two_block_mask) == two_block_mask) {
-            edge_count = 2;
-            // Apply filtering for two edges.
-            filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
-                        hev_thresh_0);
-            filter_func(src_row + src_step, src_stride, outer_thresh_1,
-                        inner_thresh_1, hev_thresh_1);
-          } else {
-            // Apply single edge filtering.
-            filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
-                        hev_thresh_0);
-          }
-        }
-
-        if ((mask8x8 & 1) != 0) {
-          const dsp::LoopFilterSize size =
-              plane == kPlaneY ? dsp::kLoopFilterSize8 : dsp::kLoopFilterSize6;
-          const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type];
-          if ((mask8x8 & two_block_mask) == two_block_mask) {
-            edge_count = 2;
-            // Apply filtering for two edges.
-            filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
-                        hev_thresh_0);
-            filter_func(src_row + src_step, src_stride, outer_thresh_1,
-                        inner_thresh_1, hev_thresh_1);
-          } else {
-            // Apply single edge filtering.
-            filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
-                        hev_thresh_0);
-          }
-        }
+bool PostFilter::GetVerticalDeblockFilterEdgeInfo(
+    const Plane /*plane*/, int row4x4, int column4x4,
+    const int8_t /*subsampling_x*/, const int8_t /*subsampling_y*/,
+    BlockParameters* const* bp_ptr, uint8_t* level, int* step,
+    int* filter_length) const {
+  const BlockParameters* bp = *bp_ptr;
+  *step = kTransformWidth[inter_transform_sizes_[row4x4][column4x4]];
+  if (column4x4 == 0) return false;
 
-        if ((mask4x4 & 1) != 0) {
-          const dsp::LoopFilterSize size = dsp::kLoopFilterSize4;
-          const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type];
-          if ((mask4x4 & two_block_mask) == two_block_mask) {
-            edge_count = 2;
-            // Apply filtering for two edges.
-            filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
-                        hev_thresh_0);
-            filter_func(src_row + src_step, src_stride, outer_thresh_1,
-                        inner_thresh_1, hev_thresh_1);
-          } else {
-            // Apply single edge filtering.
-            filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
-                        hev_thresh_0);
-          }
-        }
-      }
-
-      const int step = edge_count * column_step;
-      mask4x4 >>= step;
-      mask8x8 >>= step;
-      mask16x16 >>= step;
-      mask >>= step;
-      column_offset += step;
-      src_row += MultiplyBy4(edge_count) * pixel_size_;
-    }
-    src += row_stride;
+  const int filter_id = 0;
+  const uint8_t level_this = bp->deblock_filter_level[filter_id];
+  const int column4x4_prev = column4x4 - 1;
+  assert(column4x4_prev >= 0);
+  const BlockParameters* bp_prev = *(bp_ptr - 1);
+  const uint8_t level_prev = bp_prev->deblock_filter_level[filter_id];
+  *level = level_this;
+  if (level_this == 0) {
+    if (level_prev == 0) return false;
+    *level = level_prev;
   }
-}
 
-void PostFilter::VerticalDeblockFilter(Plane plane, int row4x4_start,
-                                       int column4x4_start, int unit_id) {
-  const int8_t subsampling_x = subsampling_x_[plane];
-  const int8_t subsampling_y = subsampling_y_[plane];
-  const int row_step = 1 << subsampling_y;
-  const int two_row_step = row_step << 1;
-  const int column_step = 1 << subsampling_x;
-  const size_t src_step = (bitdepth_ == 8) ? 4 : 4 * sizeof(uint16_t);
-  const ptrdiff_t row_stride = MultiplyBy4(frame_buffer_.stride(plane));
-  const ptrdiff_t two_row_stride = row_stride << 1;
-  const ptrdiff_t src_stride = frame_buffer_.stride(plane);
-  uint8_t* src = GetSourceBuffer(plane, row4x4_start, column4x4_start);
-  const uint64_t single_row_mask = 0xffff;
-  const LoopFilterType type = kLoopFilterTypeVertical;
-  // Subsampled UV samples correspond to the right/bottom position of
-  // Y samples.
-  const int column = subsampling_x;
-
-  // AV1 smallest transform size is 4x4, thus minimum vertical edge size is 4x4.
-  // For SIMD implementation, sse2 could compute 8 pixels at the same time.
-  // __m128i = 8 x uint16_t, AVX2 could compute 16 pixels at the same time.
-  // __m256i = 16 x uint16_t, assuming pixel type is 16 bit. It means we could
-  // filter 2 vertical edges using sse2 and 4 edges using AVX2.
-  // The bitmask enables us to call different SIMD implementations to filter
-  // 1 edge, or 2 edges or 4 edges.
-  // TODO(chengchen): Here, the implementation only consider 1 and 2 edges.
-  // Add support for 4 edges. More branches involved, for example, if input is
-  // 8 bit, __m128i = 16 x 8 bit, we could apply filtering for 4 edges using
-  // sse2, 8 edges using AVX2. If input is 16 bit, __m128 = 8 x 16 bit, then
-  // we apply filtering for 2 edges using sse2, and 4 edges using AVX2.
-  for (int row4x4 = 0; MultiplyBy4(row4x4_start + row4x4) < height_ &&
-                       row4x4 < kNum4x4InLoopFilterMaskUnit;
-       row4x4 += two_row_step) {
-    // Subsampled UV samples correspond to the right/bottom position of
-    // Y samples.
-    const int row = GetDeblockPosition(row4x4, subsampling_y);
-    const int row_next = row + row_step;
-    const int index = GetIndex(row);
-    const int shift = GetShift(row, column);
-    const int level_offset = LoopFilterMask::GetLevelOffset(row, column);
-    const int index_next = GetIndex(row_next);
-    const int shift_next_row = GetShift(row_next, column);
-    const int level_offset_next_row =
-        LoopFilterMask::GetLevelOffset(row_next, column);
-    // TODO(chengchen): replace 0, 1, 2 to meaningful enum names.
-    // mask of current row. mask4x4 represents the horizontal filter length for
-    // the current vertical edge is 4, and we needs to apply 3-tap filtering.
-    // Similarly, mask8x8 and mask16x16 represent filter lengths are 8 and 16.
-    uint64_t mask4x4_0 =
-        (masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId4x4,
-                         index) >>
-         shift) &
-        single_row_mask;
-    uint64_t mask8x8_0 =
-        (masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId8x8,
-                         index) >>
-         shift) &
-        single_row_mask;
-    uint64_t mask16x16_0 =
-        (masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId16x16,
-                         index) >>
-         shift) &
-        single_row_mask;
-    // mask4x4, mask8x8, mask16x16 are mutually exclusive.
-    assert((mask4x4_0 & mask8x8_0) == 0 && (mask4x4_0 & mask16x16_0) == 0 &&
-           (mask8x8_0 & mask16x16_0) == 0);
-    // mask of the next row. With mask of current and the next row, we can call
-    // the corresponding SIMD function to apply filtering for two vertical
-    // edges together.
-    uint64_t mask4x4_1 =
-        (masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId4x4,
-                         index_next) >>
-         shift_next_row) &
-        single_row_mask;
-    uint64_t mask8x8_1 =
-        (masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId8x8,
-                         index_next) >>
-         shift_next_row) &
-        single_row_mask;
-    uint64_t mask16x16_1 =
-        (masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId16x16,
-                         index_next) >>
-         shift_next_row) &
-        single_row_mask;
-    // mask4x4, mask8x8, mask16x16 are mutually exclusive.
-    assert((mask4x4_1 & mask8x8_1) == 0 && (mask4x4_1 & mask16x16_1) == 0 &&
-           (mask8x8_1 & mask16x16_1) == 0);
-    // Apply deblock filter for two rows.
-    uint8_t* src_row = src;
-    int column_offset = 0;
-    for (uint64_t mask = mask4x4_0 | mask8x8_0 | mask16x16_0 | mask4x4_1 |
-                         mask8x8_1 | mask16x16_1;
-         mask != 0;) {
-      if ((mask & 1) != 0) {
-        // Filter parameters of current row.
-        const uint8_t level = masks_->GetLevel(unit_id, plane, type,
-                                               level_offset + column_offset);
-        int outer_thresh_0;
-        int inner_thresh_0;
-        int hev_thresh_0;
-        GetDeblockFilterParams(level, &outer_thresh_0, &inner_thresh_0,
-                               &hev_thresh_0);
-        // Filter parameters of next row. Clip the index to avoid over
-        // reading at the edge of the block. The values will be unused in that
-        // case.
-        const int level_next_index = level_offset_next_row + column_offset;
-        const uint8_t level_next =
-            masks_->GetLevel(unit_id, plane, type, level_next_index & 0xff);
-        int outer_thresh_1;
-        int inner_thresh_1;
-        int hev_thresh_1;
-        GetDeblockFilterParams(level_next, &outer_thresh_1, &inner_thresh_1,
-                               &hev_thresh_1);
-        uint8_t* const src_row_next = src_row + row_stride;
-
-        if (((mask16x16_0 | mask16x16_1) & 1) != 0) {
-          const dsp::LoopFilterSize size = (plane == kPlaneY)
-                                               ? dsp::kLoopFilterSize14
-                                               : dsp::kLoopFilterSize6;
-          const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type];
-          if ((mask16x16_0 & mask16x16_1 & 1) != 0) {
-            // Apply dual vertical edge filtering.
-            filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
-                        hev_thresh_0);
-            filter_func(src_row_next, src_stride, outer_thresh_1,
-                        inner_thresh_1, hev_thresh_1);
-          } else if ((mask16x16_0 & 1) != 0) {
-            filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
-                        hev_thresh_0);
-          } else {
-            filter_func(src_row_next, src_stride, outer_thresh_1,
-                        inner_thresh_1, hev_thresh_1);
-          }
-        }
-
-        if (((mask8x8_0 | mask8x8_1) & 1) != 0) {
-          const dsp::LoopFilterSize size = (plane == kPlaneY)
-                                               ? dsp::kLoopFilterSize8
-                                               : dsp::kLoopFilterSize6;
-          const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type];
-          if ((mask8x8_0 & mask8x8_1 & 1) != 0) {
-            filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
-                        hev_thresh_0);
-            filter_func(src_row_next, src_stride, outer_thresh_1,
-                        inner_thresh_1, hev_thresh_1);
-          } else if ((mask8x8_0 & 1) != 0) {
-            filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
-                        hev_thresh_0);
-          } else {
-            filter_func(src_row_next, src_stride, outer_thresh_1,
-                        inner_thresh_1, hev_thresh_1);
-          }
-        }
+  const int prediction_masks = kBlockWidthPixels[bp->size] - 1;
+  const int pixel_position = MultiplyBy4(column4x4);
+  const bool is_border = (pixel_position & prediction_masks) == 0;
+  const bool skip = bp->skip && bp->is_inter;
+  const bool skip_prev = bp_prev->skip && bp_prev->is_inter;
+  if (skip && skip_prev && !is_border) return false;
+  const int step_prev =
+      kTransformWidth[inter_transform_sizes_[row4x4][column4x4_prev]];
+  *filter_length = std::min(*step, step_prev);
+  return true;
+}
 
-        if (((mask4x4_0 | mask4x4_1) & 1) != 0) {
-          const dsp::LoopFilterSize size = dsp::kLoopFilterSize4;
-          const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type];
-          if ((mask4x4_0 & mask4x4_1 & 1) != 0) {
-            filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
-                        hev_thresh_0);
-            filter_func(src_row_next, src_stride, outer_thresh_1,
-                        inner_thresh_1, hev_thresh_1);
-          } else if ((mask4x4_0 & 1) != 0) {
-            filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
-                        hev_thresh_0);
-          } else {
-            filter_func(src_row_next, src_stride, outer_thresh_1,
-                        inner_thresh_1, hev_thresh_1);
-          }
-        }
-      }
+bool PostFilter::GetVerticalDeblockFilterEdgeInfoUV(
+    const Plane plane, int row4x4, int column4x4, const int8_t subsampling_x,
+    const int8_t subsampling_y, BlockParameters* const* bp_ptr, uint8_t* level,
+    int* step, int* filter_length) const {
+  row4x4 = GetDeblockPosition(row4x4, subsampling_y);
+  column4x4 = GetDeblockPosition(column4x4, subsampling_x);
+  const BlockParameters* bp = *bp_ptr;
+  *step = kTransformWidth[bp->uv_transform_size];
+  if (column4x4 == subsampling_x) return false;
 
-      mask4x4_0 >>= column_step;
-      mask8x8_0 >>= column_step;
-      mask16x16_0 >>= column_step;
-      mask4x4_1 >>= column_step;
-      mask8x8_1 >>= column_step;
-      mask16x16_1 >>= column_step;
-      mask >>= column_step;
-      column_offset += column_step;
-      src_row += src_step;
-    }
-    src += two_row_stride;
+  const int filter_id =
+      kDeblockFilterLevelIndex[plane][kLoopFilterTypeVertical];
+  const uint8_t level_this = bp->deblock_filter_level[filter_id];
+  const BlockParameters* bp_prev = *(bp_ptr - (1 << subsampling_x));
+  const uint8_t level_prev = bp_prev->deblock_filter_level[filter_id];
+  *level = level_this;
+  if (level_this == 0) {
+    if (level_prev == 0) return false;
+    *level = level_prev;
   }
+
+  const BlockSize size =
+      kPlaneResidualSize[bp->size][subsampling_x][subsampling_y];
+  const int prediction_masks = kBlockWidthPixels[size] - 1;
+  const int pixel_position = MultiplyBy4(column4x4 >> subsampling_x);
+  const bool is_border = (pixel_position & prediction_masks) == 0;
+  const bool skip = bp->skip && bp->is_inter;
+  const bool skip_prev = bp_prev->skip && bp_prev->is_inter;
+  if (skip && skip_prev && !is_border) return false;
+  const int step_prev = kTransformWidth[bp_prev->uv_transform_size];
+  *filter_length = std::min(*step, step_prev);
+  return true;
 }
 
-void PostFilter::HorizontalDeblockFilterNoMask(Plane plane, int row4x4_start,
-                                               int column4x4_start,
-                                               int unit_id) {
-  static_cast<void>(unit_id);
+void PostFilter::HorizontalDeblockFilter(Plane plane, int row4x4_start,
+                                         int column4x4_start) {
   const int8_t subsampling_x = subsampling_x_[plane];
   const int8_t subsampling_y = subsampling_y_[plane];
   const int column_step = 1 << subsampling_x;
@@ -486,27 +240,22 @@ void PostFilter::HorizontalDeblockFilterNoMask(Plane plane, int row4x4_start,
   int filter_length;
 
   for (int column4x4 = 0; MultiplyBy4(column4x4_start + column4x4) < width_ &&
-                          column4x4 < kNum4x4InLoopFilterMaskUnit;
+                          column4x4 < kNum4x4InLoopFilterUnit;
        column4x4 += column_step, src += src_step) {
     uint8_t* src_row = src;
     for (int row4x4 = 0; MultiplyBy4(row4x4_start + row4x4) < height_ &&
-                         row4x4 < kNum4x4InLoopFilterMaskUnit;
+                         row4x4 < kNum4x4InLoopFilterUnit;
          row4x4 += row_step) {
-      const bool need_filter =
-          GetDeblockFilterEdgeInfo<kLoopFilterTypeHorizontal>(
-              plane, row4x4_start + row4x4, column4x4_start + column4x4,
-              subsampling_x, subsampling_y, &level, &row_step, &filter_length);
+      const bool need_filter = GetHorizontalDeblockFilterEdgeInfo(
+          plane, row4x4_start + row4x4, column4x4_start + column4x4,
+          subsampling_x, subsampling_y, &level, &row_step, &filter_length);
       if (need_filter) {
-        int outer_thresh;
-        int inner_thresh;
-        int hev_thresh;
-        GetDeblockFilterParams(level, &outer_thresh, &inner_thresh,
-                               &hev_thresh);
         const dsp::LoopFilterSize size =
-            GetLoopFilterSize(plane, filter_length);
+            (plane == kPlaneY) ? GetLoopFilterSizeY(filter_length)
+                               : GetLoopFilterSizeUV(filter_length);
         const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type];
-        filter_func(src_row, src_stride, outer_thresh, inner_thresh,
-                    hev_thresh);
+        filter_func(src_row, src_stride, outer_thresh_[level],
+                    inner_thresh_[level], HevThresh(level));
       }
       // TODO(chengchen): use shifts instead of multiplication.
       src_row += row_step * src_stride;
@@ -515,9 +264,8 @@ void PostFilter::HorizontalDeblockFilterNoMask(Plane plane, int row4x4_start,
   }
 }
 
-void PostFilter::VerticalDeblockFilterNoMask(Plane plane, int row4x4_start,
-                                             int column4x4_start, int unit_id) {
-  static_cast<void>(unit_id);
+void PostFilter::VerticalDeblockFilter(Plane plane, int row4x4_start,
+                                       int column4x4_start) {
   const int8_t subsampling_x = subsampling_x_[plane];
   const int8_t subsampling_y = subsampling_y_[plane];
   const int row_step = 1 << subsampling_y;
@@ -529,29 +277,30 @@ void PostFilter::VerticalDeblockFilterNoMask(Plane plane, int row4x4_start,
   uint8_t level;
   int filter_length;
 
+  BlockParameters* const* bp_row_base = block_parameters_.Address(
+      GetDeblockPosition(row4x4_start, subsampling_y),
+      GetDeblockPosition(column4x4_start, subsampling_x));
+  const auto edge_info = deblock_vertical_edge_info_[plane];
+  const int bp_stride = block_parameters_.columns4x4() * row_step;
   for (int row4x4 = 0; MultiplyBy4(row4x4_start + row4x4) < height_ &&
-                       row4x4 < kNum4x4InLoopFilterMaskUnit;
-       row4x4 += row_step, src += row_stride) {
+                       row4x4 < kNum4x4InLoopFilterUnit;
+       row4x4 += row_step, src += row_stride, bp_row_base += bp_stride) {
     uint8_t* src_row = src;
+    BlockParameters* const* bp = bp_row_base;
     for (int column4x4 = 0; MultiplyBy4(column4x4_start + column4x4) < width_ &&
-                            column4x4 < kNum4x4InLoopFilterMaskUnit;
-         column4x4 += column_step) {
-      const bool need_filter =
-          GetDeblockFilterEdgeInfo<kLoopFilterTypeVertical>(
-              plane, row4x4_start + row4x4, column4x4_start + column4x4,
-              subsampling_x, subsampling_y, &level, &column_step,
-              &filter_length);
+                            column4x4 < kNum4x4InLoopFilterUnit;
+         column4x4 += column_step, bp += column_step) {
+      const bool need_filter = (this->*edge_info)(
+          plane, row4x4_start + row4x4, column4x4_start + column4x4,
+          subsampling_x, subsampling_y, bp, &level, &column_step,
+          &filter_length);
       if (need_filter) {
-        int outer_thresh;
-        int inner_thresh;
-        int hev_thresh;
-        GetDeblockFilterParams(level, &outer_thresh, &inner_thresh,
-                               &hev_thresh);
         const dsp::LoopFilterSize size =
-            GetLoopFilterSize(plane, filter_length);
+            (plane == kPlaneY) ? GetLoopFilterSizeY(filter_length)
+                               : GetLoopFilterSizeUV(filter_length);
         const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type];
-        filter_func(src_row, src_stride, outer_thresh, inner_thresh,
-                    hev_thresh);
+        filter_func(src_row, src_stride, outer_thresh_[level],
+                    inner_thresh_[level], HevThresh(level));
       }
       src_row += column_step * pixel_size_;
       column_step = DivideBy4(column_step << subsampling_x);
@@ -573,21 +322,19 @@ void PostFilter::ApplyDeblockFilterForOneSuperBlockRow(int row4x4_start,
       if (row4x4 >= frame_header_.rows4x4) break;
       int column4x4;
       for (column4x4 = 0; column4x4 < frame_header_.columns4x4;
-           column4x4 += kNum4x4InLoopFilterMaskUnit) {
+           column4x4 += kNum4x4InLoopFilterUnit) {
         // First apply vertical filtering
-        VerticalDeblockFilterNoMask(static_cast<Plane>(plane), row4x4,
-                                    column4x4, 0);
+        VerticalDeblockFilter(static_cast<Plane>(plane), row4x4, column4x4);
 
         // Delay one superblock to apply horizontal filtering.
         if (column4x4 != 0) {
-          HorizontalDeblockFilterNoMask(static_cast<Plane>(plane), row4x4,
-                                        column4x4 - kNum4x4InLoopFilterMaskUnit,
-                                        0);
+          HorizontalDeblockFilter(static_cast<Plane>(plane), row4x4,
+                                  column4x4 - kNum4x4InLoopFilterUnit);
         }
       }
       // Horizontal filtering for the last 64x64 block.
-      HorizontalDeblockFilterNoMask(static_cast<Plane>(plane), row4x4,
-                                    column4x4 - kNum4x4InLoopFilterMaskUnit, 0);
+      HorizontalDeblockFilter(static_cast<Plane>(plane), row4x4,
+                              column4x4 - kNum4x4InLoopFilterUnit);
     }
   }
 }
@@ -602,12 +349,11 @@ void PostFilter::DeblockFilterWorker(int jobs_per_plane, const Plane* planes,
          total_jobs) {
     const Plane plane = planes[job_index / jobs_per_plane];
     const int row_unit = job_index % jobs_per_plane;
-    const int row4x4 = row_unit * kNum4x4InLoopFilterMaskUnit;
+    const int row4x4 = row_unit * kNum4x4InLoopFilterUnit;
     for (int column4x4 = 0, column_unit = 0;
          column4x4 < frame_header_.columns4x4;
-         column4x4 += kNum4x4InLoopFilterMaskUnit, ++column_unit) {
-      const int unit_id = GetDeblockUnitId(row_unit, column_unit);
-      (this->*deblock_filter)(plane, row4x4, column4x4, unit_id);
+         column4x4 += kNum4x4InLoopFilterUnit, ++column_unit) {
+      (this->*deblock_filter)(plane, row4x4, column4x4);
     }
   }
 }
@@ -635,8 +381,7 @@ void PostFilter::ApplyDeblockFilterThreaded() {
   // The only synchronization involved is to know when the each directional
   // filter is complete for the entire frame.
   for (auto& type : {kLoopFilterTypeVertical, kLoopFilterTypeHorizontal}) {
-    const DeblockFilter deblock_filter =
-        deblock_filter_type_table_[kDeblockFilterBitMask][type];
+    const DeblockFilter deblock_filter = deblock_filter_func_[type];
     std::atomic<int> job_counter(0);
     BlockingCounter pending_workers(num_workers);
     for (int i = 0; i < num_workers; ++i) {
@@ -656,4 +401,31 @@ void PostFilter::ApplyDeblockFilterThreaded() {
   }
 }
 
+void PostFilter::ApplyDeblockFilter(LoopFilterType loop_filter_type,
+                                    int row4x4_start, int column4x4_start,
+                                    int column4x4_end, int sb4x4) {
+  assert(row4x4_start >= 0);
+  assert(DoDeblock());
+
+  column4x4_end = std::min(column4x4_end, frame_header_.columns4x4);
+  if (column4x4_start >= column4x4_end) return;
+
+  const DeblockFilter deblock_filter = deblock_filter_func_[loop_filter_type];
+  const int sb_height4x4 =
+      std::min(sb4x4, frame_header_.rows4x4 - row4x4_start);
+  for (int plane = kPlaneY; plane < planes_; ++plane) {
+    if (plane != kPlaneY && frame_header_.loop_filter.level[plane + 1] == 0) {
+      continue;
+    }
+
+    for (int y = 0; y < sb_height4x4; y += kNum4x4InLoopFilterUnit) {
+      const int row4x4 = row4x4_start + y;
+      for (int column4x4 = column4x4_start; column4x4 < column4x4_end;
+           column4x4 += kNum4x4InLoopFilterUnit) {
+        (this->*deblock_filter)(static_cast<Plane>(plane), row4x4, column4x4);
+      }
+    }
+  }
+}
+
 }  // namespace libgav1
diff --git a/chromium/third_party/libgav1/src/src/post_filter/deblock_thresholds.inc b/chromium/third_party/libgav1/src/src/post_filter/deblock_thresholds.inc
new file mode 100644
index 00000000000..ca12aaaeb7e
--- /dev/null
+++ b/chromium/third_party/libgav1/src/src/post_filter/deblock_thresholds.inc
@@ -0,0 +1,85 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Thresholds for the deblocking filter. Precomputed values of part of Section
+// 7.14.4 for all possible values of sharpness.
+
+constexpr uint8_t kInnerThresh[8][kMaxLoopFilterValue + 1] = {
+    {1,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+     32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+     48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63},
+    {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8,
+     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8},
+    {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
+     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7},
+    {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+     6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+     6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6},
+    {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5},
+    {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
+     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4},
+    {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3},
+    {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}};
+
+constexpr uint8_t kOuterThresh[8][kMaxLoopFilterValue + 1] = {
+    {5,   7,   10,  13,  16,  19,  22,  25,  28,  31,  34,  37,  40,
+     43,  46,  49,  52,  55,  58,  61,  64,  67,  70,  73,  76,  79,
+     82,  85,  88,  91,  94,  97,  100, 103, 106, 109, 112, 115, 118,
+     121, 124, 127, 130, 133, 136, 139, 142, 145, 148, 151, 154, 157,
+     160, 163, 166, 169, 172, 175, 178, 181, 184, 187, 190, 193},
+    {5,   7,   9,   11,  14,  16,  19,  21,  24,  26,  29,  31,  34,
+     36,  39,  41,  44,  46,  48,  50,  52,  54,  56,  58,  60,  62,
+     64,  66,  68,  70,  72,  74,  76,  78,  80,  82,  84,  86,  88,
+     90,  92,  94,  96,  98,  100, 102, 104, 106, 108, 110, 112, 114,
+     116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138},
+    {5,   7,   9,   11,  14,  16,  19,  21,  24,  26,  29,  31,  34,
+     36,  39,  41,  43,  45,  47,  49,  51,  53,  55,  57,  59,  61,
+     63,  65,  67,  69,  71,  73,  75,  77,  79,  81,  83,  85,  87,
+     89,  91,  93,  95,  97,  99,  101, 103, 105, 107, 109, 111, 113,
+     115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137},
+    {5,   7,   9,   11,  14,  16,  19,  21,  24,  26,  29,  31,  34,
+     36,  38,  40,  42,  44,  46,  48,  50,  52,  54,  56,  58,  60,
+     62,  64,  66,  68,  70,  72,  74,  76,  78,  80,  82,  84,  86,
+     88,  90,  92,  94,  96,  98,  100, 102, 104, 106, 108, 110, 112,
+     114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136},
+    {5,   7,   9,   11,  14,  16,  19,  21,  24,  26,  29,  31,  33,
+     35,  37,  39,  41,  43,  45,  47,  49,  51,  53,  55,  57,  59,
+     61,  63,  65,  67,  69,  71,  73,  75,  77,  79,  81,  83,  85,
+     87,  89,  91,  93,  95,  97,  99,  101, 103, 105, 107, 109, 111,
+     113, 115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135},
+    {5,   7,   9,   11,  13,  15,  17,  19,  22,  24,  26,  28,  31,
+     33,  35,  37,  40,  42,  44,  46,  48,  50,  52,  54,  56,  58,
+     60,  62,  64,  66,  68,  70,  72,  74,  76,  78,  80,  82,  84,
+     86,  88,  90,  92,  94,  96,  98,  100, 102, 104, 106, 108, 110,
+     112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134},
+    {5,   7,   9,   11,  13,  15,  17,  19,  22,  24,  26,  28,  31,
+     33,  35,  37,  39,  41,  43,  45,  47,  49,  51,  53,  55,  57,
+     59,  61,  63,  65,  67,  69,  71,  73,  75,  77,  79,  81,  83,
+     85,  87,  89,  91,  93,  95,  97,  99,  101, 103, 105, 107, 109,
+     111, 113, 115, 117, 119, 121, 123, 125, 127, 129, 131, 133},
+    {5,   7,   9,   11,  13,  15,  17,  19,  22,  24,  26,  28,  30,
+     32,  34,  36,  38,  40,  42,  44,  46,  48,  50,  52,  54,  56,
+     58,  60,  62,  64,  66,  68,  70,  72,  74,  76,  78,  80,  82,
+     84,  86,  88,  90,  92,  94,  96,  98,  100, 102, 104, 106, 108,
+     110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132}};
diff --git a/chromium/third_party/libgav1/src/src/post_filter/loop_restoration.cc b/chromium/third_party/libgav1/src/src/post_filter/loop_restoration.cc
index a36788057ba..b36ad80cf05 100644
--- a/chromium/third_party/libgav1/src/src/post_filter/loop_restoration.cc
+++ b/chromium/third_party/libgav1/src/src/post_filter/loop_restoration.cc
@@ -21,16 +21,14 @@ void PostFilter::ApplyLoopRestorationForOneUnit(
     uint8_t* const cdef_buffer, const ptrdiff_t cdef_buffer_stride,
     const Plane plane, const int plane_height, const int x, const int y,
     const int row, const int column, const int unit_row,
-    const int current_process_unit_height, const int plane_process_unit_width,
-    const int plane_unit_size, const int num_horizontal_units,
-    const int plane_width, Array2DView<Pixel>* const loop_restored_window) {
+    const int current_process_unit_height, const int plane_unit_size,
+    const int num_horizontal_units, const int plane_width,
+    Array2DView<Pixel>* const loop_restored_window) {
   const int unit_x = x + column;
   const int unit_y = y + row;
   const int current_process_unit_width =
-      (unit_x + plane_process_unit_width <= plane_width)
-          ? plane_process_unit_width
-          : plane_width - unit_x;
-  uint8_t* cdef_unit_buffer =
+      std::min(plane_unit_size, plane_width - unit_x);
+  const uint8_t* cdef_unit_buffer =
       cdef_buffer + unit_y * cdef_buffer_stride + unit_x * pixel_size_;
   const int unit_column =
       std::min(unit_x / plane_unit_size, num_horizontal_units - 1);
@@ -49,54 +47,47 @@ void PostFilter::ApplyLoopRestorationForOneUnit(
     return;
   }
 
+  const ptrdiff_t block_buffer_stride =
+      kRestorationUnitWidthWithBorders * sizeof(Pixel);
   // The SIMD implementation of wiener filter (currently WienerFilter_SSE4_1())
   // over-reads 6 bytes, so add 6 extra bytes at the end of block_buffer for 8
   // bit.
-  alignas(alignof(uint16_t))
-      uint8_t block_buffer[kRestorationProcessingUnitSizeWithBorders *
-                               kRestorationProcessingUnitSizeWithBorders *
-                               sizeof(Pixel) +
-                           ((sizeof(Pixel) == 1) ? 6 : 0)];
-  const ptrdiff_t block_buffer_stride =
-      kRestorationProcessingUnitSizeWithBorders * pixel_size_;
-  IntermediateBuffers intermediate_buffers;
-
-  RestorationBuffer restoration_buffer = {
-      {intermediate_buffers.box_filter.output[0],
-       intermediate_buffers.box_filter.output[1]},
-      plane_process_unit_width,
-      {intermediate_buffers.box_filter.intermediate_a,
-       intermediate_buffers.box_filter.intermediate_b},
-      kRestorationProcessingUnitSizeWithBorders + kRestorationPadding,
-      intermediate_buffers.wiener,
-      kMaxSuperBlockSizeInPixels};
-  const int deblock_buffer_units = 64 >> subsampling_y_[plane];
-  uint8_t* const deblock_buffer = deblock_buffer_.data(plane);
-  const int deblock_buffer_stride = deblock_buffer_.stride(plane);
-  const int deblock_unit_y =
-      std::max(MultiplyBy4(Ceil(unit_y, deblock_buffer_units)) - 4, 0);
-  uint8_t* deblock_unit_buffer =
-      (deblock_buffer != nullptr)
-          ? deblock_buffer + deblock_unit_y * deblock_buffer_stride +
-                unit_x * pixel_size_
-          : nullptr;
+  alignas(alignof(uint16_t)) uint8_t
+      block_buffer[kRestorationUnitHeightWithBorders * block_buffer_stride +
+                   ((sizeof(Pixel) == 1) ? 6 : 0)];
+  RestorationBuffer restoration_buffer;
+  const uint8_t* source;
+  ptrdiff_t source_stride;
+  if (DoCdef()) {
+    const int deblock_buffer_units = 64 >> subsampling_y_[plane];
+    const uint8_t* const deblock_buffer = deblock_buffer_.data(plane);
+    assert(deblock_buffer != nullptr);
+    const int deblock_buffer_stride = deblock_buffer_.stride(plane);
+    const int deblock_unit_y =
+        std::max(MultiplyBy4(Ceil(unit_y, deblock_buffer_units)) - 4, 0);
+    const uint8_t* const deblock_unit_buffer =
+        deblock_buffer + deblock_unit_y * deblock_buffer_stride +
+        unit_x * pixel_size_;
+    PrepareLoopRestorationBlock<Pixel>(
+        cdef_unit_buffer, cdef_buffer_stride, deblock_unit_buffer,
+        deblock_buffer_stride, block_buffer, block_buffer_stride,
+        current_process_unit_width, current_process_unit_height, unit_y == 0,
+        unit_y + current_process_unit_height >= plane_height);
+    source = block_buffer + kRestorationBorder * block_buffer_stride +
+             kRestorationBorder * pixel_size_;
+    source_stride = kRestorationUnitWidthWithBorders;
+  } else {
+    source = cdef_unit_buffer;
+    source_stride = cdef_buffer_stride / sizeof(Pixel);
+  }
   assert(type == kLoopRestorationTypeSgrProj ||
          type == kLoopRestorationTypeWiener);
   const dsp::LoopRestorationFunc restoration_func =
       dsp_.loop_restorations[type - 2];
-  PrepareLoopRestorationBlock<Pixel>(
-      DoCdef(), cdef_unit_buffer, cdef_buffer_stride, deblock_unit_buffer,
-      deblock_buffer_stride, block_buffer, block_buffer_stride,
-      current_process_unit_width, current_process_unit_height, unit_y == 0,
-      unit_y + current_process_unit_height >= plane_height);
-  restoration_func(reinterpret_cast<const uint8_t*>(
-                       block_buffer + kRestorationBorder * block_buffer_stride +
-                       kRestorationBorder * pixel_size_),
-                   &(*loop_restored_window)[row][column],
+  restoration_func(source, &(*loop_restored_window)[row][column],
                    restoration_info_->loop_restoration_info(
                        static_cast<Plane>(plane), unit_id),
-                   block_buffer_stride,
-                   loop_restored_window->columns() * pixel_size_,
+                   source_stride, loop_restored_window->columns(),
                    current_process_unit_width, current_process_unit_height,
                    &restoration_buffer);
 }
@@ -104,9 +95,8 @@ void PostFilter::ApplyLoopRestorationForOneUnit(
 template <typename Pixel>
 void PostFilter::ApplyLoopRestorationForSuperBlock(
     const Plane plane, const int x, const int y, const int unit_row,
-    const int current_process_unit_height, const int process_unit_width) {
+    const int current_process_unit_height, const int plane_unit_size) {
   const int stride = frame_buffer_.stride(plane);
-  const int plane_unit_size = loop_restoration_.unit_size[plane];
   const int num_horizontal_units =
       restoration_info_->num_horizontal_units(static_cast<Plane>(plane));
   const int plane_width =
@@ -119,23 +109,14 @@ void PostFilter::ApplyLoopRestorationForSuperBlock(
                                x * pixel_size_));
   ApplyLoopRestorationForOneUnit<Pixel>(
       superres_buffer_[plane], stride, plane, plane_height, x, y, 0, 0,
-      unit_row, current_process_unit_height, process_unit_width,
-      plane_unit_size, num_horizontal_units, plane_width,
-      &loop_restored_window);
+      unit_row, current_process_unit_height, plane_unit_size,
+      num_horizontal_units, plane_width, &loop_restored_window);
 }
 
 void PostFilter::ApplyLoopRestorationForOneSuperBlockRow(int row4x4_start,
                                                          int sb4x4) {
   assert(row4x4_start >= 0);
   assert(DoRestoration());
-  const int plane_process_unit_width[kMaxPlanes] = {
-      kRestorationProcessingUnitSize,
-      kRestorationProcessingUnitSize >> subsampling_x_[kPlaneU],
-      kRestorationProcessingUnitSize >> subsampling_x_[kPlaneV]};
-  const int plane_process_unit_height[kMaxPlanes] = {
-      kRestorationProcessingUnitSize,
-      kRestorationProcessingUnitSize >> subsampling_y_[kPlaneU],
-      kRestorationProcessingUnitSize >> subsampling_y_[kPlaneV]};
   for (int plane = 0; plane < planes_; ++plane) {
     if (frame_header_.loop_restoration.type[plane] ==
         kLoopRestorationTypeNone) {
@@ -149,36 +130,36 @@ void PostFilter::ApplyLoopRestorationForOneSuperBlockRow(int row4x4_start,
                                                    subsampling_x_[plane]);
     const int num_vertical_units =
         restoration_info_->num_vertical_units(static_cast<Plane>(plane));
-    const int process_unit_width = plane_process_unit_width[plane];
+    const int plane_unit_size = frame_header_.loop_restoration.unit_size[plane];
+    const int plane_process_unit_height =
+        kRestorationUnitHeight >> subsampling_y_[plane];
+    int y = (row4x4_start == 0)
+                ? 0
+                : (MultiplyBy4(row4x4_start) >> subsampling_y_[plane]) -
+                      unit_height_offset;
+    int expected_height = plane_process_unit_height -
+                          ((row4x4_start == 0) ? unit_height_offset : 0);
     for (int sb_y = 0; sb_y < sb4x4; sb_y += 16) {
-      const int row4x4 = row4x4_start + sb_y;
-      const int y = (MultiplyBy4(row4x4) - (row4x4 == 0 ? 0 : 8)) >>
-                    subsampling_y_[plane];
       if (y >= plane_height) break;
-      const int plane_unit_size =
-          frame_header_.loop_restoration.unit_size[plane];
       const int unit_row = std::min((y + unit_height_offset) / plane_unit_size,
                                     num_vertical_units - 1);
-      const int expected_height = plane_process_unit_height[plane] +
-                                  ((y == 0) ? -unit_height_offset : 0);
       const int current_process_unit_height =
-          (y + expected_height <= plane_height) ? expected_height
-                                                : plane_height - y;
-      for (int column4x4 = 0;; column4x4 += 16) {
-        const int x = MultiplyBy4(column4x4) >> subsampling_x_[plane];
-        if (x >= plane_width) break;
+          std::min(expected_height, plane_height - y);
+      for (int x = 0; x < plane_width; x += plane_unit_size) {
 #if LIBGAV1_MAX_BITDEPTH >= 10
         if (bitdepth_ >= 10) {
           ApplyLoopRestorationForSuperBlock<uint16_t>(
               static_cast<Plane>(plane), x, y, unit_row,
-              current_process_unit_height, process_unit_width);
+              current_process_unit_height, plane_unit_size);
           continue;
         }
 #endif
         ApplyLoopRestorationForSuperBlock<uint8_t>(
             static_cast<Plane>(plane), x, y, unit_row,
-            current_process_unit_height, process_unit_width);
+            current_process_unit_height, plane_unit_size);
       }
+      expected_height = plane_process_unit_height;
+      y += current_process_unit_height;
     }
   }
 }
@@ -188,18 +169,16 @@ void PostFilter::ApplyLoopRestorationForOneRowInWindow(
     uint8_t* const cdef_buffer, const ptrdiff_t cdef_buffer_stride,
     const Plane plane, const int plane_height, const int plane_width,
     const int x, const int y, const int row, const int unit_row,
-    const int current_process_unit_height, const int process_unit_width,
-    const int window_width, const int plane_unit_size,
-    const int num_horizontal_units) {
+    const int current_process_unit_height, const int plane_unit_size,
+    const int window_width, const int num_horizontal_units) {
   Array2DView<Pixel> loop_restored_window(
       window_buffer_height_, window_buffer_width_,
       reinterpret_cast<Pixel*>(threaded_window_buffer_));
-  for (int column = 0; column < window_width; column += process_unit_width) {
+  for (int column = 0; column < window_width; column += plane_unit_size) {
     ApplyLoopRestorationForOneUnit<Pixel>(
         cdef_buffer, cdef_buffer_stride, plane, plane_height, x, y, row, column,
-        unit_row, current_process_unit_height, process_unit_width,
-        plane_unit_size, num_horizontal_units, plane_width,
-        &loop_restored_window);
+        unit_row, current_process_unit_height, plane_unit_size,
+        num_horizontal_units, plane_width, &loop_restored_window);
   }
 }
 
@@ -210,20 +189,14 @@ void PostFilter::ApplyLoopRestorationForOneRowInWindow(
 // completes filtering until all jobs are finished. This approach requires an
 // extra buffer (|threaded_window_buffer_|) to hold the filtering output, whose
 // size is the size of the window. It also needs block buffers (i.e.,
-// |block_buffer| and |intermediate_buffers| in
-// ApplyLoopRestorationForOneUnit()) to store intermediate results in loop
-// restoration for each thread. After all units inside the window are filtered,
-// the output is written to the frame buffer.
+// |block_buffer| in ApplyLoopRestorationForOneUnit()) to store intermediate
+// results in loop restoration for each thread. After all units inside the
+// window are filtered, the output is written to the frame buffer.
 template <typename Pixel>
 void PostFilter::ApplyLoopRestorationThreaded() {
-  const int plane_process_unit_width[kMaxPlanes] = {
-      kRestorationProcessingUnitSize,
-      kRestorationProcessingUnitSize >> subsampling_x_[kPlaneU],
-      kRestorationProcessingUnitSize >> subsampling_x_[kPlaneV]};
   const int plane_process_unit_height[kMaxPlanes] = {
-      kRestorationProcessingUnitSize,
-      kRestorationProcessingUnitSize >> subsampling_y_[kPlaneU],
-      kRestorationProcessingUnitSize >> subsampling_y_[kPlaneV]};
+      kRestorationUnitHeight, kRestorationUnitHeight >> subsampling_y_[kPlaneU],
+      kRestorationUnitHeight >> subsampling_y_[kPlaneV]};
 
   for (int plane = kPlaneY; plane < planes_; ++plane) {
     if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
@@ -270,11 +243,11 @@ void PostFilter::ApplyLoopRestorationThreaded() {
                 plane_process_unit_height[plane] +
             1;
       }
+      const int jobs_for_threadpool =
+          vertical_units_per_window * num_workers / (num_workers + 1);
       for (int x = 0; x < plane_width; x += window_buffer_width_) {
         const int actual_window_width =
             std::min(window_buffer_width_, plane_width - x);
-        const int jobs_for_threadpool =
-            vertical_units_per_window * num_workers / (num_workers + 1);
         assert(jobs_for_threadpool < vertical_units_per_window);
         BlockingCounter pending_jobs(jobs_for_threadpool);
         int job_count = 0;
@@ -282,37 +255,32 @@ void PostFilter::ApplyLoopRestorationThreaded() {
         for (int row = 0; row < actual_window_height;
              row += current_process_unit_height) {
           const int unit_y = y + row;
-          const int expected_height = plane_process_unit_height[plane] +
-                                      ((unit_y == 0) ? -unit_height_offset : 0);
+          const int expected_height = plane_process_unit_height[plane] -
+                                      ((unit_y == 0) ? unit_height_offset : 0);
           current_process_unit_height =
-              (unit_y + expected_height <= plane_height)
-                  ? expected_height
-                  : plane_height - unit_y;
+              std::min(expected_height, plane_height - unit_y);
           const int unit_row =
               std::min((unit_y + unit_height_offset) / plane_unit_size,
                        num_vertical_units - 1);
-          const int process_unit_width = plane_process_unit_width[plane];
 
           if (job_count < jobs_for_threadpool) {
             thread_pool_->Schedule(
-                [this, src_buffer, src_stride, process_unit_width,
+                [this, src_buffer, src_stride, plane_unit_size,
                  current_process_unit_height, actual_window_width,
-                 plane_unit_size, num_horizontal_units, x, y, row, unit_row,
-                 plane_height, plane_width, plane, &pending_jobs]() {
+                 num_horizontal_units, x, y, row, unit_row, plane_height,
+                 plane_width, plane, &pending_jobs]() {
                   ApplyLoopRestorationForOneRowInWindow<Pixel>(
                       src_buffer, src_stride, static_cast<Plane>(plane),
                       plane_height, plane_width, x, y, row, unit_row,
-                      current_process_unit_height, process_unit_width,
-                      actual_window_width, plane_unit_size,
-                      num_horizontal_units);
+                      current_process_unit_height, plane_unit_size,
+                      actual_window_width, num_horizontal_units);
                   pending_jobs.Decrement();
                 });
           } else {
             ApplyLoopRestorationForOneRowInWindow<Pixel>(
                 src_buffer, src_stride, static_cast<Plane>(plane), plane_height,
                 plane_width, x, y, row, unit_row, current_process_unit_height,
-                process_unit_width, actual_window_width, plane_unit_size,
-                num_horizontal_units);
+                plane_unit_size, actual_window_width, num_horizontal_units);
           }
           ++job_count;
         }
diff --git a/chromium/third_party/libgav1/src/src/post_filter/post_filter.cc b/chromium/third_party/libgav1/src/src/post_filter/post_filter.cc
index 1b65e9fbcf8..6174aabdee6 100644
--- a/chromium/third_party/libgav1/src/src/post_filter/post_filter.cc
+++ b/chromium/third_party/libgav1/src/src/post_filter/post_filter.cc
@@ -31,6 +31,9 @@
 namespace libgav1 {
 namespace {
 
+// Import all the constants in the anonymous namespace.
+#include "src/post_filter/deblock_thresholds.inc"
+
 // Row indices of deblocked pixels needed by loop restoration. This is used to
 // populate the |deblock_buffer_| when cdef is on. The first dimension is
 // subsampling_y.
@@ -122,16 +125,11 @@ void ExtendFrame(uint8_t* const frame_start, const int width, const int height,
 
 }  // namespace
 
-PostFilter::PostFilter(
-    const ObuFrameHeader& frame_header,
-    const ObuSequenceHeader& sequence_header, LoopFilterMask* const masks,
-    const Array2D<int16_t>& cdef_index,
-    const Array2D<TransformSize>& inter_transform_sizes,
-    LoopRestorationInfo* const restoration_info,
-    BlockParametersHolder* block_parameters, YuvBuffer* const frame_buffer,
-    YuvBuffer* const deblock_buffer, const dsp::Dsp* dsp,
-    ThreadPool* const thread_pool, uint8_t* const threaded_window_buffer,
-    uint8_t* const superres_line_buffer, int do_post_filter_mask)
+PostFilter::PostFilter(const ObuFrameHeader& frame_header,
+                       const ObuSequenceHeader& sequence_header,
+                       FrameScratchBuffer* const frame_scratch_buffer,
+                       YuvBuffer* const frame_buffer, const dsp::Dsp* dsp,
+                       int do_post_filter_mask)
     : frame_header_(frame_header),
       loop_restoration_(frame_header.loop_restoration),
       dsp_(*dsp),
@@ -149,24 +147,24 @@ PostFilter::PostFilter(
                                                          : kMaxPlanes),
       pixel_size_(static_cast<int>((bitdepth_ == 8) ? sizeof(uint8_t)
                                                     : sizeof(uint16_t))),
-      masks_(masks),
-      cdef_index_(cdef_index),
-      inter_transform_sizes_(inter_transform_sizes),
-      threaded_window_buffer_(threaded_window_buffer),
-      restoration_info_(restoration_info),
-      window_buffer_width_(GetWindowBufferWidth(thread_pool, frame_header)),
-      window_buffer_height_(GetWindowBufferHeight(thread_pool, frame_header)),
-      superres_line_buffer_(superres_line_buffer),
-      block_parameters_(*block_parameters),
+      inner_thresh_(kInnerThresh[frame_header.loop_filter.sharpness]),
+      outer_thresh_(kOuterThresh[frame_header.loop_filter.sharpness]),
+      cdef_index_(frame_scratch_buffer->cdef_index),
+      inter_transform_sizes_(frame_scratch_buffer->inter_transform_sizes),
+      threaded_window_buffer_(
+          frame_scratch_buffer->threaded_window_buffer.get()),
+      restoration_info_(&frame_scratch_buffer->loop_restoration_info),
+      superres_line_buffer_(frame_scratch_buffer->superres_line_buffer.get()),
+      block_parameters_(frame_scratch_buffer->block_parameters_holder),
       frame_buffer_(*frame_buffer),
-      deblock_buffer_(*deblock_buffer),
+      deblock_buffer_(frame_scratch_buffer->deblock_buffer),
       do_post_filter_mask_(do_post_filter_mask),
-      thread_pool_(thread_pool) {
+      thread_pool_(
+          frame_scratch_buffer->threading_strategy.post_filter_thread_pool()),
+      window_buffer_width_(GetWindowBufferWidth(thread_pool_, frame_header)),
+      window_buffer_height_(GetWindowBufferHeight(thread_pool_, frame_header)) {
   const int8_t zero_delta_lf[kFrameLfCount] = {};
   ComputeDeblockFilterLevels(zero_delta_lf, deblock_filter_levels_);
-  if (DoDeblock()) {
-    InitDeblockFilterParams();
-  }
   if (DoSuperRes()) {
     for (int plane = 0; plane < planes_; ++plane) {
       const int downscaled_width =
@@ -196,7 +194,7 @@ PostFilter::PostFilter(
   // In single threaded mode, we apply SuperRes without making a copy of the
   // input row by writing the output to one row to the top (we refer to this
   // process as "in place superres" in our code).
-  const bool in_place_superres = DoSuperRes() && thread_pool == nullptr;
+  const bool in_place_superres = DoSuperRes() && thread_pool_ == nullptr;
   if (DoCdef() || DoRestoration() || in_place_superres) {
     for (int plane = 0; plane < planes_; ++plane) {
       int horizontal_shift = 0;
@@ -372,8 +370,8 @@ void PostFilter::ApplyFilteringThreaded() {
   if (DoDeblock()) ApplyDeblockFilterThreaded();
   if (DoCdef() && DoRestoration()) {
     for (int row4x4 = 0; row4x4 < frame_header_.rows4x4;
-         row4x4 += kNum4x4InLoopFilterMaskUnit) {
-      SetupDeblockBuffer(row4x4, kNum4x4InLoopFilterMaskUnit);
+         row4x4 += kNum4x4InLoopFilterUnit) {
+      SetupDeblockBuffer(row4x4, kNum4x4InLoopFilterUnit);
     }
   }
   if (DoCdef()) ApplyCdef();
@@ -383,9 +381,10 @@ void PostFilter::ApplyFilteringThreaded() {
 }
 
 int PostFilter::ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4,
-                                                  bool is_last_row) {
+                                                  bool is_last_row,
+                                                  bool do_deblock) {
   if (row4x4 < 0) return -1;
-  if (DoDeblock()) {
+  if (DoDeblock() && do_deblock) {
     ApplyDeblockFilterForOneSuperBlockRow(row4x4, sb4x4);
   }
   if (DoRestoration() && DoCdef()) {
diff --git a/chromium/third_party/libgav1/src/src/post_filter/super_res.cc b/chromium/third_party/libgav1/src/src/post_filter/super_res.cc
index 2dc1dcd61cf..8f17a37b5cb 100644
--- a/chromium/third_party/libgav1/src/src/post_filter/super_res.cc
+++ b/chromium/third_party/libgav1/src/src/post_filter/super_res.cc
@@ -35,10 +35,10 @@ void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& buffers,
                                const std::array<int, kMaxPlanes>& strides,
                                const std::array<int, kMaxPlanes>& rows,
                                size_t line_buffer_offset) {
-  uint8_t* const line_buffer_start =
-      in_place ? nullptr
-               : superres_line_buffer_ + line_buffer_offset +
-                     kSuperResHorizontalBorder * pixel_size_;
+  // Only used when |in_place| == false.
+  uint8_t* const line_buffer_start = superres_line_buffer_ +
+                                     line_buffer_offset +
+                                     kSuperResHorizontalBorder * pixel_size_;
   for (int plane = kPlaneY; plane < planes_; ++plane) {
     const int8_t subsampling_x = subsampling_x_[plane];
     const int plane_width =
diff --git a/chromium/third_party/libgav1/src/src/threading_strategy.cc b/chromium/third_party/libgav1/src/src/threading_strategy.cc
index 75e2ed60270..5c0b940c835 100644
--- a/chromium/third_party/libgav1/src/src/threading_strategy.cc
+++ b/chromium/third_party/libgav1/src/src/threading_strategy.cc
@@ -16,15 +16,52 @@
 
 #include <algorithm>
 #include <cassert>
+#include <memory>
 
+#include "src/frame_scratch_buffer.h"
 #include "src/utils/constants.h"
 #include "src/utils/logging.h"
+#include "src/utils/vector.h"
 
 namespace libgav1 {
+namespace {
+
+// Computes the number of frame threads to be used based on the following
+// heuristic:
+//   * If |thread_count| == 1, return 0.
+//   * If |thread_count| <= |tile_count| * 4, return 0.
+//   * Otherwise, return the largest value of i which satisfies the following
+//     condition: i + i * tile_columns <= thread_count. This ensures that there
+//     are at least |tile_columns| worker threads for each frame thread.
+//   * This function will never return 1 or a value > |thread_count|.
+//
+//  This heuristic is based empirical performance data. The in-frame threading
+//  model (combination of tile multithreading, superblock row multithreading and
+//  post filter multithreading) performs better than the frame parallel model
+//  until we reach the threshold of |thread_count| > |tile_count| * 4.
+//
+//  It is a function of |tile_count| since tile threading and superblock row
+//  multithreading will scale only as a factor of |tile_count|. The threshold 4
+//  is arrived at based on empirical data. The general idea is that superblock
+//  row multithreading plateaus at 4 * |tile_count| because in most practical
+//  cases there aren't more than that many superblock rows and columns available
+//  to work on in parallel.
+int ComputeFrameThreadCount(int thread_count, int tile_count,
+                            int tile_columns) {
+  assert(thread_count > 0);
+  if (thread_count == 1) return 0;
+  return (thread_count <= tile_count * 4)
+             ? 0
+             : std::max(2, thread_count / (1 + tile_columns));
+}
+
+}  // namespace
 
 bool ThreadingStrategy::Reset(const ObuFrameHeader& frame_header,
                               int thread_count) {
   assert(thread_count > 0);
+  frame_parallel_ = false;
+
   if (thread_count == 1) {
     thread_pool_.reset(nullptr);
     tile_thread_count_ = 0;
@@ -103,14 +140,74 @@ bool ThreadingStrategy::Reset(const ObuFrameHeader& frame_header,
   return true;
 }
 
+bool ThreadingStrategy::Reset(int thread_count) {
+  assert(thread_count > 0);
+  frame_parallel_ = true;
+
+  // In frame parallel mode, we simply access the underlying |thread_pool_|
+  // directly. So ensure all the other threadpool getter functions return
+  // nullptr. Also, superblock row multithreading is always disabled in frame
+  // parallel mode.
+  tile_thread_count_ = 0;
+  max_tile_index_for_row_threads_ = 0;
+
+  if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) {
+    thread_pool_ = ThreadPool::Create("libgav1-fp", thread_count);
+    if (thread_pool_ == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.",
+                   thread_count);
+      return false;
+    }
+  }
+  return true;
+}
+
 bool InitializeThreadPoolsForFrameParallel(
-    int thread_count, std::unique_ptr<ThreadPool>* const frame_thread_pool) {
-  *frame_thread_pool = ThreadPool::Create(thread_count);
+    int thread_count, int tile_count, int tile_columns,
+    std::unique_ptr<ThreadPool>* const frame_thread_pool,
+    FrameScratchBufferPool* const frame_scratch_buffer_pool) {
+  assert(*frame_thread_pool == nullptr);
+  thread_count = std::min(thread_count, static_cast<int>(kMaxThreads));
+  const int frame_threads =
+      ComputeFrameThreadCount(thread_count, tile_count, tile_columns);
+  if (frame_threads == 0) return true;
+  *frame_thread_pool = ThreadPool::Create(frame_threads);
   if (*frame_thread_pool == nullptr) {
     LIBGAV1_DLOG(ERROR, "Failed to create frame thread pool with %d threads.",
-                 thread_count);
+                 frame_threads);
     return false;
   }
+  int remaining_threads = thread_count - frame_threads;
+  if (remaining_threads == 0) return true;
+  int threads_per_frame = remaining_threads / frame_threads;
+  const int extra_threads = remaining_threads % frame_threads;
+  Vector<std::unique_ptr<FrameScratchBuffer>> frame_scratch_buffers;
+  if (!frame_scratch_buffers.reserve(frame_threads)) return false;
+  // Create the tile thread pools.
+  for (int i = 0; i < frame_threads && remaining_threads > 0; ++i) {
+    std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+        frame_scratch_buffer_pool->Get();
+    if (frame_scratch_buffer == nullptr) {
+      return false;
+    }
+    // If the number of tile threads cannot be divided equally amongst all the
+    // frame threads, assign one extra thread to the first |extra_threads| frame
+    // threads.
+    const int current_frame_thread_count =
+        threads_per_frame + static_cast<int>(i < extra_threads);
+    if (!frame_scratch_buffer->threading_strategy.Reset(
+            current_frame_thread_count)) {
+      return false;
+    }
+    remaining_threads -= current_frame_thread_count;
+    frame_scratch_buffers.push_back_unchecked(std::move(frame_scratch_buffer));
+  }
+  // We release the frame scratch buffers in reverse order so that the extra
+  // threads are allocated to buffers in the top of the stack.
+  for (int i = static_cast<int>(frame_scratch_buffers.size()) - 1; i >= 0;
+       --i) {
+    frame_scratch_buffer_pool->Release(std::move(frame_scratch_buffers[i]));
+  }
   return true;
 }
 
diff --git a/chromium/third_party/libgav1/src/src/threading_strategy.h b/chromium/third_party/libgav1/src/src/threading_strategy.h
index 5822bb31f36..84b35896d26 100644
--- a/chromium/third_party/libgav1/src/src/threading_strategy.h
+++ b/chromium/third_party/libgav1/src/src/threading_strategy.h
@@ -25,6 +25,8 @@
 
 namespace libgav1 {
 
+class FrameScratchBufferPool;
+
 // This class allocates and manages the worker threads among thread pools used
 // for multi-threaded decoding.
 class ThreadingStrategy {
@@ -36,18 +38,28 @@ class ThreadingStrategy {
   ThreadingStrategy& operator=(const ThreadingStrategy&) = delete;
 
   // Creates or re-allocates the thread pools based on the |frame_header| and
-  // |thread_count|. This function is idempotent if the |frame_header| and
-  // |thread_count| doesn't change between calls (it will only create new
-  // threads on the first call and do nothing on the subsequent calls). This
-  // function also starts the worker threads whenever it creates new thread
-  // pools.
+  // |thread_count|. This function is used only in non frame-parallel mode. This
+  // function is idempotent if the |frame_header| and |thread_count| don't
+  // change between calls (it will only create new threads on the first call and
+  // do nothing on the subsequent calls). This function also starts the worker
+  // threads whenever it creates new thread pools.
   // The following strategy is used to allocate threads:
   //   * One thread is allocated for decoding each Tile.
   //   * Any remaining threads are allocated for superblock row multi-threading
   //     within each of the tile in a round robin fashion.
+  // Note: During the lifetime of a ThreadingStrategy object, only one of the
+  // Reset() variants will be used.
   LIBGAV1_MUST_USE_RESULT bool Reset(const ObuFrameHeader& frame_header,
                                      int thread_count);
 
+  // Creates or re-allocates a thread pool with |thread_count| threads. This
+  // function is used only in frame parallel mode. This function is idempotent
+  // if the |thread_count| doesn't change between calls (it will only create new
+  // threads on the first call and do nothing on the subsequent calls).
+  // Note: During the lifetime of a ThreadingStrategy object, only one of the
+  // Reset() variants will be used.
+  LIBGAV1_MUST_USE_RESULT bool Reset(int thread_count);
+
   // Returns a pointer to the ThreadPool that is to be used for Tile
   // multi-threading.
   ThreadPool* tile_thread_pool() const {
@@ -56,8 +68,14 @@ class ThreadingStrategy {
 
   int tile_thread_count() const { return tile_thread_count_; }
 
+  // Returns a pointer to the underlying ThreadPool.
+  // Note: Valid only when |frame_parallel_| is true. This is used for
+  // facilitating in-frame multi-threading in that case.
+  ThreadPool* thread_pool() const { return thread_pool_.get(); }
+
   // Returns a pointer to the ThreadPool that is to be used within the Tile at
   // index |tile_index| for superblock row multi-threading.
+  // Note: Valid only when |frame_parallel_| is false.
   ThreadPool* row_thread_pool(int tile_index) const {
     return tile_index < max_tile_index_for_row_threads_ ? thread_pool_.get()
                                                         : nullptr;
@@ -65,20 +83,48 @@ class ThreadingStrategy {
 
   // Returns a pointer to the ThreadPool that is to be used for post filter
   // multi-threading.
-  ThreadPool* post_filter_thread_pool() const { return thread_pool_.get(); }
+  // Note: Valid only when |frame_parallel_| is false.
+  ThreadPool* post_filter_thread_pool() const {
+    return frame_parallel_ ? nullptr : thread_pool_.get();
+  }
 
   // Returns a pointer to the ThreadPool that is to be used for film grain
   // synthesis and blending.
+  // Note: Valid only when |frame_parallel_| is false.
   ThreadPool* film_grain_thread_pool() const { return thread_pool_.get(); }
 
  private:
   std::unique_ptr<ThreadPool> thread_pool_;
-  int tile_thread_count_;
-  int max_tile_index_for_row_threads_;
+  int tile_thread_count_ = 0;
+  int max_tile_index_for_row_threads_ = 0;
+  bool frame_parallel_ = false;
 };
 
+// Initializes the |frame_thread_pool| and the necessary worker threadpools (the
+// threading_strategy objects in each of the frame scratch buffer in
+// |frame_scratch_buffer_pool|) as follows:
+//  * frame_threads = ComputeFrameThreadCount();
+//  * For more details on how frame_threads is computed, see the function
+//    comment in ComputeFrameThreadCount().
+//  * |frame_thread_pool| is created with |frame_threads| threads.
+//  * divide the remaining number of threads into each frame thread and
+//    initialize a frame_scratch_buffer.threading_strategy for each frame
+//    thread.
+//  When this function is called, |frame_scratch_buffer_pool| must be empty. If
+//  this function returns true, it means the initialization was successful and
+//  one of the following is true:
+//    * |frame_thread_pool| has been successfully initialized and
+//      |frame_scratch_buffer_pool| has been successfully populated with
+//      |frame_threads| buffers to be used by each frame thread. The total
+//      number of threads that this function creates will always be equal to
+//      |thread_count|.
+//    * |frame_thread_pool| is nullptr. |frame_scratch_buffer_pool| is not
+//      modified. This means that frame threading will not be used and the
+//      decoder will continue to operate normally in non frame parallel mode.
 LIBGAV1_MUST_USE_RESULT bool InitializeThreadPoolsForFrameParallel(
-    int thread_count, std::unique_ptr<ThreadPool>* frame_thread_pool);
+    int thread_count, int tile_count, int tile_columns,
+    std::unique_ptr<ThreadPool>* frame_thread_pool,
+    FrameScratchBufferPool* frame_scratch_buffer_pool);
 
 }  // namespace libgav1
 
diff --git a/chromium/third_party/libgav1/src/src/tile.h b/chromium/third_party/libgav1/src/src/tile.h
index d8f48b4df27..7fb7e2296c0 100644
--- a/chromium/third_party/libgav1/src/src/tile.h
+++ b/chromium/third_party/libgav1/src/src/tile.h
@@ -33,7 +33,6 @@
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/frame_scratch_buffer.h"
-#include "src/loop_filter_mask.h"
 #include "src/loop_restoration_info.h"
 #include "src/obu_parser.h"
 #include "src/post_filter.h"
@@ -77,16 +76,14 @@ class Tile : public Allocable {
       const WedgeMaskArray& wedge_masks,
       SymbolDecoderContext* const saved_symbol_decoder_context,
       const SegmentationMap* prev_segment_ids, PostFilter* const post_filter,
-      BlockParametersHolder* const block_parameters_holder,
       const dsp::Dsp* const dsp, ThreadPool* const thread_pool,
       BlockingCounterWithStatus* const pending_tiles, bool frame_parallel,
       bool use_intra_prediction_buffer) {
     std::unique_ptr<Tile> tile(new (std::nothrow) Tile(
         tile_number, data, size, sequence_header, frame_header, current_frame,
         state, frame_scratch_buffer, wedge_masks, saved_symbol_decoder_context,
-        prev_segment_ids, post_filter, block_parameters_holder, dsp,
-        thread_pool, pending_tiles, frame_parallel,
-        use_intra_prediction_buffer));
+        prev_segment_ids, post_filter, dsp, thread_pool, pending_tiles,
+        frame_parallel, use_intra_prediction_buffer));
     return (tile != nullptr && tile->Init()) ? std::move(tile) : nullptr;
   }
 
@@ -100,9 +97,17 @@ class Tile : public Allocable {
 
   // Parses the entire tile.
   bool Parse();
+  // Decodes the entire tile. |superblock_row_progress| and
+  // |superblock_row_progress_condvar| are arrays of size equal to the number of
+  // superblock rows in the frame. Increments |superblock_row_progress[i]| after
+  // each superblock row at index |i| is decoded. If the count reaches the
+  // number of tile columns, then it notifies
+  // |superblock_row_progress_condvar[i]|.
+  bool Decode(std::mutex* mutex, int* superblock_row_progress,
+              std::condition_variable* superblock_row_progress_condvar);
   // Parses and decodes the entire tile. Depending on the configuration of this
   // Tile, this function may do multithreaded decoding.
-  bool ParseAndDecode(bool is_main_thread);  // 5.11.2.
+  bool ParseAndDecode();  // 5.11.2.
   // Processes all the columns of the superblock row at |row4x4| that are within
   // this Tile. If |save_symbol_decoder_context| is true, then
   // SaveSymbolDecoderContext() is invoked for the last superblock row.
@@ -118,10 +123,14 @@ class Tile : public Allocable {
     return reference_frame_sign_bias_;
   }
 
+  bool IsRow4x4Inside(int row4x4) const {
+    return row4x4 >= row4x4_start_ && row4x4 < row4x4_end_;
+  }
+
   // 5.11.51.
   bool IsInside(int row4x4, int column4x4) const {
-    return row4x4 >= row4x4_start_ && row4x4 < row4x4_end_ &&
-           column4x4 >= column4x4_start_ && column4x4 < column4x4_end_;
+    return IsRow4x4Inside(row4x4) && column4x4 >= column4x4_start_ &&
+           column4x4 < column4x4_end_;
   }
 
   bool IsLeftInside(int column4x4) const {
@@ -168,9 +177,13 @@ class Tile : public Allocable {
   const BlockParameters& Parameters(int row, int column) const {
     return *block_parameters_holder_.Find(row, column);
   }
+
   int number() const { return number_; }
   int superblock_rows() const { return superblock_rows_; }
   int superblock_columns() const { return superblock_columns_; }
+  int row4x4_start() const { return row4x4_start_; }
+  int column4x4_start() const { return column4x4_start_; }
+  int column4x4_end() const { return column4x4_end_; }
 
  private:
   Tile(int tile_number, const uint8_t* data, size_t size,
@@ -180,9 +193,9 @@ class Tile : public Allocable {
        const WedgeMaskArray& wedge_masks,
        SymbolDecoderContext* saved_symbol_decoder_context,
        const SegmentationMap* prev_segment_ids, PostFilter* post_filter,
-       BlockParametersHolder* block_parameters_holder, const dsp::Dsp* dsp,
-       ThreadPool* thread_pool, BlockingCounterWithStatus* pending_tiles,
-       bool frame_parallel, bool use_intra_prediction_buffer);
+       const dsp::Dsp* dsp, ThreadPool* thread_pool,
+       BlockingCounterWithStatus* pending_tiles, bool frame_parallel,
+       bool use_intra_prediction_buffer);
 
   // Stores the transform tree state when reading variable size transform trees
   // and when applying the transform tree. When applying the transform tree,
@@ -201,16 +214,20 @@ class Tile : public Allocable {
     int depth;
   };
 
+  // Enum to track the processing state of a superblock.
+  enum SuperBlockState : uint8_t {
+    kSuperBlockStateNone,       // Not yet parsed or decoded.
+    kSuperBlockStateParsed,     // Parsed but not yet decoded.
+    kSuperBlockStateScheduled,  // Scheduled for decoding.
+    kSuperBlockStateDecoded     // Parsed and decoded.
+  };
+
   // Parameters used to facilitate multi-threading within the Tile.
   struct ThreadingParameters {
     std::mutex mutex;
-    // Array2DView of size |superblock_rows_| by |superblock_columns_|
-    // containing the processing state of each superblock. The code in this
-    // class uses relative indexing of superblocks with respect to this Tile.
-    // The memory for this comes from the caller (the |super_block_state|
-    // parameter in the constructor). The memory is for the whole frame whereas
-    // the |sb_state| array in this struct points to the beginning of this Tile.
-    Array2DView<SuperBlockState> sb_state LIBGAV1_GUARDED_BY(mutex);
+    // 2d array of size |superblock_rows_| by |superblock_columns_| containing
+    // the processing state of each superblock.
+    Array2D<SuperBlockState> sb_state LIBGAV1_GUARDED_BY(mutex);
     // Variable used to indicate either parse or decode failure.
     bool abort LIBGAV1_GUARDED_BY(mutex) = false;
     int pending_jobs LIBGAV1_GUARDED_BY(mutex) = 0;
@@ -297,14 +314,6 @@ class Tile : public Allocable {
   void ResetLoopRestorationParams();
   void ReadLoopRestorationCoefficients(int row4x4, int column4x4,
                                        BlockSize block_size);  // 5.11.57.
-  // Build bit masks for vertical edges followed by horizontal edges.
-  // Traverse through each transform edge in the current coding block, and
-  // determine if a 4x4 edge needs filtering. If filtering is needed, determine
-  // filter length. Set corresponding bit mask to 1.
-  void BuildBitMask(const Block& block);
-  void BuildBitMaskHelper(const Block& block, int row4x4, int column4x4,
-                          BlockSize block_size, bool is_vertical_block_border,
-                          bool is_horizontal_block_border);
 
   // Helper functions for DecodeBlock.
   bool ReadSegmentId(const Block& block);       // 5.11.9.
@@ -582,8 +591,8 @@ class Tile : public Allocable {
   }
 
   const int number_;
-  int row_;
-  int column_;
+  const int row_;
+  const int column_;
   const uint8_t* const data_;
   size_t size_;
   int row4x4_start_;
@@ -729,14 +738,17 @@ class Tile : public Allocable {
   int8_t delta_lf_[kFrameLfCount];
   // True if all the values in |delta_lf_| are zero. False otherwise.
   bool delta_lf_all_zero_;
-  bool build_bit_mask_when_parsing_;
   const bool frame_parallel_;
   const bool use_intra_prediction_buffer_;
   // Buffer used to store the unfiltered pixels that are necessary for decoding
   // the next superblock row (for the intra prediction process). Used only if
-  // |use_intra_prediction_buffer_| is true.
-  std::array<AlignedDynamicBuffer<uint8_t, kMaxAlignment>, kMaxPlanes>
-      intra_prediction_buffer_;
+  // |use_intra_prediction_buffer_| is true. The |frame_scratch_buffer| contains
+  // one row buffer for each tile row. This tile will have to use the buffer
+  // corresponding to this tile's row.
+  IntraPredictionBuffer* const intra_prediction_buffer_;
+  // Stores the progress of the reference frames. This will be used to avoid
+  // unnecessary calls into RefCountedBuffer::WaitUntil().
+  std::array<int, kNumReferenceFrameTypes> reference_frame_progress_cache_;
 };
 
 struct Tile::Block {
diff --git a/chromium/third_party/libgav1/src/src/tile/bitstream/mode_info.cc b/chromium/third_party/libgav1/src/src/tile/bitstream/mode_info.cc
index c13fbe3b907..1bae5a3c1b6 100644
--- a/chromium/third_party/libgav1/src/src/tile/bitstream/mode_info.cc
+++ b/chromium/third_party/libgav1/src/src/tile/bitstream/mode_info.cc
@@ -1100,12 +1100,11 @@ uint16_t* Tile::GetIsExplicitCompoundTypeCdf(const Block& block) {
 
 uint16_t* Tile::GetIsCompoundTypeAverageCdf(const Block& block) {
   const BlockParameters& bp = *block.bp;
-  const int forward = std::abs(GetRelativeDistance(
-      current_frame_.order_hint(bp.reference_frame[0]),
-      frame_header_.order_hint, sequence_header_.order_hint_shift_bits));
-  const int backward = std::abs(GetRelativeDistance(
-      current_frame_.order_hint(bp.reference_frame[1]),
-      frame_header_.order_hint, sequence_header_.order_hint_shift_bits));
+  const ReferenceInfo& reference_info = *current_frame_.reference_info();
+  const int forward =
+      std::abs(reference_info.relative_distance_from[bp.reference_frame[0]]);
+  const int backward =
+      std::abs(reference_info.relative_distance_from[bp.reference_frame[1]]);
   int context = (forward == backward) ? 3 : 0;
   if (block.top_available[kPlaneY]) {
     if (!block.IsTopSingle()) {
diff --git a/chromium/third_party/libgav1/src/src/tile/prediction.cc b/chromium/third_party/libgav1/src/src/tile/prediction.cc
index 672b5a2b3a7..785c1dac404 100644
--- a/chromium/third_party/libgav1/src/src/tile/prediction.cc
+++ b/chromium/third_party/libgav1/src/src/tile/prediction.cc
@@ -277,7 +277,6 @@ void Tile::IntraPrediction(const Block& block, Plane plane, int x, int y,
                           (mode == kPredictionModeDc && has_left);
 
   const Pixel* top_row_src = buffer[y - 1];
-  int top_row_offset = 0;
 
   // Determine if we need to retrieve the top row from
   // |intra_prediction_buffer_|.
@@ -295,13 +294,8 @@ void Tile::IntraPrediction(const Block& block, Plane plane, int x, int y,
     // then we will have to retrieve the top row from the
     // |intra_prediction_buffer_|.
     if (current_superblock_index != top_row_superblock_index) {
-      top_row_src =
-          reinterpret_cast<const Pixel*>(intra_prediction_buffer_[plane].get());
-      // The |intra_prediction_buffer_| only stores the top row for this Tile.
-      // The |x| value in this function is absolute to the frame. So in order to
-      // make it relative to this Tile, all acccesses into top_row_src must be
-      // offset by negative |top_row_offset|.
-      top_row_offset = MultiplyBy4(column4x4_start_) >> subsampling_x_[plane];
+      top_row_src = reinterpret_cast<const Pixel*>(
+          (*intra_prediction_buffer_)[plane].get());
     }
   }
 
@@ -309,8 +303,7 @@ void Tile::IntraPrediction(const Block& block, Plane plane, int x, int y,
     // Compute top_row.
     if (has_top || has_left) {
       const int left_index = has_left ? x - 1 : x;
-      top_row[-1] = has_top ? top_row_src[left_index - top_row_offset]
-                            : buffer[y][left_index];
+      top_row[-1] = has_top ? top_row_src[left_index] : buffer[y][left_index];
     } else {
       top_row[-1] = 1 << (bitdepth - 1);
     }
@@ -320,14 +313,12 @@ void Tile::IntraPrediction(const Block& block, Plane plane, int x, int y,
       Memset(top_row, (1 << (bitdepth - 1)) - 1, top_size);
     } else {
       const int top_limit = std::min(max_x - x + 1, top_right_size);
-      memcpy(top_row, &top_row_src[x - top_row_offset],
-             top_limit * sizeof(Pixel));
+      memcpy(top_row, &top_row_src[x], top_limit * sizeof(Pixel));
       // Even though it is safe to call Memset with a size of 0, accessing
       // top_row_src[top_limit - x + 1] is not allowed when this condition is
       // false.
       if (top_size - top_limit > 0) {
-        Memset(top_row + top_limit,
-               top_row_src[top_limit + x - 1 - top_row_offset],
+        Memset(top_row + top_limit, top_row_src[top_limit + x - 1],
                top_size - top_limit);
       }
     }
@@ -336,13 +327,13 @@ void Tile::IntraPrediction(const Block& block, Plane plane, int x, int y,
     // Compute left_column.
     if (has_top || has_left) {
       const int left_index = has_left ? x - 1 : x;
-      left_column[-1] = has_top ? top_row_src[left_index - top_row_offset]
-                                : buffer[y][left_index];
+      left_column[-1] =
+          has_top ? top_row_src[left_index] : buffer[y][left_index];
     } else {
       left_column[-1] = 1 << (bitdepth - 1);
     }
     if (!has_left && has_top) {
-      Memset(left_column, top_row_src[x - top_row_offset], left_size);
+      Memset(left_column, top_row_src[x], left_size);
     } else if (!has_left && !has_top) {
       Memset(left_column, (1 << (bitdepth - 1)) + 1, left_size);
     } else {
@@ -942,14 +933,13 @@ void Tile::DistanceWeightedPrediction(void* prediction_0, void* prediction_1,
   for (int reference = 0; reference < 2; ++reference) {
     const BlockParameters& bp =
         *block_parameters_holder_.Find(candidate_row, candidate_column);
-    const unsigned int reference_hint =
-        current_frame_.order_hint(bp.reference_frame[reference]);
     // Note: distance[0] and distance[1] correspond to relative distance
     // between current frame and reference frame [1] and [0], respectively.
-    distance[1 - reference] = Clip3(
-        std::abs(GetRelativeDistance(reference_hint, frame_header_.order_hint,
-                                     sequence_header_.order_hint_shift_bits)),
-        0, kMaxFrameDistance);
+    distance[1 - reference] = std::min(
+        std::abs(static_cast<int>(
+            current_frame_.reference_info()
+                ->relative_distance_from[bp.reference_frame[reference]])),
+        static_cast<int>(kMaxFrameDistance));
   }
   GetDistanceWeights(distance, weight);
 
@@ -1136,7 +1126,11 @@ bool Tile::BlockInterPrediction(
       // reference_y_max by 2 since we only track the progress of Y planes.
       reference_y_max = LeftShift(reference_y_max, subsampling_y);
     }
-    if (!reference_frames_[reference_frame_index]->WaitUntil(reference_y_max)) {
+    if (reference_frame_progress_cache_[reference_frame_index] <
+            reference_y_max &&
+        !reference_frames_[reference_frame_index]->WaitUntil(
+            reference_y_max,
+            &reference_frame_progress_cache_[reference_frame_index])) {
       return false;
     }
   }
@@ -1275,7 +1269,11 @@ bool Tile::BlockWarpProcess(const Block& block, const Plane plane,
     // For U and V planes with subsampling, we need to multiply reference_y_max
     // by 2 since we only track the progress of Y planes.
     reference_y_max = LeftShift(reference_y_max, subsampling_y_[plane]);
-    if (!reference_frames_[reference_frame_index]->WaitUntil(reference_y_max)) {
+    if (reference_frame_progress_cache_[reference_frame_index] <
+            reference_y_max &&
+        !reference_frames_[reference_frame_index]->WaitUntil(
+            reference_y_max,
+            &reference_frame_progress_cache_[reference_frame_index])) {
       return false;
     }
   }
diff --git a/chromium/third_party/libgav1/src/src/tile/tile.cc b/chromium/third_party/libgav1/src/src/tile/tile.cc
index 50daf1add34..ed00e282018 100644
--- a/chromium/third_party/libgav1/src/src/tile/tile.cc
+++ b/chromium/third_party/libgav1/src/src/tile/tile.cc
@@ -17,6 +17,7 @@
 #include <algorithm>
 #include <array>
 #include <cassert>
+#include <climits>
 #include <cstdlib>
 #include <cstring>
 #include <memory>
@@ -100,6 +101,14 @@ constexpr PredictionMode
         kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal,
         kPredictionModeD157, kPredictionModeDc};
 
+// Mask used to determine the index for mode_deltas lookup.
+constexpr BitMaskSet kPredictionModeDeltasMask(
+    kPredictionModeNearestMv, kPredictionModeNearMv, kPredictionModeNewMv,
+    kPredictionModeNearestNearestMv, kPredictionModeNearNearMv,
+    kPredictionModeNearestNewMv, kPredictionModeNewNearestMv,
+    kPredictionModeNearNewMv, kPredictionModeNewNearMv,
+    kPredictionModeNewNewMv);
+
 // This is computed as:
 // min(transform_width_log2, 5) + min(transform_height_log2, 5) - 4.
 constexpr uint8_t kEobMultiSizeLookup[kNumTransformSizes] = {
@@ -383,12 +392,13 @@ Tile::Tile(int tile_number, const uint8_t* const data, size_t size,
            const WedgeMaskArray& wedge_masks,
            SymbolDecoderContext* const saved_symbol_decoder_context,
            const SegmentationMap* prev_segment_ids,
-           PostFilter* const post_filter,
-           BlockParametersHolder* const block_parameters_holder,
-           const dsp::Dsp* const dsp, ThreadPool* const thread_pool,
+           PostFilter* const post_filter, const dsp::Dsp* const dsp,
+           ThreadPool* const thread_pool,
            BlockingCounterWithStatus* const pending_tiles, bool frame_parallel,
            bool use_intra_prediction_buffer)
     : number_(tile_number),
+      row_(number_ / frame_header.tile_info.tile_columns),
+      column_(number_ % frame_header.tile_info.tile_columns),
       data_(data),
       size_(size),
       read_deltas_(false),
@@ -410,7 +420,7 @@ Tile::Tile(int tile_number, const uint8_t* const data, size_t size,
       prev_segment_ids_(prev_segment_ids),
       dsp_(*dsp),
       post_filter_(*post_filter),
-      block_parameters_holder_(*block_parameters_holder),
+      block_parameters_holder_(frame_scratch_buffer->block_parameters_holder),
       quantizer_(sequence_header_.color_config.bitdepth,
                  &frame_header_.quantizer),
       residual_size_((sequence_header_.color_config.bitdepth == 8)
@@ -428,11 +438,12 @@ Tile::Tile(int tile_number, const uint8_t* const data, size_t size,
       tile_scratch_buffer_pool_(
           &frame_scratch_buffer->tile_scratch_buffer_pool),
       pending_tiles_(pending_tiles),
-      build_bit_mask_when_parsing_(false),
       frame_parallel_(frame_parallel),
-      use_intra_prediction_buffer_(use_intra_prediction_buffer) {
-  row_ = number_ / frame_header.tile_info.tile_columns;
-  column_ = number_ % frame_header.tile_info.tile_columns;
+      use_intra_prediction_buffer_(use_intra_prediction_buffer),
+      intra_prediction_buffer_(
+          use_intra_prediction_buffer_
+              ? &frame_scratch_buffer->intra_prediction_buffers.get()[row_]
+              : nullptr) {
   row4x4_start_ = frame_header.tile_info.tile_row_start[row_];
   row4x4_end_ = frame_header.tile_info.tile_row_start[row_ + 1];
   column4x4_start_ = frame_header.tile_info.tile_column_start[column_];
@@ -454,6 +465,9 @@ Tile::Tile(int tile_number, const uint8_t* const data, size_t size,
   split_parse_and_decode_ = (thread_pool_ != nullptr &&
                              superblock_columns_ > intra_block_copy_lag_) ||
                             frame_parallel;
+  if (frame_parallel_) {
+    reference_frame_progress_cache_.fill(INT_MIN);
+  }
   memset(delta_lf_, 0, sizeof(delta_lf_));
   delta_lf_all_zero_ = true;
   const YuvBuffer& buffer = post_filter_.frame_buffer();
@@ -491,21 +505,6 @@ Tile::Tile(int tile_number, const uint8_t* const data, size_t size,
         std::min(frame_header_.columns4x4, DivideBy4(plane_width + 3)
                                                << subsampling_x_[plane]);
   }
-  auto& superblock_state = frame_scratch_buffer->superblock_state;
-  if (split_parse_and_decode_ && superblock_state.rows() > 0) {
-    // The |superblock_state| array is for the entire frame. Set
-    // |threading_.sb_state| to point to the beginning of this Tile.
-    std::lock_guard<std::mutex> lock(threading_.mutex);
-    const int superblock_width_log2 =
-        FloorLog2(kBlockWidthPixels[SuperBlockSize()]);
-    const int superblock_row_start_index =
-        MultiplyBy4(row4x4_start_) >> superblock_width_log2;
-    const int superblock_column_start_index =
-        MultiplyBy4(column4x4_start_) >> superblock_width_log2;
-    threading_.sb_state.Reset(superblock_rows_, superblock_state.columns(),
-                              &superblock_state[superblock_row_start_index]
-                                               [superblock_column_start_index]);
-  }
 }
 
 bool Tile::Init() {
@@ -545,28 +544,11 @@ bool Tile::Init() {
       return false;
     }
   }
-  if (use_intra_prediction_buffer_) {
-    for (int plane = 0; plane < PlaneCount(); ++plane) {
-      const size_t intra_prediction_buffer_size =
-          (MultiplyBy4(column4x4_end_ - column4x4_start_) >>
-           subsampling_x_[plane]) *
-          (sequence_header_.color_config.bitdepth == 8 ? sizeof(uint8_t)
-                                                       : sizeof(uint16_t));
-      if (!intra_prediction_buffer_[plane].Resize(
-              intra_prediction_buffer_size)) {
-        LIBGAV1_DLOG(
-            ERROR, "Failed to allocate intra prediction buffer for plane %d.\n",
-            plane);
-        return false;
-      }
-    }
-  }
   if (frame_header_.use_ref_frame_mvs) {
     assert(sequence_header_.enable_order_hint);
     SetupMotionField(frame_header_, current_frame_, reference_frames_,
-                     sequence_header_.order_hint_shift_bits, row4x4_start_,
-                     row4x4_end_, column4x4_start_, column4x4_end_,
-                     &motion_field_);
+                     row4x4_start_, row4x4_end_, column4x4_start_,
+                     column4x4_end_, &motion_field_);
   }
   ResetLoopRestorationParams();
   return true;
@@ -612,11 +594,10 @@ void Tile::SaveSymbolDecoderContext() {
   }
 }
 
-bool Tile::ParseAndDecode(bool is_main_thread) {
+bool Tile::ParseAndDecode() {
   // If this is the main thread, we build the loop filter bit masks when parsing
   // so that it happens in the current thread. This ensures that the main thread
   // does as much work as possible.
-  build_bit_mask_when_parsing_ = is_main_thread;
   if (split_parse_and_decode_) {
     if (!ThreadedParseAndDecode()) return false;
     SaveSymbolDecoderContext();
@@ -663,9 +644,72 @@ bool Tile::Parse() {
   return true;
 }
 
+bool Tile::Decode(
+    std::mutex* const mutex, int* const superblock_row_progress,
+    std::condition_variable* const superblock_row_progress_condvar) {
+  const int block_width4x4 = sequence_header_.use_128x128_superblock ? 32 : 16;
+  const int block_width4x4_log2 =
+      sequence_header_.use_128x128_superblock ? 5 : 4;
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      tile_scratch_buffer_pool_->Get();
+  if (scratch_buffer == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+    return false;
+  }
+  for (int row4x4 = row4x4_start_, index = row4x4_start_ >> block_width4x4_log2;
+       row4x4 < row4x4_end_; row4x4 += block_width4x4, ++index) {
+    if (!ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+            row4x4, scratch_buffer.get())) {
+      return false;
+    }
+    if (post_filter_.DoDeblock()) {
+      // Apply vertical deblock filtering for all the columns in this tile
+      // except for the first 64 columns.
+      post_filter_.ApplyDeblockFilter(
+          kLoopFilterTypeVertical, row4x4,
+          column4x4_start_ + kNum4x4InLoopFilterUnit, column4x4_end_,
+          block_width4x4);
+      // If this is the first superblock row of the tile, then we cannot apply
+      // horizontal deblocking here since we don't know if the top row is
+      // available. So it will be done by the calling thread in that case.
+      if (row4x4 != row4x4_start_) {
+        // Apply horizontal deblock filtering for all the columns in this tile
+        // except for the first and the last 64 columns.
+        // Note about the last tile of each row: For the last tile,
+        // column4x4_end may not be a multiple of 16. In that case it is still
+        // okay to simply subtract 16 since ApplyDeblockFilter() will only do
+        // the filters in increments of 64 columns (or 32 columns for chroma
+        // with subsampling).
+        post_filter_.ApplyDeblockFilter(
+            kLoopFilterTypeHorizontal, row4x4,
+            column4x4_start_ + kNum4x4InLoopFilterUnit,
+            column4x4_end_ - kNum4x4InLoopFilterUnit, block_width4x4);
+      }
+    }
+    bool notify;
+    {
+      std::unique_lock<std::mutex> lock(*mutex);
+      notify = ++superblock_row_progress[index] ==
+               frame_header_.tile_info.tile_columns;
+    }
+    if (notify) {
+      // We are done decoding this superblock row. Notify the post filtering
+      // thread.
+      superblock_row_progress_condvar[index].notify_one();
+    }
+  }
+  tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+  return true;
+}
+
 bool Tile::ThreadedParseAndDecode() {
   {
     std::lock_guard<std::mutex> lock(threading_.mutex);
+    if (!threading_.sb_state.Reset(superblock_rows_, superblock_columns_)) {
+      pending_tiles_->Decrement(false);
+      LIBGAV1_DLOG(ERROR, "threading.sb_state.Reset() failed.");
+      return false;
+    }
     // Account for the parsing job.
     ++threading_.pending_jobs;
   }
@@ -826,14 +870,16 @@ void Tile::PopulateIntraPredictionBuffer(int row4x4) {
   if (!use_intra_prediction_buffer_ || row4x4 + block_width4x4 >= row4x4_end_) {
     return;
   }
+  const size_t pixel_size =
+      (sequence_header_.color_config.bitdepth == 8 ? sizeof(uint8_t)
+                                                   : sizeof(uint16_t));
   for (int plane = 0; plane < PlaneCount(); ++plane) {
     const int row_to_copy =
         (MultiplyBy4(row4x4 + block_width4x4) >> subsampling_y_[plane]) - 1;
     const size_t pixels_to_copy =
         (MultiplyBy4(column4x4_end_ - column4x4_start_) >>
          subsampling_x_[plane]) *
-        (sequence_header_.color_config.bitdepth == 8 ? sizeof(uint8_t)
-                                                     : sizeof(uint16_t));
+        pixel_size;
     const size_t column_start =
         MultiplyBy4(column4x4_start_) >> subsampling_x_[plane];
     void* start;
@@ -848,7 +894,8 @@ void Tile::PopulateIntraPredictionBuffer(int row4x4) {
     {
       start = &buffer_[plane][row_to_copy][column_start];
     }
-    memcpy(intra_prediction_buffer_[plane].get(), start, pixels_to_copy);
+    memcpy((*intra_prediction_buffer_)[plane].get() + column_start * pixel_size,
+           start, pixels_to_copy);
   }
 }
 
@@ -2067,15 +2114,16 @@ bool Tile::ComputePrediction(const Block& block) {
 void Tile::PopulateDeblockFilterLevel(const Block& block) {
   if (!post_filter_.DoDeblock()) return;
   BlockParameters& bp = *block.bp;
+  const int mode_id =
+      static_cast<int>(kPredictionModeDeltasMask.Contains(bp.y_mode));
   for (int i = 0; i < kFrameLfCount; ++i) {
     if (delta_lf_all_zero_) {
       bp.deblock_filter_level[i] = post_filter_.GetZeroDeltaDeblockFilterLevel(
-          bp.segment_id, i, bp.reference_frame[0],
-          LoopFilterMask::GetModeId(bp.y_mode));
+          bp.segment_id, i, bp.reference_frame[0], mode_id);
     } else {
       bp.deblock_filter_level[i] =
           deblock_filter_levels_[bp.segment_id][i][bp.reference_frame[0]]
-                                [LoopFilterMask::GetModeId(bp.y_mode)];
+                                [mode_id];
     }
   }
 }
@@ -2138,10 +2186,6 @@ bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
     current_frame_.segmentation_map()->FillBlock(row4x4, column4x4, x_limit,
                                                  y_limit, bp.segment_id);
   }
-  if (kDeblockFilterBitMask &&
-      (build_bit_mask_when_parsing_ || !split_parse_and_decode_)) {
-    BuildBitMask(block);
-  }
   StoreMotionFieldMvsIntoCurrentFrame(block);
   if (!split_parse_and_decode_) {
     prediction_parameters_ = std::move(bp.prediction_parameters);
@@ -2164,9 +2208,6 @@ bool Tile::DecodeBlock(ParameterTree* const tree,
       !Residual(block, kProcessingModeDecodeOnly)) {
     return false;
   }
-  if (kDeblockFilterBitMask && !build_bit_mask_when_parsing_) {
-    BuildBitMask(block);
-  }
   block.bp->prediction_parameters.reset(nullptr);
   return true;
 }
@@ -2451,176 +2492,11 @@ void Tile::ReadLoopRestorationCoefficients(int row4x4, int column4x4,
   }
 }
 
-void Tile::BuildBitMask(const Block& block) {
-  if (!post_filter_.DoDeblock()) return;
-  if (block.size <= kBlock64x64) {
-    BuildBitMaskHelper(block, block.row4x4, block.column4x4, block.size, true,
-                       true);
-  } else {
-    const int block_width4x4 = kNum4x4BlocksWide[block.size];
-    const int block_height4x4 = kNum4x4BlocksHigh[block.size];
-    for (int y = 0; y < block_height4x4; y += 16) {
-      for (int x = 0; x < block_width4x4; x += 16) {
-        BuildBitMaskHelper(block, block.row4x4 + y, block.column4x4 + x,
-                           kBlock64x64, x == 0, y == 0);
-      }
-    }
-  }
-}
-
-void Tile::BuildBitMaskHelper(const Block& block, int row4x4, int column4x4,
-                              BlockSize block_size,
-                              const bool is_vertical_block_border,
-                              const bool is_horizontal_block_border) {
-  const int block_width4x4 = kNum4x4BlocksWide[block_size];
-  const int block_height4x4 = kNum4x4BlocksHigh[block_size];
-  BlockParameters& bp = *block.bp;
-  const bool skip = bp.skip && bp.is_inter;
-  LoopFilterMask* const masks = post_filter_.masks();
-  const int unit_id = DivideBy16(row4x4) * masks->num_64x64_blocks_per_row() +
-                      DivideBy16(column4x4);
-
-  for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
-    // For U and V planes, do not build bit masks if level == 0.
-    if (plane > kPlaneY && frame_header_.loop_filter.level[plane + 1] == 0) {
-      continue;
-    }
-    // Build bit mask for vertical edges.
-    const int subsampling_x = subsampling_x_[plane];
-    const int subsampling_y = subsampling_y_[plane];
-    const int column_limit =
-        std::min(column4x4 + block_width4x4, deblock_column_limit_[plane]);
-    const int row_limit =
-        std::min(row4x4 + block_height4x4, deblock_row_limit_[plane]);
-    const int row_start = GetDeblockPosition(row4x4, subsampling_y);
-    const int column_start = GetDeblockPosition(column4x4, subsampling_x);
-    if (row_start >= row_limit || column_start >= column_limit) {
-      continue;
-    }
-    const int vertical_step = 1 << subsampling_y;
-    const int horizontal_step = 1 << subsampling_x;
-    const BlockParameters& bp =
-        *block_parameters_holder_.Find(row_start, column_start);
-    const int horizontal_level_index =
-        kDeblockFilterLevelIndex[plane][kLoopFilterTypeHorizontal];
-    const int vertical_level_index =
-        kDeblockFilterLevelIndex[plane][kLoopFilterTypeVertical];
-    const uint8_t vertical_level =
-        bp.deblock_filter_level[vertical_level_index];
-
-    for (int row = row_start; row < row_limit; row += vertical_step) {
-      for (int column = column_start; column < column_limit;) {
-        const TransformSize tx_size = (plane == kPlaneY)
-                                          ? inter_transform_sizes_[row][column]
-                                          : bp.uv_transform_size;
-        // (1). Don't filter frame boundary.
-        // (2). For tile boundary, we don't know whether the previous tile is
-        // available or not, thus we handle it after all tiles are decoded.
-        const bool is_vertical_border =
-            (column == column_start) && is_vertical_block_border;
-        if (column == GetDeblockPosition(column4x4_start_, subsampling_x) ||
-            (skip && !is_vertical_border)) {
-          column += kNum4x4BlocksWide[tx_size] << subsampling_x;
-          continue;
-        }
-
-        // bp_left is the parameter of the left prediction block which
-        // is guaranteed to be inside the tile.
-        const BlockParameters& bp_left =
-            *block_parameters_holder_.Find(row, column - horizontal_step);
-        const uint8_t left_level =
-            is_vertical_border
-                ? bp_left.deblock_filter_level[vertical_level_index]
-                : vertical_level;
-        // We don't have to check if the left block is skipped or not,
-        // because if the current transform block is on the edge of the coding
-        // block, is_vertical_border is true; if it's not on the edge,
-        // left skip is equal to skip.
-        if (vertical_level != 0 || left_level != 0) {
-          const TransformSize left_tx_size =
-              (plane == kPlaneY)
-                  ? inter_transform_sizes_[row][column - horizontal_step]
-                  : bp_left.uv_transform_size;
-          const LoopFilterTransformSizeId transform_size_id =
-              GetTransformSizeIdWidth(tx_size, left_tx_size);
-          const int r = row & (kNum4x4InLoopFilterMaskUnit - 1);
-          const int c = column & (kNum4x4InLoopFilterMaskUnit - 1);
-          const int shift = LoopFilterMask::GetShift(r, c);
-          const int index = LoopFilterMask::GetIndex(r);
-          const auto mask = static_cast<uint64_t>(1) << shift;
-          masks->SetLeft(mask, unit_id, plane, transform_size_id, index);
-          const uint8_t current_level =
-              (vertical_level == 0) ? left_level : vertical_level;
-          masks->SetLevel(current_level, unit_id, plane,
-                          kLoopFilterTypeVertical,
-                          LoopFilterMask::GetLevelOffset(r, c));
-        }
-        column += kNum4x4BlocksWide[tx_size] << subsampling_x;
-      }
-    }
-
-    // Build bit mask for horizontal edges.
-    const uint8_t horizontal_level =
-        bp.deblock_filter_level[horizontal_level_index];
-    for (int column = column_start; column < column_limit;
-         column += horizontal_step) {
-      for (int row = row_start; row < row_limit;) {
-        const TransformSize tx_size = (plane == kPlaneY)
-                                          ? inter_transform_sizes_[row][column]
-                                          : bp.uv_transform_size;
-
-        // (1). Don't filter frame boundary.
-        // (2). For tile boundary, we don't know whether the previous tile is
-        // available or not, thus we handle it after all tiles are decoded.
-        const bool is_horizontal_border =
-            (row == row_start) && is_horizontal_block_border;
-        if (row == GetDeblockPosition(row4x4_start_, subsampling_y) ||
-            (skip && !is_horizontal_border)) {
-          row += kNum4x4BlocksHigh[tx_size] << subsampling_y;
-          continue;
-        }
-
-        // bp_top is the parameter of the top prediction block which is
-        // guaranteed to be inside the tile.
-        const BlockParameters& bp_top =
-            *block_parameters_holder_.Find(row - vertical_step, column);
-        const uint8_t top_level =
-            is_horizontal_border
-                ? bp_top.deblock_filter_level[horizontal_level_index]
-                : horizontal_level;
-        // We don't have to check it the top block is skipped or not,
-        // because if the current transform block is on the edge of the coding
-        // block, is_horizontal_border is true; if it's not on the edge,
-        // top skip is equal to skip.
-        if (horizontal_level != 0 || top_level != 0) {
-          const TransformSize top_tx_size =
-              (plane == kPlaneY)
-                  ? inter_transform_sizes_[row - vertical_step][column]
-                  : bp_top.uv_transform_size;
-          const LoopFilterTransformSizeId transform_size_id =
-              static_cast<LoopFilterTransformSizeId>(
-                  std::min({kTransformHeightLog2[tx_size] - 2,
-                            kTransformHeightLog2[top_tx_size] - 2, 2}));
-          const int r = row & (kNum4x4InLoopFilterMaskUnit - 1);
-          const int c = column & (kNum4x4InLoopFilterMaskUnit - 1);
-          const int shift = LoopFilterMask::GetShift(r, c);
-          const int index = LoopFilterMask::GetIndex(r);
-          const auto mask = static_cast<uint64_t>(1) << shift;
-          masks->SetTop(mask, unit_id, plane, transform_size_id, index);
-          const uint8_t current_level =
-              (horizontal_level == 0) ? top_level : horizontal_level;
-          masks->SetLevel(current_level, unit_id, plane,
-                          kLoopFilterTypeHorizontal,
-                          LoopFilterMask::GetLevelOffset(r, c));
-        }
-        row += kNum4x4BlocksHigh[tx_size] << subsampling_y;
-      }
-    }
-  }
-}
-
 void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) {
-  if (frame_header_.refresh_frame_flags == 0) return;
+  if (frame_header_.refresh_frame_flags == 0 ||
+      IsIntraFrame(frame_header_.frame_type)) {
+    return;
+  }
   // Iterate over odd rows/columns beginning at the first odd row/column for the
   // block. It is done this way because motion field mvs are only needed at a
   // 8x8 granularity.
@@ -2636,6 +2512,7 @@ void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) {
   // The largest reference MV component that can be saved.
   constexpr int kRefMvsLimit = (1 << 12) - 1;
   const BlockParameters& bp = *block.bp;
+  ReferenceInfo* reference_info = current_frame_.reference_info();
   for (int i = 1; i >= 0; --i) {
     const ReferenceFrameType reference_frame_to_store = bp.reference_frame[i];
     // Must make a local copy so that StoreMotionFieldMvs() knows there is no
@@ -2649,12 +2526,7 @@ void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) {
         // The next line is equivalent to:
         // mv_row <= kRefMvsLimit && mv_column <= kRefMvsLimit
         (mv_row | mv_column) <= kRefMvsLimit &&
-        GetRelativeDistance(
-            reference_order_hint_
-                [frame_header_.reference_frame_index[reference_frame_to_store -
-                                                     kReferenceFrameLast]],
-            frame_header_.order_hint,
-            sequence_header_.order_hint_shift_bits) < 0) {
+        reference_info->relative_distance_from[reference_frame_to_store] < 0) {
       const int row_start8x8 = DivideBy2(row_start4x4);
       const int row_limit8x8 = DivideBy2(row_limit4x4);
       const int column_start8x8 = DivideBy2(column_start4x4);
@@ -2663,10 +2535,10 @@ void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) {
       const int columns = column_limit8x8 - column_start8x8;
       const ptrdiff_t stride = DivideBy2(current_frame_.columns4x4());
       ReferenceFrameType* const reference_frame_row_start =
-          current_frame_.motion_field_reference_frame(row_start8x8,
-                                                      column_start8x8);
+          &reference_info
+               ->motion_field_reference_frame[row_start8x8][column_start8x8];
       MotionVector* const mv =
-          current_frame_.motion_field_mv(row_start8x8, column_start8x8);
+          &reference_info->motion_field_mv[row_start8x8][column_start8x8];
 
       // Specialize columns cases 1, 2, 4, 8 and 16. This makes memset() inlined
       // and simplifies std::fill() for these cases.
diff --git a/chromium/third_party/libgav1/src/src/utils/array_2d.h b/chromium/third_party/libgav1/src/src/utils/array_2d.h
index 941d4b16f87..2df624187d0 100644
--- a/chromium/third_party/libgav1/src/src/utils/array_2d.h
+++ b/chromium/third_party/libgav1/src/src/utils/array_2d.h
@@ -113,6 +113,7 @@ class Array2D {
   int columns() const { return data_view_.columns(); }
   size_t size() const { return size_; }
   T* data() { return data_.get(); }
+  const T* data() const { return data_.get(); }
 
   T* operator[](int row) { return data_view_[row]; }
 
diff --git a/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.cc b/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.cc
index b52e91d6c97..79bb2b8f7e1 100644
--- a/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.cc
+++ b/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.cc
@@ -35,13 +35,11 @@ int RowsOrColumns4x4ToSuperBlocks(int value4x4, bool use_128x128_superblock) {
 
 }  // namespace
 
-BlockParametersHolder::BlockParametersHolder(int rows4x4, int columns4x4,
-                                             bool use_128x128_superblock)
-    : rows4x4_(rows4x4),
-      columns4x4_(columns4x4),
-      use_128x128_superblock_(use_128x128_superblock) {}
-
-bool BlockParametersHolder::Init() {
+bool BlockParametersHolder::Reset(int rows4x4, int columns4x4,
+                                  bool use_128x128_superblock) {
+  rows4x4_ = rows4x4;
+  columns4x4_ = columns4x4;
+  use_128x128_superblock_ = use_128x128_superblock;
   if (!block_parameters_cache_.Reset(rows4x4_, columns4x4_)) {
     LIBGAV1_DLOG(ERROR, "block_parameters_cache_.Reset() failed.");
     return false;
diff --git a/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.h b/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.h
index 909de5eefa3..35543c30a4e 100644
--- a/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.h
+++ b/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.h
@@ -31,17 +31,16 @@ namespace libgav1 {
 // corresponding to a superblock.
 class BlockParametersHolder {
  public:
-  // If |use_128x128_superblock| is true, 128x128 superblocks will be used,
-  // otherwise 64x64 superblocks will be used.
-  BlockParametersHolder(int rows4x4, int columns4x4,
-                        bool use_128x128_superblock);
+  BlockParametersHolder() = default;
 
   // Not copyable or movable.
   BlockParametersHolder(const BlockParametersHolder&) = delete;
   BlockParametersHolder& operator=(const BlockParametersHolder&) = delete;
 
-  // Must be called first.
-  LIBGAV1_MUST_USE_RESULT bool Init();
+  // If |use_128x128_superblock| is true, 128x128 superblocks will be used,
+  // otherwise 64x64 superblocks will be used.
+  LIBGAV1_MUST_USE_RESULT bool Reset(int rows4x4, int columns4x4,
+                                     bool use_128x128_superblock);
 
   // Finds the BlockParameters corresponding to |row4x4| and |column4x4|. This
   // is done as a simple look up of the |block_parameters_cache_| matrix.
@@ -54,6 +53,10 @@ class BlockParametersHolder {
     return block_parameters_cache_.data() + row4x4 * columns4x4_ + column4x4;
   }
 
+  BlockParameters* const* Address(int row4x4, int column4x4) const {
+    return block_parameters_cache_.data() + row4x4 * columns4x4_ + column4x4;
+  }
+
   int columns4x4() const { return columns4x4_; }
 
   // Returns the ParameterTree corresponding to superblock starting at (|row|,
@@ -66,9 +69,9 @@ class BlockParametersHolder {
                  BlockParameters* bp);
 
  private:
-  const int rows4x4_;
-  const int columns4x4_;
-  const bool use_128x128_superblock_;
+  int rows4x4_ = 0;
+  int columns4x4_ = 0;
+  bool use_128x128_superblock_ = false;
   Array2D<std::unique_ptr<ParameterTree>> trees_;
 
   // This is a 2d array of size |rows4x4_| * |columns4x4_|. This is filled in by
diff --git a/chromium/third_party/libgav1/src/src/utils/common.h b/chromium/third_party/libgav1/src/src/utils/common.h
index 56f413a2849..d6e019933e2 100644
--- a/chromium/third_party/libgav1/src/src/utils/common.h
+++ b/chromium/third_party/libgav1/src/src/utils/common.h
@@ -400,19 +400,17 @@ constexpr int ApplySign(int value, int sign) { return (value ^ sign) - sign; }
 
 // 7.9.3. (without the clamp for numerator and denominator).
 inline void GetMvProjection(const MotionVector& mv, int numerator,
-                            int denominator, MotionVector* projection_mv) {
-  // Allow numerator and denominator to be 0 so that this function can be called
-  // unconditionally. When either numerator or denominator is 0, |projection_mv|
-  // will be 0, and this is what we want.
+                            int division_multiplier,
+                            MotionVector* projection_mv) {
+  // Allow numerator and to be 0 so that this function can be called
+  // unconditionally. When numerator is 0, |projection_mv| will be 0, and this
+  // is what we want.
   assert(std::abs(numerator) <= kMaxFrameDistance);
-  assert(denominator >= 0);
-  assert(denominator <= kMaxFrameDistance);
   for (int i = 0; i < 2; ++i) {
-    projection_mv->mv[i] = Clip3(
-        RightShiftWithRoundingSigned(
-            mv.mv[i] * numerator * kProjectionMvDivisionLookup[denominator],
-            14),
-        -kProjectionMvClamp, kProjectionMvClamp);
+    projection_mv->mv[i] =
+        Clip3(RightShiftWithRoundingSigned(
+                  mv.mv[i] * numerator * division_multiplier, 14),
+              -kProjectionMvClamp, kProjectionMvClamp);
   }
 }
 
diff --git a/chromium/third_party/libgav1/src/src/utils/constants.h b/chromium/third_party/libgav1/src/src/utils/constants.h
index 868bfdc8c82..f070767ecb6 100644
--- a/chromium/third_party/libgav1/src/src/utils/constants.h
+++ b/chromium/third_party/libgav1/src/src/utils/constants.h
@@ -27,11 +27,14 @@ namespace libgav1 {
 // Returns the number of elements between begin (inclusive) and end (inclusive).
 constexpr int EnumRangeLength(int begin, int end) { return end - begin + 1; }
 
-#if defined(ENABLE_DEBLOCK_BIT_MASK)
-constexpr bool kDeblockFilterBitMask = true;
+enum {
+// Maximum number of threads that the library will ever create.
+#if defined(LIBGAV1_MAX_THREADS) && LIBGAV1_MAX_THREADS > 0
+  kMaxThreads = LIBGAV1_MAX_THREADS
 #else
-constexpr bool kDeblockFilterBitMask = false;
-#endif  // defined(ENABLE_DEBLOCK_BIT_MASK)
+  kMaxThreads = 128
+#endif
+};  // anonymous enum
 
 enum {
   kInvalidMvValue = -32768,
@@ -44,7 +47,6 @@ enum {
   kFrameLfCount = 4,
   kMaxLoopFilterValue = 63,
   kNum4x4In64x64 = 256,
-  kNumLoopFilterMasks = 4,
   kMaxAngleDelta = 3,
   kDirectionalIntraModes = 8,
   kMaxSuperBlockSizeLog2 = 7,
@@ -97,24 +99,19 @@ enum {
   kMaxSuperBlockSizeInPixels = 128,
   kMaxScaledSuperBlockSizeInPixels = 128 * 2,
   kMaxSuperBlockSizeSquareInPixels = 128 * 128,
-  kNum4x4InLoopFilterMaskUnit = 16,
+  kNum4x4InLoopFilterUnit = 16,
   kProjectionMvClamp = (1 << 14) - 1,  // == 16383
   kProjectionMvMaxHorizontalOffset = 8,
+  kCdefUnitSize = 64,
+  kCdefUnitSizeWithBorders = kCdefUnitSize + 2 * kRestorationBorder,
   kRestorationUnitOffset = 8,
-  // 2 pixel padding for 5x5 box sum on each side.
-  kRestorationPadding = 4,
   // Loop restoration's processing unit size is fixed as 64x64.
-  kRestorationProcessingUnitSize = 64,
-  kRestorationProcessingUnitSizeWithBorders =
-      kRestorationProcessingUnitSize + 2 * kRestorationBorder,
-  // The max size of a box filter process output buffer.
-  kMaxBoxFilterProcessOutputPixels = kRestorationProcessingUnitSize *
-                                     kRestorationProcessingUnitSize,  // == 4096
-  // The max size of a box filter process intermediate buffer.
-  kBoxFilterProcessIntermediatePixels =
-      (kRestorationProcessingUnitSizeWithBorders + kRestorationPadding) *
-      (kRestorationProcessingUnitSizeWithBorders +
-       kRestorationPadding),  // == 5476
+  kRestorationUnitHeight = 64,
+  kRestorationUnitWidth = 256,
+  kRestorationUnitHeightWithBorders =
+      kRestorationUnitHeight + 2 * kRestorationBorder,
+  kRestorationUnitWidthWithBorders =
+      kRestorationUnitWidth + 2 * kRestorationBorder,
   kSuperResFilterBits = 6,
   kSuperResFilterShifts = 1 << kSuperResFilterBits,
   kSuperResFilterTaps = 8,
@@ -148,8 +145,6 @@ enum {
   kMaxFrameDistance = 31,
   kReferenceFrameScalePrecision = 14,
   kNumWienerCoefficients = 3,
-  // Maximum number of threads that the library will ever use at any given time.
-  kMaxThreads = 32,
   kLoopFilterMaxModeDeltas = 2,
   kMaxCdefStrengths = 8,
   kCdefLargeValue = 0x4000,  // Used to indicate where CDEF is not available.
@@ -512,14 +507,6 @@ enum ObuType : int8_t {
   kObuPadding = 15,
 };
 
-// Enum to track the processing state of a superblock.
-enum SuperBlockState : uint8_t {
-  kSuperBlockStateNone,       // Not yet parsed or decoded.
-  kSuperBlockStateParsed,     // Parsed but not yet decoded.
-  kSuperBlockStateScheduled,  // Scheduled for decoding.
-  kSuperBlockStateDecoded     // Parsed and decoded.
-};
-
 //------------------------------------------------------------------------------
 // ToString()
 //
diff --git a/chromium/third_party/libgav1/src/src/utils/libgav1_utils.cmake b/chromium/third_party/libgav1/src/src/utils/libgav1_utils.cmake
index 50bf941306f..8b6ec4bee32 100644
--- a/chromium/third_party/libgav1/src/src/utils/libgav1_utils.cmake
+++ b/chromium/third_party/libgav1/src/src/utils/libgav1_utils.cmake
@@ -44,6 +44,7 @@ list(APPEND libgav1_utils_sources
             "${libgav1_source}/utils/queue.h"
             "${libgav1_source}/utils/raw_bit_reader.cc"
             "${libgav1_source}/utils/raw_bit_reader.h"
+            "${libgav1_source}/utils/reference_info.h"
             "${libgav1_source}/utils/segmentation.cc"
             "${libgav1_source}/utils/segmentation.h"
             "${libgav1_source}/utils/segmentation_map.cc"
diff --git a/chromium/third_party/libgav1/src/src/utils/reference_info.h b/chromium/third_party/libgav1/src/src/utils/reference_info.h
new file mode 100644
index 00000000000..a6607912ab8
--- /dev/null
+++ b/chromium/third_party/libgav1/src/src/utils/reference_info.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_
+#define LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_
+
+#include <array>
+#include <cstdint>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// This struct collects some members related to reference frames in one place to
+// make it easier to pass them as parameters to some dsp functions.
+struct ReferenceInfo {
+  // Initialize |motion_field_reference_frame| so that
+  // Tile::StoreMotionFieldMvsIntoCurrentFrame() can skip some updates when
+  // the updates are the same as the initialized value.
+  // Set to kReferenceFrameIntra instead of kReferenceFrameNone to simplify
+  // branch conditions in motion field projection.
+  // The following memory initialization of contiguous memory is very fast. It
+  // is not recommended to make the initialization multi-threaded, unless the
+  // memory which needs to be initialized in each thread is still contiguous.
+  LIBGAV1_MUST_USE_RESULT bool Reset(int rows, int columns) {
+    return motion_field_reference_frame.Reset(rows, columns,
+                                              /*zero_initialize=*/true) &&
+           motion_field_mv.Reset(
+               rows, columns,
+#if LIBGAV1_MSAN
+               // It is set in Tile::StoreMotionFieldMvsIntoCurrentFrame() only
+               // for qualified blocks. In MotionFieldProjectionKernel() dsp
+               // optimizations, it is read no matter it was set or not.
+               /*zero_initialize=*/true
+#else
+               /*zero_initialize=*/false
+#endif
+           );
+  }
+
+  // All members are used by inter frames only.
+  // For intra frames, they are not initialized.
+
+  std::array<uint8_t, kNumReferenceFrameTypes> order_hint;
+
+  // An example when |relative_distance_from| does not equal
+  // -|relative_distance_to|:
+  // |relative_distance_from| = GetRelativeDistance(7, 71, 25) = -64
+  // -|relative_distance_to| = -GetRelativeDistance(71, 7, 25) = 64
+  // This is why we need both |relative_distance_from| and
+  // |relative_distance_to|.
+  // |relative_distance_from|: Relative distances from reference frames to this
+  // frame.
+  std::array<int8_t, kNumReferenceFrameTypes> relative_distance_from;
+  // |relative_distance_to|: Relative distances to reference frames.
+  std::array<int8_t, kNumReferenceFrameTypes> relative_distance_to;
+
+  // Skip motion field projection of specific types of frames if their
+  // |relative_distance_to| is negative or too large.
+  std::array<bool, kNumReferenceFrameTypes> skip_references;
+  // Lookup table to get motion field projection division multiplier of specific
+  // types of frames. Derived from kProjectionMvDivisionLookup.
+  std::array<int16_t, kNumReferenceFrameTypes> projection_divisions;
+
+  // The current frame's |motion_field_reference_frame| and |motion_field_mv_|
+  // are guaranteed to be allocated only when refresh_frame_flags is not 0.
+  // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds
+  // to MfRefFrames[i * 2 + 1][j * 2 + 1] in the spec.
+  Array2D<ReferenceFrameType> motion_field_reference_frame;
+  // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds
+  // to MfMvs[i * 2 + 1][j * 2 + 1] in the spec.
+  Array2D<MotionVector> motion_field_mv;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_
diff --git a/chromium/third_party/libgav1/src/src/utils/types.h b/chromium/third_party/libgav1/src/src/utils/types.h
index 8a95bdb20f9..89b35ad7b21 100644
--- a/chromium/third_party/libgav1/src/src/utils/types.h
+++ b/chromium/third_party/libgav1/src/src/utils/types.h
@@ -283,8 +283,10 @@ struct Delta {
 };
 
 struct Cdef {
-  uint8_t damping;
+  uint8_t damping;  // damping value from the spec + (bitdepth - 8).
   uint8_t bits;
+  // All the strength values are the values from the spec and left shifted by
+  // (bitdepth - 8).
   uint8_t y_primary_strength[kMaxCdefStrengths];
   uint8_t y_secondary_strength[kMaxCdefStrengths];
   uint8_t uv_primary_strength[kMaxCdefStrengths];