diff options
Diffstat (limited to 'chromium/third_party/libgav1')
83 files changed, 8272 insertions, 4951 deletions
diff --git a/chromium/third_party/libgav1/BUILD.gn b/chromium/third_party/libgav1/BUILD.gn index 9a31f423f6e..3a28871b8d2 100644 --- a/chromium/third_party/libgav1/BUILD.gn +++ b/chromium/third_party/libgav1/BUILD.gn @@ -16,6 +16,10 @@ config("public_libgav1_config") { "LIBGAV1_THREADPOOL_USE_STD_MUTEX", # to avoid abseil dependency. "LIBGAV1_ENABLE_LOGGING=0", # to avoid debug log of libgav1 in chromium # debug build. + + # Don't let libgav1 export any symbols. Otherwise the verify_order step on + # macOS can fail since these exports end up in the final Chromium binary. + "LIBGAV1_PUBLIC=", ] } diff --git a/chromium/third_party/libgav1/README.chromium b/chromium/third_party/libgav1/README.chromium index fc62bb71907..27a8fe8222f 100644 --- a/chromium/third_party/libgav1/README.chromium +++ b/chromium/third_party/libgav1/README.chromium @@ -2,9 +2,9 @@ Name: libgav1 Short Name: libgav1 URL: https://chromium.googlesource.com/codecs/libgav1/ Version: 0 -Date: Thursday March 26 2020 +Date: Saturday May 23 2020 Branch: master -Commit: 638ef84819f8b3cd614dcf63378fe4814aa4cb2a +Commit: bf190c43e5c7cc81751867c917a81bc2920be079 License: Apache 2.0 License File: libgav1/LICENSE Security Critical: yes diff --git a/chromium/third_party/libgav1/libgav1_srcs.gni b/chromium/third_party/libgav1/libgav1_srcs.gni index e460d030f1b..9dc54f97124 100644 --- a/chromium/third_party/libgav1/libgav1_srcs.gni +++ b/chromium/third_party/libgav1/libgav1_srcs.gni @@ -15,8 +15,6 @@ gav1_common_sources = [ "//third_party/libgav1/src/src/frame_scratch_buffer.h", "//third_party/libgav1/src/src/internal_frame_buffer_list.cc", "//third_party/libgav1/src/src/internal_frame_buffer_list.h", - "//third_party/libgav1/src/src/loop_filter_mask.cc", - "//third_party/libgav1/src/src/loop_filter_mask.h", "//third_party/libgav1/src/src/loop_restoration_info.cc", "//third_party/libgav1/src/src/loop_restoration_info.h", "//third_party/libgav1/src/src/motion_vector.cc", @@ -146,6 +144,10 @@ gav1_dsp_sources = [ "//third_party/libgav1/src/src/dsp/x86/loop_restoration_sse4.h", "//third_party/libgav1/src/src/dsp/x86/mask_blend_sse4.cc", "//third_party/libgav1/src/src/dsp/x86/mask_blend_sse4.h", + "//third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.cc", + "//third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.h", + "//third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.cc", + "//third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.h", "//third_party/libgav1/src/src/dsp/x86/obmc_sse4.cc", "//third_party/libgav1/src/src/dsp/x86/obmc_sse4.h", "//third_party/libgav1/src/src/dsp/x86/super_res_sse4.cc", @@ -215,6 +217,7 @@ gav1_utils_sources = [ "//third_party/libgav1/src/src/utils/queue.h", "//third_party/libgav1/src/src/utils/raw_bit_reader.cc", "//third_party/libgav1/src/src/utils/raw_bit_reader.h", + "//third_party/libgav1/src/src/utils/reference_info.h", "//third_party/libgav1/src/src/utils/segmentation.cc", "//third_party/libgav1/src/src/utils/segmentation.h", "//third_party/libgav1/src/src/utils/segmentation_map.cc", diff --git a/chromium/third_party/libgav1/src/README.md b/chromium/third_party/libgav1/src/README.md index ead3fc3b8ee..a5799d1395b 100644 --- a/chromium/third_party/libgav1/src/README.md +++ b/chromium/third_party/libgav1/src/README.md @@ -56,6 +56,9 @@ Configuration options: absl::Mutex in ThreadPool. Defining this to 1 will remove any Abseil dependency from the core library. Automatically defined in `src/utils/threadpool.h` if unset. +* `LIBGAV1_MAX_THREADS`: sets the number of threads that the library is + allowed to create. Has to be an integer > 0. Otherwise this is ignored. + The default value is 128. For additional options see: diff --git a/chromium/third_party/libgav1/src/cmake/libgav1_build_definitions.cmake b/chromium/third_party/libgav1/src/cmake/libgav1_build_definitions.cmake index cd5ff9e1230..31017a9de14 100644 --- a/chromium/third_party/libgav1/src/cmake/libgav1_build_definitions.cmake +++ b/chromium/third_party/libgav1/src/cmake/libgav1_build_definitions.cmake @@ -63,6 +63,14 @@ macro(libgav1_set_build_definitions) list(APPEND libgav1_clang_cxx_flags "-Wmissing-prototypes" "-Wshorten-64-to-32") + if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6") + # Quiet warnings in copy-list-initialization where {} elision has always + # been allowed. + list(APPEND libgav1_clang_cxx_flags "-Wno-missing-braces") + endif() + endif() + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "7") # Quiet warnings due to potential snprintf() truncation in threadpool.cc. diff --git a/chromium/third_party/libgav1/src/cmake/libgav1_flags.cmake b/chromium/third_party/libgav1/src/cmake/libgav1_flags.cmake index 295b078756a..0b8df60f3df 100644 --- a/chromium/third_party/libgav1/src/cmake/libgav1_flags.cmake +++ b/chromium/third_party/libgav1/src/cmake/libgav1_flags.cmake @@ -212,14 +212,17 @@ endmacro() macro(libgav1_set_cxx_flags) unset(cxx_flag_lists) - if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") - list(APPEND cxx_flag_lists libgav1_clang_cxx_flags) - endif() - if(CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU") list(APPEND cxx_flag_lists libgav1_base_cxx_flags) endif() + # Append clang flags after the base set to allow -Wno* overrides to take + # effect. Some of the base flags may enable a large set of warnings, e.g., + # -Wall. + if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") + list(APPEND cxx_flag_lists libgav1_clang_cxx_flags) + endif() + if(MSVC) list(APPEND cxx_flag_lists libgav1_msvc_cxx_flags) endif() diff --git a/chromium/third_party/libgav1/src/examples/file_reader.cc b/chromium/third_party/libgav1/src/examples/file_reader.cc index f174e2d67b6..b0967227ef8 100644 --- a/chromium/third_party/libgav1/src/examples/file_reader.cc +++ b/chromium/third_party/libgav1/src/examples/file_reader.cc @@ -26,7 +26,6 @@ #include <io.h> #endif -#include "absl/memory/memory.h" #include "examples/file_reader_constants.h" #include "examples/file_reader_factory.h" #include "examples/file_reader_interface.h" @@ -53,10 +52,9 @@ FileReader::~FileReader() { } std::unique_ptr<FileReaderInterface> FileReader::Open( - absl::string_view file_name, const bool error_tolerant) { + const std::string& file_name, const bool error_tolerant) { if (file_name.empty()) return nullptr; - const std::string fopen_file_name = std::string(file_name); FILE* raw_file_ptr; bool owns_file = true; @@ -64,14 +62,14 @@ std::unique_ptr<FileReaderInterface> FileReader::Open( raw_file_ptr = SetBinaryMode(stdin); owns_file = false; // stdin is owned by the Standard C Library. } else { - raw_file_ptr = fopen(fopen_file_name.c_str(), "rb"); + raw_file_ptr = fopen(file_name.c_str(), "rb"); } if (raw_file_ptr == nullptr) { return nullptr; } - auto file = absl::WrapUnique( + std::unique_ptr<FileReader> file( new (std::nothrow) FileReader(raw_file_ptr, owns_file, error_tolerant)); if (file == nullptr) { LIBGAV1_EXAMPLES_LOG_ERROR("Out of memory"); diff --git a/chromium/third_party/libgav1/src/examples/file_reader.h b/chromium/third_party/libgav1/src/examples/file_reader.h index ad5911e32fe..c342a20df1e 100644 --- a/chromium/third_party/libgav1/src/examples/file_reader.h +++ b/chromium/third_party/libgav1/src/examples/file_reader.h @@ -21,10 +21,9 @@ #include <cstdint> #include <cstdio> #include <memory> +#include <string> #include <vector> -#include "absl/base/attributes.h" -#include "absl/strings/string_view.h" #include "examples/file_reader_interface.h" namespace libgav1 { @@ -42,7 +41,7 @@ class FileReader : public FileReaderInterface { // ReadTemporalUnit() may return truncated data. // Returns nullptr when the file does not exist, cannot be read, or is not an // IVF file. - static std::unique_ptr<FileReaderInterface> Open(absl::string_view file_name, + static std::unique_ptr<FileReaderInterface> Open(const std::string& file_name, bool error_tolerant = false); FileReader() = delete; @@ -62,10 +61,10 @@ class FileReader : public FileReaderInterface { // The |timestamp| pointer is optional: callers not interested in timestamps // can pass nullptr. When |timestamp| is not a nullptr, this function returns // the presentation timestamp from the IVF frame header. - ABSL_MUST_USE_RESULT bool ReadTemporalUnit(std::vector<uint8_t>* tu_data, - int64_t* timestamp) override; + /*LIBGAV1_MUST_USE_RESULT*/ bool ReadTemporalUnit( + std::vector<uint8_t>* tu_data, int64_t* timestamp) override; - ABSL_MUST_USE_RESULT bool IsEndOfFile() const override { + /*LIBGAV1_MUST_USE_RESULT*/ bool IsEndOfFile() const override { return feof(file_) != 0; } diff --git a/chromium/third_party/libgav1/src/examples/file_reader_factory.cc b/chromium/third_party/libgav1/src/examples/file_reader_factory.cc index 860d916423d..d5260eba893 100644 --- a/chromium/third_party/libgav1/src/examples/file_reader_factory.cc +++ b/chromium/third_party/libgav1/src/examples/file_reader_factory.cc @@ -38,7 +38,7 @@ bool FileReaderFactory::RegisterReader(OpenFunction open_function) { } std::unique_ptr<FileReaderInterface> FileReaderFactory::OpenReader( - absl::string_view file_name, const bool error_tolerant /*= false*/) { + const std::string& file_name, const bool error_tolerant /*= false*/) { for (auto* open_function : *GetFileReaderOpenFunctions()) { auto reader = open_function(file_name, error_tolerant); if (reader == nullptr) continue; diff --git a/chromium/third_party/libgav1/src/examples/file_reader_factory.h b/chromium/third_party/libgav1/src/examples/file_reader_factory.h index ddf8744d19b..0f534845e75 100644 --- a/chromium/third_party/libgav1/src/examples/file_reader_factory.h +++ b/chromium/third_party/libgav1/src/examples/file_reader_factory.h @@ -18,8 +18,8 @@ #define LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_ #include <memory> +#include <string> -#include "absl/strings/string_view.h" #include "examples/file_reader_interface.h" namespace libgav1 { @@ -27,7 +27,7 @@ namespace libgav1 { class FileReaderFactory { public: using OpenFunction = std::unique_ptr<FileReaderInterface> (*)( - absl::string_view file_name, bool error_tolerant); + const std::string& file_name, bool error_tolerant); FileReaderFactory() = delete; FileReaderFactory(const FileReaderFactory&) = delete; @@ -43,7 +43,7 @@ class FileReaderFactory { // returned. If |error_tolerant| is true and the reader supports it, some // format and read errors may be ignored and partial data returned. static std::unique_ptr<FileReaderInterface> OpenReader( - absl::string_view file_name, bool error_tolerant = false); + const std::string& file_name, bool error_tolerant = false); }; } // namespace libgav1 diff --git a/chromium/third_party/libgav1/src/examples/file_reader_interface.h b/chromium/third_party/libgav1/src/examples/file_reader_interface.h index d768017e2ba..d8f703091e2 100644 --- a/chromium/third_party/libgav1/src/examples/file_reader_interface.h +++ b/chromium/third_party/libgav1/src/examples/file_reader_interface.h @@ -21,8 +21,6 @@ #include <cstdint> #include <vector> -#include "absl/base/attributes.h" - namespace libgav1 { class FileReaderInterface { @@ -47,10 +45,10 @@ class FileReaderInterface { // The |timestamp| pointer is optional: callers not interested in timestamps // can pass nullptr. When |timestamp| is not a nullptr, this function returns // the presentation timestamp of the temporal unit. - ABSL_MUST_USE_RESULT virtual bool ReadTemporalUnit( + /*LIBGAV1_MUST_USE_RESULT*/ virtual bool ReadTemporalUnit( std::vector<uint8_t>* tu_data, int64_t* timestamp) = 0; - ABSL_MUST_USE_RESULT virtual bool IsEndOfFile() const = 0; + /*LIBGAV1_MUST_USE_RESULT*/ virtual bool IsEndOfFile() const = 0; // The values returned by these accessors are strictly informative. No // validation is performed when they are read from file. diff --git a/chromium/third_party/libgav1/src/examples/file_writer.cc b/chromium/third_party/libgav1/src/examples/file_writer.cc index bf13d4a1199..54afe145cde 100644 --- a/chromium/third_party/libgav1/src/examples/file_writer.cc +++ b/chromium/third_party/libgav1/src/examples/file_writer.cc @@ -25,8 +25,6 @@ #include <io.h> #endif -#include "absl/memory/memory.h" -#include "absl/strings/str_format.h" #include "examples/logging.h" namespace libgav1 { @@ -72,9 +70,8 @@ std::string GetY4mColorSpaceString( if (y4m_parameters.bitdepth > 8) { const bool monochrome = y4m_parameters.image_format == kImageFormatMonochrome400; - color_space_string = - absl::StrFormat("%s%s%d", color_space_string, monochrome ? "" : "p", - y4m_parameters.bitdepth); + if (!monochrome) color_space_string += "p"; + color_space_string += std::to_string(y4m_parameters.bitdepth); } return color_space_string; @@ -85,7 +82,7 @@ std::string GetY4mColorSpaceString( FileWriter::~FileWriter() { fclose(file_); } std::unique_ptr<FileWriter> FileWriter::Open( - absl::string_view file_name, FileType file_type, + const std::string& file_name, FileType file_type, const Y4mParameters* const y4m_parameters) { if (file_name.empty() || (file_type == kFileTypeY4m && y4m_parameters == nullptr) || @@ -94,13 +91,12 @@ std::unique_ptr<FileWriter> FileWriter::Open( return nullptr; } - const std::string fopen_file_name = std::string(file_name); FILE* raw_file_ptr; if (file_name == "-") { raw_file_ptr = SetBinaryMode(stdout); } else { - raw_file_ptr = fopen(fopen_file_name.c_str(), "wb"); + raw_file_ptr = fopen(file_name.c_str(), "wb"); } if (raw_file_ptr == nullptr) { @@ -108,7 +104,7 @@ std::unique_ptr<FileWriter> FileWriter::Open( return nullptr; } - auto file = absl::WrapUnique(new (std::nothrow) FileWriter(raw_file_ptr)); + std::unique_ptr<FileWriter> file(new (std::nothrow) FileWriter(raw_file_ptr)); if (file == nullptr) { LIBGAV1_EXAMPLES_LOG_ERROR("Out of memory"); fclose(raw_file_ptr); @@ -173,11 +169,13 @@ bool FileWriter::WriteFrame(const DecoderBuffer& frame_buffer) { // // More info here: https://wiki.multimedia.cx/index.php/YUV4MPEG2 bool FileWriter::WriteY4mFileHeader(const Y4mParameters& y4m_parameters) { - std::string y4m_header = absl::StrFormat( - "YUV4MPEG2 W%zu H%zu F%zu:%zu Ip C%s\n", y4m_parameters.width, - y4m_parameters.height, y4m_parameters.frame_rate_numerator, - y4m_parameters.frame_rate_denominator, - GetY4mColorSpaceString(y4m_parameters)); + std::string y4m_header = "YUV4MPEG2"; + y4m_header += " W" + std::to_string(y4m_parameters.width); + y4m_header += " H" + std::to_string(y4m_parameters.height); + y4m_header += " F" + std::to_string(y4m_parameters.frame_rate_numerator) + + ":" + std::to_string(y4m_parameters.frame_rate_denominator); + y4m_header += " Ip C" + GetY4mColorSpaceString(y4m_parameters); + y4m_header += "\n"; return fwrite(y4m_header.c_str(), 1, y4m_header.length(), file_) == y4m_header.length(); } diff --git a/chromium/third_party/libgav1/src/examples/file_writer.h b/chromium/third_party/libgav1/src/examples/file_writer.h index a7b1937dd37..00f6cc38097 100644 --- a/chromium/third_party/libgav1/src/examples/file_writer.h +++ b/chromium/third_party/libgav1/src/examples/file_writer.h @@ -21,9 +21,8 @@ #include <cstdint> #include <cstdio> #include <memory> +#include <string> -#include "absl/base/attributes.h" -#include "absl/strings/string_view.h" #include "gav1/decoder_buffer.h" namespace libgav1 { @@ -70,7 +69,7 @@ class FileWriter { // Returns a FileWriter instance after the file is opened successfully for // kFileTypeRaw files, and after the Y4M file header bytes are written for // kFileTypeY4m files. Returns nullptr upon failure. - static std::unique_ptr<FileWriter> Open(absl::string_view file_name, + static std::unique_ptr<FileWriter> Open(const std::string& file_name, FileType type, const Y4mParameters* y4m_parameters); @@ -86,7 +85,8 @@ class FileWriter { // Writes the frame data in |frame_buffer| to |file_|. Returns true after // successful write of |frame_buffer| data. - ABSL_MUST_USE_RESULT bool WriteFrame(const DecoderBuffer& frame_buffer); + /*LIBGAV1_MUST_USE_RESULT*/ bool WriteFrame( + const DecoderBuffer& frame_buffer); private: explicit FileWriter(FILE* file) : file_(file) {} diff --git a/chromium/third_party/libgav1/src/examples/logging.h b/chromium/third_party/libgav1/src/examples/logging.h index ba784ef5c15..536ed1dbaf2 100644 --- a/chromium/third_party/libgav1/src/examples/logging.h +++ b/chromium/third_party/libgav1/src/examples/logging.h @@ -18,6 +18,7 @@ #define LIBGAV1_EXAMPLES_LOGGING_H_ #include <cstddef> +#include <cstdio> namespace libgav1 { namespace examples { diff --git a/chromium/third_party/libgav1/src/src/buffer_pool.cc b/chromium/third_party/libgav1/src/src/buffer_pool.cc index 282da8c948b..c1a5606cd11 100644 --- a/chromium/third_party/libgav1/src/src/buffer_pool.cc +++ b/chromium/third_party/libgav1/src/src/buffer_pool.cc @@ -69,27 +69,13 @@ bool RefCountedBuffer::SetFrameDimensions(const ObuFrameHeader& frame_header) { render_height_ = frame_header.render_height; rows4x4_ = frame_header.rows4x4; columns4x4_ = frame_header.columns4x4; - const int rows4x4_half = DivideBy2(rows4x4_); - const int columns4x4_half = DivideBy2(columns4x4_); - if (!motion_field_reference_frame_.Reset(rows4x4_half, columns4x4_half, - /*zero_initialize=*/false) || - !motion_field_mv_.Reset(rows4x4_half, columns4x4_half, - /*zero_initialize=*/false)) { - return false; - } - if (frame_header.refresh_frame_flags != 0) { - // Initialize so that Tile::StoreMotionFieldMvsIntoCurrentFrame() can skip - // some updates when the updates are the same as the initialized value. - // Set to kReferenceFrameIntra instead of kReferenceFrameNone to simplify - // branch conditions in motion field projection. - // The following memory initialization of contiguous memory is very fast. It - // is not recommended to make the initialization multi-threaded, unless the - // memory which needs to be initialized in each thread is still contiguous. - static_assert(sizeof(motion_field_reference_frame_[0][0]) == sizeof(int8_t), - ""); - memset(motion_field_reference_frame_.data(), kReferenceFrameIntra, - sizeof(motion_field_reference_frame_[0][0]) * - motion_field_reference_frame_.size()); + if (frame_header.refresh_frame_flags != 0 && + !IsIntraFrame(frame_header.frame_type)) { + const int rows4x4_half = DivideBy2(rows4x4_); + const int columns4x4_half = DivideBy2(columns4x4_); + if (!reference_info_.Reset(rows4x4_half, columns4x4_half)) { + return false; + } } return segmentation_map_.Allocate(rows4x4_, columns4x4_); } diff --git a/chromium/third_party/libgav1/src/src/buffer_pool.h b/chromium/third_party/libgav1/src/src/buffer_pool.h index 07adc838f12..13008c10cd2 100644 --- a/chromium/third_party/libgav1/src/src/buffer_pool.h +++ b/chromium/third_party/libgav1/src/src/buffer_pool.h @@ -19,6 +19,7 @@ #include <array> #include <cassert> +#include <climits> #include <condition_variable> // NOLINT (unapproved c++11 header) #include <cstdint> #include <cstring> @@ -29,9 +30,9 @@ #include "src/gav1/frame_buffer.h" #include "src/internal_frame_buffer_list.h" #include "src/symbol_decoder_context.h" -#include "src/utils/array_2d.h" #include "src/utils/compiler_attributes.h" #include "src/utils/constants.h" +#include "src/utils/reference_info.h" #include "src/utils/segmentation.h" #include "src/utils/segmentation_map.h" #include "src/utils/types.h" @@ -108,21 +109,11 @@ class RefCountedBuffer { bool showable_frame() const { return showable_frame_; } void set_showable_frame(bool value) { showable_frame_ = value; } - // This array has kNumReferenceFrameTypes elements. - const uint8_t* order_hint_array() const { return order_hint_.data(); } - uint8_t order_hint(ReferenceFrameType reference_frame) const { - return order_hint_[reference_frame]; - } - void set_order_hint(ReferenceFrameType reference_frame, uint8_t order_hint) { - order_hint_[reference_frame] = order_hint; - } - void ClearOrderHints() { order_hint_.fill(0); } - // Sets upscaled_width_, frame_width_, frame_height_, render_width_, // render_height_, rows4x4_ and columns4x4_ from the corresponding fields - // in frame_header. Allocates motion_field_reference_frame_, - // motion_field_mv_, and segmentation_map_. Returns true on success, false - // on failure. + // in frame_header. Allocates reference_info_.motion_field_reference_frame, + // reference_info_.motion_field_mv_, and segmentation_map_. Returns true on + // success, false on failure. bool SetFrameDimensions(const ObuFrameHeader& frame_header); int32_t upscaled_width() const { return upscaled_width_; } @@ -135,27 +126,6 @@ class RefCountedBuffer { int32_t rows4x4() const { return rows4x4_; } int32_t columns4x4() const { return columns4x4_; } - // Entry at |row|, |column| corresponds to - // MfRefFrames[row * 2 + 1][column * 2 + 1] in the spec. - ReferenceFrameType* motion_field_reference_frame(int row, int column) { - return &motion_field_reference_frame_[row][column]; - } - - const ReferenceFrameType* motion_field_reference_frame(int row, - int column) const { - return &motion_field_reference_frame_[row][column]; - } - - // Entry at |row|, |column| corresponds to - // MfMvs[row * 2 + 1][column * 2 + 1] in the spec. - MotionVector* motion_field_mv(int row, int column) { - return &motion_field_mv_[row][column]; - } - - const MotionVector* motion_field_mv(int row, int column) const { - return &motion_field_mv_[row][column]; - } - SegmentationMap* segmentation_map() { return &segmentation_map_; } const SegmentationMap* segmentation_map() const { return &segmentation_map_; } @@ -205,6 +175,9 @@ class RefCountedBuffer { film_grain_params_ = params; } + const ReferenceInfo* reference_info() const { return &reference_info_; } + ReferenceInfo* reference_info() { return &reference_info_; } + // This will wake up the WaitUntil*() functions and make them return false. void Abort() { { @@ -217,8 +190,10 @@ class RefCountedBuffer { } void SetFrameState(FrameState frame_state) { - std::lock_guard<std::mutex> lock(mutex_); - frame_state_ = frame_state; + { + std::lock_guard<std::mutex> lock(mutex_); + frame_state_ = frame_state; + } if (frame_state == kFrameStateParsed) { parsed_condvar_.notify_all(); } else if (frame_state == kFrameStateDecoded) { @@ -230,9 +205,11 @@ class RefCountedBuffer { // Sets the progress of this frame to |progress_row| and notifies any threads // that may be waiting on rows <= |progress_row|. void SetProgress(int progress_row) { - std::lock_guard<std::mutex> lock(mutex_); - if (progress_row_ >= progress_row) return; - progress_row_ = progress_row; + { + std::lock_guard<std::mutex> lock(mutex_); + if (progress_row_ >= progress_row) return; + progress_row_ = progress_row; + } progress_row_condvar_.notify_all(); } @@ -257,8 +234,14 @@ class RefCountedBuffer { } // Waits until the |progress_row| has been decoded (as indicated either by - // |progress_row_| or |frame_state_|). - bool WaitUntil(int progress_row) { + // |progress_row_| or |frame_state_|). |progress_row_cache| must not be + // nullptr and will be populated with the value of |progress_row_| after the + // wait. + // + // Typical usage of |progress_row_cache| is as follows: + // * Initialize |*progress_row_cache| to INT_MIN. + // * Call WaitUntil only if |*progress_row_cache| < |progress_row|. + bool WaitUntil(int progress_row, int* progress_row_cache) { // If |progress_row| is negative, it means that the wait is on the top // border to be available. The top border will be available when row 0 has // been decoded. So we can simply wait on row 0 instead. @@ -268,6 +251,11 @@ class RefCountedBuffer { !abort_) { progress_row_condvar_.wait(lock); } + // Once |frame_state_| reaches kFrameStateDecoded, |progress_row_| may no + // longer be updated. So we set |*progress_row_cache| to INT_MAX in that + // case. + *progress_row_cache = + (frame_state_ != kFrameStateDecoded) ? progress_row_ : INT_MAX; return !abort_; } @@ -311,8 +299,6 @@ class RefCountedBuffer { ChromaSamplePosition chroma_sample_position_ = kChromaSamplePositionUnknown; bool showable_frame_ = false; - std::array<uint8_t, kNumReferenceFrameTypes> order_hint_ = {}; - int32_t upscaled_width_ = 0; int32_t frame_width_ = 0; int32_t frame_height_ = 0; @@ -321,12 +307,6 @@ class RefCountedBuffer { int32_t columns4x4_ = 0; int32_t rows4x4_ = 0; - // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds - // to MfRefFrames[i * 2 + 1][j * 2 + 1] in the spec. - Array2D<ReferenceFrameType> motion_field_reference_frame_; - // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds - // to MfMvs[i * 2 + 1][j * 2 + 1] in the spec. - Array2D<MotionVector> motion_field_mv_; // segmentation_map_ contains a rows4x4_ by columns4x4_ 2D array. SegmentationMap segmentation_map_; @@ -344,6 +324,7 @@ class RefCountedBuffer { // on feature_enabled only, we also save their values as an optimization. Segmentation segmentation_ = {}; FilmGrainParams film_grain_params_ = {}; + ReferenceInfo reference_info_; }; // RefCountedBufferPtr contains a reference to a RefCountedBuffer. diff --git a/chromium/third_party/libgav1/src/src/decoder_impl.cc b/chromium/third_party/libgav1/src/src/decoder_impl.cc index 841e4efed4b..508bbde4822 100644 --- a/chromium/third_party/libgav1/src/src/decoder_impl.cc +++ b/chromium/third_party/libgav1/src/src/decoder_impl.cc @@ -27,7 +27,6 @@ #include "src/film_grain.h" #include "src/frame_buffer_utils.h" #include "src/frame_scratch_buffer.h" -#include "src/loop_filter_mask.h" #include "src/loop_restoration_info.h" #include "src/obu_parser.h" #include "src/post_filter.h" @@ -36,6 +35,7 @@ #include "src/threading_strategy.h" #include "src/utils/blocking_counter.h" #include "src/utils/common.h" +#include "src/utils/constants.h" #include "src/utils/logging.h" #include "src/utils/parameter_tree.h" #include "src/utils/raw_bit_reader.h" @@ -61,6 +61,41 @@ int GetBottomBorderPixels(const bool do_cdef, const bool do_restoration, return border; } +// Sets |frame_scratch_buffer->tile_decoding_failed| to true (while holding on +// to |frame_scratch_buffer->superblock_row_mutex|) and notifies the first +// |count| condition variables in +// |frame_scratch_buffer->superblock_row_progress_condvar|. +void SetFailureAndNotifyAll(FrameScratchBuffer* const frame_scratch_buffer, + int count) { + { + std::lock_guard<std::mutex> lock( + frame_scratch_buffer->superblock_row_mutex); + frame_scratch_buffer->tile_decoding_failed = true; + } + std::condition_variable* const condvars = + frame_scratch_buffer->superblock_row_progress_condvar.get(); + for (int i = 0; i < count; ++i) { + condvars[i].notify_one(); + } +} + +// Helper class that releases the frame scratch buffer in the destructor. +class FrameScratchBufferReleaser { + public: + FrameScratchBufferReleaser( + FrameScratchBufferPool* frame_scratch_buffer_pool, + std::unique_ptr<FrameScratchBuffer>* frame_scratch_buffer) + : frame_scratch_buffer_pool_(frame_scratch_buffer_pool), + frame_scratch_buffer_(frame_scratch_buffer) {} + ~FrameScratchBufferReleaser() { + frame_scratch_buffer_pool_->Release(std::move(*frame_scratch_buffer_)); + } + + private: + FrameScratchBufferPool* const frame_scratch_buffer_pool_; + std::unique_ptr<FrameScratchBuffer>* const frame_scratch_buffer_; +}; + } // namespace // static @@ -107,22 +142,40 @@ DecoderImpl::~DecoderImpl() { } StatusCode DecoderImpl::Init() { + if (!GenerateWedgeMask(&wedge_masks_)) { + LIBGAV1_DLOG(ERROR, "GenerateWedgeMask() failed."); + return kStatusOutOfMemory; + } + return kStatusOk; +} + +StatusCode DecoderImpl::InitializeFrameThreadPoolAndTemporalUnitQueue( + const uint8_t* data, size_t size) { if (settings_.frame_parallel) { -#if defined(ENABLE_FRAME_PARALLEL) - if (settings_.threads > 1) { - if (!InitializeThreadPoolsForFrameParallel(settings_.threads, - &frame_thread_pool_)) { - return kStatusOutOfMemory; - } - // TODO(b/142583029): Frame parallel decoding with in-frame - // multi-threading is not yet implemented. Until then, we force - // settings_.threads to 1 when frame parallel decoding is enabled. - settings_.threads = 1; + DecoderState state; + std::unique_ptr<ObuParser> obu( + new (std::nothrow) ObuParser(data, size, &buffer_pool_, &state)); + if (obu == nullptr) { + LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser."); + return kStatusOutOfMemory; + } + RefCountedBufferPtr current_frame; + const StatusCode status = obu->ParseOneFrame(¤t_frame); + if (status != kStatusOk) { + LIBGAV1_DLOG(ERROR, "Failed to parse OBU."); + return status; + } + current_frame = nullptr; + // We assume that the first frame that was parsed will contain the frame + // header. This assumption is usually true in practice. So we will simply + // not use frame parallel mode if this is not the case. + if (settings_.threads > 1 && + !InitializeThreadPoolsForFrameParallel( + settings_.threads, obu->frame_header().tile_info.tile_count, + obu->frame_header().tile_info.tile_columns, &frame_thread_pool_, + &frame_scratch_buffer_pool_)) { + return kStatusOutOfMemory; } -#else - LIBGAV1_DLOG( - ERROR, "Frame parallel decoding is not implemented, ignoring setting."); -#endif // defined(ENABLE_FRAME_PARALLEL) } const int max_allowed_frames = GetMaxAllowedFrames(); assert(max_allowed_frames > 0); @@ -130,10 +183,6 @@ StatusCode DecoderImpl::Init() { LIBGAV1_DLOG(ERROR, "temporal_units_.Init() failed."); return kStatusOutOfMemory; } - if (!GenerateWedgeMask(&wedge_masks_)) { - LIBGAV1_DLOG(ERROR, "GenerateWedgeMask() failed."); - return kStatusOutOfMemory; - } return kStatusOk; } @@ -141,7 +190,19 @@ StatusCode DecoderImpl::EnqueueFrame(const uint8_t* data, size_t size, int64_t user_private_data, void* buffer_private_data) { if (data == nullptr || size == 0) return kStatusInvalidArgument; - if (abort_) return kStatusUnknownError; + if (HasFailure()) return kStatusUnknownError; + if (!seen_first_frame_) { + seen_first_frame_ = true; + const StatusCode status = + InitializeFrameThreadPoolAndTemporalUnitQueue(data, size); + if (status != kStatusOk) { + if (settings_.release_input_buffer != nullptr) { + settings_.release_input_buffer(settings_.callback_private_data, + buffer_private_data); + } + return SignalFailure(status); + } + } if (temporal_units_.Full()) { return kStatusTryAgain; } @@ -153,11 +214,13 @@ StatusCode DecoderImpl::EnqueueFrame(const uint8_t* data, size_t size, StatusCode DecoderImpl::SignalFailure(StatusCode status) { if (status == kStatusOk || status == kStatusTryAgain) return status; - abort_ = true; - failure_status_ = status; // Make sure all waiting threads exit. buffer_pool_.Abort(); frame_thread_pool_ = nullptr; + { + std::lock_guard<std::mutex> lock(mutex_); + failure_status_ = status; + } while (!temporal_units_.Empty()) { if (settings_.release_input_buffer != nullptr) { settings_.release_input_buffer( @@ -197,17 +260,22 @@ StatusCode DecoderImpl::DequeueFrame(const DecoderBuffer** out_ptr) { temporal_units_.Pop(); return status; } - if (settings_.blocking_dequeue) { + { std::unique_lock<std::mutex> lock(mutex_); - while (!temporal_unit.decoded && !abort_) { - decoded_condvar_.wait(lock); + if (settings_.blocking_dequeue) { + while (!temporal_unit.decoded && failure_status_ == kStatusOk) { + decoded_condvar_.wait(lock); + } + } else { + if (!temporal_unit.decoded && failure_status_ == kStatusOk) { + return kStatusTryAgain; + } + } + if (failure_status_ != kStatusOk) { + const StatusCode failure_status = failure_status_; + lock.unlock(); + return SignalFailure(failure_status); } - } else { - std::lock_guard<std::mutex> lock(mutex_); - if (!temporal_unit.decoded && !abort_) return kStatusTryAgain; - } - if (abort_) { - return SignalFailure(failure_status_); } if (settings_.release_input_buffer != nullptr) { settings_.release_input_buffer(settings_.callback_private_data, @@ -290,33 +358,32 @@ StatusCode DecoderImpl::ParseAndSchedule() { std::lock_guard<std::mutex> lock(mutex_); temporal_unit.has_displayable_frame = false; temporal_unit.decoded = true; - decoded_condvar_.notify_one(); return kStatusOk; } for (auto& frame : temporal_unit.frames) { EncodedFrame* const encoded_frame = &frame; frame_thread_pool_->Schedule([this, encoded_frame]() { - if (abort_) return; + if (HasFailure()) return; const StatusCode status = DecodeFrame(encoded_frame); - if (abort_) return; encoded_frame->state = {}; encoded_frame->frame = nullptr; TemporalUnit& temporal_unit = encoded_frame->temporal_unit; std::lock_guard<std::mutex> lock(mutex_); + if (failure_status_ != kStatusOk) return; // temporal_unit's status defaults to kStatusOk. So we need to set it only - // on error. If |abort_| is true at this point, it means that there has - // already been a failure. So we don't care about this subsequent failure. - // We will simply return the error code of the first failure. + // on error. If |failure_status_| is not kStatusOk at this point, it means + // that there has already been a failure. So we don't care about this + // subsequent failure. We will simply return the error code of the first + // failure. if (status != kStatusOk) { temporal_unit.status = status; - if (!abort_) { - abort_ = true; + if (failure_status_ == kStatusOk) { failure_status_ = status; } } temporal_unit.decoded = ++temporal_unit.decoded_count == temporal_unit.frames.size(); - if (temporal_unit.decoded || abort_) { + if (temporal_unit.decoded || failure_status_ != kStatusOk) { decoded_condvar_.notify_one(); } }); @@ -330,6 +397,17 @@ StatusCode DecoderImpl::DecodeFrame(EncodedFrame* const encoded_frame) { const Vector<ObuTileGroup>& tile_groups = encoded_frame->tile_groups; RefCountedBufferPtr current_frame = std::move(encoded_frame->frame); + std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer = + frame_scratch_buffer_pool_.Get(); + if (frame_scratch_buffer == nullptr) { + LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer."); + return kStatusOutOfMemory; + } + // |frame_scratch_buffer| will be released when this local variable goes out + // of scope (i.e.) on any return path in this function. + FrameScratchBufferReleaser frame_scratch_buffer_releaser( + &frame_scratch_buffer_pool_, &frame_scratch_buffer); + StatusCode status; if (!frame_header.show_existing_frame) { if (tile_groups.empty()) { @@ -339,16 +417,9 @@ StatusCode DecoderImpl::DecodeFrame(EncodedFrame* const encoded_frame) { // not have a reason to handle those cases, so we simply continue. return kStatusOk; } - std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer = - frame_scratch_buffer_pool_.Get(); - if (frame_scratch_buffer == nullptr) { - LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer."); - return kStatusOutOfMemory; - } status = DecodeTiles(sequence_header, frame_header, tile_groups, encoded_frame->state, frame_scratch_buffer.get(), current_frame.get()); - frame_scratch_buffer_pool_.Release(std::move(frame_scratch_buffer)); if (status != kStatusOk) { return status; } @@ -362,8 +433,9 @@ StatusCode DecoderImpl::DecodeFrame(EncodedFrame* const encoded_frame) { return kStatusOk; } RefCountedBufferPtr film_grain_frame; - status = ApplyFilmGrain(sequence_header, frame_header, current_frame, - &film_grain_frame, /*thread_pool=*/nullptr); + status = ApplyFilmGrain( + sequence_header, frame_header, current_frame, &film_grain_frame, + frame_scratch_buffer->threading_strategy.thread_pool()); if (status != kStatusOk) { return status; } @@ -402,6 +474,17 @@ StatusCode DecoderImpl::DecodeTemporalUnit(const TemporalUnit& temporal_unit, RefCountedBufferPtr current_frame; RefCountedBufferPtr displayable_frame; StatusCode status; + std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer = + frame_scratch_buffer_pool_.Get(); + if (frame_scratch_buffer == nullptr) { + LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer."); + return kStatusOutOfMemory; + } + // |frame_scratch_buffer| will be released when this local variable goes out + // of scope (i.e.) on any return path in this function. + FrameScratchBufferReleaser frame_scratch_buffer_releaser( + &frame_scratch_buffer_pool_, &frame_scratch_buffer); + while (obu->HasData()) { status = obu->ParseOneFrame(¤t_frame); if (status != kStatusOk) { @@ -433,16 +516,9 @@ StatusCode DecoderImpl::DecodeTemporalUnit(const TemporalUnit& temporal_unit, // not have a reason to handle those cases, so we simply continue. continue; } - std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer = - frame_scratch_buffer_pool_.Get(); - if (frame_scratch_buffer == nullptr) { - LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer."); - return kStatusOutOfMemory; - } status = DecodeTiles(obu->sequence_header(), obu->frame_header(), obu->tile_groups(), state_, frame_scratch_buffer.get(), current_frame.get()); - frame_scratch_buffer_pool_.Release(std::move(frame_scratch_buffer)); if (status != kStatusOk) { return status; } @@ -463,17 +539,10 @@ StatusCode DecoderImpl::DecodeTemporalUnit(const TemporalUnit& temporal_unit, } displayable_frame = std::move(current_frame); RefCountedBufferPtr film_grain_frame; - std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer = - frame_scratch_buffer_pool_.Get(); - if (frame_scratch_buffer == nullptr) { - LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer."); - return kStatusOutOfMemory; - } status = ApplyFilmGrain( obu->sequence_header(), obu->frame_header(), displayable_frame, &film_grain_frame, frame_scratch_buffer->threading_strategy.film_grain_thread_pool()); - frame_scratch_buffer_pool_.Release(std::move(frame_scratch_buffer)); if (status != kStatusOk) return status; displayable_frame = std::move(film_grain_frame); } @@ -572,25 +641,6 @@ StatusCode DecoderImpl::DecodeTiles( RefCountedBuffer* const current_frame) { frame_scratch_buffer->tile_scratch_buffer_pool.Reset( sequence_header.color_config.bitdepth); - if (IsFrameParallel()) { - // We can parse the current frame if all the reference frames have been - // parsed. - for (int i = 0; i < kNumReferenceFrameTypes; ++i) { - if (!state.reference_valid[i] || state.reference_frame[i] == nullptr) { - continue; - } - if (!state.reference_frame[i]->WaitUntilParsed()) { - return kStatusUnknownError; - } - } - } - if (PostFilter::DoDeblock(frame_header, settings_.post_filter_mask)) { - if (kDeblockFilterBitMask && !frame_scratch_buffer->loop_filter_mask.Reset( - frame_header.width, frame_header.height)) { - LIBGAV1_DLOG(ERROR, "Failed to allocate memory for loop filter masks."); - return kStatusOutOfMemory; - } - } if (!frame_scratch_buffer->loop_restoration_info.Reset( &frame_header.loop_restoration, frame_header.upscaled_width, frame_header.height, sequence_header.color_config.subsampling_x, @@ -671,11 +721,10 @@ StatusCode DecoderImpl::DecodeTiles( // The addition of kMaxBlockHeight4x4 and kMaxBlockWidth4x4 is necessary so // that the block parameters cache can be filled in for the last row/column // without having to check for boundary conditions. - BlockParametersHolder block_parameters_holder( - frame_header.rows4x4 + kMaxBlockHeight4x4, - frame_header.columns4x4 + kMaxBlockWidth4x4, - sequence_header.use_128x128_superblock); - if (!block_parameters_holder.Init()) { + if (!frame_scratch_buffer->block_parameters_holder.Reset( + frame_header.rows4x4 + kMaxBlockHeight4x4, + frame_header.columns4x4 + kMaxBlockWidth4x4, + sequence_header.use_128x128_superblock)) { return kStatusOutOfMemory; } const dsp::Dsp* const dsp = @@ -685,24 +734,6 @@ StatusCode DecoderImpl::DecodeTiles( sequence_header.color_config.bitdepth); return kStatusInternalError; } - // If prev_segment_ids is a null pointer, it is treated as if it pointed to - // a segmentation map containing all 0s. - const SegmentationMap* prev_segment_ids = nullptr; - if (frame_header.primary_reference_frame == kPrimaryReferenceNone) { - frame_scratch_buffer->symbol_decoder_context.Initialize( - frame_header.quantizer.base_index); - } else { - const int index = - frame_header - .reference_frame_index[frame_header.primary_reference_frame]; - const RefCountedBuffer* prev_frame = state.reference_frame[index].get(); - frame_scratch_buffer->symbol_decoder_context = prev_frame->FrameContext(); - if (frame_header.segmentation.enabled && - prev_frame->columns4x4() == frame_header.columns4x4 && - prev_frame->rows4x4() == frame_header.rows4x4) { - prev_segment_ids = prev_frame->segmentation_map(); - } - } const uint8_t tile_size_bytes = frame_header.tile_info.tile_size_bytes; const int tile_count = tile_groups.back().end + 1; @@ -714,26 +745,12 @@ StatusCode DecoderImpl::DecodeTiles( } ThreadingStrategy& threading_strategy = frame_scratch_buffer->threading_strategy; - if (!threading_strategy.Reset(frame_header, settings_.threads)) { + if (!IsFrameParallel() && + !threading_strategy.Reset(frame_header, settings_.threads)) { return kStatusOutOfMemory; } if (threading_strategy.row_thread_pool(0) != nullptr || IsFrameParallel()) { - const int block_width4x4_minus_one = - sequence_header.use_128x128_superblock ? 31 : 15; - const int block_width4x4_log2 = - sequence_header.use_128x128_superblock ? 5 : 4; - const int superblock_rows = - (frame_header.rows4x4 + block_width4x4_minus_one) >> - block_width4x4_log2; - const int superblock_columns = - (frame_header.columns4x4 + block_width4x4_minus_one) >> - block_width4x4_log2; - if (!frame_scratch_buffer->superblock_state.Reset(superblock_rows, - superblock_columns)) { - LIBGAV1_DLOG(ERROR, "Failed to allocate super_block_state.\n"); - return kStatusOutOfMemory; - } if (frame_scratch_buffer->residual_buffer_pool == nullptr) { frame_scratch_buffer->residual_buffer_pool.reset( new (std::nothrow) ResidualBufferPool( @@ -818,25 +835,80 @@ StatusCode DecoderImpl::DecodeTiles( } } - PostFilter post_filter( - frame_header, sequence_header, &frame_scratch_buffer->loop_filter_mask, - frame_scratch_buffer->cdef_index, - frame_scratch_buffer->inter_transform_sizes, - &frame_scratch_buffer->loop_restoration_info, &block_parameters_holder, - current_frame->buffer(), &frame_scratch_buffer->deblock_buffer, dsp, - threading_strategy.post_filter_thread_pool(), - frame_scratch_buffer->threaded_window_buffer.get(), - frame_scratch_buffer->superres_line_buffer.get(), - settings_.post_filter_mask); + PostFilter post_filter(frame_header, sequence_header, frame_scratch_buffer, + current_frame->buffer(), dsp, + settings_.post_filter_mask); + + if (IsFrameParallel()) { + // We can parse the current frame if all the reference frames have been + // parsed. + for (int i = 0; i < kNumReferenceFrameTypes; ++i) { + if (!state.reference_valid[i] || state.reference_frame[i] == nullptr) { + continue; + } + if (!state.reference_frame[i]->WaitUntilParsed()) { + return kStatusUnknownError; + } + } + } + + // If prev_segment_ids is a null pointer, it is treated as if it pointed to + // a segmentation map containing all 0s. + const SegmentationMap* prev_segment_ids = nullptr; + if (frame_header.primary_reference_frame == kPrimaryReferenceNone) { + frame_scratch_buffer->symbol_decoder_context.Initialize( + frame_header.quantizer.base_index); + } else { + const int index = + frame_header + .reference_frame_index[frame_header.primary_reference_frame]; + const RefCountedBuffer* prev_frame = state.reference_frame[index].get(); + frame_scratch_buffer->symbol_decoder_context = prev_frame->FrameContext(); + if (frame_header.segmentation.enabled && + prev_frame->columns4x4() == frame_header.columns4x4 && + prev_frame->rows4x4() == frame_header.rows4x4) { + prev_segment_ids = prev_frame->segmentation_map(); + } + } + // The Tile class must make use of a separate buffer to store the unfiltered // pixels for the intra prediction of the next superblock row. This is done // only when one of the following conditions are true: - // * frame_parallel is true. + // * IsFrameParallel() is true. // * settings_.threads == 1. // In the non-frame-parallel multi-threaded case, we do not run the post // filters in the decode loop. So this buffer need not be used. const bool use_intra_prediction_buffer = IsFrameParallel() || settings_.threads == 1; + if (use_intra_prediction_buffer) { + if (!frame_scratch_buffer->intra_prediction_buffers.Resize( + frame_header.tile_info.tile_rows)) { + LIBGAV1_DLOG(ERROR, "Failed to Resize intra_prediction_buffers."); + return kStatusOutOfMemory; + } + IntraPredictionBuffer* const intra_prediction_buffers = + frame_scratch_buffer->intra_prediction_buffers.get(); + for (int plane = 0; plane < num_planes; ++plane) { + const int subsampling = + (plane == kPlaneY) ? 0 : sequence_header.color_config.subsampling_x; + const size_t intra_prediction_buffer_size = + ((MultiplyBy4(frame_header.columns4x4) >> subsampling) * + (sequence_header.color_config.bitdepth == 8 ? sizeof(uint8_t) + : sizeof(uint16_t))); + for (int tile_row = 0; tile_row < frame_header.tile_info.tile_rows; + ++tile_row) { + if (!intra_prediction_buffers[tile_row][plane].Resize( + intra_prediction_buffer_size)) { + LIBGAV1_DLOG(ERROR, + "Failed to allocate intra prediction buffer for tile " + "row %d plane %d.\n", + tile_row, plane); + return kStatusOutOfMemory; + } + } + } + } + SymbolDecoderContext saved_symbol_decoder_context; int tile_index = 0; BlockingCounterWithStatus pending_tiles(tile_count); @@ -870,7 +942,7 @@ StatusCode DecoderImpl::DecodeTiles( tile_number, tile_group.data + byte_offset, tile_size, sequence_header, frame_header, current_frame, state, frame_scratch_buffer, wedge_masks_, &saved_symbol_decoder_context, - prev_segment_ids, &post_filter, &block_parameters_holder, dsp, + prev_segment_ids, &post_filter, dsp, threading_strategy.row_thread_pool(tile_index++), &pending_tiles, IsFrameParallel(), use_intra_prediction_buffer); if (tile == nullptr) { @@ -885,7 +957,12 @@ StatusCode DecoderImpl::DecodeTiles( } assert(tiles.size() == static_cast<size_t>(tile_count)); if (IsFrameParallel()) { - return DecodeTilesFrameParallel( + if (frame_scratch_buffer->threading_strategy.thread_pool() == nullptr) { + return DecodeTilesFrameParallel( + sequence_header, frame_header, tiles, saved_symbol_decoder_context, + prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame); + } + return DecodeTilesThreadedFrameParallel( sequence_header, frame_header, tiles, saved_symbol_decoder_context, prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame); } @@ -894,10 +971,8 @@ StatusCode DecoderImpl::DecodeTiles( status = DecodeTilesNonFrameParallel(sequence_header, frame_header, tiles, frame_scratch_buffer, &post_filter); } else { - status = DecodeTilesThreadedNonFrameParallel( - sequence_header, frame_header, tiles, tile_groups, - block_parameters_holder, frame_scratch_buffer, &post_filter, - &pending_tiles); + status = DecodeTilesThreadedNonFrameParallel(tiles, frame_scratch_buffer, + &post_filter, &pending_tiles); } if (status != kStatusOk) return status; if (frame_header.enable_frame_end_update_cdf) { @@ -928,8 +1003,8 @@ StatusCode DecoderImpl::DecodeTilesNonFrameParallel( } } post_filter->ApplyFilteringForOneSuperBlockRow( - row4x4, block_width4x4, - row4x4 + block_width4x4 >= frame_header.rows4x4); + row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4, + /*do_deblock=*/true); } frame_scratch_buffer->tile_scratch_buffer_pool.Release( std::move(tile_scratch_buffer)); @@ -937,11 +1012,7 @@ StatusCode DecoderImpl::DecodeTilesNonFrameParallel( } StatusCode DecoderImpl::DecodeTilesThreadedNonFrameParallel( - const ObuSequenceHeader& sequence_header, - const ObuFrameHeader& frame_header, const Vector<std::unique_ptr<Tile>>& tiles, - const Vector<ObuTileGroup>& tile_groups, - const BlockParametersHolder& block_parameters_holder, FrameScratchBuffer* const frame_scratch_buffer, PostFilter* const post_filter, BlockingCounterWithStatus* const pending_tiles) { @@ -964,7 +1035,7 @@ StatusCode DecoderImpl::DecodeTilesThreadedNonFrameParallel( tile_count) { if (!failed) { const auto& tile_ptr = tiles[index]; - if (!tile_ptr->ParseAndDecode(/*is_main_thread=*/false)) { + if (!tile_ptr->ParseAndDecode()) { LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number()); failed = true; } @@ -981,7 +1052,7 @@ StatusCode DecoderImpl::DecodeTilesThreadedNonFrameParallel( tile_count) { if (!tile_decoding_failed) { const auto& tile_ptr = tiles[index]; - if (!tile_ptr->ParseAndDecode(/*is_main_thread=*/true)) { + if (!tile_ptr->ParseAndDecode()) { LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number()); tile_decoding_failed = true; } @@ -995,15 +1066,8 @@ StatusCode DecoderImpl::DecodeTilesThreadedNonFrameParallel( // Wait until all the tiles have been decoded. tile_decoding_failed |= !pending_tiles->Wait(); if (tile_decoding_failed) return kStatusUnknownError; - if (post_filter->DoDeblock() && kDeblockFilterBitMask) { - frame_scratch_buffer->loop_filter_mask.Build( - sequence_header, frame_header, tile_groups.front().start, - tile_groups.back().end, block_parameters_holder, - frame_scratch_buffer->inter_transform_sizes); - } - if (threading_strategy.post_filter_thread_pool() != nullptr) { - post_filter->ApplyFilteringThreaded(); - } + assert(threading_strategy.post_filter_thread_pool() != nullptr); + post_filter->ApplyFilteringThreaded(); return kStatusOk; } @@ -1048,8 +1112,8 @@ StatusCode DecoderImpl::DecodeTilesFrameParallel( } } const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow( - row4x4, block_width4x4, - row4x4 + block_width4x4 >= frame_header.rows4x4); + row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4, + /*do_deblock=*/true); if (progress_row >= 0) { current_frame->SetProgress(progress_row); } @@ -1062,6 +1126,309 @@ StatusCode DecoderImpl::DecodeTilesFrameParallel( return kStatusOk; } +StatusCode DecoderImpl::DecodeTilesThreadedFrameParallel( + const ObuSequenceHeader& sequence_header, + const ObuFrameHeader& frame_header, + const Vector<std::unique_ptr<Tile>>& tiles, + const SymbolDecoderContext& saved_symbol_decoder_context, + const SegmentationMap* const prev_segment_ids, + FrameScratchBuffer* const frame_scratch_buffer, + PostFilter* const post_filter, RefCountedBuffer* const current_frame) { + // Parse the frame. + ThreadPool& thread_pool = + *frame_scratch_buffer->threading_strategy.thread_pool(); + std::atomic<int> tile_counter(0); + const int tile_count = static_cast<int>(tiles.size()); + const int num_workers = thread_pool.num_threads(); + BlockingCounterWithStatus parse_workers(num_workers); + // Submit tile parsing jobs to the thread pool. + for (int i = 0; i < num_workers; ++i) { + thread_pool.Schedule([&tiles, tile_count, &tile_counter, &parse_workers]() { + bool failed = false; + int index; + while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) < + tile_count) { + if (!failed) { + const auto& tile_ptr = tiles[index]; + if (!tile_ptr->Parse()) { + LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number()); + failed = true; + } + } + } + parse_workers.Decrement(!failed); + }); + } + + // Have the current thread participate in parsing. + bool failed = false; + int index; + while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) < + tile_count) { + if (!failed) { + const auto& tile_ptr = tiles[index]; + if (!tile_ptr->Parse()) { + LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number()); + failed = true; + } + } + } + + // Wait until all the parse workers are done. This ensures that all the tiles + // have been parsed. + if (!parse_workers.Wait() || failed) { + return kLibgav1StatusUnknownError; + } + if (frame_header.enable_frame_end_update_cdf) { + frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context; + } + current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context); + SetCurrentFrameSegmentationMap(frame_header, prev_segment_ids, current_frame); + current_frame->SetFrameState(kFrameStateParsed); + + // Decode the frame. + const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16; + const int block_width4x4_log2 = + sequence_header.use_128x128_superblock ? 5 : 4; + const int superblock_rows = + (frame_header.rows4x4 + block_width4x4 - 1) >> block_width4x4_log2; + if (!frame_scratch_buffer->superblock_row_progress.Resize(superblock_rows) || + !frame_scratch_buffer->superblock_row_progress_condvar.Resize( + superblock_rows)) { + return kLibgav1StatusOutOfMemory; + } + int* const superblock_row_progress = + frame_scratch_buffer->superblock_row_progress.get(); + memset(superblock_row_progress, 0, + superblock_rows * sizeof(superblock_row_progress[0])); + frame_scratch_buffer->tile_decoding_failed = false; + const int tile_columns = frame_header.tile_info.tile_columns; + const bool decode_entire_tiles_in_worker_threads = + num_workers >= tile_columns; + BlockingCounter pending_jobs( + decode_entire_tiles_in_worker_threads ? num_workers : tile_columns); + if (decode_entire_tiles_in_worker_threads) { + // Submit tile decoding jobs to the thread pool. + tile_counter = 0; + for (int i = 0; i < num_workers; ++i) { + thread_pool.Schedule([&tiles, tile_count, &tile_counter, &pending_jobs, + frame_scratch_buffer, superblock_rows]() { + bool failed = false; + int index; + while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) < + tile_count) { + if (failed) continue; + const auto& tile_ptr = tiles[index]; + if (!tile_ptr->Decode( + &frame_scratch_buffer->superblock_row_mutex, + frame_scratch_buffer->superblock_row_progress.get(), + frame_scratch_buffer->superblock_row_progress_condvar + .get())) { + LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number()); + failed = true; + SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows); + } + } + pending_jobs.Decrement(); + }); + } + } else { + // Schedule the jobs for first tile row. + for (int tile_index = 0; tile_index < tile_columns; ++tile_index) { + thread_pool.Schedule([this, &tiles, tile_index, block_width4x4, + tile_columns, superblock_rows, frame_scratch_buffer, + post_filter, &pending_jobs]() { + DecodeSuperBlockRowInTile( + tiles, tile_index, 0, block_width4x4, tile_columns, superblock_rows, + frame_scratch_buffer, post_filter, &pending_jobs); + pending_jobs.Decrement(); + }); + } + } + + // Current thread will do the post filters. + std::condition_variable* const superblock_row_progress_condvar = + frame_scratch_buffer->superblock_row_progress_condvar.get(); + const std::unique_ptr<Tile>* tile_row_base = &tiles[0]; + for (int row4x4 = 0, index = 0; row4x4 < frame_header.rows4x4; + row4x4 += block_width4x4, ++index) { + if (!tile_row_base[0]->IsRow4x4Inside(row4x4)) { + tile_row_base += tile_columns; + } + { + std::unique_lock<std::mutex> lock( + frame_scratch_buffer->superblock_row_mutex); + while (superblock_row_progress[index] != tile_columns && + !frame_scratch_buffer->tile_decoding_failed) { + superblock_row_progress_condvar[index].wait(lock); + } + if (frame_scratch_buffer->tile_decoding_failed) break; + } + if (post_filter->DoDeblock()) { + // Apply deblocking filter for the tile boundaries of this superblock row. + // The deblocking filter for the internal blocks will be applied in the + // tile worker threads. In this thread, we will only have to apply + // deblocking filter for the tile boundaries. + ApplyDeblockingFilterForTileBoundaries( + post_filter, tile_row_base, frame_header, row4x4, block_width4x4, + tile_columns, decode_entire_tiles_in_worker_threads); + } + // Apply all the post filters other than deblocking. + const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow( + row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4, + /*do_deblock=*/false); + if (progress_row >= 0) { + current_frame->SetProgress(progress_row); + } + } + // Wait until all the pending jobs are done. This ensures that all the tiles + // have been decoded and wrapped up. + pending_jobs.Wait(); + { + std::lock_guard<std::mutex> lock( + frame_scratch_buffer->superblock_row_mutex); + if (frame_scratch_buffer->tile_decoding_failed) { + return kLibgav1StatusUnknownError; + } + } + + current_frame->SetFrameState(kFrameStateDecoded); + return kStatusOk; +} + +void DecoderImpl::DecodeSuperBlockRowInTile( + const Vector<std::unique_ptr<Tile>>& tiles, size_t tile_index, int row4x4, + const int superblock_size4x4, const int tile_columns, + const int superblock_rows, FrameScratchBuffer* const frame_scratch_buffer, + PostFilter* const post_filter, BlockingCounter* const pending_jobs) { + std::unique_ptr<TileScratchBuffer> scratch_buffer = + frame_scratch_buffer->tile_scratch_buffer_pool.Get(); + if (scratch_buffer == nullptr) { + SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows); + return; + } + Tile& tile = *tiles[tile_index]; + const bool ok = tile.ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>( + row4x4, scratch_buffer.get()); + frame_scratch_buffer->tile_scratch_buffer_pool.Release( + std::move(scratch_buffer)); + if (!ok) { + SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows); + return; + } + if (post_filter->DoDeblock()) { + // Apply vertical deblock filtering for all the columns in this tile except + // for the first 64 columns. + post_filter->ApplyDeblockFilter( + kLoopFilterTypeVertical, row4x4, + tile.column4x4_start() + kNum4x4InLoopFilterUnit, tile.column4x4_end(), + superblock_size4x4); + // Apply horizontal deblock filtering for all the columns in this tile + // except for the first and the last 64 columns. + // Note about the last tile of each row: For the last tile, column4x4_end + // may not be a multiple of 16. In that case it is still okay to simply + // subtract 16 since ApplyDeblockFilter() will only do the filters in + // increments of 64 columns (or 32 columns for chroma with subsampling). + post_filter->ApplyDeblockFilter( + kLoopFilterTypeHorizontal, row4x4, + tile.column4x4_start() + kNum4x4InLoopFilterUnit, + tile.column4x4_end() - kNum4x4InLoopFilterUnit, superblock_size4x4); + } + const int superblock_size4x4_log2 = FloorLog2(superblock_size4x4); + const int index = row4x4 >> superblock_size4x4_log2; + int* const superblock_row_progress = + frame_scratch_buffer->superblock_row_progress.get(); + std::condition_variable* const superblock_row_progress_condvar = + frame_scratch_buffer->superblock_row_progress_condvar.get(); + bool notify; + { + std::lock_guard<std::mutex> lock( + frame_scratch_buffer->superblock_row_mutex); + notify = ++superblock_row_progress[index] == tile_columns; + } + if (notify) { + // We are done decoding this superblock row. Notify the post filtering + // thread. + superblock_row_progress_condvar[index].notify_one(); + } + // Schedule the next superblock row (if one exists). + ThreadPool& thread_pool = + *frame_scratch_buffer->threading_strategy.thread_pool(); + const int next_row4x4 = row4x4 + superblock_size4x4; + if (!tile.IsRow4x4Inside(next_row4x4)) { + tile_index += tile_columns; + } + if (tile_index >= tiles.size()) return; + pending_jobs->IncrementBy(1); + thread_pool.Schedule([this, &tiles, tile_index, next_row4x4, + superblock_size4x4, tile_columns, superblock_rows, + frame_scratch_buffer, post_filter, pending_jobs]() { + DecodeSuperBlockRowInTile(tiles, tile_index, next_row4x4, + superblock_size4x4, tile_columns, superblock_rows, + frame_scratch_buffer, post_filter, pending_jobs); + pending_jobs->Decrement(); + }); +} + +void DecoderImpl::ApplyDeblockingFilterForTileBoundaries( + PostFilter* const post_filter, const std::unique_ptr<Tile>* tile_row_base, + const ObuFrameHeader& frame_header, int row4x4, int block_width4x4, + int tile_columns, bool decode_entire_tiles_in_worker_threads) { + // Apply vertical deblock filtering for the first 64 columns of each tile. + for (int tile_column = 0; tile_column < tile_columns; ++tile_column) { + const Tile& tile = *tile_row_base[tile_column]; + post_filter->ApplyDeblockFilter( + kLoopFilterTypeVertical, row4x4, tile.column4x4_start(), + tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4); + } + if (decode_entire_tiles_in_worker_threads && + row4x4 == tile_row_base[0]->row4x4_start()) { + // This is the first superblock row of a tile row. In this case, apply + // horizontal deblock filtering for the entire superblock row. + post_filter->ApplyDeblockFilter(kLoopFilterTypeHorizontal, row4x4, 0, + frame_header.columns4x4, block_width4x4); + } else { + // Apply horizontal deblock filtering for the first 64 columns of the + // first tile. + const Tile& first_tile = *tile_row_base[0]; + post_filter->ApplyDeblockFilter( + kLoopFilterTypeHorizontal, row4x4, first_tile.column4x4_start(), + first_tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4); + // Apply horizontal deblock filtering for the last 64 columns of the + // previous tile and the first 64 columns of the current tile. + for (int tile_column = 1; tile_column < tile_columns; ++tile_column) { + const Tile& tile = *tile_row_base[tile_column]; + // If the previous tile has more than 64 columns, then include those + // for the horizontal deblock. + const Tile& previous_tile = *tile_row_base[tile_column - 1]; + const int column4x4_start = + tile.column4x4_start() - + ((tile.column4x4_start() - kNum4x4InLoopFilterUnit != + previous_tile.column4x4_start()) + ? kNum4x4InLoopFilterUnit + : 0); + post_filter->ApplyDeblockFilter( + kLoopFilterTypeHorizontal, row4x4, column4x4_start, + tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4); + } + // Apply horizontal deblock filtering for the last 64 columns of the + // last tile. + const Tile& last_tile = *tile_row_base[tile_columns - 1]; + // Identify the last column4x4 value and do horizontal filtering for + // that column4x4. The value of last column4x4 is the nearest multiple + // of 16 that is before tile.column4x4_end(). + const int column4x4_start = (last_tile.column4x4_end() - 1) & ~15; + // If column4x4_start is the same as tile.column4x4_start() then it + // means that the last tile has <= 64 columns. So there is nothing left + // to deblock (since it was already deblocked in the loop above). + if (column4x4_start != last_tile.column4x4_start()) { + post_filter->ApplyDeblockFilter( + kLoopFilterTypeHorizontal, row4x4, column4x4_start, + last_tile.column4x4_end(), block_width4x4); + } + } +} + void DecoderImpl::SetCurrentFrameSegmentationMap( const ObuFrameHeader& frame_header, const SegmentationMap* prev_segment_ids, RefCountedBuffer* const current_frame) { @@ -1092,10 +1459,7 @@ StatusCode DecoderImpl::ApplyFilmGrain( return kStatusOk; } if (!frame_header.show_existing_frame && - frame_header.refresh_frame_flags == 0 && - // TODO(vigneshv): In frame parallel mode, we never do film grain in - // place. Revisit this and see if this constraint need to be enforced. - !IsFrameParallel()) { + frame_header.refresh_frame_flags == 0) { // If show_existing_frame is true, then the current frame is a previously // saved reference frame. If refresh_frame_flags is nonzero, then the // state_.UpdateReferenceFrames() call above has saved the current frame as diff --git a/chromium/third_party/libgav1/src/src/decoder_impl.h b/chromium/third_party/libgav1/src/src/decoder_impl.h index dbc79ed85d7..4d58999c95e 100644 --- a/chromium/third_party/libgav1/src/src/decoder_impl.h +++ b/chromium/third_party/libgav1/src/src/decoder_impl.h @@ -18,7 +18,6 @@ #define LIBGAV1_SRC_DECODER_IMPL_H_ #include <array> -#include <atomic> #include <condition_variable> // NOLINT (unapproved c++11 header) #include <cstddef> #include <cstdint> @@ -32,7 +31,6 @@ #include "src/gav1/decoder_buffer.h" #include "src/gav1/decoder_settings.h" #include "src/gav1/status_code.h" -#include "src/loop_filter_mask.h" #include "src/obu_parser.h" #include "src/residual_buffer_pool.h" #include "src/symbol_decoder_context.h" @@ -129,6 +127,19 @@ class DecoderImpl : public Allocable { private: explicit DecoderImpl(const DecoderSettings* settings); StatusCode Init(); + // Called when the first frame is enqueued. It does the OBU parsing for one + // temporal unit to retrieve the tile configuration and sets up the frame + // threading if frame parallel mode is allowed. It also initializes the + // |temporal_units_| queue based on the number of frame threads. + // + // The following are the limitations of the current implementation: + // * It assumes that all frames in the video have the same tile + // configuration. The frame parallel threading model will not be updated + // based on tile configuration changes mid-stream. + // * The above assumption holds true even when there is a new coded video + // sequence (i.e.) a new sequence header. + StatusCode InitializeFrameThreadPoolAndTemporalUnitQueue(const uint8_t* data, + size_t size); // Used only in frame parallel mode. Signals failure and waits until the // worker threads are aborted if |status| is a failure status. If |status| is // equal to kStatusOk or kStatusTryAgain, this function does not do anything. @@ -175,11 +186,7 @@ class DecoderImpl : public Allocable { const Vector<std::unique_ptr<Tile>>& tiles, FrameScratchBuffer* frame_scratch_buffer, PostFilter* post_filter); StatusCode DecodeTilesThreadedNonFrameParallel( - const ObuSequenceHeader& sequence_header, - const ObuFrameHeader& frame_header, const Vector<std::unique_ptr<Tile>>& tiles, - const Vector<ObuTileGroup>& tile_groups, - const BlockParametersHolder& block_parameters_holder, FrameScratchBuffer* frame_scratch_buffer, PostFilter* post_filter, BlockingCounterWithStatus* pending_tiles); StatusCode DecodeTilesFrameParallel( @@ -190,6 +197,36 @@ class DecoderImpl : public Allocable { const SegmentationMap* prev_segment_ids, FrameScratchBuffer* frame_scratch_buffer, PostFilter* post_filter, RefCountedBuffer* current_frame); + StatusCode DecodeTilesThreadedFrameParallel( + const ObuSequenceHeader& sequence_header, + const ObuFrameHeader& frame_header, + const Vector<std::unique_ptr<Tile>>& tiles, + const SymbolDecoderContext& saved_symbol_decoder_context, + const SegmentationMap* prev_segment_ids, + FrameScratchBuffer* frame_scratch_buffer, PostFilter* post_filter, + RefCountedBuffer* current_frame); + // Helper function used by DecodeTilesThreadedFrameParallel. Decodes the + // superblock row starting at |row4x4| for tile at index |tile_index| in the + // list of tiles |tiles|. If the decoding is successful, then it does the + // following: + // * Schedule the next superblock row in the current tile column for + // decoding (the next superblock row may be in a different tile than the + // current one). + // * If an entire superblock row of the frame has been decoded, it notifies + // the waiters (if there are any). + void DecodeSuperBlockRowInTile(const Vector<std::unique_ptr<Tile>>& tiles, + size_t tile_index, int row4x4, + int superblock_size4x4, int tile_columns, + int superblock_rows, + FrameScratchBuffer* frame_scratch_buffer, + PostFilter* post_filter, + BlockingCounter* pending_jobs); + // Helper function used by DecodeTilesThreadedFrameParallel. Applies the + // deblocking filter for tile boundaries for the superblock row at |row4x4|. + void ApplyDeblockingFilterForTileBoundaries( + PostFilter* post_filter, const std::unique_ptr<Tile>* tile_row_base, + const ObuFrameHeader& frame_header, int row4x4, int block_width4x4, + int tile_columns, bool decode_entire_tiles_in_worker_threads); // Sets the current frame's segmentation map for two cases. The third case // is handled in Tile::DecodeBlock(). void SetCurrentFrameSegmentationMap(const ObuFrameHeader& frame_header, @@ -206,6 +243,11 @@ class DecoderImpl : public Allocable { bool IsNewSequenceHeader(const ObuParser& obu); bool IsFrameParallel() const { return frame_thread_pool_ != nullptr; } + bool HasFailure() { + std::lock_guard<std::mutex> lock(mutex_); + return failure_status_ != kStatusOk; + } + Queue<TemporalUnit> temporal_units_; DecoderState state_; @@ -228,21 +270,16 @@ class DecoderImpl : public Allocable { // 2) DecodeTiles() // Both of these functions have to respond to the other one failing by // aborting whatever they are doing. This variable is used to accomplish that. - std::atomic<bool> abort_{false}; - // Stores the failure status if |abort_| is true. - std::atomic<StatusCode> failure_status_{kStatusOk}; + // If |failure_status_| is not kStatusOk, then the two functions will try to + // abort as early as they can. + StatusCode failure_status_ = kStatusOk LIBGAV1_GUARDED_BY(mutex_); ObuSequenceHeader sequence_header_ = {}; // If true, sequence_header is valid. bool has_sequence_header_ = false; -#if defined(ENABLE_FRAME_PARALLEL) - // TODO(b/142583029): A copy of the DecoderSettings is made to facilitate the - // development of frame parallel mode behind a compile time flag. - DecoderSettings settings_; -#else const DecoderSettings& settings_; -#endif + bool seen_first_frame_ = false; }; } // namespace libgav1 diff --git a/chromium/third_party/libgav1/src/src/dsp/arm/cdef_neon.cc b/chromium/third_party/libgav1/src/src/dsp/arm/cdef_neon.cc index 1fccfb47b36..c005f081279 100644 --- a/chromium/third_party/libgav1/src/src/dsp/arm/cdef_neon.cc +++ b/chromium/third_party/libgav1/src/src/dsp/arm/cdef_neon.cc @@ -36,16 +36,7 @@ namespace dsp { namespace low_bitdepth { namespace { -// CdefDirection: -// Mirror values and pad to 16 elements. -alignas(16) constexpr uint32_t kDivisionTable[] = {840, 420, 280, 210, 168, 140, - 120, 105, 120, 140, 168, 210, - 280, 420, 840, 0}; - -// Used when calculating odd |cost[x]| values to mask off unwanted elements. -// Holds elements 1 3 5 X 5 3 1 X -alignas(16) constexpr uint32_t kDivisionTableOdd[] = {420, 210, 140, 0, - 140, 210, 420, 0}; +#include "src/dsp/cdef.inc" // Expand |a| to int8x16_t, left shift it by |shift| and sum the low // and high values with |b| and |c| respectively. @@ -159,10 +150,10 @@ uint32x4_t SquareAccumulate(uint32x4_t a, uint16x4_t b) { // |cost[0]| and |cost[4]| square the input and sum with the corresponding // element from the other end of the vector: -// |kDivisionTable[]| element: +// |kCdefDivisionTable[]| element: // cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) * -// kDivisionTable[i + 1]; -// cost[0] += Square(partial[0][7]) * kDivisionTable[8]; +// kCdefDivisionTable[i + 1]; +// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8]; // Because everything is being summed into a single value the distributive // property allows us to mirror the division table and accumulate once. uint32_t Cost0Or4(const uint16x8_t a, const uint16x8_t b, @@ -179,7 +170,7 @@ uint32_t Cost0Or4(const uint16x8_t a, const uint16x8_t b, uint32_t SquareAccumulate(const uint16x8_t a) { uint32x4_t c = Square(vget_low_u16(a)); c = SquareAccumulate(c, vget_high_u16(a)); - c = vmulq_n_u32(c, kDivisionTable[7]); + c = vmulq_n_u32(c, kCdefDivisionTable[7]); return SumVector(c); } @@ -188,7 +179,7 @@ uint32_t CostOdd(const uint16x8_t a, const uint16x8_t b, const uint32x4_t mask, // Remove elements 0-2. uint32x4_t c = vandq_u32(mask, Square(vget_low_u16(a))); c = vaddq_u32(c, Square(vget_high_u16(a))); - c = vmulq_n_u32(c, kDivisionTable[7]); + c = vmulq_n_u32(c, kCdefDivisionTable[7]); c = vmlaq_u32(c, Square(vget_low_u16(a)), division_table[0]); c = vmlaq_u32(c, Square(vget_low_u16(b)), division_table[1]); @@ -230,14 +221,14 @@ void CdefDirection_NEON(const void* const source, ptrdiff_t stride, cost[6] = SquareAccumulate(partial_lo[6]); const uint32x4_t division_table[4] = { - vld1q_u32(kDivisionTable), vld1q_u32(kDivisionTable + 4), - vld1q_u32(kDivisionTable + 8), vld1q_u32(kDivisionTable + 12)}; + vld1q_u32(kCdefDivisionTable), vld1q_u32(kCdefDivisionTable + 4), + vld1q_u32(kCdefDivisionTable + 8), vld1q_u32(kCdefDivisionTable + 12)}; cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table); cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table); - const uint32x4_t division_table_odd[2] = {vld1q_u32(kDivisionTableOdd), - vld1q_u32(kDivisionTableOdd + 4)}; + const uint32x4_t division_table_odd[2] = { + vld1q_u32(kCdefDivisionTableOdd), vld1q_u32(kCdefDivisionTableOdd + 4)}; const uint32x4_t element_3_mask = {0, 0, 0, static_cast<uint32_t>(-1)}; @@ -328,31 +319,34 @@ int16x8_t Constrain(const uint16x8_t pixel, const uint16x8_t reference, return vsubq_s16(veorq_s16(clamp_abs_diff, sign), sign); } -template <int width> +template <int width, bool enable_primary = true, bool enable_secondary = true> void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height, const int direction, const int primary_strength, const int secondary_strength, const int damping, uint8_t* dst, const ptrdiff_t dst_stride) { static_assert(width == 8 || width == 4, ""); + static_assert(enable_primary || enable_secondary, ""); const uint16x8_t cdef_large_value_mask = vdupq_n_u16(static_cast<uint16_t>(~kCdefLargeValue)); const int16x8_t primary_threshold = vdupq_n_s16(primary_strength); const int16x8_t secondary_threshold = vdupq_n_s16(secondary_strength); int16x8_t primary_damping_shift, secondary_damping_shift; + // FloorLog2() requires input to be > 0. - if (primary_strength == 0) { - primary_damping_shift = vdupq_n_s16(0); - } else { + // 8-bit damping range: Y: [3, 6], UV: [2, 5]. + if (enable_primary) { + // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary + // for UV filtering. primary_damping_shift = vdupq_n_s16(-std::max(0, damping - FloorLog2(primary_strength))); } - - if (secondary_strength == 0) { - secondary_damping_shift = vdupq_n_s16(0); - } else { + if (enable_secondary) { + // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is + // necessary. + assert(damping - FloorLog2(secondary_strength) >= 0); secondary_damping_shift = - vdupq_n_s16(-std::max(0, damping - FloorLog2(secondary_strength))); + vdupq_n_s16(-(damping - FloorLog2(secondary_strength))); } const int primary_tap_0 = kCdefPrimaryTaps[primary_strength & 1][0]; @@ -366,105 +360,112 @@ void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height, } else { pixel = vcombine_u16(vld1_u16(src), vld1_u16(src + src_stride)); } + uint16x8_t min = pixel; uint16x8_t max = pixel; - - // Primary |direction|. - uint16x8_t primary_val[4]; - if (width == 8) { - LoadDirection(src, src_stride, primary_val, direction); + int16x8_t sum; + + if (enable_primary) { + // Primary |direction|. + uint16x8_t primary_val[4]; + if (width == 8) { + LoadDirection(src, src_stride, primary_val, direction); + } else { + LoadDirection4(src, src_stride, primary_val, direction); + } + + min = vminq_u16(min, primary_val[0]); + min = vminq_u16(min, primary_val[1]); + min = vminq_u16(min, primary_val[2]); + min = vminq_u16(min, primary_val[3]); + + // Convert kCdefLargeValue to 0 before calculating max. + max = vmaxq_u16(max, vandq_u16(primary_val[0], cdef_large_value_mask)); + max = vmaxq_u16(max, vandq_u16(primary_val[1], cdef_large_value_mask)); + max = vmaxq_u16(max, vandq_u16(primary_val[2], cdef_large_value_mask)); + max = vmaxq_u16(max, vandq_u16(primary_val[3], cdef_large_value_mask)); + + sum = Constrain(primary_val[0], pixel, primary_threshold, + primary_damping_shift); + sum = vmulq_n_s16(sum, primary_tap_0); + sum = vmlaq_n_s16(sum, + Constrain(primary_val[1], pixel, primary_threshold, + primary_damping_shift), + primary_tap_0); + sum = vmlaq_n_s16(sum, + Constrain(primary_val[2], pixel, primary_threshold, + primary_damping_shift), + primary_tap_1); + sum = vmlaq_n_s16(sum, + Constrain(primary_val[3], pixel, primary_threshold, + primary_damping_shift), + primary_tap_1); } else { - LoadDirection4(src, src_stride, primary_val, direction); + sum = vdupq_n_s16(0); } - min = vminq_u16(min, primary_val[0]); - min = vminq_u16(min, primary_val[1]); - min = vminq_u16(min, primary_val[2]); - min = vminq_u16(min, primary_val[3]); - - // Convert kCdefLargeValue to 0 before calculating max. - max = vmaxq_u16(max, vandq_u16(primary_val[0], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(primary_val[1], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(primary_val[2], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(primary_val[3], cdef_large_value_mask)); - - int16x8_t sum = Constrain(primary_val[0], pixel, primary_threshold, - primary_damping_shift); - sum = vmulq_n_s16(sum, primary_tap_0); - sum = vmlaq_n_s16(sum, - Constrain(primary_val[1], pixel, primary_threshold, - primary_damping_shift), - primary_tap_0); - sum = vmlaq_n_s16(sum, - Constrain(primary_val[2], pixel, primary_threshold, - primary_damping_shift), - primary_tap_1); - sum = vmlaq_n_s16(sum, - Constrain(primary_val[3], pixel, primary_threshold, - primary_damping_shift), - primary_tap_1); - - // Secondary |direction| values (+/- 2). Clamp |direction|. - uint16x8_t secondary_val[8]; - if (width == 8) { - LoadDirection(src, src_stride, secondary_val, (direction + 2) & 0x7); - LoadDirection(src, src_stride, secondary_val + 4, (direction - 2) & 0x7); - } else { - LoadDirection4(src, src_stride, secondary_val, (direction + 2) & 0x7); - LoadDirection4(src, src_stride, secondary_val + 4, (direction - 2) & 0x7); + if (enable_secondary) { + // Secondary |direction| values (+/- 2). Clamp |direction|. + uint16x8_t secondary_val[8]; + if (width == 8) { + LoadDirection(src, src_stride, secondary_val, direction + 2); + LoadDirection(src, src_stride, secondary_val + 4, direction - 2); + } else { + LoadDirection4(src, src_stride, secondary_val, direction + 2); + LoadDirection4(src, src_stride, secondary_val + 4, direction - 2); + } + + min = vminq_u16(min, secondary_val[0]); + min = vminq_u16(min, secondary_val[1]); + min = vminq_u16(min, secondary_val[2]); + min = vminq_u16(min, secondary_val[3]); + min = vminq_u16(min, secondary_val[4]); + min = vminq_u16(min, secondary_val[5]); + min = vminq_u16(min, secondary_val[6]); + min = vminq_u16(min, secondary_val[7]); + + max = vmaxq_u16(max, vandq_u16(secondary_val[0], cdef_large_value_mask)); + max = vmaxq_u16(max, vandq_u16(secondary_val[1], cdef_large_value_mask)); + max = vmaxq_u16(max, vandq_u16(secondary_val[2], cdef_large_value_mask)); + max = vmaxq_u16(max, vandq_u16(secondary_val[3], cdef_large_value_mask)); + max = vmaxq_u16(max, vandq_u16(secondary_val[4], cdef_large_value_mask)); + max = vmaxq_u16(max, vandq_u16(secondary_val[5], cdef_large_value_mask)); + max = vmaxq_u16(max, vandq_u16(secondary_val[6], cdef_large_value_mask)); + max = vmaxq_u16(max, vandq_u16(secondary_val[7], cdef_large_value_mask)); + + sum = vmlaq_n_s16(sum, + Constrain(secondary_val[0], pixel, secondary_threshold, + secondary_damping_shift), + kCdefSecondaryTap0); + sum = vmlaq_n_s16(sum, + Constrain(secondary_val[1], pixel, secondary_threshold, + secondary_damping_shift), + kCdefSecondaryTap0); + sum = vmlaq_n_s16(sum, + Constrain(secondary_val[2], pixel, secondary_threshold, + secondary_damping_shift), + kCdefSecondaryTap1); + sum = vmlaq_n_s16(sum, + Constrain(secondary_val[3], pixel, secondary_threshold, + secondary_damping_shift), + kCdefSecondaryTap1); + sum = vmlaq_n_s16(sum, + Constrain(secondary_val[4], pixel, secondary_threshold, + secondary_damping_shift), + kCdefSecondaryTap0); + sum = vmlaq_n_s16(sum, + Constrain(secondary_val[5], pixel, secondary_threshold, + secondary_damping_shift), + kCdefSecondaryTap0); + sum = vmlaq_n_s16(sum, + Constrain(secondary_val[6], pixel, secondary_threshold, + secondary_damping_shift), + kCdefSecondaryTap1); + sum = vmlaq_n_s16(sum, + Constrain(secondary_val[7], pixel, secondary_threshold, + secondary_damping_shift), + kCdefSecondaryTap1); } - - min = vminq_u16(min, secondary_val[0]); - min = vminq_u16(min, secondary_val[1]); - min = vminq_u16(min, secondary_val[2]); - min = vminq_u16(min, secondary_val[3]); - min = vminq_u16(min, secondary_val[4]); - min = vminq_u16(min, secondary_val[5]); - min = vminq_u16(min, secondary_val[6]); - min = vminq_u16(min, secondary_val[7]); - - max = vmaxq_u16(max, vandq_u16(secondary_val[0], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(secondary_val[1], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(secondary_val[2], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(secondary_val[3], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(secondary_val[4], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(secondary_val[5], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(secondary_val[6], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(secondary_val[7], cdef_large_value_mask)); - - sum = vmlaq_n_s16(sum, - Constrain(secondary_val[0], pixel, secondary_threshold, - secondary_damping_shift), - kCdefSecondaryTap0); - sum = vmlaq_n_s16(sum, - Constrain(secondary_val[1], pixel, secondary_threshold, - secondary_damping_shift), - kCdefSecondaryTap0); - sum = vmlaq_n_s16(sum, - Constrain(secondary_val[2], pixel, secondary_threshold, - secondary_damping_shift), - kCdefSecondaryTap1); - sum = vmlaq_n_s16(sum, - Constrain(secondary_val[3], pixel, secondary_threshold, - secondary_damping_shift), - kCdefSecondaryTap1); - sum = vmlaq_n_s16(sum, - Constrain(secondary_val[4], pixel, secondary_threshold, - secondary_damping_shift), - kCdefSecondaryTap0); - sum = vmlaq_n_s16(sum, - Constrain(secondary_val[5], pixel, secondary_threshold, - secondary_damping_shift), - kCdefSecondaryTap0); - sum = vmlaq_n_s16(sum, - Constrain(secondary_val[6], pixel, secondary_threshold, - secondary_damping_shift), - kCdefSecondaryTap1); - sum = vmlaq_n_s16(sum, - Constrain(secondary_val[7], pixel, secondary_threshold, - secondary_damping_shift), - kCdefSecondaryTap1); - // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max)) const int16x8_t sum_lt_0 = vshrq_n_s16(sum, 15); sum = vaddq_s16(sum, vdupq_n_s16(8)); @@ -495,26 +496,48 @@ void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height, // inside the frame. However it requires the source input to be padded with a // constant large value if at the boundary. The input must be uint16_t. void CdefFilter_NEON(const void* const source, const ptrdiff_t source_stride, - const int rows4x4, const int columns4x4, const int curr_x, - const int curr_y, const int subsampling_x, - const int subsampling_y, const int primary_strength, - const int secondary_strength, const int damping, - const int direction, void* const dest, + const int block_width, const int block_height, + const int primary_strength, const int secondary_strength, + const int damping, const int direction, void* const dest, const ptrdiff_t dest_stride) { - const int plane_width = MultiplyBy4(columns4x4) >> subsampling_x; - const int plane_height = MultiplyBy4(rows4x4) >> subsampling_y; - const int block_width = std::min(8 >> subsampling_x, plane_width - curr_x); - const int block_height = std::min(8 >> subsampling_y, plane_height - curr_y); const auto* src = static_cast<const uint16_t*>(source); auto* dst = static_cast<uint8_t*>(dest); - if (block_width == 8) { - DoCdef<8>(src, source_stride, block_height, direction, primary_strength, - secondary_strength, damping, dst, dest_stride); + // TODO(slavarnway): Change dsp->cdef_filter to dsp->cdef_filter[2][2]. This + // would eliminate the strength checks. + if (secondary_strength > 0) { + if (primary_strength > 0) { + if (block_width == 8) { + DoCdef<8>(src, source_stride, block_height, direction, primary_strength, + secondary_strength, damping, dst, dest_stride); + } else { + assert(block_width == 4); + DoCdef<4>(src, source_stride, block_height, direction, primary_strength, + secondary_strength, damping, dst, dest_stride); + } + } else { + if (block_width == 8) { + DoCdef<8, /*enable_primary=*/false>( + src, source_stride, block_height, direction, primary_strength, + secondary_strength, damping, dst, dest_stride); + } else { + assert(block_width == 4); + DoCdef<4, /*enable_primary=*/false>( + src, source_stride, block_height, direction, primary_strength, + secondary_strength, damping, dst, dest_stride); + } + } } else { - assert(block_width == 4); - DoCdef<4>(src, source_stride, block_height, direction, primary_strength, - secondary_strength, damping, dst, dest_stride); + if (block_width == 8) { + DoCdef<8, /*enable_primary=*/true, /*enable_secondary=*/false>( + src, source_stride, block_height, direction, primary_strength, + secondary_strength, damping, dst, dest_stride); + } else { + assert(block_width == 4); + DoCdef<4, /*enable_primary=*/true, /*enable_secondary=*/false>( + src, source_stride, block_height, direction, primary_strength, + secondary_strength, damping, dst, dest_stride); + } } } diff --git a/chromium/third_party/libgav1/src/src/dsp/arm/convolve_neon.cc b/chromium/third_party/libgav1/src/src/dsp/arm/convolve_neon.cc index 34868826dcd..424be020bff 100644 --- a/chromium/third_party/libgav1/src/src/dsp/arm/convolve_neon.cc +++ b/chromium/third_party/libgav1/src/src/dsp/arm/convolve_neon.cc @@ -1350,8 +1350,6 @@ void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y, const int height, void* dest, const ptrdiff_t dest_stride) { constexpr ptrdiff_t src_stride = kIntermediateStride; - constexpr int kernel_offset = (8 - num_taps) / 2; - src += src_stride * kernel_offset; const int16_t* src_y = src; // |dest| is 16-bit in compound mode, Pixel otherwise. uint16_t* dest16_y = static_cast<uint16_t*>(dest); @@ -1425,8 +1423,6 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width, const int step_y, const int height, void* dest, const ptrdiff_t dest_stride) { constexpr ptrdiff_t src_stride = kIntermediateStride; - constexpr int kernel_offset = (8 - num_taps) / 2; - src += src_stride * kernel_offset; // A possible improvement is to use arithmetic to decide how many times to // apply filters to same source before checking whether to load new srcs. // However, this will only improve performance with very small step sizes. @@ -1498,15 +1494,14 @@ void ConvolveScale2D_NEON(const void* const reference, const int subpixel_y, const int step_x, const int step_y, const int width, const int height, void* prediction, const ptrdiff_t pred_stride) { - // TODO(petersonab): Reduce the height here by using the vertical filter - // size and offset horizontal filter. Reduce intermediate block stride to - // width to make smaller blocks faster. + const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); + const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); + assert(step_x <= 2048); + const int num_vert_taps = GetNumTapsInFilter(vert_filter_index); const int intermediate_height = (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >> kScaleSubPixelBits) + - kSubPixelTaps; - // TODO(b/133525024): Decide whether it's worth branching to a special case - // when step_x or step_y is 1024. + num_vert_taps; assert(step_x <= 2048); // The output of the horizontal filter, i.e. the intermediate_result, is // guaranteed to fit in int16_t. @@ -1520,11 +1515,27 @@ void ConvolveScale2D_NEON(const void* const reference, // Similarly for height. int filter_index = GetFilterIndex(horizontal_filter_index, width); int16_t* intermediate = intermediate_result; - const auto* src = static_cast<const uint8_t*>(reference); const ptrdiff_t src_stride = reference_stride; + const auto* src = static_cast<const uint8_t*>(reference); + const int vert_kernel_offset = (8 - num_vert_taps) / 2; + src += vert_kernel_offset * src_stride; + + // Derive the maximum value of |step_x| at which all source values fit in one + // 16-byte load. Final index is src_x + |num_taps| - 1 < 16 + // step_x*7 is the final base subpel index for the shuffle mask for filter + // inputs in each iteration on large blocks. When step_x is large, we need a + // larger structure and use a larger table lookup in order to gather all + // filter inputs. + // |num_taps| - 1 is the shuffle index of the final filter input. + const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index); + const int kernel_start_ceiling = 16 - num_horiz_taps; + // This truncated quotient |grade_x_threshold| selects |step_x| such that: + // (step_x * 7) >> kScaleSubPixelBits < single load limit + const int grade_x_threshold = + (kernel_start_ceiling << kScaleSubPixelBits) / 7; switch (filter_index) { case 0: - if (step_x > 1024) { + if (step_x > grade_x_threshold) { ConvolveKernelHorizontalSigned6Tap<2>( src, src_stride, width, subpixel_x, step_x, intermediate_height, intermediate); @@ -1535,7 +1546,7 @@ void ConvolveScale2D_NEON(const void* const reference, } break; case 1: - if (step_x > 1024) { + if (step_x > grade_x_threshold) { ConvolveKernelHorizontalMixed6Tap<2>(src, src_stride, width, subpixel_x, step_x, intermediate_height, intermediate); @@ -1547,7 +1558,7 @@ void ConvolveScale2D_NEON(const void* const reference, } break; case 2: - if (step_x > 1024) { + if (step_x > grade_x_threshold) { ConvolveKernelHorizontalSigned8Tap<2>( src, src_stride, width, subpixel_x, step_x, intermediate_height, intermediate); @@ -1558,7 +1569,7 @@ void ConvolveScale2D_NEON(const void* const reference, } break; case 3: - if (step_x > 1024) { + if (step_x > grade_x_threshold) { ConvolveKernelHorizontal2Tap<2>(src, src_stride, width, subpixel_x, step_x, intermediate_height, intermediate); diff --git a/chromium/third_party/libgav1/src/src/dsp/arm/loop_restoration_neon.cc b/chromium/third_party/libgav1/src/src/dsp/arm/loop_restoration_neon.cc index f63fabdd7e2..e89ba36773b 100644 --- a/chromium/third_party/libgav1/src/src/dsp/arm/loop_restoration_neon.cc +++ b/chromium/third_party/libgav1/src/src/dsp/arm/loop_restoration_neon.cc @@ -16,7 +16,6 @@ #include "src/utils/cpu.h" #if LIBGAV1_ENABLE_NEON - #include <arm_neon.h> #include <cassert> @@ -33,6 +32,11 @@ namespace dsp { namespace low_bitdepth { namespace { +template <int bytes> +inline uint16x4_t VshrU128(const uint16x8_t a) { + return vext_u16(vget_low_u16(a), vget_high_u16(a), bytes / 2); +} + // Wiener // Must make a local copy of coefficients to help compiler know that they have @@ -50,7 +54,6 @@ inline void PopulateWienerCoefficients( assert(direction == WienerInfo::kVertical); filter_3 = 128; } - for (int i = 0; i < 3; ++i) { const int16_t coeff = restoration_info.wiener_info.filter[direction][i]; filter[i] = coeff; @@ -76,74 +79,24 @@ inline int CountZeroCoefficients(const int16_t filter[2][kSubPixelTaps]) { return number_zero_coefficients; } -inline void LoadHorizontal4Tap3(const uint8_t* source, uint8x8_t s[3]) { - s[0] = vld1_u8(source); - // Faster than using vshr_n_u64(). - s[1] = vext_u8(s[0], s[0], 1); - s[2] = vext_u8(s[0], s[0], 2); -} - -inline void LoadHorizontal4Tap5(const uint8_t* source, uint8x8_t s[5]) { - s[0] = vld1_u8(source); - // Faster than using vshr_n_u64(). - s[1] = vext_u8(s[0], s[0], 1); - s[2] = vext_u8(s[0], s[0], 2); - s[3] = vext_u8(s[0], s[0], 3); - s[4] = vext_u8(s[0], s[0], 4); -} - -inline void LoadHorizontal8Tap3(const uint8_t* source, uint8x8_t s[3]) { - const uint8x16_t r = vld1q_u8(source); - s[0] = vget_low_u8(r); - s[1] = vext_u8(s[0], vget_high_u8(r), 1); - s[2] = vext_u8(s[0], vget_high_u8(r), 2); -} - -inline void LoadHorizontal8Tap5(const uint8_t* source, uint8x8_t s[5]) { - const uint8x16_t r = vld1q_u8(source); - s[0] = vget_low_u8(r); - s[1] = vext_u8(s[0], vget_high_u8(r), 1); - s[2] = vext_u8(s[0], vget_high_u8(r), 2); - s[3] = vext_u8(s[0], vget_high_u8(r), 3); - s[4] = vext_u8(s[0], vget_high_u8(r), 4); -} - -inline void LoadHorizontalTap7(const uint8_t* source, uint8x8_t s[7]) { - // This is just as fast as an 8x8 transpose but avoids over-reading - // extra rows. It always over-reads by at least 1 value. On small widths - // (4xH) it over-reads by 9 values. - const uint8x16_t r = vld1q_u8(source); - s[0] = vget_low_u8(r); - s[1] = vext_u8(s[0], vget_high_u8(r), 1); - s[2] = vext_u8(s[0], vget_high_u8(r), 2); - s[3] = vext_u8(s[0], vget_high_u8(r), 3); - s[4] = vext_u8(s[0], vget_high_u8(r), 4); - s[5] = vext_u8(s[0], vget_high_u8(r), 5); - s[6] = vext_u8(s[0], vget_high_u8(r), 6); -} - inline int16x8_t HorizontalSum(const uint8x8_t a[3], const int16_t filter[2], int16x8_t sum) { const int16x8_t a_0_2 = vreinterpretq_s16_u16(vaddl_u8(a[0], a[2])); sum = vmlaq_n_s16(sum, a_0_2, filter[0]); sum = vmlaq_n_s16(sum, vreinterpretq_s16_u16(vmovl_u8(a[1])), filter[1]); - sum = vrshrq_n_s16(sum, kInterRoundBitsHorizontal); - // Delaying |horizontal_rounding| until after down shifting allows the sum to // stay in 16 bits. // |horizontal_rounding| = 1 << (bitdepth + kWienerFilterBits - 1) // 1 << ( 8 + 7 - 1) // Plus |kInterRoundBitsHorizontal| and it works out to 1 << 11. sum = vaddq_s16(sum, vdupq_n_s16(1 << 11)); - // Just like |horizontal_rounding|, adding |filter[3]| at this point allows // the sum to stay in 16 bits. // But wait! We *did* calculate |filter[3]| and used it in the sum! But it was // offset by 128. Fix that here: // |src[3]| * 128 >> 3 == |src[3]| << 4 sum = vaddq_s16(sum, vreinterpretq_s16_u16(vshll_n_u8(a[1], 4))); - // Saturate to // [0, // (1 << (bitdepth + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1)] @@ -153,111 +106,6 @@ inline int16x8_t HorizontalSum(const uint8x8_t a[3], const int16_t filter[2], return sum; } -inline int16x8_t HorizontalSumTap3(const uint8x8_t a[3], - const int16_t filter[2]) { - return HorizontalSum(a, filter, vdupq_n_s16(0)); -} - -inline int16x8_t HorizontalSumTap5(const uint8x8_t a[5], - const int16_t filter[3]) { - const int16x8_t a_0_4 = vreinterpretq_s16_u16(vaddl_u8(a[0], a[4])); - const int16x8_t sum = vmulq_n_s16(a_0_4, filter[0]); - return HorizontalSum(a + 1, filter + 1, sum); -} - -inline int16x8_t HorizontalSumTap7(const uint8x8_t a[7], - const int16_t filter[4]) { - const int16x8_t a_0_6 = vreinterpretq_s16_u16(vaddl_u8(a[0], a[6])); - const int16x8_t a_1_5 = vreinterpretq_s16_u16(vaddl_u8(a[1], a[5])); - int16x8_t sum = vmulq_n_s16(a_0_6, filter[0]); - sum = vmlaq_n_s16(sum, a_1_5, filter[1]); - return HorizontalSum(a + 2, filter + 2, sum); -} - -inline int16x8_t WienerHorizontal4Tap3(const uint8_t* source, - const int16_t filter[2]) { - uint8x8_t s[5]; - LoadHorizontal4Tap3(source, s); - return HorizontalSumTap3(s, filter); -} - -inline int16x8_t WienerHorizontal4Tap5(const uint8_t* source, - const int16_t filter[3]) { - uint8x8_t s[5]; - LoadHorizontal4Tap5(source, s); - return HorizontalSumTap5(s, filter); -} - -inline int16x8_t WienerHorizontal4Tap7(const uint8_t* source, - const int16_t filter[4]) { - uint8x8_t s[7]; - LoadHorizontalTap7(source, s); - return HorizontalSumTap7(s, filter); -} - -inline int16x8_t WienerHorizontal4x2Tap3(const uint8_t* source, - const ptrdiff_t stride, - const int16_t filter[2]) { - uint8x8_t s0[5], s1[5], s[5]; - LoadHorizontal4Tap3(source + 0 * stride, s0); - LoadHorizontal4Tap3(source + 1 * stride, s1); - s[0] = InterleaveLow32(s0[0], s1[0]); - s[1] = InterleaveLow32(s0[1], s1[1]); - s[2] = InterleaveLow32(s0[2], s1[2]); - return HorizontalSumTap3(s, filter); -} - -inline int16x8_t WienerHorizontal4x2Tap5(const uint8_t* source, - const ptrdiff_t stride, - const int16_t filter[3]) { - uint8x8_t s0[5], s1[5], s[5]; - LoadHorizontal4Tap5(source + 0 * stride, s0); - LoadHorizontal4Tap5(source + 1 * stride, s1); - s[0] = InterleaveLow32(s0[0], s1[0]); - s[1] = InterleaveLow32(s0[1], s1[1]); - s[2] = InterleaveLow32(s0[2], s1[2]); - s[3] = InterleaveLow32(s0[3], s1[3]); - s[4] = InterleaveLow32(s0[4], s1[4]); - return HorizontalSumTap5(s, filter); -} - -inline int16x8_t WienerHorizontal4x2Tap7(const uint8_t* source, - const ptrdiff_t stride, - const int16_t filter[4]) { - uint8x8_t s0[7], s1[7], s[7]; - LoadHorizontalTap7(source + 0 * stride, s0); - LoadHorizontalTap7(source + 1 * stride, s1); - s[0] = InterleaveLow32(s0[0], s1[0]); - s[1] = InterleaveLow32(s0[1], s1[1]); - s[2] = InterleaveLow32(s0[2], s1[2]); - s[3] = InterleaveLow32(s0[3], s1[3]); - s[4] = InterleaveLow32(s0[4], s1[4]); - s[5] = InterleaveLow32(s0[5], s1[5]); - s[6] = InterleaveLow32(s0[6], s1[6]); - return HorizontalSumTap7(s, filter); -} - -inline int16x8_t WienerHorizontal8Tap3(const uint8_t* source, - const int16_t filter[2]) { - uint8x8_t s[3]; - LoadHorizontal8Tap3(source, s); - return HorizontalSumTap3(s, filter); -} - -inline int16x8_t WienerHorizontal8Tap5(const uint8_t* source, - const int16_t filter[3]) { - uint8x8_t s[5]; - LoadHorizontal8Tap5(source, s); - return HorizontalSumTap5(s, filter); -} - -inline int16x8_t WienerHorizontal8Tap7(const uint8_t* source, - const int16_t filter[4]) { - uint8x8_t s[7]; - LoadHorizontalTap7(source, s); - return HorizontalSumTap7(s, filter); -} - inline uint8x8_t WienerVertical(const int16x8_t a[3], const int16_t filter[2], int32x4_t sum[2]) { // -(1 << (bitdepth + kInterRoundBitsVertical - 1)) @@ -265,7 +113,6 @@ inline uint8x8_t WienerVertical(const int16x8_t a[3], const int16_t filter[2], constexpr int vertical_rounding = -(1 << 18); const int32x4_t rounding = vdupq_n_s32(vertical_rounding); const int16x8_t a_0_2 = vaddq_s16(a[0], a[2]); - sum[0] = vaddq_s32(sum[0], rounding); sum[1] = vaddq_s32(sum[1], rounding); sum[0] = vmlal_n_s16(sum[0], vget_low_s16(a_0_2), filter[0]); @@ -274,44 +121,9 @@ inline uint8x8_t WienerVertical(const int16x8_t a[3], const int16_t filter[2], sum[1] = vmlal_n_s16(sum[1], vget_high_s16(a[1]), filter[1]); const uint16x4_t sum_lo_16 = vqrshrun_n_s32(sum[0], 11); const uint16x4_t sum_hi_16 = vqrshrun_n_s32(sum[1], 11); - return vqmovn_u16(vcombine_u16(sum_lo_16, sum_hi_16)); } -inline uint8x8_t WienerVerticalTap3(const int16x8_t a[3], - const int16_t filter[2]) { - int32x4_t sum[2]; - sum[0] = sum[1] = vdupq_n_s32(0); - return WienerVertical(a, filter, sum); -} - -inline uint8x8_t WienerVerticalTap5(const int16x8_t a[5], - const int16_t filter[3]) { - const int16x8_t a_0_4 = vaddq_s16(a[0], a[4]); - int32x4_t sum[2]; - - sum[0] = sum[1] = vdupq_n_s32(0); - sum[0] = vmlal_n_s16(sum[0], vget_low_s16(a_0_4), filter[0]); - sum[1] = vmlal_n_s16(sum[1], vget_high_s16(a_0_4), filter[0]); - - return WienerVertical(a + 1, filter + 1, sum); -} - -inline uint8x8_t WienerVerticalTap7(const int16x8_t a[7], - const int16_t filter[4]) { - const int16x8_t a_0_6 = vaddq_s16(a[0], a[6]); - const int16x8_t a_1_5 = vaddq_s16(a[1], a[5]); - int32x4_t sum[2]; - - sum[0] = sum[1] = vdupq_n_s32(0); - sum[0] = vmlal_n_s16(sum[0], vget_low_s16(a_0_6), filter[0]); - sum[1] = vmlal_n_s16(sum[1], vget_high_s16(a_0_6), filter[0]); - sum[0] = vmlal_n_s16(sum[0], vget_low_s16(a_1_5), filter[1]); - sum[1] = vmlal_n_s16(sum[1], vget_high_s16(a_1_5), filter[1]); - - return WienerVertical(a + 2, filter + 2, sum); -} - // For width 16 and up, store the horizontal results, and then do the vertical // filter row by row. This is faster than doing it column by column when // considering cache issues. @@ -330,360 +142,168 @@ void WienerFilter_NEON(const void* const source, void* const dest, int16_t* wiener_buffer = reinterpret_cast<int16_t*>(buffer->wiener_buffer); int16_t filter_horizontal[kSubPixelTaps / 2]; int16_t filter_vertical[kSubPixelTaps / 2]; - int16x8_t a[7]; - PopulateWienerCoefficients(restoration_info, WienerInfo::kHorizontal, filter_horizontal); PopulateWienerCoefficients(restoration_info, WienerInfo::kVertical, filter_vertical); - if (number_zero_coefficients == 0) { // 7-tap - src -= kCenterTap * source_stride + kCenterTap; - - if (width > 8) { - int y = height + kSubPixelTaps - 2; - do { - int x = 0; - do { - const int16x8_t a = WienerHorizontal8Tap7(src + x, filter_horizontal); - vst1q_s16(wiener_buffer + x, a); - x += 8; - } while (x < width); - src += source_stride; - wiener_buffer += width; - } while (--y != 0); - - wiener_buffer = reinterpret_cast<int16_t*>(buffer->wiener_buffer); - - y = height; + src -= (kCenterTap - 1) * source_stride + kCenterTap; + int y = height + kSubPixelTaps - 4; + do { + wiener_buffer += width; + int x = 0; do { - int x = 0; - do { - a[0] = vld1q_s16(wiener_buffer + x + 0 * width); - a[1] = vld1q_s16(wiener_buffer + x + 1 * width); - a[2] = vld1q_s16(wiener_buffer + x + 2 * width); - a[3] = vld1q_s16(wiener_buffer + x + 3 * width); - a[4] = vld1q_s16(wiener_buffer + x + 4 * width); - a[5] = vld1q_s16(wiener_buffer + x + 5 * width); - a[6] = vld1q_s16(wiener_buffer + x + 6 * width); - - const uint8x8_t r = WienerVerticalTap7(a, filter_vertical); - vst1_u8(dst + x, r); - x += 8; - } while (x < width); - wiener_buffer += width; - dst += dest_stride; - } while (--y != 0); - } else if (width > 4) { - a[0] = WienerHorizontal8Tap7(src, filter_horizontal); - src += source_stride; - a[1] = WienerHorizontal8Tap7(src, filter_horizontal); - src += source_stride; - a[2] = WienerHorizontal8Tap7(src, filter_horizontal); + // This is just as fast as an 8x8 transpose but avoids over-reading + // extra rows. It always over-reads by at least 1 value. On small widths + // (4xH) it over-reads by 9 values. + const uint8x16_t r = vld1q_u8(src + x); + uint8x8_t s[7]; + s[0] = vget_low_u8(r); + s[1] = vext_u8(s[0], vget_high_u8(r), 1); + s[2] = vext_u8(s[0], vget_high_u8(r), 2); + s[3] = vext_u8(s[0], vget_high_u8(r), 3); + s[4] = vext_u8(s[0], vget_high_u8(r), 4); + s[5] = vext_u8(s[0], vget_high_u8(r), 5); + s[6] = vext_u8(s[0], vget_high_u8(r), 6); + const int16x8_t s_0_6 = vreinterpretq_s16_u16(vaddl_u8(s[0], s[6])); + const int16x8_t s_1_5 = vreinterpretq_s16_u16(vaddl_u8(s[1], s[5])); + int16x8_t sum = vmulq_n_s16(s_0_6, filter_horizontal[0]); + sum = vmlaq_n_s16(sum, s_1_5, filter_horizontal[1]); + const int16x8_t a = HorizontalSum(s + 2, filter_horizontal + 2, sum); + vst1q_s16(wiener_buffer + x, a); + x += 8; + } while (x < width); src += source_stride; - a[3] = WienerHorizontal8Tap7(src, filter_horizontal); - src += source_stride; - a[4] = WienerHorizontal8Tap7(src, filter_horizontal); - src += source_stride; - a[5] = WienerHorizontal8Tap7(src, filter_horizontal); - src += source_stride; - - int y = height; + } while (--y != 0); + // Because the top row of |source| is a duplicate of the second row, and the + // bottom row of |source| is a duplicate of its above row, we can duplicate + // the top and bottom row of |wiener_buffer| accordingly. + memcpy(wiener_buffer + width, wiener_buffer, + sizeof(*wiener_buffer) * width); + wiener_buffer = reinterpret_cast<int16_t*>(buffer->wiener_buffer); + memcpy(wiener_buffer, wiener_buffer + width, + sizeof(*wiener_buffer) * width); + + y = height; + do { + int x = 0; do { - a[6] = WienerHorizontal8Tap7(src, filter_horizontal); - src += source_stride; - - const uint8x8_t r = WienerVerticalTap7(a, filter_vertical); - vst1_u8(dst, r); - dst += dest_stride; - - a[0] = a[1]; - a[1] = a[2]; - a[2] = a[3]; - a[3] = a[4]; - a[4] = a[5]; - a[5] = a[6]; - } while (--y != 0); - } else { - int y = height; - - if ((y & 1) != 0) { - --y; - a[0] = WienerHorizontal4x2Tap7(src, source_stride, filter_horizontal); - src += source_stride; - a[2] = WienerHorizontal4x2Tap7(src + source_stride, source_stride, - filter_horizontal); - a[4] = WienerHorizontal4x2Tap7(src + 3 * source_stride, source_stride, - filter_horizontal); - a[1] = vcombine_s16(vget_high_s16(a[0]), vget_low_s16(a[2])); - a[3] = vcombine_s16(vget_high_s16(a[2]), vget_low_s16(a[4])); - a[6] = - WienerHorizontal4Tap7(src + 5 * source_stride, filter_horizontal); - a[5] = vcombine_s16(vget_high_s16(a[4]), vget_low_s16(a[6])); - const uint8x8_t r = WienerVerticalTap7(a, filter_vertical); - StoreLo4(dst, r); - dst += dest_stride; - } - - if (y != 0) { - a[0] = WienerHorizontal4x2Tap7(src, source_stride, filter_horizontal); - src += 2 * source_stride; - a[2] = WienerHorizontal4x2Tap7(src, source_stride, filter_horizontal); - src += 2 * source_stride; - a[4] = WienerHorizontal4x2Tap7(src, source_stride, filter_horizontal); - src += 2 * source_stride; - a[1] = vcombine_s16(vget_high_s16(a[0]), vget_low_s16(a[2])); - a[3] = vcombine_s16(vget_high_s16(a[2]), vget_low_s16(a[4])); - - do { - a[6] = WienerHorizontal4x2Tap7(src, source_stride, filter_horizontal); - src += 2 * source_stride; - a[5] = vcombine_s16(vget_high_s16(a[4]), vget_low_s16(a[6])); - - const uint8x8_t r = WienerVerticalTap7(a, filter_vertical); - StoreLo4(dst, r); - dst += dest_stride; - StoreHi4(dst, r); - dst += dest_stride; - - a[0] = a[2]; - a[1] = a[3]; - a[2] = a[4]; - a[3] = a[5]; - a[4] = a[6]; - y -= 2; - } while (y != 0); - } - } + int16x8_t a[7]; + a[0] = vld1q_s16(wiener_buffer + x + 0 * width); + a[1] = vld1q_s16(wiener_buffer + x + 1 * width); + a[2] = vld1q_s16(wiener_buffer + x + 2 * width); + a[3] = vld1q_s16(wiener_buffer + x + 3 * width); + a[4] = vld1q_s16(wiener_buffer + x + 4 * width); + a[5] = vld1q_s16(wiener_buffer + x + 5 * width); + a[6] = vld1q_s16(wiener_buffer + x + 6 * width); + const int16x8_t a_0_6 = vaddq_s16(a[0], a[6]); + const int16x8_t a_1_5 = vaddq_s16(a[1], a[5]); + int32x4_t sum[2]; + sum[0] = sum[1] = vdupq_n_s32(0); + sum[0] = vmlal_n_s16(sum[0], vget_low_s16(a_0_6), filter_vertical[0]); + sum[1] = vmlal_n_s16(sum[1], vget_high_s16(a_0_6), filter_vertical[0]); + sum[0] = vmlal_n_s16(sum[0], vget_low_s16(a_1_5), filter_vertical[1]); + sum[1] = vmlal_n_s16(sum[1], vget_high_s16(a_1_5), filter_vertical[1]); + const uint8x8_t r = WienerVertical(a + 2, filter_vertical + 2, sum); + vst1_u8(dst + x, r); + x += 8; + } while (x < width); + wiener_buffer += width; + dst += dest_stride; + } while (--y != 0); } else if (number_zero_coefficients == 1) { // 5-tap src -= (kCenterTap - 1) * source_stride + kCenterTap - 1; - - if (width > 8) { - int y = height + kSubPixelTaps - 4; - do { - int x = 0; - do { - const int16x8_t a = - WienerHorizontal8Tap5(src + x, filter_horizontal + 1); - vst1q_s16(wiener_buffer + x, a); - x += 8; - } while (x < width); - src += source_stride; - wiener_buffer += width; - } while (--y != 0); - - wiener_buffer = reinterpret_cast<int16_t*>(buffer->wiener_buffer); - - y = height; + int y = height + kSubPixelTaps - 4; + do { + int x = 0; do { - int x = 0; - do { - a[0] = vld1q_s16(wiener_buffer + x + 0 * width); - a[1] = vld1q_s16(wiener_buffer + x + 1 * width); - a[2] = vld1q_s16(wiener_buffer + x + 2 * width); - a[3] = vld1q_s16(wiener_buffer + x + 3 * width); - a[4] = vld1q_s16(wiener_buffer + x + 4 * width); - - const uint8x8_t r = WienerVerticalTap5(a, filter_vertical + 1); - vst1_u8(dst + x, r); - x += 8; - } while (x < width); - wiener_buffer += width; - dst += dest_stride; - } while (--y != 0); - } else if (width > 4) { - a[0] = WienerHorizontal8Tap5(src, filter_horizontal + 1); - src += source_stride; - a[1] = WienerHorizontal8Tap5(src, filter_horizontal + 1); - src += source_stride; - a[2] = WienerHorizontal8Tap5(src, filter_horizontal + 1); - src += source_stride; - a[3] = WienerHorizontal8Tap5(src, filter_horizontal + 1); + const uint8x16_t r = vld1q_u8(src + x); + uint8x8_t s[5]; + s[0] = vget_low_u8(r); + s[1] = vext_u8(s[0], vget_high_u8(r), 1); + s[2] = vext_u8(s[0], vget_high_u8(r), 2); + s[3] = vext_u8(s[0], vget_high_u8(r), 3); + s[4] = vext_u8(s[0], vget_high_u8(r), 4); + const int16x8_t s_0_4 = vreinterpretq_s16_u16(vaddl_u8(s[0], s[4])); + const int16x8_t sum = vmulq_n_s16(s_0_4, filter_horizontal[1]); + const int16x8_t a = HorizontalSum(s + 1, filter_horizontal + 2, sum); + vst1q_s16(wiener_buffer + x, a); + x += 8; + } while (x < width); src += source_stride; + wiener_buffer += width; + } while (--y != 0); - int y = height; + wiener_buffer = reinterpret_cast<int16_t*>(buffer->wiener_buffer); + y = height; + do { + int x = 0; do { - a[4] = WienerHorizontal8Tap5(src, filter_horizontal + 1); - src += source_stride; - - const uint8x8_t r = WienerVerticalTap5(a, filter_vertical + 1); - vst1_u8(dst, r); - dst += dest_stride; - - a[0] = a[1]; - a[1] = a[2]; - a[2] = a[3]; - a[3] = a[4]; - } while (--y != 0); - } else { - int y = height; - - if ((y & 1) != 0) { - --y; - a[0] = - WienerHorizontal4x2Tap5(src, source_stride, filter_horizontal + 1); - src += source_stride; - a[2] = WienerHorizontal4x2Tap5(src + source_stride, source_stride, - filter_horizontal + 1); - a[1] = vcombine_s16(vget_high_s16(a[0]), vget_low_s16(a[2])); - a[4] = WienerHorizontal4Tap5(src + 3 * source_stride, - filter_horizontal + 1); - a[3] = vcombine_s16(vget_high_s16(a[2]), vget_low_s16(a[4])); - const uint8x8_t r = WienerVerticalTap5(a, filter_vertical + 1); - StoreLo4(dst, r); - dst += dest_stride; - } - - if (y != 0) { - a[0] = - WienerHorizontal4x2Tap5(src, source_stride, filter_horizontal + 1); - src += 2 * source_stride; - a[2] = - WienerHorizontal4x2Tap5(src, source_stride, filter_horizontal + 1); - src += 2 * source_stride; - a[1] = vcombine_s16(vget_high_s16(a[0]), vget_low_s16(a[2])); - - do { - a[4] = WienerHorizontal4x2Tap5(src, source_stride, - filter_horizontal + 1); - src += 2 * source_stride; - a[3] = vcombine_s16(vget_high_s16(a[2]), vget_low_s16(a[4])); - - const uint8x8_t r = WienerVerticalTap5(a, filter_vertical + 1); - StoreLo4(dst, r); - dst += dest_stride; - StoreHi4(dst, r); - dst += dest_stride; - - a[0] = a[2]; - a[1] = a[3]; - a[2] = a[4]; - y -= 2; - } while (y != 0); - } - } + int16x8_t a[5]; + a[0] = vld1q_s16(wiener_buffer + x + 0 * width); + a[1] = vld1q_s16(wiener_buffer + x + 1 * width); + a[2] = vld1q_s16(wiener_buffer + x + 2 * width); + a[3] = vld1q_s16(wiener_buffer + x + 3 * width); + a[4] = vld1q_s16(wiener_buffer + x + 4 * width); + const int16x8_t a_0_4 = vaddq_s16(a[0], a[4]); + int32x4_t sum[2]; + sum[0] = sum[1] = vdupq_n_s32(0); + sum[0] = vmlal_n_s16(sum[0], vget_low_s16(a_0_4), filter_vertical[1]); + sum[1] = vmlal_n_s16(sum[1], vget_high_s16(a_0_4), filter_vertical[1]); + const uint8x8_t r = WienerVertical(a + 1, filter_vertical + 2, sum); + vst1_u8(dst + x, r); + x += 8; + } while (x < width); + wiener_buffer += width; + dst += dest_stride; + } while (--y != 0); } else { // 3-tap src -= (kCenterTap - 2) * source_stride + kCenterTap - 2; - - if (width > 8) { - int y = height + kSubPixelTaps - 6; - do { - int x = 0; - do { - const int16x8_t a = - WienerHorizontal8Tap3(src + x, filter_horizontal + 2); - vst1q_s16(wiener_buffer + x, a); - x += 8; - } while (x < width); - src += source_stride; - wiener_buffer += width; - } while (--y != 0); - - wiener_buffer = reinterpret_cast<int16_t*>(buffer->wiener_buffer); - - y = height; + int y = height + kSubPixelTaps - 6; + do { + int x = 0; do { - int x = 0; - do { - a[0] = vld1q_s16(wiener_buffer + x + 0 * width); - a[1] = vld1q_s16(wiener_buffer + x + 1 * width); - a[2] = vld1q_s16(wiener_buffer + x + 2 * width); - - const uint8x8_t r = WienerVerticalTap3(a, filter_vertical + 2); - vst1_u8(dst + x, r); - x += 8; - } while (x < width); - wiener_buffer += width; - dst += dest_stride; - } while (--y != 0); - } else if (width > 4) { - a[0] = WienerHorizontal8Tap3(src, filter_horizontal + 2); - src += source_stride; - a[1] = WienerHorizontal8Tap3(src, filter_horizontal + 2); + const uint8x16_t r = vld1q_u8(src + x); + uint8x8_t s[3]; + s[0] = vget_low_u8(r); + s[1] = vext_u8(s[0], vget_high_u8(r), 1); + s[2] = vext_u8(s[0], vget_high_u8(r), 2); + const int16x8_t a = + HorizontalSum(s, filter_horizontal + 2, vdupq_n_s16(0)); + vst1q_s16(wiener_buffer + x, a); + x += 8; + } while (x < width); src += source_stride; + wiener_buffer += width; + } while (--y != 0); - int y = height; + wiener_buffer = reinterpret_cast<int16_t*>(buffer->wiener_buffer); + y = height; + do { + int x = 0; do { - a[2] = WienerHorizontal8Tap3(src, filter_horizontal + 2); - src += source_stride; - - const uint8x8_t r = WienerVerticalTap3(a, filter_vertical + 2); - vst1_u8(dst, r); - dst += dest_stride; - - a[0] = a[1]; - a[1] = a[2]; - } while (--y != 0); - } else { - int y = height; - - if ((y & 1) != 0) { - --y; - a[0] = - WienerHorizontal4x2Tap3(src, source_stride, filter_horizontal + 2); - src += source_stride; - a[2] = - WienerHorizontal4Tap3(src + source_stride, filter_horizontal + 2); - a[1] = vcombine_s16(vget_high_s16(a[0]), vget_low_s16(a[2])); - const uint8x8_t r = WienerVerticalTap3(a, filter_vertical + 2); - StoreLo4(dst, r); - dst += dest_stride; - } - - if (y != 0) { - a[0] = - WienerHorizontal4x2Tap3(src, source_stride, filter_horizontal + 2); - src += 2 * source_stride; - - do { - a[2] = WienerHorizontal4x2Tap3(src, source_stride, - filter_horizontal + 2); - src += 2 * source_stride; - a[1] = vcombine_s16(vget_high_s16(a[0]), vget_low_s16(a[2])); - - const uint8x8_t r = WienerVerticalTap3(a, filter_vertical + 2); - StoreLo4(dst, r); - dst += dest_stride; - StoreHi4(dst, r); - dst += dest_stride; - - a[0] = a[2]; - y -= 2; - } while (y != 0); - } - } + int16x8_t a[3]; + a[0] = vld1q_s16(wiener_buffer + x + 0 * width); + a[1] = vld1q_s16(wiener_buffer + x + 1 * width); + a[2] = vld1q_s16(wiener_buffer + x + 2 * width); + int32x4_t sum[2]; + sum[0] = sum[1] = vdupq_n_s32(0); + const uint8x8_t r = WienerVertical(a, filter_vertical + 2, sum); + vst1_u8(dst + x, r); + x += 8; + } while (x < width); + wiener_buffer += width; + dst += dest_stride; + } while (--y != 0); } } +//------------------------------------------------------------------------------ // SGR -constexpr int kSgrProjScaleBits = 20; -constexpr int kSgrProjRestoreBits = 4; -constexpr int kSgrProjSgrBits = 8; -constexpr int kSgrProjReciprocalBits = 12; - -// a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1); -// sgr_ma2 = 256 - a2 -constexpr uint8_t kSgrMa2Lookup[256] = { - 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16, 15, 14, - 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, - 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 0}; - template <int n> inline uint16x4_t CalculateSgrMA2(const uint32x4_t sum_sq, const uint16x4_t sum, const uint32_t s) { @@ -697,15 +317,15 @@ inline uint16x4_t CalculateSgrMA2(const uint32x4_t sum_sq, const uint16x4_t sum, // z = RightShiftWithRounding(p * s, kSgrProjScaleBits); const uint32x4_t pxs = vmulq_n_u32(p, s); - // For some reason vrshrn_n_u32() (narrowing shift) can only shift by 16 - // and kSgrProjScaleBits is 20. + // vrshrn_n_u32() (narrowing shift) can only shift by 16 and kSgrProjScaleBits + // is 20. const uint32x4_t shifted = vrshrq_n_u32(pxs, kSgrProjScaleBits); return vmovn_u32(shifted); } -inline uint16x4_t CalculateB2Shifted(const uint8x8_t sgr_ma2, - const uint16x4_t sum, - const uint32_t one_over_n) { +inline uint16x4_t CalculateIntermediate4(const uint8x8_t sgr_ma2, + const uint16x4_t sum, + const uint32_t one_over_n) { // b2 = ((1 << kSgrProjSgrBits) - a2) * b * one_over_n // 1 << kSgrProjSgrBits = 256 // |a2| = [1, 256] @@ -726,9 +346,9 @@ inline uint16x4_t CalculateB2Shifted(const uint8x8_t sgr_ma2, return vrshrn_n_u32(b2, kSgrProjReciprocalBits); } -inline uint16x8_t CalculateB2Shifted(const uint8x8_t sgr_ma2, - const uint16x8_t sum, - const uint32_t one_over_n) { +inline uint16x8_t CalculateIntermediate8(const uint8x8_t sgr_ma2, + const uint16x8_t sum, + const uint32_t one_over_n) { // b2 = ((1 << kSgrProjSgrBits) - a2) * b * one_over_n // 1 << kSgrProjSgrBits = 256 // |a2| = [1, 256] @@ -753,41 +373,41 @@ inline uint16x8_t CalculateB2Shifted(const uint8x8_t sgr_ma2, return vcombine_u16(b2_lo, b2_hi); } -inline uint16x8_t Sum3(const uint16x8_t left, const uint16x8_t middle, - const uint16x8_t right) { +inline uint16x4_t Sum3(const uint16x4_t left, const uint16x4_t middle, + const uint16x4_t right) { + const uint16x4_t sum = vadd_u16(left, middle); + return vadd_u16(sum, right); +} + +inline uint16x8_t Sum3_16(const uint16x8_t left, const uint16x8_t middle, + const uint16x8_t right) { const uint16x8_t sum = vaddq_u16(left, middle); return vaddq_u16(sum, right); } -inline uint32x4_t Sum3(const uint32x4_t left, const uint32x4_t middle, - const uint32x4_t right) { +inline uint32x4_t Sum3_32(const uint32x4_t left, const uint32x4_t middle, + const uint32x4_t right) { const uint32x4_t sum = vaddq_u32(left, middle); return vaddq_u32(sum, right); } -inline uint16x8_t Sum3W(const uint8x8_t left, const uint8x8_t middle, - const uint8x8_t right) { +inline uint16x8_t Sum3W_16(const uint8x8_t left, const uint8x8_t middle, + const uint8x8_t right) { const uint16x8_t sum = vaddl_u8(left, middle); return vaddw_u8(sum, right); } -inline uint32x4_t Sum3W(const uint16x4_t left, const uint16x4_t middle, - const uint16x4_t right) { - const uint32x4_t sum = vaddl_u16(left, middle); - return vaddw_u16(sum, right); -} - -inline uint16x4_t Sum3(const uint16x4_t left, const uint16x4_t middle, - const uint16x4_t right) { - const uint16x4_t sum = vadd_u16(left, middle); - return vadd_u16(sum, right); +inline uint16x8_t Sum3W_16(const uint8x8_t a[3]) { + return Sum3W_16(a[0], a[1], a[2]); } -inline uint16x8_t Sum3W(const uint8x8_t a[3]) { - return Sum3W(a[0], a[1], a[2]); +inline uint32x4_t Sum3W_32(const uint16x4_t left, const uint16x4_t middle, + const uint16x4_t right) { + const uint32x4_t sum = vaddl_u16(left, middle); + return vaddw_u16(sum, right); } -inline uint16x8x2_t Sum3W(const uint8x16_t a[3]) { +inline uint16x8x2_t Sum3W_16x2(const uint8x16_t a[3]) { const uint8x8_t low0 = vget_low_u8(a[0]); const uint8x8_t low1 = vget_low_u8(a[1]); const uint8x8_t low2 = vget_low_u8(a[2]); @@ -795,8 +415,8 @@ inline uint16x8x2_t Sum3W(const uint8x16_t a[3]) { const uint8x8_t high1 = vget_high_u8(a[1]); const uint8x8_t high2 = vget_high_u8(a[2]); uint16x8x2_t sum; - sum.val[0] = Sum3W(low0, low1, low2); - sum.val[1] = Sum3W(high0, high1, high2); + sum.val[0] = Sum3W_16(low0, low1, low2); + sum.val[1] = Sum3W_16(high0, high1, high2); return sum; } @@ -808,32 +428,31 @@ inline uint32x4x2_t Sum3W(const uint16x8_t a[3]) { const uint16x4_t high1 = vget_high_u16(a[1]); const uint16x4_t high2 = vget_high_u16(a[2]); uint32x4x2_t sum; - sum.val[0] = Sum3W(low0, low1, low2); - sum.val[1] = Sum3W(high0, high1, high2); + sum.val[0] = Sum3W_32(low0, low1, low2); + sum.val[1] = Sum3W_32(high0, high1, high2); return sum; } template <int index> -inline uint32x4_t Sum3WLow(const uint16x8x2_t a[3]) { +inline uint32x4_t Sum3WLo(const uint16x8x2_t a[3]) { const uint16x4_t low0 = vget_low_u16(a[0].val[index]); const uint16x4_t low1 = vget_low_u16(a[1].val[index]); const uint16x4_t low2 = vget_low_u16(a[2].val[index]); - return Sum3W(low0, low1, low2); + return Sum3W_32(low0, low1, low2); } -template <int index> -inline uint32x4_t Sum3WHigh(const uint16x8x2_t a[3]) { - const uint16x4_t high0 = vget_high_u16(a[0].val[index]); - const uint16x4_t high1 = vget_high_u16(a[1].val[index]); - const uint16x4_t high2 = vget_high_u16(a[2].val[index]); - return Sum3W(high0, high1, high2); +inline uint32x4_t Sum3WHi(const uint16x8x2_t a[3]) { + const uint16x4_t high0 = vget_high_u16(a[0].val[0]); + const uint16x4_t high1 = vget_high_u16(a[1].val[0]); + const uint16x4_t high2 = vget_high_u16(a[2].val[0]); + return Sum3W_32(high0, high1, high2); } inline uint32x4x3_t Sum3W(const uint16x8x2_t a[3]) { uint32x4x3_t sum; - sum.val[0] = Sum3WLow<0>(a); - sum.val[1] = Sum3WHigh<0>(a); - sum.val[2] = Sum3WLow<1>(a); + sum.val[0] = Sum3WLo<0>(a); + sum.val[1] = Sum3WHi(a); + sum.val[2] = Sum3WLo<1>(a); return sum; } @@ -844,35 +463,35 @@ inline uint16x4_t Sum5(const uint16x4_t a[5]) { return vadd_u16(sum, a[4]); } -inline uint16x8_t Sum5(const uint16x8_t a[5]) { +inline uint16x8_t Sum5_16(const uint16x8_t a[5]) { const uint16x8_t sum01 = vaddq_u16(a[0], a[1]); const uint16x8_t sum23 = vaddq_u16(a[2], a[3]); const uint16x8_t sum = vaddq_u16(sum01, sum23); return vaddq_u16(sum, a[4]); } -inline uint32x4_t Sum5(const uint32x4_t a[5]) { +inline uint32x4_t Sum5_32(const uint32x4_t a[5]) { const uint32x4_t sum01 = vaddq_u32(a[0], a[1]); const uint32x4_t sum23 = vaddq_u32(a[2], a[3]); const uint32x4_t sum = vaddq_u32(sum01, sum23); return vaddq_u32(sum, a[4]); } -inline uint16x8_t Sum5W(const uint8x8_t a[5]) { +inline uint16x8_t Sum5W_16(const uint8x8_t a[5]) { const uint16x8_t sum01 = vaddl_u8(a[0], a[1]); const uint16x8_t sum23 = vaddl_u8(a[2], a[3]); const uint16x8_t sum = vaddq_u16(sum01, sum23); return vaddw_u8(sum, a[4]); } -inline uint32x4_t Sum5W(const uint16x4_t a[5]) { +inline uint32x4_t Sum5W_32(const uint16x4_t a[5]) { const uint32x4_t sum01 = vaddl_u16(a[0], a[1]); const uint32x4_t sum23 = vaddl_u16(a[2], a[3]); const uint32x4_t sum0123 = vaddq_u32(sum01, sum23); return vaddw_u16(sum0123, a[4]); } -inline uint16x8x2_t Sum5W(const uint8x16_t a[5]) { +inline uint16x8x2_t Sum5W_16D(const uint8x16_t a[5]) { uint16x8x2_t sum; uint8x8_t low[5], high[5]; low[0] = vget_low_u8(a[0]); @@ -885,12 +504,12 @@ inline uint16x8x2_t Sum5W(const uint8x16_t a[5]) { high[2] = vget_high_u8(a[2]); high[3] = vget_high_u8(a[3]); high[4] = vget_high_u8(a[4]); - sum.val[0] = Sum5W(low); - sum.val[1] = Sum5W(high); + sum.val[0] = Sum5W_16(low); + sum.val[1] = Sum5W_16(high); return sum; } -inline uint32x4x2_t Sum5W(const uint16x8_t a[5]) { +inline uint32x4x2_t Sum5W_32x2(const uint16x8_t a[5]) { uint32x4x2_t sum; uint16x4_t low[5], high[5]; low[0] = vget_low_u16(a[0]); @@ -903,113 +522,112 @@ inline uint32x4x2_t Sum5W(const uint16x8_t a[5]) { high[2] = vget_high_u16(a[2]); high[3] = vget_high_u16(a[3]); high[4] = vget_high_u16(a[4]); - sum.val[0] = Sum5W(low); - sum.val[1] = Sum5W(high); + sum.val[0] = Sum5W_32(low); + sum.val[1] = Sum5W_32(high); return sum; } template <int index> -inline uint32x4_t Sum5WLow(const uint16x8x2_t a[5]) { +inline uint32x4_t Sum5WLo(const uint16x8x2_t a[5]) { uint16x4_t low[5]; low[0] = vget_low_u16(a[0].val[index]); low[1] = vget_low_u16(a[1].val[index]); low[2] = vget_low_u16(a[2].val[index]); low[3] = vget_low_u16(a[3].val[index]); low[4] = vget_low_u16(a[4].val[index]); - return Sum5W(low); + return Sum5W_32(low); } -template <int index> -inline uint32x4_t Sum5WHigh(const uint16x8x2_t a[5]) { +inline uint32x4_t Sum5WHi(const uint16x8x2_t a[5]) { uint16x4_t high[5]; - high[0] = vget_high_u16(a[0].val[index]); - high[1] = vget_high_u16(a[1].val[index]); - high[2] = vget_high_u16(a[2].val[index]); - high[3] = vget_high_u16(a[3].val[index]); - high[4] = vget_high_u16(a[4].val[index]); - return Sum5W(high); + high[0] = vget_high_u16(a[0].val[0]); + high[1] = vget_high_u16(a[1].val[0]); + high[2] = vget_high_u16(a[2].val[0]); + high[3] = vget_high_u16(a[3].val[0]); + high[4] = vget_high_u16(a[4].val[0]); + return Sum5W_32(high); } -inline uint32x4x3_t Sum5W(const uint16x8x2_t a[5]) { +inline uint32x4x3_t Sum5W_32x3(const uint16x8x2_t a[5]) { uint32x4x3_t sum; - sum.val[0] = Sum5WLow<0>(a); - sum.val[1] = Sum5WHigh<0>(a); - sum.val[2] = Sum5WLow<1>(a); + sum.val[0] = Sum5WLo<0>(a); + sum.val[1] = Sum5WHi(a); + sum.val[2] = Sum5WLo<1>(a); return sum; } inline uint16x4_t Sum3Horizontal(const uint16x8_t a) { const uint16x4_t left = vget_low_u16(a); - const uint16x4_t middle = vext_u16(vget_low_u16(a), vget_high_u16(a), 1); - const uint16x4_t right = vext_u16(vget_low_u16(a), vget_high_u16(a), 2); + const uint16x4_t middle = VshrU128<2>(a); + const uint16x4_t right = VshrU128<4>(a); return Sum3(left, middle, right); } -inline uint16x8_t Sum3Horizontal(const uint16x8x2_t a) { +inline uint16x8_t Sum3Horizontal_16(const uint16x8x2_t a) { const uint16x8_t left = a.val[0]; const uint16x8_t middle = vextq_u16(a.val[0], a.val[1], 1); const uint16x8_t right = vextq_u16(a.val[0], a.val[1], 2); - return Sum3(left, middle, right); + return Sum3_16(left, middle, right); } -inline uint32x4_t Sum3Horizontal(const uint32x4x2_t a) { +inline uint32x4_t Sum3Horizontal_32(const uint32x4x2_t a) { const uint32x4_t left = a.val[0]; const uint32x4_t middle = vextq_u32(a.val[0], a.val[1], 1); const uint32x4_t right = vextq_u32(a.val[0], a.val[1], 2); - return Sum3(left, middle, right); + return Sum3_32(left, middle, right); } -inline uint32x4x2_t Sum3Horizontal(const uint32x4x3_t a) { +inline uint32x4x2_t Sum3Horizontal_32x2(const uint32x4x3_t a) { uint32x4x2_t sum; { const uint32x4_t left = a.val[0]; const uint32x4_t middle = vextq_u32(a.val[0], a.val[1], 1); const uint32x4_t right = vextq_u32(a.val[0], a.val[1], 2); - sum.val[0] = Sum3(left, middle, right); + sum.val[0] = Sum3_32(left, middle, right); } { const uint32x4_t left = a.val[1]; const uint32x4_t middle = vextq_u32(a.val[1], a.val[2], 1); const uint32x4_t right = vextq_u32(a.val[1], a.val[2], 2); - sum.val[1] = Sum3(left, middle, right); + sum.val[1] = Sum3_32(left, middle, right); } return sum; } inline uint16x4_t Sum3HorizontalOffset1(const uint16x8_t a) { - const uint16x4_t left = vext_u16(vget_low_u16(a), vget_high_u16(a), 1); - const uint16x4_t middle = vext_u16(vget_low_u16(a), vget_high_u16(a), 2); - const uint16x4_t right = vext_u16(vget_low_u16(a), vget_high_u16(a), 3); + const uint16x4_t left = VshrU128<2>(a); + const uint16x4_t middle = VshrU128<4>(a); + const uint16x4_t right = VshrU128<6>(a); return Sum3(left, middle, right); } -inline uint16x8_t Sum3HorizontalOffset1(const uint16x8x2_t a) { +inline uint16x8_t Sum3HorizontalOffset1_16(const uint16x8x2_t a) { const uint16x8_t left = vextq_u16(a.val[0], a.val[1], 1); const uint16x8_t middle = vextq_u16(a.val[0], a.val[1], 2); const uint16x8_t right = vextq_u16(a.val[0], a.val[1], 3); - return Sum3(left, middle, right); + return Sum3_16(left, middle, right); } -inline uint32x4_t Sum3HorizontalOffset1(const uint32x4x2_t a) { +inline uint32x4_t Sum3HorizontalOffset1_32(const uint32x4x2_t a) { const uint32x4_t left = vextq_u32(a.val[0], a.val[1], 1); const uint32x4_t middle = vextq_u32(a.val[0], a.val[1], 2); const uint32x4_t right = vextq_u32(a.val[0], a.val[1], 3); - return Sum3(left, middle, right); + return Sum3_32(left, middle, right); } -inline uint32x4x2_t Sum3HorizontalOffset1(const uint32x4x3_t a) { +inline uint32x4x2_t Sum3HorizontalOffset1_32x2(const uint32x4x3_t a) { uint32x4x2_t sum; { const uint32x4_t left = vextq_u32(a.val[0], a.val[1], 1); const uint32x4_t middle = vextq_u32(a.val[0], a.val[1], 2); const uint32x4_t right = vextq_u32(a.val[0], a.val[1], 3); - sum.val[0] = Sum3(left, middle, right); + sum.val[0] = Sum3_32(left, middle, right); } { const uint32x4_t left = vextq_u32(a.val[1], a.val[2], 1); const uint32x4_t middle = vextq_u32(a.val[1], a.val[2], 2); const uint32x4_t right = vextq_u32(a.val[1], a.val[2], 3); - sum.val[1] = Sum3(left, middle, right); + sum.val[1] = Sum3_32(left, middle, right); } return sum; } @@ -1017,34 +635,34 @@ inline uint32x4x2_t Sum3HorizontalOffset1(const uint32x4x3_t a) { inline uint16x4_t Sum5Horizontal(const uint16x8_t a) { uint16x4_t s[5]; s[0] = vget_low_u16(a); - s[1] = vext_u16(vget_low_u16(a), vget_high_u16(a), 1); - s[2] = vext_u16(vget_low_u16(a), vget_high_u16(a), 2); - s[3] = vext_u16(vget_low_u16(a), vget_high_u16(a), 3); + s[1] = VshrU128<2>(a); + s[2] = VshrU128<4>(a); + s[3] = VshrU128<6>(a); s[4] = vget_high_u16(a); return Sum5(s); } -inline uint16x8_t Sum5Horizontal(const uint16x8x2_t a) { +inline uint16x8_t Sum5Horizontal_16(const uint16x8x2_t a) { uint16x8_t s[5]; s[0] = a.val[0]; s[1] = vextq_u16(a.val[0], a.val[1], 1); s[2] = vextq_u16(a.val[0], a.val[1], 2); s[3] = vextq_u16(a.val[0], a.val[1], 3); - s[4] = vcombine_u16(vget_high_u16(a.val[0]), vget_low_u16(a.val[1])); - return Sum5(s); + s[4] = vextq_u16(a.val[0], a.val[1], 4); + return Sum5_16(s); } -inline uint32x4_t Sum5Horizontal(const uint32x4x2_t a) { +inline uint32x4_t Sum5Horizontal_32(const uint32x4x2_t a) { uint32x4_t s[5]; s[0] = a.val[0]; s[1] = vextq_u32(a.val[0], a.val[1], 1); s[2] = vextq_u32(a.val[0], a.val[1], 2); s[3] = vextq_u32(a.val[0], a.val[1], 3); s[4] = a.val[1]; - return Sum5(s); + return Sum5_32(s); } -inline uint32x4x2_t Sum5Horizontal(const uint32x4x3_t a) { +inline uint32x4x2_t Sum5Horizontal_32x2(const uint32x4x3_t a) { uint32x4x2_t sum; uint32x4_t s[5]; s[0] = a.val[0]; @@ -1052,43 +670,42 @@ inline uint32x4x2_t Sum5Horizontal(const uint32x4x3_t a) { s[2] = vextq_u32(a.val[0], a.val[1], 2); s[3] = vextq_u32(a.val[0], a.val[1], 3); s[4] = a.val[1]; - sum.val[0] = Sum5(s); + sum.val[0] = Sum5_32(s); s[0] = a.val[1]; s[1] = vextq_u32(a.val[1], a.val[2], 1); s[2] = vextq_u32(a.val[1], a.val[2], 2); s[3] = vextq_u32(a.val[1], a.val[2], 3); s[4] = a.val[2]; - sum.val[1] = Sum5(s); + sum.val[1] = Sum5_32(s); return sum; } template <int size, int offset> -inline void PreProcess4(const uint8x8_t* const row, - const uint16x8_t* const row_sq, const uint32_t s, - uint16_t* const dst) { +inline void BoxFilterPreProcess4(const uint8x8_t* const row, + const uint16x8_t* const row_sq, + const uint32_t s, uint16_t* const dst) { static_assert(offset == 0 || offset == 1, ""); // Number of elements in the box being summed. constexpr uint32_t n = size * size; constexpr uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n; - const uint16x4_t v_255 = vdup_n_u16(255); uint16x4_t sum; uint32x4_t sum_sq; if (size == 3) { if (offset == 0) { - sum = Sum3Horizontal(Sum3W(row)); - sum_sq = Sum3Horizontal(Sum3W(row_sq)); + sum = Sum3Horizontal(Sum3W_16(row)); + sum_sq = Sum3Horizontal_32(Sum3W(row_sq)); } else { - sum = Sum3HorizontalOffset1(Sum3W(row)); - sum_sq = Sum3HorizontalOffset1(Sum3W(row_sq)); + sum = Sum3HorizontalOffset1(Sum3W_16(row)); + sum_sq = Sum3HorizontalOffset1_32(Sum3W(row_sq)); } } if (size == 5) { - sum = Sum5Horizontal(Sum5W(row)); - sum_sq = Sum5Horizontal(Sum5W(row_sq)); + sum = Sum5Horizontal(Sum5W_16(row)); + sum_sq = Sum5Horizontal_32(Sum5W_32x2(row_sq)); } const uint16x4_t z0 = CalculateSgrMA2<n>(sum_sq, sum, s); - const uint16x4_t z = vmin_u16(v_255, z0); + const uint16x4_t z = vmin_u16(z0, vdup_n_u16(255)); // Using vget_lane_s16() can save a sign extension instruction. // Add 4 0s for memory initialization purpose only. const uint8_t lookup[8] = { @@ -1101,42 +718,41 @@ inline void PreProcess4(const uint8x8_t* const row, kSgrMa2Lookup[vget_lane_s16(vreinterpret_s16_u16(z), 2)], kSgrMa2Lookup[vget_lane_s16(vreinterpret_s16_u16(z), 3)]}; const uint8x8_t sgr_ma2 = vld1_u8(lookup); - const uint16x4_t b2 = CalculateB2Shifted(sgr_ma2, sum, one_over_n); + const uint16x4_t b2 = CalculateIntermediate4(sgr_ma2, sum, one_over_n); const uint16x8_t sgr_ma2_b2 = vcombine_u16(vreinterpret_u16_u8(sgr_ma2), b2); vst1q_u16(dst, sgr_ma2_b2); } template <int size, int offset> -inline void PreProcess8(const uint8x16_t* const row, - const uint16x8x2_t* const row_sq, const uint32_t s, - uint8x8_t* const sgr_ma2, uint16x8_t* const b2, - uint16_t* const dst) { +inline void BoxFilterPreProcess8(const uint8x16_t* const row, + const uint16x8x2_t* const row_sq, + const uint32_t s, uint8x8_t* const sgr_ma2, + uint16x8_t* const b2, uint16_t* const dst) { // Number of elements in the box being summed. constexpr uint32_t n = size * size; constexpr uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n; - const uint16x8_t v_255 = vdupq_n_u16(255); uint16x8_t sum; uint32x4x2_t sum_sq; if (size == 3) { if (offset == 0) { - sum = Sum3Horizontal(Sum3W(row)); - sum_sq = Sum3Horizontal(Sum3W(row_sq)); + sum = Sum3Horizontal_16(Sum3W_16x2(row)); + sum_sq = Sum3Horizontal_32x2(Sum3W(row_sq)); } else /* if (offset == 1) */ { - sum = Sum3HorizontalOffset1(Sum3W(row)); - sum_sq = Sum3HorizontalOffset1(Sum3W(row_sq)); + sum = Sum3HorizontalOffset1_16(Sum3W_16x2(row)); + sum_sq = Sum3HorizontalOffset1_32x2(Sum3W(row_sq)); } } if (size == 5) { - sum = Sum5Horizontal(Sum5W(row)); - sum_sq = Sum5Horizontal(Sum5W(row_sq)); + sum = Sum5Horizontal_16(Sum5W_16D(row)); + sum_sq = Sum5Horizontal_32x2(Sum5W_32x3(row_sq)); } const uint16x4_t z0 = CalculateSgrMA2<n>(sum_sq.val[0], vget_low_u16(sum), s); const uint16x4_t z1 = CalculateSgrMA2<n>(sum_sq.val[1], vget_high_u16(sum), s); const uint16x8_t z01 = vcombine_u16(z0, z1); // Using vqmovn_u16() needs an extra sign extension instruction. - const uint16x8_t z = vminq_u16(v_255, z01); + const uint16x8_t z = vminq_u16(z01, vdupq_n_u16(255)); // Using vgetq_lane_s16() can save the sign extension instruction. const uint8_t lookup[8] = { kSgrMa2Lookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 0)], @@ -1148,40 +764,40 @@ inline void PreProcess8(const uint8x16_t* const row, kSgrMa2Lookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 6)], kSgrMa2Lookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 7)]}; *sgr_ma2 = vld1_u8(lookup); - *b2 = CalculateB2Shifted(*sgr_ma2, sum, one_over_n); + *b2 = CalculateIntermediate8(*sgr_ma2, sum, one_over_n); const uint16x8_t sgr_ma2_b2 = vcombine_u16(vreinterpret_u16_u8(*sgr_ma2), vget_high_u16(*b2)); vst1q_u16(dst, sgr_ma2_b2); } -inline void Prepare3(const uint8x8_t a[2], uint8x8_t* const left, - uint8x8_t* const middle, uint8x8_t* const right) { +inline void Prepare3_8(const uint8x8_t a[2], uint8x8_t* const left, + uint8x8_t* const middle, uint8x8_t* const right) { *left = vext_u8(a[0], a[1], 4); *middle = vext_u8(a[0], a[1], 5); *right = vext_u8(a[0], a[1], 6); } -inline void Prepare3(const uint16x8_t a[2], uint16x8_t* const left, - uint16x8_t* const middle, uint16x8_t* const right) { - *left = vcombine_u16(vget_high_u16(a[0]), vget_low_u16(a[1])); +inline void Prepare3_16(const uint16x8_t a[2], uint16x8_t* const left, + uint16x8_t* const middle, uint16x8_t* const right) { + *left = vextq_u16(a[0], a[1], 4); *middle = vextq_u16(a[0], a[1], 5); *right = vextq_u16(a[0], a[1], 6); } inline uint16x8_t Sum343(const uint8x8_t a[2]) { uint8x8_t left, middle, right; - Prepare3(a, &left, &middle, &right); - const uint16x8_t sum = Sum3W(left, middle, right); - const uint16x8_t sum3 = Sum3(sum, sum, sum); + Prepare3_8(a, &left, &middle, &right); + const uint16x8_t sum = Sum3W_16(left, middle, right); + const uint16x8_t sum3 = Sum3_16(sum, sum, sum); return vaddw_u8(sum3, middle); } inline void Sum343_444(const uint8x8_t a[2], uint16x8_t* const sum343, uint16x8_t* const sum444) { uint8x8_t left, middle, right; - Prepare3(a, &left, &middle, &right); - const uint16x8_t sum = Sum3W(left, middle, right); - const uint16x8_t sum3 = Sum3(sum, sum, sum); + Prepare3_8(a, &left, &middle, &right); + const uint16x8_t sum = Sum3W_16(left, middle, right); + const uint16x8_t sum3 = Sum3_16(sum, sum, sum); *sum343 = vaddw_u8(sum3, middle); *sum444 = vshlq_n_u16(sum, 2); } @@ -1189,13 +805,13 @@ inline void Sum343_444(const uint8x8_t a[2], uint16x8_t* const sum343, inline uint32x4x2_t Sum343W(const uint16x8_t a[2]) { uint16x8_t left, middle, right; uint32x4x2_t d; - Prepare3(a, &left, &middle, &right); + Prepare3_16(a, &left, &middle, &right); d.val[0] = - Sum3W(vget_low_u16(left), vget_low_u16(middle), vget_low_u16(right)); - d.val[1] = - Sum3W(vget_high_u16(left), vget_high_u16(middle), vget_high_u16(right)); - d.val[0] = Sum3(d.val[0], d.val[0], d.val[0]); - d.val[1] = Sum3(d.val[1], d.val[1], d.val[1]); + Sum3W_32(vget_low_u16(left), vget_low_u16(middle), vget_low_u16(right)); + d.val[1] = Sum3W_32(vget_high_u16(left), vget_high_u16(middle), + vget_high_u16(right)); + d.val[0] = Sum3_32(d.val[0], d.val[0], d.val[0]); + d.val[1] = Sum3_32(d.val[1], d.val[1], d.val[1]); d.val[0] = vaddw_u16(d.val[0], vget_low_u16(middle)); d.val[1] = vaddw_u16(d.val[1], vget_high_u16(middle)); return d; @@ -1204,13 +820,13 @@ inline uint32x4x2_t Sum343W(const uint16x8_t a[2]) { inline void Sum343_444W(const uint16x8_t a[2], uint32x4x2_t* const sum343, uint32x4x2_t* const sum444) { uint16x8_t left, middle, right; - Prepare3(a, &left, &middle, &right); + Prepare3_16(a, &left, &middle, &right); sum444->val[0] = - Sum3W(vget_low_u16(left), vget_low_u16(middle), vget_low_u16(right)); - sum444->val[1] = - Sum3W(vget_high_u16(left), vget_high_u16(middle), vget_high_u16(right)); - sum343->val[0] = Sum3(sum444->val[0], sum444->val[0], sum444->val[0]); - sum343->val[1] = Sum3(sum444->val[1], sum444->val[1], sum444->val[1]); + Sum3W_32(vget_low_u16(left), vget_low_u16(middle), vget_low_u16(right)); + sum444->val[1] = Sum3W_32(vget_high_u16(left), vget_high_u16(middle), + vget_high_u16(right)); + sum343->val[0] = Sum3_32(sum444->val[0], sum444->val[0], sum444->val[0]); + sum343->val[1] = Sum3_32(sum444->val[1], sum444->val[1], sum444->val[1]); sum343->val[0] = vaddw_u16(sum343->val[0], vget_low_u16(middle)); sum343->val[1] = vaddw_u16(sum343->val[1], vget_high_u16(middle)); sum444->val[0] = vshlq_n_u32(sum444->val[0], 2); @@ -1219,8 +835,8 @@ inline void Sum343_444W(const uint16x8_t a[2], uint32x4x2_t* const sum343, inline uint16x8_t Sum565(const uint8x8_t a[2]) { uint8x8_t left, middle, right; - Prepare3(a, &left, &middle, &right); - const uint16x8_t sum = Sum3W(left, middle, right); + Prepare3_8(a, &left, &middle, &right); + const uint16x8_t sum = Sum3W_16(left, middle, right); const uint16x8_t sum4 = vshlq_n_u16(sum, 2); const uint16x8_t sum5 = vaddq_u16(sum4, sum); return vaddw_u8(sum5, middle); @@ -1228,9 +844,9 @@ inline uint16x8_t Sum565(const uint8x8_t a[2]) { inline uint32x4_t Sum565W(const uint16x8_t a) { const uint16x4_t left = vget_low_u16(a); - const uint16x4_t middle = vext_u16(left, vget_high_u16(a), 1); - const uint16x4_t right = vext_u16(left, vget_high_u16(a), 2); - const uint32x4_t sum = Sum3W(left, middle, right); + const uint16x4_t middle = VshrU128<2>(a); + const uint16x4_t right = VshrU128<4>(a); + const uint32x4_t sum = Sum3W_32(left, middle, right); const uint32x4_t sum4 = vshlq_n_u32(sum, 2); const uint32x4_t sum5 = vaddq_u32(sum4, sum); return vaddw_u16(sum5, middle); @@ -1256,53 +872,95 @@ inline uint16x4_t FilterOutput(const uint16x4_t src, const uint16x4_t a, } template <int shift> -inline void CalculateFilteredOutput(const uint8x8_t src, const uint16x8_t a, - const uint32x4x2_t b, uint16_t* const dst) { +inline int16x8_t CalculateFilteredOutput(const uint8x8_t src, + const uint16x8_t a, + const uint32x4x2_t b) { const uint16x8_t src_u16 = vmovl_u8(src); const uint16x4_t dst_lo = FilterOutput<shift>(vget_low_u16(src_u16), vget_low_u16(a), b.val[0]); const uint16x4_t dst_hi = FilterOutput<shift>(vget_high_u16(src_u16), vget_high_u16(a), b.val[1]); - const uint16x8_t d = vcombine_u16(dst_lo, dst_hi); - vst1q_u16(dst, d); + return vreinterpretq_s16_u16(vcombine_u16(dst_lo, dst_hi)); // 14 bits } -inline void BoxFilter1(const uint8x8_t src_u8, const uint8x8_t a2[2], - const uint16x8_t b2[2], uint16x8_t sum565_a[2], - uint32x4x2_t sum565_b[2], uint16_t* const out_buf) { +inline int16x8_t BoxFilterPass1(const uint8x8_t src_u8, const uint8x8_t a2[2], + const uint16x8_t b2[2], uint16x8_t sum565_a[2], + uint32x4x2_t sum565_b[2]) { uint32x4x2_t b_v; sum565_a[1] = Sum565(a2); sum565_a[1] = vsubq_u16(vdupq_n_u16((5 + 6 + 5) * 256), sum565_a[1]); - sum565_b[1].val[0] = - Sum565W(vcombine_u16(vget_high_u16(b2[0]), vget_low_u16(b2[1]))); + sum565_b[1].val[0] = Sum565W(vextq_u16(b2[0], b2[1], 4)); sum565_b[1].val[1] = Sum565W(b2[1]); uint16x8_t a_v = vaddq_u16(sum565_a[0], sum565_a[1]); b_v.val[0] = vaddq_u32(sum565_b[0].val[0], sum565_b[1].val[0]); b_v.val[1] = vaddq_u32(sum565_b[0].val[1], sum565_b[1].val[1]); - CalculateFilteredOutput<5>(src_u8, a_v, b_v, out_buf); + return CalculateFilteredOutput<5>(src_u8, a_v, b_v); // 14 bits } -inline void BoxFilter2(const uint8x8_t src_u8, const uint8x8_t a2[2], - const uint16x8_t b2[2], uint16x8_t sum343_a[4], - uint16x8_t sum444_a[3], uint32x4x2_t sum343_b[4], - uint32x4x2_t sum444_b[3], uint16_t* const out_buf) { +inline int16x8_t BoxFilterPass2(const uint8x8_t src_u8, const uint8x8_t a2[2], + const uint16x8_t b2[2], uint16x8_t sum343_a[4], + uint16x8_t sum444_a[3], + uint32x4x2_t sum343_b[4], + uint32x4x2_t sum444_b[3]) { uint32x4x2_t b_v; Sum343_444(a2, &sum343_a[2], &sum444_a[1]); sum343_a[2] = vsubq_u16(vdupq_n_u16((3 + 4 + 3) * 256), sum343_a[2]); sum444_a[1] = vsubq_u16(vdupq_n_u16((4 + 4 + 4) * 256), sum444_a[1]); - uint16x8_t a_v = Sum3(sum343_a[0], sum444_a[0], sum343_a[2]); + uint16x8_t a_v = Sum3_16(sum343_a[0], sum444_a[0], sum343_a[2]); Sum343_444W(b2, &sum343_b[2], &sum444_b[1]); - b_v.val[0] = Sum3(sum343_b[0].val[0], sum444_b[0].val[0], sum343_b[2].val[0]); - b_v.val[1] = Sum3(sum343_b[0].val[1], sum444_b[0].val[1], sum343_b[2].val[1]); - CalculateFilteredOutput<5>(src_u8, a_v, b_v, out_buf); + b_v.val[0] = + Sum3_32(sum343_b[0].val[0], sum444_b[0].val[0], sum343_b[2].val[0]); + b_v.val[1] = + Sum3_32(sum343_b[0].val[1], sum444_b[0].val[1], sum343_b[2].val[1]); + return CalculateFilteredOutput<5>(src_u8, a_v, b_v); // 14 bits +} + +inline void SelfGuidedDoubleMultiplier( + const uint8x8_t src, const int16x8_t box_filter_process_output[2], + const int16x4_t w0, const int16x4_t w1, const int16x4_t w2, + uint8_t* const dst) { + // |wN| values are signed. |src| values can be treated as int16_t. + const int16x8_t u = + vreinterpretq_s16_u16(vshll_n_u8(src, kSgrProjRestoreBits)); + int32x4_t v_lo = vmull_s16(vget_low_s16(u), w1); + v_lo = vmlal_s16(v_lo, vget_low_s16(box_filter_process_output[0]), w0); + v_lo = vmlal_s16(v_lo, vget_low_s16(box_filter_process_output[1]), w2); + int32x4_t v_hi = vmull_s16(vget_high_s16(u), w1); + v_hi = vmlal_s16(v_hi, vget_high_s16(box_filter_process_output[0]), w0); + v_hi = vmlal_s16(v_hi, vget_high_s16(box_filter_process_output[1]), w2); + // |s| is saturated to uint8_t. + const int16x4_t s_lo = + vrshrn_n_s32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits); + const int16x4_t s_hi = + vrshrn_n_s32(v_hi, kSgrProjRestoreBits + kSgrProjPrecisionBits); + vst1_u8(dst, vqmovun_s16(vcombine_s16(s_lo, s_hi))); } -inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride, +inline void SelfGuidedSingleMultiplier( + const uint8x8_t src, const int16x8_t box_filter_process_output, + const int16_t w0, const int16_t w1, uint8_t* dst) { + // weight: -96 to 96 (Sgrproj_Xqd_Min/Max) + const int16x8_t u = + vreinterpretq_s16_u16(vshll_n_u8(src, kSgrProjRestoreBits)); + // u * w1 + u * wN == u * (w1 + wN) + int32x4_t v_lo = vmull_n_s16(vget_low_s16(u), w1); + v_lo = vmlal_n_s16(v_lo, vget_low_s16(box_filter_process_output), w0); + int32x4_t v_hi = vmull_n_s16(vget_high_s16(u), w1); + v_hi = vmlal_n_s16(v_hi, vget_high_s16(box_filter_process_output), w0); + const int16x4_t s_lo = + vrshrn_n_s32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits); + const int16x4_t s_hi = + vrshrn_n_s32(v_hi, kSgrProjRestoreBits + kSgrProjPrecisionBits); + vst1_u8(dst, vqmovun_s16(vcombine_s16(s_lo, s_hi))); +} + +inline void BoxFilterProcess(const uint8_t* const src, + const ptrdiff_t src_stride, + const RestorationUnitInfo& restoration_info, const int width, const int height, - const uint16_t s[2], - uint16_t* const box_filter_process_output, - uint16_t* const temp) { + const uint16_t s[2], uint16_t* const temp, + uint8_t* const dst, const ptrdiff_t dst_stride) { // We have combined PreProcess and Process for the first pass by storing // intermediate values in the |a2| region. The values stored are one vertical // column of interleaved |a2| and |b2| values and consume 8 * |height| values. @@ -1340,45 +998,39 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride, // interleaved in |temp|. The first half is not stored, since it is used // immediately and becomes useless for the next column. Next we will start the // second column. When 2 rows have been calculated we can calculate Process - // and output those into the top of |box_filter_process_output|. + // and output the results. // Calculate and store a single column. Scope so we can re-use the variable // names for the next step. uint16_t* ab_ptr = temp; - // The first phase needs a radius of 2 context values. The second phase - // needs a context of radius 1 values. This means we start at (-3, -3). - const uint8_t* const src_pre_process = src - 3 - 3 * stride; - // Calculate intermediate results, including two-pixel border, for example, - // if unit size is 64x64, we calculate 68x68 pixels. + const uint8_t* const src_pre_process = src - 2 * src_stride - 3; + // Calculate intermediate results, including two-pixel border, for example, if + // unit size is 64x64, we calculate 68x68 pixels. { const uint8_t* column = src_pre_process; uint8x8_t row[5]; uint16x8_t row_sq[5]; - - row[0] = vld1_u8(column); - column += stride; - row[1] = vld1_u8(column); - column += stride; + row[0] = row[1] = vld1_u8(column); + column += src_stride; row[2] = vld1_u8(column); - row_sq[0] = vmull_u8(row[0], row[0]); - row_sq[1] = vmull_u8(row[1], row[1]); + row_sq[0] = row_sq[1] = vmull_u8(row[1], row[1]); row_sq[2] = vmull_u8(row[2], row[2]); - int y = 0; + int y = (height + 2) >> 1; do { - column += stride; + column += src_stride; row[3] = vld1_u8(column); - column += stride; + column += src_stride; row[4] = vld1_u8(column); row_sq[3] = vmull_u8(row[3], row[3]); row_sq[4] = vmull_u8(row[4], row[4]); - PreProcess4<5, 0>(row + 0, row_sq + 0, s[0], ab_ptr + 0); - PreProcess4<3, 1>(row + 1, row_sq + 1, s[1], ab_ptr + 8); - PreProcess4<3, 1>(row + 2, row_sq + 2, s[1], ab_ptr + 16); + BoxFilterPreProcess4<5, 0>(row + 0, row_sq + 0, s[0], ab_ptr + 0); + BoxFilterPreProcess4<3, 1>(row + 1, row_sq + 1, s[1], ab_ptr + 8); + BoxFilterPreProcess4<3, 1>(row + 2, row_sq + 2, s[1], ab_ptr + 16); row[0] = row[2]; row[1] = row[3]; @@ -1388,10 +1040,23 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride, row_sq[1] = row_sq[3]; row_sq[2] = row_sq[4]; ab_ptr += 24; - y += 2; - } while (y < height + 2); + } while (--y != 0); + + if ((height & 1) != 0) { + column += src_stride; + row[3] = row[4] = vld1_u8(column); + row_sq[3] = row_sq[4] = vmull_u8(row[3], row[3]); + BoxFilterPreProcess4<5, 0>(row + 0, row_sq + 0, s[0], ab_ptr + 0); + BoxFilterPreProcess4<3, 1>(row + 1, row_sq + 1, s[1], ab_ptr + 8); + } } + const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0]; + const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1]; + const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1; + const int16x4_t w0_v = vdup_n_s16(w0); + const int16x4_t w1_v = vdup_n_s16(w1); + const int16x4_t w2_v = vdup_n_s16(w2); int x = 0; do { // |src_pre_process| is X but we already processed the first column of 4 @@ -1423,21 +1088,18 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride, const uint8_t* column = src_pre_process + x + 4; uint8x16_t row[5]; uint16x8x2_t row_sq[5]; - - row[0] = vld1q_u8(column); - column += stride; - row[1] = vld1q_u8(column); - column += stride; + row[0] = row[1] = vld1q_u8(column); + column += src_stride; row[2] = vld1q_u8(column); - column += stride; + column += src_stride; row[3] = vld1q_u8(column); - column += stride; + column += src_stride; row[4] = vld1q_u8(column); - row_sq[0].val[0] = vmull_u8(vget_low_u8(row[0]), vget_low_u8(row[0])); - row_sq[0].val[1] = vmull_u8(vget_high_u8(row[0]), vget_high_u8(row[0])); - row_sq[1].val[0] = vmull_u8(vget_low_u8(row[1]), vget_low_u8(row[1])); - row_sq[1].val[1] = vmull_u8(vget_high_u8(row[1]), vget_high_u8(row[1])); + row_sq[0].val[0] = row_sq[1].val[0] = + vmull_u8(vget_low_u8(row[1]), vget_low_u8(row[1])); + row_sq[0].val[1] = row_sq[1].val[1] = + vmull_u8(vget_high_u8(row[1]), vget_high_u8(row[1])); row_sq[2].val[0] = vmull_u8(vget_low_u8(row[2]), vget_low_u8(row[2])); row_sq[2].val[1] = vmull_u8(vget_high_u8(row[2]), vget_high_u8(row[2])); row_sq[3].val[0] = vmull_u8(vget_low_u8(row[3]), vget_low_u8(row[3])); @@ -1445,21 +1107,17 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride, row_sq[4].val[0] = vmull_u8(vget_low_u8(row[4]), vget_low_u8(row[4])); row_sq[4].val[1] = vmull_u8(vget_high_u8(row[4]), vget_high_u8(row[4])); - PreProcess8<5, 0>(row, row_sq, s[0], &a2[0][1], &b2[0][1], ab_ptr); - PreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1][1], &b2[1][1], - ab_ptr + 8); + BoxFilterPreProcess8<5, 0>(row, row_sq, s[0], &a2[0][1], &b2[0][1], ab_ptr); + BoxFilterPreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1][1], &b2[1][1], + ab_ptr + 8); // Pass 1 Process. These are the only values we need to propagate between // rows. sum565_a[0] = Sum565(a2[0]); sum565_a[0] = vsubq_u16(vdupq_n_u16((5 + 6 + 5) * 256), sum565_a[0]); - sum565_b[0].val[0] = - Sum565W(vcombine_u16(vget_high_u16(b2[0][0]), vget_low_u16(b2[0][1]))); + sum565_b[0].val[0] = Sum565W(vextq_u16(b2[0][0], b2[0][1], 4)); sum565_b[0].val[1] = Sum565W(b2[0][1]); - const uint8_t* src_ptr = src + x; - uint16_t* out_buf = box_filter_process_output + 2 * x; - sum343_a[0] = Sum343(a2[1]); sum343_a[0] = vsubq_u16(vdupq_n_u16((3 + 4 + 3) * 256), sum343_a[0]); sum343_b[0] = Sum343W(b2[1]); @@ -1467,19 +1125,21 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride, b2[1][0] = vld1q_u16(ab_ptr + 16); a2[1][0] = vget_low_u8(vreinterpretq_u8_u16(b2[1][0])); - PreProcess8<3, 1>(row + 2, row_sq + 2, s[1], &a2[1][1], &b2[1][1], - ab_ptr + 16); + BoxFilterPreProcess8<3, 1>(row + 2, row_sq + 2, s[1], &a2[1][1], &b2[1][1], + ab_ptr + 16); Sum343_444(a2[1], &sum343_a[1], &sum444_a[0]); sum343_a[1] = vsubq_u16(vdupq_n_u16((3 + 4 + 3) * 256), sum343_a[1]); sum444_a[0] = vsubq_u16(vdupq_n_u16((4 + 4 + 4) * 256), sum444_a[0]); Sum343_444W(b2[1], &sum343_b[1], &sum444_b[0]); + const uint8_t* src_ptr = src + x; + uint8_t* dst_ptr = dst + x; + // Calculate one output line. Add in the line from the previous pass and // output one even row. Sum the new line and output the odd row. Carry the // new row into the next pass. - int y = 0; - do { + for (int y = height >> 1; y != 0; --y) { ab_ptr += 24; b2[0][0] = vld1q_u16(ab_ptr); a2[0][0] = vget_low_u8(vreinterpretq_u8_u16(b2[0][0])); @@ -1494,9 +1154,9 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride, row_sq[1] = row_sq[3]; row_sq[2] = row_sq[4]; - column += stride; + column += src_stride; row[3] = vld1q_u8(column); - column += stride; + column += src_stride; row[4] = vld1q_u8(column); row_sq[3].val[0] = vmull_u8(vget_low_u8(row[3]), vget_low_u8(row[3])); @@ -1504,28 +1164,31 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride, row_sq[4].val[0] = vmull_u8(vget_low_u8(row[4]), vget_low_u8(row[4])); row_sq[4].val[1] = vmull_u8(vget_high_u8(row[4]), vget_high_u8(row[4])); - PreProcess8<5, 0>(row, row_sq, s[0], &a2[0][1], &b2[0][1], ab_ptr); - PreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1][1], &b2[1][1], - ab_ptr + 8); + BoxFilterPreProcess8<5, 0>(row, row_sq, s[0], &a2[0][1], &b2[0][1], + ab_ptr); + BoxFilterPreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1][1], + &b2[1][1], ab_ptr + 8); + + int16x8_t p[2]; + const uint8x8_t src0 = vld1_u8(src_ptr); + p[0] = BoxFilterPass1(src0, a2[0], b2[0], sum565_a, sum565_b); + p[1] = BoxFilterPass2(src0, a2[1], b2[1], sum343_a, sum444_a, sum343_b, + sum444_b); + SelfGuidedDoubleMultiplier(src0, p, w0_v, w1_v, w2_v, dst_ptr); + src_ptr += src_stride; + dst_ptr += dst_stride; - uint8x8_t src_u8 = vld1_u8(src_ptr); - BoxFilter1(src_u8, a2[0], b2[0], sum565_a, sum565_b, out_buf); - BoxFilter2(src_u8, a2[1], b2[1], sum343_a, sum444_a, sum343_b, sum444_b, - out_buf + 8); - src_ptr += stride; - out_buf += 2 * kRestorationProcessingUnitSize; - - src_u8 = vld1_u8(src_ptr); - CalculateFilteredOutput<4>(src_u8, sum565_a[1], sum565_b[1], out_buf); + const uint8x8_t src1 = vld1_u8(src_ptr); + p[0] = CalculateFilteredOutput<4>(src1, sum565_a[1], sum565_b[1]); b2[1][0] = vld1q_u16(ab_ptr + 16); a2[1][0] = vget_low_u8(vreinterpretq_u8_u16(b2[1][0])); - PreProcess8<3, 1>(row + 2, row_sq + 2, s[1], &a2[1][1], &b2[1][1], - ab_ptr + 16); - - BoxFilter2(src_u8, a2[1], b2[1], sum343_a + 1, sum444_a + 1, sum343_b + 1, - sum444_b + 1, out_buf + 8); - src_ptr += stride; - out_buf += 2 * kRestorationProcessingUnitSize; + BoxFilterPreProcess8<3, 1>(row + 2, row_sq + 2, s[1], &a2[1][1], + &b2[1][1], ab_ptr + 16); + p[1] = BoxFilterPass2(src1, a2[1], b2[1], sum343_a + 1, sum444_a + 1, + sum343_b + 1, sum444_b + 1); + SelfGuidedDoubleMultiplier(src1, p, w0_v, w1_v, w2_v, dst_ptr); + src_ptr += src_stride; + dst_ptr += dst_stride; sum565_a[0] = sum565_a[1]; sum565_b[0] = sum565_b[1]; @@ -1535,17 +1198,53 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride, sum343_b[0] = sum343_b[2]; sum343_b[1] = sum343_b[3]; sum444_b[0] = sum444_b[2]; + } + if ((height & 1) != 0) { + ab_ptr += 24; + b2[0][0] = vld1q_u16(ab_ptr); + a2[0][0] = vget_low_u8(vreinterpretq_u8_u16(b2[0][0])); + b2[1][0] = vld1q_u16(ab_ptr + 8); + a2[1][0] = vget_low_u8(vreinterpretq_u8_u16(b2[1][0])); - y += 2; - } while (y < height); + row[0] = row[2]; + row[1] = row[3]; + row[2] = row[4]; + + row_sq[0] = row_sq[2]; + row_sq[1] = row_sq[3]; + row_sq[2] = row_sq[4]; + + column += src_stride; + row[3] = row[4] = vld1q_u8(column); + + row_sq[3].val[0] = row_sq[4].val[0] = + vmull_u8(vget_low_u8(row[3]), vget_low_u8(row[3])); + row_sq[3].val[1] = row_sq[4].val[1] = + vmull_u8(vget_high_u8(row[3]), vget_high_u8(row[3])); + + BoxFilterPreProcess8<5, 0>(row, row_sq, s[0], &a2[0][1], &b2[0][1], + ab_ptr); + BoxFilterPreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1][1], + &b2[1][1], ab_ptr + 8); + + int16x8_t p[2]; + const uint8x8_t src0 = vld1_u8(src_ptr); + p[0] = BoxFilterPass1(src0, a2[0], b2[0], sum565_a, sum565_b); + p[1] = BoxFilterPass2(src0, a2[1], b2[1], sum343_a, sum444_a, sum343_b, + sum444_b); + SelfGuidedDoubleMultiplier(src0, p, w0_v, w1_v, w2_v, dst_ptr); + } x += 8; } while (x < width); } -inline void BoxFilterProcess_FirstPass( - const uint8_t* const src, const ptrdiff_t stride, const int width, - const int height, const uint32_t s, - uint16_t* const box_filter_process_output, uint16_t* const temp) { +inline void BoxFilterProcessPass1(const uint8_t* const src, + const ptrdiff_t src_stride, + const RestorationUnitInfo& restoration_info, + const int width, const int height, + const uint32_t s, uint16_t* const temp, + uint8_t* const dst, + const ptrdiff_t dst_stride) { // We have combined PreProcess and Process for the first pass by storing // intermediate values in the |a2| region. The values stored are one vertical // column of interleaved |a2| and |b2| values and consume 8 * |height| values. @@ -1583,43 +1282,37 @@ inline void BoxFilterProcess_FirstPass( // interleaved in |temp|. The first half is not stored, since it is used // immediately and becomes useless for the next column. Next we will start the // second column. When 2 rows have been calculated we can calculate Process - // and output those into the top of |box_filter_process_output|. + // and output the results. // Calculate and store a single column. Scope so we can re-use the variable // names for the next step. uint16_t* ab_ptr = temp; - // The first phase needs a radius of 2 context values. The second phase - // needs a context of radius 1 values. This means we start at (-3, -3). - const uint8_t* const src_pre_process = src - 3 - 3 * stride; - // Calculate intermediate results, including two-pixel border, for example, - // if unit size is 64x64, we calculate 68x68 pixels. + const uint8_t* const src_pre_process = src - 2 * src_stride - 3; + // Calculate intermediate results, including two-pixel border, for example, if + // unit size is 64x64, we calculate 68x68 pixels. { const uint8_t* column = src_pre_process; uint8x8_t row[5]; uint16x8_t row_sq[5]; - - row[0] = vld1_u8(column); - column += stride; - row[1] = vld1_u8(column); - column += stride; + row[0] = row[1] = vld1_u8(column); + column += src_stride; row[2] = vld1_u8(column); - row_sq[0] = vmull_u8(row[0], row[0]); - row_sq[1] = vmull_u8(row[1], row[1]); + row_sq[0] = row_sq[1] = vmull_u8(row[1], row[1]); row_sq[2] = vmull_u8(row[2], row[2]); - int y = 0; + int y = (height + 2) >> 1; do { - column += stride; + column += src_stride; row[3] = vld1_u8(column); - column += stride; + column += src_stride; row[4] = vld1_u8(column); row_sq[3] = vmull_u8(row[3], row[3]); row_sq[4] = vmull_u8(row[4], row[4]); - PreProcess4<5, 0>(row, row_sq, s, ab_ptr); + BoxFilterPreProcess4<5, 0>(row, row_sq, s, ab_ptr); row[0] = row[2]; row[1] = row[3]; @@ -1629,10 +1322,18 @@ inline void BoxFilterProcess_FirstPass( row_sq[1] = row_sq[3]; row_sq[2] = row_sq[4]; ab_ptr += 8; - y += 2; - } while (y < height + 2); + } while (--y != 0); + + if ((height & 1) != 0) { + column += src_stride; + row[3] = row[4] = vld1_u8(column); + row_sq[3] = row_sq[4] = vmull_u8(row[3], row[3]); + BoxFilterPreProcess4<5, 0>(row, row_sq, s, ab_ptr); + } } + const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0]; + const int16_t w1 = (1 << kSgrProjPrecisionBits) - w0; int x = 0; do { // |src_pre_process| is X but we already processed the first column of 4 @@ -1662,21 +1363,18 @@ inline void BoxFilterProcess_FirstPass( const uint8_t* column = src_pre_process + x + 4; uint8x16_t row[5]; uint16x8x2_t row_sq[5]; - - row[0] = vld1q_u8(column); - column += stride; - row[1] = vld1q_u8(column); - column += stride; + row[0] = row[1] = vld1q_u8(column); + column += src_stride; row[2] = vld1q_u8(column); - column += stride; + column += src_stride; row[3] = vld1q_u8(column); - column += stride; + column += src_stride; row[4] = vld1q_u8(column); - row_sq[0].val[0] = vmull_u8(vget_low_u8(row[0]), vget_low_u8(row[0])); - row_sq[0].val[1] = vmull_u8(vget_high_u8(row[0]), vget_high_u8(row[0])); - row_sq[1].val[0] = vmull_u8(vget_low_u8(row[1]), vget_low_u8(row[1])); - row_sq[1].val[1] = vmull_u8(vget_high_u8(row[1]), vget_high_u8(row[1])); + row_sq[0].val[0] = row_sq[1].val[0] = + vmull_u8(vget_low_u8(row[1]), vget_low_u8(row[1])); + row_sq[0].val[1] = row_sq[1].val[1] = + vmull_u8(vget_high_u8(row[1]), vget_high_u8(row[1])); row_sq[2].val[0] = vmull_u8(vget_low_u8(row[2]), vget_low_u8(row[2])); row_sq[2].val[1] = vmull_u8(vget_high_u8(row[2]), vget_high_u8(row[2])); row_sq[3].val[0] = vmull_u8(vget_low_u8(row[3]), vget_low_u8(row[3])); @@ -1684,24 +1382,22 @@ inline void BoxFilterProcess_FirstPass( row_sq[4].val[0] = vmull_u8(vget_low_u8(row[4]), vget_low_u8(row[4])); row_sq[4].val[1] = vmull_u8(vget_high_u8(row[4]), vget_high_u8(row[4])); - PreProcess8<5, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr); + BoxFilterPreProcess8<5, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr); // Pass 1 Process. These are the only values we need to propagate between // rows. sum565_a[0] = Sum565(a2); sum565_a[0] = vsubq_u16(vdupq_n_u16((5 + 6 + 5) * 256), sum565_a[0]); - sum565_b[0].val[0] = - Sum565W(vcombine_u16(vget_high_u16(b2[0]), vget_low_u16(b2[1]))); + sum565_b[0].val[0] = Sum565W(vextq_u16(b2[0], b2[1], 4)); sum565_b[0].val[1] = Sum565W(b2[1]); const uint8_t* src_ptr = src + x; - uint16_t* out_buf = box_filter_process_output + x; + uint8_t* dst_ptr = dst + x; // Calculate one output line. Add in the line from the previous pass and // output one even row. Sum the new line and output the odd row. Carry the // new row into the next pass. - int y = 0; - do { + for (int y = height >> 1; y != 0; --y) { ab_ptr += 8; b2[0] = vld1q_u16(ab_ptr); a2[0] = vget_low_u8(vreinterpretq_u8_u16(b2[0])); @@ -1714,9 +1410,9 @@ inline void BoxFilterProcess_FirstPass( row_sq[1] = row_sq[3]; row_sq[2] = row_sq[4]; - column += stride; + column += src_stride; row[3] = vld1q_u8(column); - column += stride; + column += src_stride; row[4] = vld1q_u8(column); row_sq[3].val[0] = vmull_u8(vget_low_u8(row[3]), vget_low_u8(row[3])); @@ -1724,55 +1420,86 @@ inline void BoxFilterProcess_FirstPass( row_sq[4].val[0] = vmull_u8(vget_low_u8(row[4]), vget_low_u8(row[4])); row_sq[4].val[1] = vmull_u8(vget_high_u8(row[4]), vget_high_u8(row[4])); - PreProcess8<5, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr); + BoxFilterPreProcess8<5, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr); - uint8x8_t src_u8 = vld1_u8(src_ptr); - BoxFilter1(src_u8, a2, b2, sum565_a, sum565_b, out_buf); - src_ptr += stride; - out_buf += kRestorationProcessingUnitSize; + const uint8x8_t src0 = vld1_u8(src_ptr); + const int16x8_t p0 = BoxFilterPass1(src0, a2, b2, sum565_a, sum565_b); + SelfGuidedSingleMultiplier(src0, p0, w0, w1, dst_ptr); + src_ptr += src_stride; + dst_ptr += dst_stride; - src_u8 = vld1_u8(src_ptr); - CalculateFilteredOutput<4>(src_u8, sum565_a[1], sum565_b[1], out_buf); - src_ptr += stride; - out_buf += kRestorationProcessingUnitSize; + const uint8x8_t src1 = vld1_u8(src_ptr); + const int16x8_t p1 = + CalculateFilteredOutput<4>(src1, sum565_a[1], sum565_b[1]); + SelfGuidedSingleMultiplier(src1, p1, w0, w1, dst_ptr); + src_ptr += src_stride; + dst_ptr += dst_stride; sum565_a[0] = sum565_a[1]; sum565_b[0] = sum565_b[1]; - y += 2; - } while (y < height); + } + if ((height & 1) != 0) { + ab_ptr += 8; + b2[0] = vld1q_u16(ab_ptr); + a2[0] = vget_low_u8(vreinterpretq_u8_u16(b2[0])); + + row[0] = row[2]; + row[1] = row[3]; + row[2] = row[4]; + + row_sq[0] = row_sq[2]; + row_sq[1] = row_sq[3]; + row_sq[2] = row_sq[4]; + + column += src_stride; + row[3] = row[4] = vld1q_u8(column); + + row_sq[3].val[0] = row_sq[4].val[0] = + vmull_u8(vget_low_u8(row[3]), vget_low_u8(row[3])); + row_sq[3].val[1] = row_sq[4].val[1] = + vmull_u8(vget_high_u8(row[3]), vget_high_u8(row[3])); + + BoxFilterPreProcess8<5, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr); + + const uint8x8_t src0 = vld1_u8(src_ptr); + const int16x8_t p0 = BoxFilterPass1(src0, a2, b2, sum565_a, sum565_b); + SelfGuidedSingleMultiplier(src0, p0, w0, w1, dst_ptr); + } x += 8; } while (x < width); } -inline void BoxFilterProcess_SecondPass( - const uint8_t* src, const ptrdiff_t stride, const int width, - const int height, const uint32_t s, - uint16_t* const box_filter_process_output, uint16_t* const temp) { +inline void BoxFilterProcessPass2(const uint8_t* src, + const ptrdiff_t src_stride, + const RestorationUnitInfo& restoration_info, + const int width, const int height, + const uint32_t s, uint16_t* const temp, + uint8_t* const dst, + const ptrdiff_t dst_stride) { uint16_t* ab_ptr = temp; - // Calculate intermediate results, including one-pixel border, for example, - // if unit size is 64x64, we calculate 66x66 pixels. + // Calculate intermediate results, including one-pixel border, for example, if + // unit size is 64x64, we calculate 66x66 pixels. // Because of the vectors this calculates start in blocks of 4 so we actually // get 68 values. - const uint8_t* const src_top_left_corner = src - 2 - 2 * stride; + const uint8_t* const src_top_left_corner = src - 2 - 2 * src_stride; { const uint8_t* column = src_top_left_corner; uint8x8_t row[3]; uint16x8_t row_sq[3]; - row[0] = vld1_u8(column); - column += stride; + column += src_stride; row[1] = vld1_u8(column); row_sq[0] = vmull_u8(row[0], row[0]); row_sq[1] = vmull_u8(row[1], row[1]); int y = height + 2; do { - column += stride; + column += src_stride; row[2] = vld1_u8(column); row_sq[2] = vmull_u8(row[2], row[2]); - PreProcess4<3, 0>(row, row_sq, s, ab_ptr); + BoxFilterPreProcess4<3, 0>(row, row_sq, s, ab_ptr); row[0] = row[1]; row[1] = row[2]; @@ -1780,13 +1507,14 @@ inline void BoxFilterProcess_SecondPass( row_sq[0] = row_sq[1]; row_sq[1] = row_sq[2]; ab_ptr += 8; - } while (--y); + } while (--y != 0); } + assert(restoration_info.sgr_proj_info.multiplier[0] == 0); + const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1]; + const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1; int x = 0; do { - const uint8_t* src_ptr = src + x; - uint16_t* out_buf = box_filter_process_output + x; ab_ptr = temp; uint8x8_t a2[2]; @@ -1799,9 +1527,9 @@ inline void BoxFilterProcess_SecondPass( uint8x16_t row[3]; uint16x8x2_t row_sq[3]; row[0] = vld1q_u8(column); - column += stride; + column += src_stride; row[1] = vld1q_u8(column); - column += stride; + column += src_stride; row[2] = vld1q_u8(column); row_sq[0].val[0] = vmull_u8(vget_low_u8(row[0]), vget_low_u8(row[0])); @@ -1811,7 +1539,7 @@ inline void BoxFilterProcess_SecondPass( row_sq[2].val[0] = vmull_u8(vget_low_u8(row[2]), vget_low_u8(row[2])); row_sq[2].val[1] = vmull_u8(vget_high_u8(row[2]), vget_high_u8(row[2])); - PreProcess8<3, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr); + BoxFilterPreProcess8<3, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr); sum343_a[0] = Sum343(a2); sum343_a[0] = vsubq_u16(vdupq_n_u16((3 + 4 + 3) * 256), sum343_a[0]); @@ -1826,19 +1554,21 @@ inline void BoxFilterProcess_SecondPass( row_sq[0] = row_sq[1]; row_sq[1] = row_sq[2]; - column += stride; + column += src_stride; row[2] = vld1q_u8(column); row_sq[2].val[0] = vmull_u8(vget_low_u8(row[2]), vget_low_u8(row[2])); row_sq[2].val[1] = vmull_u8(vget_high_u8(row[2]), vget_high_u8(row[2])); - PreProcess8<3, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr); + BoxFilterPreProcess8<3, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr); Sum343_444(a2, &sum343_a[1], &sum444_a[0]); sum343_a[1] = vsubq_u16(vdupq_n_u16((3 + 4 + 3) * 256), sum343_a[1]); sum444_a[0] = vsubq_u16(vdupq_n_u16((4 + 4 + 4) * 256), sum444_a[0]); Sum343_444W(b2, &sum343_b[1], &sum444_b[0]); + const uint8_t* src_ptr = src + x; + uint8_t* dst_ptr = dst + x; int y = height; do { ab_ptr += 8; @@ -1850,214 +1580,59 @@ inline void BoxFilterProcess_SecondPass( row_sq[0] = row_sq[1]; row_sq[1] = row_sq[2]; - column += stride; + column += src_stride; row[2] = vld1q_u8(column); row_sq[2].val[0] = vmull_u8(vget_low_u8(row[2]), vget_low_u8(row[2])); row_sq[2].val[1] = vmull_u8(vget_high_u8(row[2]), vget_high_u8(row[2])); - PreProcess8<3, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr); + BoxFilterPreProcess8<3, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr); uint8x8_t src_u8 = vld1_u8(src_ptr); - BoxFilter2(src_u8, a2, b2, sum343_a, sum444_a, sum343_b, sum444_b, - out_buf); + int16x8_t p = BoxFilterPass2(src_u8, a2, b2, sum343_a, sum444_a, sum343_b, + sum444_b); + SelfGuidedSingleMultiplier(src_u8, p, w0, w1, dst_ptr); sum343_a[0] = sum343_a[1]; sum343_a[1] = sum343_a[2]; sum444_a[0] = sum444_a[1]; sum343_b[0] = sum343_b[1]; sum343_b[1] = sum343_b[2]; sum444_b[0] = sum444_b[1]; - src_ptr += stride; - out_buf += kRestorationProcessingUnitSize; - } while (--y); - x += 8; - } while (x < width); -} - -inline void SelfGuidedSingleMultiplier( - const uint8_t* src, const ptrdiff_t src_stride, - const uint16_t* const box_filter_process_output, uint8_t* dst, - const ptrdiff_t dst_stride, const int width, const int height, - const int16_t w_single) { - const int16_t w_combo = (1 << kSgrProjPrecisionBits) - w_single; - const auto* box_filter = - reinterpret_cast<const int16_t*>(box_filter_process_output); - int w = width; - - if (w & 4) { - w -= 4; - const uint8_t* src_ptr = src + w; - uint8_t* dst_ptr = dst + w; - const int16_t* box_filter_w = box_filter + w; - int y = height; - do { - const int16x8_t u = vreinterpretq_s16_u16( - vshll_n_u8(vld1_u8(src_ptr), kSgrProjRestoreBits)); - const int16x4_t p = vld1_s16(box_filter_w); - // u * w1 + u * wN == u * (w1 + wN) - int32x4_t v_lo = vmull_n_s16(vget_low_s16(u), w_combo); - v_lo = vmlal_n_s16(v_lo, p, w_single); - const int16x4_t s_lo = - vrshrn_n_s32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits); - StoreLo4(dst_ptr, vqmovun_s16(vcombine_s16(s_lo, s_lo))); - src_ptr += src_stride; - dst_ptr += dst_stride; - box_filter_w += kRestorationProcessingUnitSize; - } while (--y); - - if (!w) return; - } - - int y = height; - do { - int x = 0; - do { - const int16x8_t u = vreinterpretq_s16_u16( - vshll_n_u8(vld1_u8(src + x), kSgrProjRestoreBits)); - const int16x8_t p = vld1q_s16(box_filter + x); - // u * w1 + u * wN == u * (w1 + wN) - int32x4_t v_lo = vmull_n_s16(vget_low_s16(u), w_combo); - v_lo = vmlal_n_s16(v_lo, vget_low_s16(p), w_single); - int32x4_t v_hi = vmull_n_s16(vget_high_s16(u), w_combo); - v_hi = vmlal_n_s16(v_hi, vget_high_s16(p), w_single); - const int16x4_t s_lo = - vrshrn_n_s32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits); - const int16x4_t s_hi = - vrshrn_n_s32(v_hi, kSgrProjRestoreBits + kSgrProjPrecisionBits); - vst1_u8(dst + x, vqmovun_s16(vcombine_s16(s_lo, s_hi))); - x += 8; - } while (x < w); - src += src_stride; - dst += dst_stride; - box_filter += kRestorationProcessingUnitSize; - } while (--y); -} - -inline void SelfGuidedDoubleMultiplier( - const uint8_t* src, const ptrdiff_t src_stride, - const uint16_t* const box_filter_process_output, uint8_t* dst, - const ptrdiff_t dst_stride, const int width, const int height, const int w0, - const int w1, const int w2) { - const auto* box_filter = - reinterpret_cast<const int16_t*>(box_filter_process_output); - const int16x4_t w0_v = vdup_n_s16(w0); - const int16x4_t w1_v = vdup_n_s16(w1); - const int16x4_t w2_v = vdup_n_s16(w2); - int w = width; - - if (w & 4) { - w -= 4; - const uint8_t* src_ptr = src + w; - uint8_t* dst_ptr = dst + w; - const int16_t* box_filter_w = box_filter + 2 * w; - int y = height; - do { - // |wN| values are signed. |src| values can be treated as int16_t. - // Load 8 values but ignore 4. - const int16x4_t u = vget_low_s16(vreinterpretq_s16_u16( - vshll_n_u8(vld1_u8(src_ptr), kSgrProjRestoreBits))); - // |box_filter_process_output| is 14 bits, also safe to treat as int16_t. - const int16x4_t p0 = vld1_s16(box_filter_w + 0); - const int16x4_t p1 = vld1_s16(box_filter_w + 8); - int32x4_t v = vmull_s16(u, w1_v); - v = vmlal_s16(v, p0, w0_v); - v = vmlal_s16(v, p1, w2_v); - // |s| is saturated to uint8_t. - const int16x4_t s = - vrshrn_n_s32(v, kSgrProjRestoreBits + kSgrProjPrecisionBits); - StoreLo4(dst_ptr, vqmovun_s16(vcombine_s16(s, s))); src_ptr += src_stride; dst_ptr += dst_stride; - box_filter_w += 2 * kRestorationProcessingUnitSize; - } while (--y); - - if (!w) return; - } - - int y = height; - do { - int x = 0; - do { - // |wN| values are signed. |src| values can be treated as int16_t. - const int16x8_t u = vreinterpretq_s16_u16( - vshll_n_u8(vld1_u8(src + x), kSgrProjRestoreBits)); - // |box_filter_process_output| is 14 bits, also safe to treat as int16_t. - const int16x8_t p0 = vld1q_s16(box_filter + 2 * x + 0); - const int16x8_t p1 = vld1q_s16(box_filter + 2 * x + 8); - int32x4_t v_lo = vmull_s16(vget_low_s16(u), w1_v); - v_lo = vmlal_s16(v_lo, vget_low_s16(p0), w0_v); - v_lo = vmlal_s16(v_lo, vget_low_s16(p1), w2_v); - int32x4_t v_hi = vmull_s16(vget_high_s16(u), w1_v); - v_hi = vmlal_s16(v_hi, vget_high_s16(p0), w0_v); - v_hi = vmlal_s16(v_hi, vget_high_s16(p1), w2_v); - // |s| is saturated to uint8_t. - const int16x4_t s_lo = - vrshrn_n_s32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits); - const int16x4_t s_hi = - vrshrn_n_s32(v_hi, kSgrProjRestoreBits + kSgrProjPrecisionBits); - vst1_u8(dst + x, vqmovun_s16(vcombine_s16(s_lo, s_hi))); - x += 8; - } while (x < w); - src += src_stride; - dst += dst_stride; - box_filter += 2 * kRestorationProcessingUnitSize; - } while (--y); + } while (--y != 0); + x += 8; + } while (x < width); } +// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in +// the end of each row. It is safe to overwrite the output as it will not be +// part of the visible frame. void SelfGuidedFilter_NEON(const void* const source, void* const dest, const RestorationUnitInfo& restoration_info, - ptrdiff_t source_stride, ptrdiff_t dest_stride, - const int width, const int height, - RestorationBuffer* const /*buffer*/) { - const auto* src = static_cast<const uint8_t*>(source); - - // The output frame is broken into blocks of 64x64 (32x32 if U/V are - // subsampled). If either dimension is less than 32/64 it indicates it is at - // the right or bottom edge of the frame. It is safe to overwrite the output - // as it will not be part of the visible frame. This saves us from having to - // handle non-multiple-of-8 widths. - // We could round here, but the for loop with += 8 does the same thing. - - // width = (width + 7) & ~0x7; - - // -96 to 96 (Sgrproj_Xqd_Min/Max) + const ptrdiff_t source_stride, + const ptrdiff_t dest_stride, const int width, + const int height, RestorationBuffer* const buffer) { const int index = restoration_info.sgr_proj_info.index; - const int radius_pass_0 = kSgrProjParams[index][0]; - const int radius_pass_1 = kSgrProjParams[index][2]; - alignas(kMaxAlignment) - uint16_t box_filter_process_output[2 * kMaxBoxFilterProcessOutputPixels]; - alignas(kMaxAlignment) - uint16_t temp[12 * (kRestorationProcessingUnitSize + 2)]; - - // If |radius| is 0 then there is nothing to do. If |radius| is not 0, it is - // always 2 for the first pass and 1 for the second pass. - const int w0 = restoration_info.sgr_proj_info.multiplier[0]; - const int w1 = restoration_info.sgr_proj_info.multiplier[1]; - const int w2 = (1 << kSgrProjPrecisionBits) - w0 - w1; + const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0 + const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0 + const auto* src = static_cast<const uint8_t*>(source); auto* dst = static_cast<uint8_t*>(dest); - // Note: Combining box filter process with the final multipliers has no speed - // gain. There are not enough neon registers to hold those weights. - if (radius_pass_0 != 0 && radius_pass_1 != 0) { - BoxFilterProcess(src, source_stride, width, height, - kSgrScaleParameter[index], box_filter_process_output, - temp); - SelfGuidedDoubleMultiplier(src, source_stride, box_filter_process_output, - dst, dest_stride, width, height, w0, w1, w2); + if (radius_pass_1 == 0) { + // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the + // following assertion. + assert(radius_pass_0 != 0); + BoxFilterProcessPass1(src, source_stride, restoration_info, width, height, + kSgrScaleParameter[index][0], buffer->sgf_buffer, dst, + dest_stride); + } else if (radius_pass_0 == 0) { + BoxFilterProcessPass2(src, source_stride, restoration_info, width, height, + kSgrScaleParameter[index][1], buffer->sgf_buffer, dst, + dest_stride); } else { - int16_t w_single; - if (radius_pass_0 != 0) { - BoxFilterProcess_FirstPass(src, source_stride, width, height, - kSgrScaleParameter[index][0], - box_filter_process_output, temp); - w_single = w0; - } else /* if (radius_pass_1 != 0) */ { - BoxFilterProcess_SecondPass(src, source_stride, width, height, - kSgrScaleParameter[index][1], - box_filter_process_output, temp); - w_single = w2; - } - SelfGuidedSingleMultiplier(src, source_stride, box_filter_process_output, - dst, dest_stride, width, height, w_single); + BoxFilterProcess(src, source_stride, restoration_info, width, height, + kSgrScaleParameter[index], buffer->sgf_buffer, dst, + dest_stride); } } diff --git a/chromium/third_party/libgav1/src/src/dsp/arm/motion_field_projection_neon.cc b/chromium/third_party/libgav1/src/src/dsp/arm/motion_field_projection_neon.cc index b84548de6f7..3e731b22450 100644 --- a/chromium/third_party/libgav1/src/src/dsp/arm/motion_field_projection_neon.cc +++ b/chromium/third_party/libgav1/src/src/dsp/arm/motion_field_projection_neon.cc @@ -34,92 +34,77 @@ namespace libgav1 { namespace dsp { namespace { -inline int8x8_t Project_NEON(const int16x8_t delta, const int16x8_t dst_sign) { - // Add 63 to negative delta so that it shifts towards zero. - const int16x8_t delta_sign = vshrq_n_s16(delta, 15); - const uint16x8_t delta_u = vreinterpretq_u16_s16(delta); - const uint16x8_t delta_sign_u = vreinterpretq_u16_s16(delta_sign); - const uint16x8_t delta_adjust_u = vsraq_n_u16(delta_u, delta_sign_u, 10); - const int16x8_t delta_adjust = vreinterpretq_s16_u16(delta_adjust_u); - const int16x8_t offset0 = vshrq_n_s16(delta_adjust, 6); - const int16x8_t offset1 = veorq_s16(offset0, dst_sign); - const int16x8_t offset2 = vsubq_s16(offset1, dst_sign); - return vqmovn_s16(offset2); -} - -inline int16x8_t LookupTable(const int8x8x4_t division_table, - const int8x16_t idx) { - const int8x8_t idx_low = vget_low_s8(idx); - const int8x8_t idx_high = vget_high_s8(idx); - const int16x4_t d0 = vreinterpret_s16_s8(vtbl4_s8(division_table, idx_low)); - const int16x4_t d1 = vreinterpret_s16_s8(vtbl4_s8(division_table, idx_high)); - return vcombine_s16(d0, d1); -} - -inline int16x8_t LoadDivision(const int8x8x4_t division_table[2], +inline int16x8_t LoadDivision(const int8x8x2_t division_table, const int8x8_t reference_offset) { - const int8x16_t k32 = vdupq_n_s8(32); const int8x8_t kOne = vcreate_s8(0x0100010001000100); const int8x16_t kOneQ = vcombine_s8(kOne, kOne); const int8x8_t t = vadd_s8(reference_offset, reference_offset); const int8x8x2_t tt = vzip_s8(t, t); const int8x16_t t1 = vcombine_s8(tt.val[0], tt.val[1]); - const int8x16_t idx0 = vaddq_s8(t1, kOneQ); - const int8x16_t idx1 = vsubq_s8(idx0, k32); - const int16x8_t denorm0 = LookupTable(division_table[0], idx0); - const int16x8_t denorm1 = LookupTable(division_table[1], idx1); - return vorrq_s16(denorm0, denorm1); + const int8x16_t idx = vaddq_s8(t1, kOneQ); + const int8x8_t idx_low = vget_low_s8(idx); + const int8x8_t idx_high = vget_high_s8(idx); + const int16x4_t d0 = vreinterpret_s16_s8(vtbl2_s8(division_table, idx_low)); + const int16x4_t d1 = vreinterpret_s16_s8(vtbl2_s8(division_table, idx_high)); + return vcombine_s16(d0, d1); } inline int16x4_t MvProjection(const int16x4_t mv, const int16x4_t denominator, const int numerator) { const int32x4_t m0 = vmull_s16(mv, denominator); const int32x4_t m = vmulq_n_s32(m0, numerator); - // Subtract the sign bit to round towards zero. - const int32x4_t sub_sign = vsraq_n_s32(m, m, 31); - return vqrshrn_n_s32(sub_sign, 14); + // Add the sign (0 or -1) to round towards zero. + const int32x4_t add_sign = vsraq_n_s32(m, m, 31); + return vqrshrn_n_s32(add_sign, 14); } inline int16x8_t MvProjectionClip(const int16x8_t mv, const int16x8_t denominator, const int numerator) { - const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp); const int16x4_t mv0 = vget_low_s16(mv); const int16x4_t mv1 = vget_high_s16(mv); - const int16x4_t m0 = MvProjection(mv0, vget_low_s16(denominator), numerator); - const int16x4_t m1 = MvProjection(mv1, vget_high_s16(denominator), numerator); - const int16x8_t m = vcombine_s16(m0, m1); - const int16x8_t clamp = vminq_s16(m, projection_mv_clamp); + const int16x4_t s0 = MvProjection(mv0, vget_low_s16(denominator), numerator); + const int16x4_t s1 = MvProjection(mv1, vget_high_s16(denominator), numerator); + const int16x8_t projection = vcombine_s16(s0, s1); + const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp); + const int16x8_t clamp = vminq_s16(projection, projection_mv_clamp); return vmaxq_s16(clamp, vnegq_s16(projection_mv_clamp)); } -inline void GetMvProjection(const int32x4_t mv[2], const int16x8_t denominator, - const int numerator, int16x8_t projection_mv[2]) { - const int16x8_t mv0 = vreinterpretq_s16_s32(mv[0]); - const int16x8_t mv1 = vreinterpretq_s16_s32(mv[1]); - // Deinterlace - const int16x8x2_t mvs = vuzpq_s16(mv0, mv1); - projection_mv[0] = MvProjectionClip(mvs.val[0], denominator, numerator); - projection_mv[1] = MvProjectionClip(mvs.val[1], denominator, numerator); +inline int8x8_t Project_NEON(const int16x8_t delta, const int16x8_t dst_sign) { + // Add 63 to negative delta so that it shifts towards zero. + const int16x8_t delta_sign = vshrq_n_s16(delta, 15); + const uint16x8_t delta_u = vreinterpretq_u16_s16(delta); + const uint16x8_t delta_sign_u = vreinterpretq_u16_s16(delta_sign); + const uint16x8_t delta_adjust_u = vsraq_n_u16(delta_u, delta_sign_u, 10); + const int16x8_t delta_adjust = vreinterpretq_s16_u16(delta_adjust_u); + const int16x8_t offset0 = vshrq_n_s16(delta_adjust, 6); + const int16x8_t offset1 = veorq_s16(offset0, dst_sign); + const int16x8_t offset2 = vsubq_s16(offset1, dst_sign); + return vqmovn_s16(offset2); } -void GetPosition(const int8x8x4_t division_table[2], - const MotionVector* const mv, - const int reference_to_current_with_sign, const int x8_start, - const int x8_end, const int x8, const int8x8_t r_offsets, - const int8x8_t source_reference_type8, const int8x8_t skip_r, - const int8x8_t y8_floor8, const int8x8_t y8_ceiling8, - const int16x8_t d_sign, const int delta, int8x8_t* const r, - int8x8_t* const position_y8, int8x8_t* const position_x8, - int64_t* const skip_64, int32x4_t mvs[2]) { - const int32_t* const mv_int = reinterpret_cast<const int32_t*>(mv + x8); +inline void GetPosition( + const int8x8x2_t division_table, const MotionVector* const mv, + const int numerator, const int x8_start, const int x8_end, const int x8, + const int8x8_t r_offsets, const int8x8_t source_reference_type8, + const int8x8_t skip_r, const int8x8_t y8_floor8, const int8x8_t y8_ceiling8, + const int16x8_t d_sign, const int delta, int8x8_t* const r, + int8x8_t* const position_y8, int8x8_t* const position_x8, + int64_t* const skip_64, int32x4_t mvs[2]) { + const auto* const mv_int = reinterpret_cast<const int32_t*>(mv + x8); *r = vtbl1_s8(r_offsets, source_reference_type8); - const int16x8_t denorm = LoadDivision(division_table, *r); + const int16x8_t denorm = LoadDivision(division_table, source_reference_type8); int16x8_t projection_mv[2]; mvs[0] = vld1q_s32(mv_int + 0); mvs[1] = vld1q_s32(mv_int + 4); - // reference_to_current_with_sign could be 0. - GetMvProjection(mvs, denorm, reference_to_current_with_sign, projection_mv); + // Deinterlace x and y components + const int16x8_t mv0 = vreinterpretq_s16_s32(mvs[0]); + const int16x8_t mv1 = vreinterpretq_s16_s32(mvs[1]); + const int16x8x2_t mv_yx = vuzpq_s16(mv0, mv1); + // numerator could be 0. + projection_mv[0] = MvProjectionClip(mv_yx.val[0], denorm, numerator); + projection_mv[1] = MvProjectionClip(mv_yx.val[1], denorm, numerator); // Do not update the motion vector if the block position is not valid or // if position_x8 is outside the current range of x8_start and x8_end. // Note that position_y8 will always be within the range of y8_start and @@ -147,46 +132,31 @@ void GetPosition(const int8x8x4_t division_table[2], } template <int idx> -int16_t VgetqLaneS16(const int16x8_t src) { - if (idx == 0) return vgetq_lane_s16(src, 0); - if (idx == 1) return vgetq_lane_s16(src, 1); - if (idx == 2) return vgetq_lane_s16(src, 2); - if (idx == 3) return vgetq_lane_s16(src, 3); - if (idx == 4) return vgetq_lane_s16(src, 4); - if (idx == 5) return vgetq_lane_s16(src, 5); - if (idx == 6) return vgetq_lane_s16(src, 6); - return vgetq_lane_s16(src, 7); -} - -template <int idx> inline void Store(const int16x8_t position, const int8x8_t reference_offset, - const int32x4_t mvs, int8_t* dst_reference_offset, + const int32x4_t mv, int8_t* dst_reference_offset, MotionVector* dst_mv) { - const ptrdiff_t offset = VgetqLaneS16<idx>(position); - int32_t* const d_mv = reinterpret_cast<int32_t*>(&dst_mv[offset]); - vst1q_lane_s32(d_mv, mvs, idx & 3); + const ptrdiff_t offset = vgetq_lane_s16(position, idx); + auto* const d_mv = reinterpret_cast<int32_t*>(&dst_mv[offset]); + vst1q_lane_s32(d_mv, mv, idx & 3); vst1_lane_s8(&dst_reference_offset[offset], reference_offset, idx); } template <int idx> inline void CheckStore(const int8_t* skips, const int16x8_t position, - const int8x8_t reference_offset, const int32x4_t mvs, + const int8x8_t reference_offset, const int32x4_t mv, int8_t* dst_reference_offset, MotionVector* dst_mv) { if (skips[idx] == 0) { - const ptrdiff_t offset = VgetqLaneS16<idx>(position); - int32_t* const d_mv = reinterpret_cast<int32_t*>(&dst_mv[offset]); - vst1q_lane_s32(d_mv, mvs, idx & 3); - vst1_lane_s8(&dst_reference_offset[offset], reference_offset, idx); + Store<idx>(position, reference_offset, mv, dst_reference_offset, dst_mv); } } // 7.9.2. -void MotionFieldProjectionKernel_NEON( - const ReferenceFrameType* source_reference_type, const MotionVector* mv, - const uint8_t order_hint[kNumReferenceFrameTypes], - unsigned int current_frame_order_hint, unsigned int order_hint_shift_bits, - int reference_to_current_with_sign, int dst_sign, int y8_start, int y8_end, - int x8_start, int x8_end, TemporalMotionField* motion_field) { +void MotionFieldProjectionKernel_NEON(const ReferenceInfo& reference_info, + const int reference_to_current_with_sign, + const int dst_sign, const int y8_start, + const int y8_end, const int x8_start, + const int x8_end, + TemporalMotionField* const motion_field) { const ptrdiff_t stride = motion_field->mv.columns(); // The column range has to be offset by kProjectionMvMaxHorizontalOffset since // coordinates in that range could end up being position_x8 because of @@ -197,14 +167,17 @@ void MotionFieldProjectionKernel_NEON( x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride)); const int adjusted_x8_end8 = adjusted_x8_end & ~7; const int leftover = adjusted_x8_end - adjusted_x8_end8; - const int8_t* const table = - reinterpret_cast<const int8_t*>(kProjectionMvDivisionLookup); + const int8_t* const reference_offsets = + reference_info.relative_distance_to.data(); + const bool* const skip_references = reference_info.skip_references.data(); + const int16_t* const projection_divisions = + reference_info.projection_divisions.data(); + const ReferenceFrameType* source_reference_types = + &reference_info.motion_field_reference_frame[y8_start][0]; + const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0]; int8_t* dst_reference_offset = motion_field->reference_offset[y8_start]; MotionVector* dst_mv = motion_field->mv[y8_start]; const int16x8_t d_sign = vdupq_n_s16(dst_sign); - int8_t reference_offsets[kNumReferenceFrameTypes]; - bool skip_reference[kNumReferenceFrameTypes]; - int8x8x4_t division_table[2]; static_assert(sizeof(int8_t) == sizeof(bool), ""); static_assert(sizeof(int8_t) == sizeof(ReferenceFrameType), ""); @@ -219,37 +192,13 @@ void MotionFieldProjectionKernel_NEON( // which means this optimization works for frame width up to 32K (each // position is a 8x8 block). assert(8 * stride <= 32768); - - const int8x8_t current_order_hints = vdup_n_s8(current_frame_order_hint); - const int8x8_t order_hints = vreinterpret_s8_u8(vld1_u8(order_hint)); - const int8x8_t diff = vsub_s8(current_order_hints, order_hints); - // |order_hint_shift_bits| - 24 could be -24. In this case diff is 0, - // and the behavior of left or right shifting -24 bits is defined for ARM NEON - // instructions, and the result of shifting 0 is still 0. - const int8x8_t left_shift_bits = vdup_n_s8(order_hint_shift_bits - 24); - const int8x8_t diff_shift_left = vshl_s8(diff, left_shift_bits); - const int8x8_t r_offsets = vshl_s8(diff_shift_left, vneg_s8(left_shift_bits)); - const uint8x8_t overflow = vcgt_s8(r_offsets, vdup_n_s8(kMaxFrameDistance)); - const uint8x8_t underflow = vcle_s8(r_offsets, vdup_n_s8(0)); - const int8x8_t sk = vreinterpret_s8_u8(vorr_u8(overflow, underflow)); - // Initialize skip_reference[kReferenceFrameIntra] to simplify branch - // conditions in projection. - const int8x8_t skip_reference8 = vset_lane_s8(-1, sk, 0); - vst1_s8(reinterpret_cast<int8_t*>(skip_reference), skip_reference8); - vst1_s8(reference_offsets, r_offsets); - - // The compiler is inefficient when using vld4_s64(). Instructions waste in - // copying from int64x1x4_t to int8x8x4_t, and there is no such vector - // reinterpret intrinsics available to the best of our knowledge. Anyway - // compiler is good enough to use 4 vld1q_s8(). - division_table[0].val[0] = vld1_s8(table + 0 * 8); - division_table[0].val[1] = vld1_s8(table + 1 * 8); - division_table[0].val[2] = vld1_s8(table + 2 * 8); - division_table[0].val[3] = vld1_s8(table + 3 * 8); - division_table[1].val[0] = vld1_s8(table + 4 * 8); - division_table[1].val[1] = vld1_s8(table + 5 * 8); - division_table[1].val[2] = vld1_s8(table + 6 * 8); - division_table[1].val[3] = vld1_s8(table + 7 * 8); + const int8x8_t skip_reference = + vld1_s8(reinterpret_cast<const int8_t*>(skip_references)); + const int8x8_t r_offsets = vld1_s8(reference_offsets); + const int8x16_t table = vreinterpretq_s8_s16(vld1q_s16(projection_divisions)); + int8x8x2_t division_table; + division_table.val[0] = vget_low_s8(table); + division_table.val[1] = vget_high_s8(table); int y8 = y8_start; do { @@ -261,8 +210,8 @@ void MotionFieldProjectionKernel_NEON( for (x8 = adjusted_x8_start; x8 < adjusted_x8_end8; x8 += 8) { const int8x8_t source_reference_type8 = - vld1_s8(reinterpret_cast<const int8_t*>(source_reference_type + x8)); - const int8x8_t skip_r = vtbl1_s8(skip_reference8, source_reference_type8); + vld1_s8(reinterpret_cast<const int8_t*>(source_reference_types + x8)); + const int8x8_t skip_r = vtbl1_s8(skip_reference, source_reference_type8); const int64_t early_skip = vget_lane_s64(vreinterpret_s64_s8(skip_r), 0); // Early termination #1 if all are skips. Chance is typically ~30-40%. if (early_skip == -1) continue; @@ -278,8 +227,8 @@ void MotionFieldProjectionKernel_NEON( if (skip_64 == -1) continue; const int16x8_t p_y = vmovl_s8(position_y8); const int16x8_t p_x = vmovl_s8(position_x8); - const int16x8_t p_xy = vmlaq_n_s16(p_x, p_y, stride); - const int16x8_t position = vaddq_s16(p_xy, vdupq_n_s16(x8)); + const int16x8_t pos = vmlaq_n_s16(p_x, p_y, stride); + const int16x8_t position = vaddq_s16(pos, vdupq_n_s16(x8)); if (skip_64 == 0) { // Store all. Chance is typically ~70-85% after Early termination #2. Store<0>(position, r, mvs[0], dst_reference_offset, dst_mv); @@ -318,9 +267,9 @@ void MotionFieldProjectionKernel_NEON( const int delta = 8 - leftover; x8 = adjusted_x8_end - 8; const int8x8_t source_reference_type8 = vld1_s8( - reinterpret_cast<const int8_t*>(source_reference_type + x8)); + reinterpret_cast<const int8_t*>(source_reference_types + x8)); const int8x8_t skip_r = - vtbl1_s8(skip_reference8, source_reference_type8); + vtbl1_s8(skip_reference, source_reference_type8); const int64_t early_skip = vget_lane_s64(vreinterpret_s64_s8(skip_r), 0); // Early termination #1 if all are skips. @@ -336,8 +285,8 @@ void MotionFieldProjectionKernel_NEON( if (skip_64 != -1) { const int16x8_t p_y = vmovl_s8(position_y8); const int16x8_t p_x = vmovl_s8(position_x8); - const int16x8_t p_xy = vmlaq_n_s16(p_x, p_y, stride); - const int16x8_t position = vaddq_s16(p_xy, vdupq_n_s16(x8)); + const int16x8_t pos = vmlaq_n_s16(p_x, p_y, stride); + const int16x8_t position = vaddq_s16(pos, vdupq_n_s16(x8)); // Store up to 7 elements since leftover is at most 7. if (skip_64 == 0) { // Store all. @@ -373,13 +322,13 @@ void MotionFieldProjectionKernel_NEON( } } else { for (; x8 < adjusted_x8_end; ++x8) { - if (skip_reference[source_reference_type[x8]]) continue; - const int reference_offset = - reference_offsets[source_reference_type[x8]]; + const int source_reference_type = source_reference_types[x8]; + if (skip_references[source_reference_type]) continue; MotionVector projection_mv; // reference_to_current_with_sign could be 0. GetMvProjection(mv[x8], reference_to_current_with_sign, - reference_offset, &projection_mv); + projection_divisions[source_reference_type], + &projection_mv); // Do not update the motion vector if the block position is not valid // or if position_x8 is outside the current range of x8_start and // x8_end. Note that position_y8 will always be within the range of @@ -395,12 +344,12 @@ void MotionFieldProjectionKernel_NEON( if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue; dst_mv[position_y8 * stride + position_x8] = mv[x8]; dst_reference_offset[position_y8 * stride + position_x8] = - reference_offset; + reference_offsets[source_reference_type]; } } } - source_reference_type += stride; + source_reference_types += stride; mv += stride; dst_reference_offset += stride; dst_mv += stride; diff --git a/chromium/third_party/libgav1/src/src/dsp/arm/motion_vector_search_neon.cc b/chromium/third_party/libgav1/src/src/dsp/arm/motion_vector_search_neon.cc index 5332180dfbc..da3ba1706e6 100644 --- a/chromium/third_party/libgav1/src/src/dsp/arm/motion_vector_search_neon.cc +++ b/chromium/third_party/libgav1/src/src/dsp/arm/motion_vector_search_neon.cc @@ -64,7 +64,7 @@ inline int16x8_t MvProjectionCompoundClip( const MotionVector* const temporal_mvs, const int8_t* const temporal_reference_offsets, const int reference_offsets[2]) { - const int32_t* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs); + const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs); const int32x2_t temporal_mv = vld1_s32(tmvs); const int16x4_t tmv0 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 0)); const int16x4_t tmv1 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 1)); @@ -79,7 +79,7 @@ inline int16x8_t MvProjectionSingleClip( const MotionVector* const temporal_mvs, const int8_t* const temporal_reference_offsets, const int reference_offset, int16x4_t* const lookup) { - const int16_t* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs); + const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs); const int16x8_t temporal_mv = vld1q_s16(tmvs); *lookup = vld1_lane_s16( &kProjectionMvDivisionLookup[temporal_reference_offsets[0]], *lookup, 0); @@ -98,27 +98,26 @@ inline int16x8_t MvProjectionSingleClip( return ProjectionClip(mv0, mv1); } -void LowPrecision(const int16x8_t mv, void* const candidate_mvs) { - const int16x8_t k1 = vdupq_n_s16(1); +inline void LowPrecision(const int16x8_t mv, void* const candidate_mvs) { + const int16x8_t kRoundDownMask = vdupq_n_s16(1); const uint16x8_t mvu = vreinterpretq_u16_s16(mv); const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15)); - const int16x8_t mv1 = vbicq_s16(mv0, k1); + const int16x8_t mv1 = vbicq_s16(mv0, kRoundDownMask); vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv1); } -void ForceInteger(const int16x8_t mv, void* const candidate_mvs) { - const int16x8_t k3 = vdupq_n_s16(3); - const int16x8_t k7 = vdupq_n_s16(7); +inline void ForceInteger(const int16x8_t mv, void* const candidate_mvs) { + const int16x8_t kRoundDownMask = vdupq_n_s16(7); const uint16x8_t mvu = vreinterpretq_u16_s16(mv); const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15)); - const int16x8_t mv1 = vaddq_s16(mv0, k3); - const int16x8_t mv2 = vbicq_s16(mv1, k7); + const int16x8_t mv1 = vaddq_s16(mv0, vdupq_n_s16(3)); + const int16x8_t mv2 = vbicq_s16(mv1, kRoundDownMask); vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv2); } void MvProjectionCompoundLowPrecision_NEON( const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, - const int reference_offsets[2], int count, + const int reference_offsets[2], const int count, CompoundMotionVector* candidate_mvs) { // |reference_offsets| non-zero check usually equals true and is ignored. // To facilitate the compilers, make a local copy of |reference_offsets|. diff --git a/chromium/third_party/libgav1/src/src/dsp/arm/warp_neon.cc b/chromium/third_party/libgav1/src/src/dsp/arm/warp_neon.cc index 901aa3ddedf..c7fb739ba75 100644 --- a/chromium/third_party/libgav1/src/src/dsp/arm/warp_neon.cc +++ b/chromium/third_party/libgav1/src/src/dsp/arm/warp_neon.cc @@ -133,7 +133,7 @@ void Warp_NEON(const void* const source, const ptrdiff_t source_stride, assert(block_width >= 8); assert(block_height >= 8); - // Warp process applies for each 8x8 block (or smaller). + // Warp process applies for each 8x8 block. int start_y = block_start_y; do { int start_x = block_start_x; diff --git a/chromium/third_party/libgav1/src/src/dsp/arm/weight_mask_neon.h b/chromium/third_party/libgav1/src/src/dsp/arm/weight_mask_neon.h index f13eb13605c..b4749ec6aea 100644 --- a/chromium/third_party/libgav1/src/src/dsp/arm/weight_mask_neon.h +++ b/chromium/third_party/libgav1/src/src/dsp/arm/weight_mask_neon.h @@ -36,6 +36,7 @@ void WeightMaskInit_NEON(); #define LIBGAV1_Dsp8bpp_WeightMask_16x8 LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_WeightMask_16x16 LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_WeightMask_16x32 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_WeightMask_16x64 LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_WeightMask_32x8 LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_WeightMask_32x16 LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_WeightMask_32x32 LIBGAV1_CPU_NEON diff --git a/chromium/third_party/libgav1/src/src/dsp/cdef.cc b/chromium/third_party/libgav1/src/src/dsp/cdef.cc index 0ebee20d8b5..a7c720b77cc 100644 --- a/chromium/third_party/libgav1/src/src/dsp/cdef.cc +++ b/chromium/third_party/libgav1/src/src/dsp/cdef.cc @@ -29,6 +29,8 @@ namespace libgav1 { namespace dsp { namespace { +#include "src/dsp/cdef.inc" + // Silence unused function warnings when CdefDirection_C is obviated. #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ !defined(LIBGAV1_Dsp8bpp_CdefDirection) || \ @@ -119,21 +121,23 @@ int Constrain(int diff, int threshold, int damping) { // constant large value if at the boundary. And the input should be uint16_t. template <int bitdepth, typename Pixel> void CdefFilter_C(const void* const source, const ptrdiff_t source_stride, - const int rows4x4, const int columns4x4, const int curr_x, - const int curr_y, const int subsampling_x, - const int subsampling_y, const int primary_strength, - const int secondary_strength, const int damping, - const int direction, void* const dest, + const int block_width, const int block_height, + const int primary_strength, const int secondary_strength, + const int damping, const int direction, void* const dest, const ptrdiff_t dest_stride) { - static constexpr int kCdefSecondaryTaps[2] = {kCdefSecondaryTap0, - kCdefSecondaryTap1}; - const int coeff_shift = bitdepth - 8; - const int plane_width = MultiplyBy4(columns4x4) >> subsampling_x; - const int plane_height = MultiplyBy4(rows4x4) >> subsampling_y; - const int block_width = std::min(8 >> subsampling_x, plane_width - curr_x); assert(block_width == 4 || block_width == 8); - const int block_height = std::min(8 >> subsampling_y, plane_height - curr_y); assert(block_height == 4 || block_height == 8); + assert(direction >= 0 && direction <= 7); + constexpr int coeff_shift = bitdepth - 8; + // Section 5.9.19. CDEF params syntax. + assert(primary_strength >= 0 && primary_strength <= 15 << coeff_shift); + assert(secondary_strength >= 0 && secondary_strength <= 4 << coeff_shift && + secondary_strength != 3 << coeff_shift); + // damping is decreased by 1 for chroma. + assert((damping >= 3 && damping <= 6 + coeff_shift) || + (damping >= 2 && damping <= 5 + coeff_shift)); + static constexpr int kCdefSecondaryTaps[2] = {kCdefSecondaryTap0, + kCdefSecondaryTap1}; const auto* src = static_cast<const uint16_t*>(source); auto* dst = static_cast<Pixel*>(dest); const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel); @@ -146,7 +150,7 @@ void CdefFilter_C(const void* const source, const ptrdiff_t source_stride, uint16_t max_value = pixel_value; uint16_t min_value = pixel_value; for (int k = 0; k < 2; ++k) { - const int signs[] = {-1, 1}; + static constexpr int signs[] = {-1, 1}; for (const int& sign : signs) { int dy = sign * kCdefDirections[direction][k][0]; int dx = sign * kCdefDirections[direction][k][1]; @@ -160,10 +164,10 @@ void CdefFilter_C(const void* const source, const ptrdiff_t source_stride, max_value = std::max(value, max_value); min_value = std::min(value, min_value); } - const int offsets[] = {-2, 2}; + static constexpr int offsets[] = {-2, 2}; for (const int& offset : offsets) { - dy = sign * kCdefDirections[(direction + offset) & 7][k][0]; - dx = sign * kCdefDirections[(direction + offset) & 7][k][1]; + dy = sign * kCdefDirections[direction + offset][k][0]; + dx = sign * kCdefDirections[direction + offset][k][1]; value = src[dy * source_stride + dx + x]; // Note: the summation can ignore the condition check in SIMD // implementation. diff --git a/chromium/third_party/libgav1/src/src/dsp/cdef.inc b/chromium/third_party/libgav1/src/src/dsp/cdef.inc new file mode 100644 index 00000000000..c1a31361796 --- /dev/null +++ b/chromium/third_party/libgav1/src/src/dsp/cdef.inc @@ -0,0 +1,29 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Constants used for cdef implementations. +// This will be included inside an anonymous namespace on files where these are +// necessary. + +const int8_t (*const kCdefDirections)[2][2] = kCdefDirectionsPadded + 2; + +// Mirror values and pad to 16 elements. +alignas(16) constexpr uint32_t kCdefDivisionTable[] = { + 840, 420, 280, 210, 168, 140, 120, 105, + 120, 140, 168, 210, 280, 420, 840, 0}; + +// Used when calculating odd |cost[x]| values to mask off unwanted elements. +// Holds elements 1 3 5 X 5 3 1 X +alignas(16) constexpr uint32_t kCdefDivisionTableOdd[] = {420, 210, 140, 0, + 140, 210, 420, 0}; diff --git a/chromium/third_party/libgav1/src/src/dsp/common.h b/chromium/third_party/libgav1/src/src/dsp/common.h index 2532d177856..2a08403379f 100644 --- a/chromium/third_party/libgav1/src/src/dsp/common.h +++ b/chromium/third_party/libgav1/src/src/dsp/common.h @@ -45,15 +45,15 @@ struct RestorationUnitInfo : public MaxAlignedAllocable { WienerInfo wiener_info; }; -struct RestorationBuffer { +union RestorationBuffer { // For self-guided filter. - int* box_filter_process_output[2]; - ptrdiff_t box_filter_process_output_stride; - uint32_t* box_filter_process_intermediate[2]; - ptrdiff_t box_filter_process_intermediate_stride; + alignas(kMaxAlignment) uint16_t sgf_buffer[12 * (kRestorationUnitHeight + 2)]; // For wiener filter. - uint16_t* wiener_buffer; - ptrdiff_t wiener_buffer_stride; + // The array |intermediate| in Section 7.17.4, the intermediate results + // between the horizontal and vertical filters. + alignas(kMaxAlignment) uint16_t + wiener_buffer[(kRestorationUnitHeight + kSubPixelTaps - 1) * + kRestorationUnitWidth]; }; } // namespace libgav1 diff --git a/chromium/third_party/libgav1/src/src/dsp/constants.cc b/chromium/third_party/libgav1/src/src/dsp/constants.cc index 1b9e6fc14e0..0099ca36c8c 100644 --- a/chromium/third_party/libgav1/src/src/dsp/constants.cc +++ b/chromium/third_party/libgav1/src/src/dsp/constants.cc @@ -81,8 +81,23 @@ const uint16_t kSgrScaleParameter[16][2] = { const uint8_t kCdefPrimaryTaps[2][2] = {{4, 2}, {3, 3}}; -const int8_t kCdefDirections[8][2][2] = { - {{-1, 1}, {-2, 2}}, {{0, 1}, {-1, 2}}, {{0, 1}, {0, 2}}, {{0, 1}, {1, 2}}, - {{1, 1}, {2, 2}}, {{1, 0}, {2, 1}}, {{1, 0}, {2, 0}}, {{1, 0}, {2, -1}}}; +// This is Cdef_Directions (section 7.15.3) with 2 padding entries at the +// beginning and end of the table. The cdef direction range is [0, 7] and the +// first index is offset +/-2. This removes the need to constrain the first +// index to the same range using e.g., & 7. +const int8_t kCdefDirectionsPadded[12][2][2] = { + {{1, 0}, {2, 0}}, // Padding: Cdef_Directions[6] + {{1, 0}, {2, -1}}, // Padding: Cdef_Directions[7] + {{-1, 1}, {-2, 2}}, // Begin Cdef_Directions + {{0, 1}, {-1, 2}}, // + {{0, 1}, {0, 2}}, // + {{0, 1}, {1, 2}}, // + {{1, 1}, {2, 2}}, // + {{1, 0}, {2, 1}}, // + {{1, 0}, {2, 0}}, // + {{1, 0}, {2, -1}}, // End Cdef_Directions + {{-1, 1}, {-2, 2}}, // Padding: Cdef_Directions[0] + {{0, 1}, {-1, 2}}, // Padding: Cdef_Directions[1] +}; } // namespace libgav1 diff --git a/chromium/third_party/libgav1/src/src/dsp/constants.h b/chromium/third_party/libgav1/src/src/dsp/constants.h index d588d22af41..7c1b62c4926 100644 --- a/chromium/third_party/libgav1/src/src/dsp/constants.h +++ b/chromium/third_party/libgav1/src/src/dsp/constants.h @@ -64,7 +64,7 @@ extern const uint16_t kSgrScaleParameter[16][2]; extern const uint8_t kCdefPrimaryTaps[2][2]; -extern const int8_t kCdefDirections[8][2][2]; +extern const int8_t kCdefDirectionsPadded[12][2][2]; } // namespace libgav1 diff --git a/chromium/third_party/libgav1/src/src/dsp/dsp.cc b/chromium/third_party/libgav1/src/src/dsp/dsp.cc index db285a5f8a0..c1df27634cc 100644 --- a/chromium/third_party/libgav1/src/src/dsp/dsp.cc +++ b/chromium/third_party/libgav1/src/src/dsp/dsp.cc @@ -94,6 +94,8 @@ void DspInit() { LoopFilterInit_SSE4_1(); LoopRestorationInit_SSE4_1(); MaskBlendInit_SSE4_1(); + MotionFieldProjectionInit_SSE4_1(); + MotionVectorSearchInit_SSE4_1(); ObmcInit_SSE4_1(); SuperResInit_SSE4_1(); WarpInit_SSE4_1(); diff --git a/chromium/third_party/libgav1/src/src/dsp/dsp.h b/chromium/third_party/libgav1/src/src/dsp/dsp.h index f5b5b366947..470436faf26 100644 --- a/chromium/third_party/libgav1/src/src/dsp/dsp.h +++ b/chromium/third_party/libgav1/src/src/dsp/dsp.h @@ -25,6 +25,7 @@ #include "src/dsp/constants.h" #include "src/dsp/film_grain_common.h" #include "src/utils/cpu.h" +#include "src/utils/reference_info.h" #include "src/utils/types.h" namespace libgav1 { @@ -328,20 +329,15 @@ using CdefDirectionFunc = void (*)(const void* src, ptrdiff_t stride, // Cdef filtering function signature. Section 7.15.3. // |source| is a pointer to the input block. |source_stride| is given in bytes. -// |rows4x4| and |columns4x4| are frame sizes in units of 4x4 pixels. -// |curr_x| and |curr_y| are current position in units of pixels. -// |subsampling_x|, |subsampling_y| are the subsampling factors of current -// plane. +// |block_width|, |block_height| are the width/height of the input block. // |primary_strength|, |secondary_strength|, and |damping| are Cdef filtering // parameters. // |direction| is the filtering direction. // |dest| is the output buffer. |dest_stride| is given in bytes. using CdefFilteringFunc = void (*)(const void* source, ptrdiff_t source_stride, - int rows4x4, int columns4x4, int curr_x, - int curr_y, int subsampling_x, - int subsampling_y, int primary_strength, - int secondary_strength, int damping, - int direction, void* dest, + int block_width, int block_height, + int primary_strength, int secondary_strength, + int damping, int direction, void* dest, ptrdiff_t dest_stride); // Upscaling process function signature. Section 7.16. @@ -360,7 +356,8 @@ using SuperResRowFunc = void (*)(const void* source, const int upscaled_width, // |source| is the input frame buffer, which is deblocked and cdef filtered. // |dest| is the output. // |restoration_info| contains loop restoration information, such as filter -// type, strength. |source| and |dest| share the same stride given in bytes. +// type, strength. +// |source_stride| and |dest_stride| are given in pixels. // |buffer| contains buffers required for self guided filter and wiener filter. // They must be initialized before calling. using LoopRestorationFunc = void (*)( @@ -745,15 +742,7 @@ struct FilmGrainFuncs { }; // Motion field projection function signature. Section 7.9. -// |source_reference_type| corresponds to MfRefFrames[i * 2 + 1][j * 2 + 1] in -// the spec. -// |mv| corresponds to MfMvs[i * 2 + 1][j * 2 + 1] in the spec. -// |order_hint| points to an array of kNumReferenceFrameTypes elements which -// specifies OrderHintBits least significant bits of the expected output order -// for reference frames. -// |current_frame_order_hint| specifies OrderHintBits least significant bits of -// the expected output order for this frame. -// |order_hint_shift_bits| equals (32 - OrderHintBits) % 32. +// |reference_info| provides reference information for motion field projection. // |reference_to_current_with_sign| is the precalculated reference frame id // distance from current frame. // |dst_sign| is -1 for LAST_FRAME and LAST2_FRAME, or 0 (1 in spec) for others. @@ -763,11 +752,9 @@ struct FilmGrainFuncs { // |motion_field| is the output which saves the projected motion field // information. using MotionFieldProjectionKernelFunc = void (*)( - const ReferenceFrameType* source_reference_type, const MotionVector* mv, - const uint8_t order_hint[kNumReferenceFrameTypes], - unsigned int current_frame_order_hint, unsigned int order_hint_shift_bits, - int reference_to_current_with_sign, int dst_sign, int y8_start, int y8_end, - int x8_start, int x8_end, TemporalMotionField* motion_field); + const ReferenceInfo& reference_info, int reference_to_current_with_sign, + int dst_sign, int y8_start, int y8_end, int x8_start, int x8_end, + TemporalMotionField* motion_field); // Compound temporal motion vector projection function signature. // Section 7.9.3 and 7.10.2.10. @@ -797,35 +784,35 @@ using MvProjectionSingleFunc = void (*)( int reference_offset, int count, MotionVector* candidate_mvs); struct Dsp { - IntraPredictorFuncs intra_predictors; + AverageBlendFunc average_blend; + CdefDirectionFunc cdef_direction; + CdefFilteringFunc cdef_filter; + CflIntraPredictorFuncs cfl_intra_predictors; + CflSubsamplerFuncs cfl_subsamplers; + ConvolveFuncs convolve; + ConvolveScaleFuncs convolve_scale; DirectionalIntraPredictorZone1Func directional_intra_predictor_zone1; DirectionalIntraPredictorZone2Func directional_intra_predictor_zone2; DirectionalIntraPredictorZone3Func directional_intra_predictor_zone3; + DistanceWeightedBlendFunc distance_weighted_blend; + FilmGrainFuncs film_grain; FilterIntraPredictorFunc filter_intra_predictor; - CflIntraPredictorFuncs cfl_intra_predictors; - CflSubsamplerFuncs cfl_subsamplers; + InterIntraMaskBlendFuncs8bpp inter_intra_mask_blend_8bpp; IntraEdgeFilterFunc intra_edge_filter; IntraEdgeUpsamplerFunc intra_edge_upsampler; + IntraPredictorFuncs intra_predictors; InverseTransformAddFuncs inverse_transforms; LoopFilterFuncs loop_filters; - CdefDirectionFunc cdef_direction; - CdefFilteringFunc cdef_filter; - SuperResRowFunc super_res_row; LoopRestorationFuncs loop_restorations; + MaskBlendFuncs mask_blend; MotionFieldProjectionKernelFunc motion_field_projection_kernel; MvProjectionCompoundFunc mv_projection_compound[3]; MvProjectionSingleFunc mv_projection_single[3]; - ConvolveFuncs convolve; - ConvolveScaleFuncs convolve_scale; - WeightMaskFuncs weight_mask; - AverageBlendFunc average_blend; - DistanceWeightedBlendFunc distance_weighted_blend; - MaskBlendFuncs mask_blend; - InterIntraMaskBlendFuncs8bpp inter_intra_mask_blend_8bpp; ObmcBlendFuncs obmc_blend; - WarpFunc warp; + SuperResRowFunc super_res_row; WarpCompoundFunc warp_compound; - FilmGrainFuncs film_grain; + WarpFunc warp; + WeightMaskFuncs weight_mask; }; // Initializes function pointers based on build config and runtime diff --git a/chromium/third_party/libgav1/src/src/dsp/libgav1_dsp.cmake b/chromium/third_party/libgav1/src/src/dsp/libgav1_dsp.cmake index 06e23ee0f4f..00574fa1953 100644 --- a/chromium/third_party/libgav1/src/src/dsp/libgav1_dsp.cmake +++ b/chromium/third_party/libgav1/src/src/dsp/libgav1_dsp.cmake @@ -24,6 +24,7 @@ list(APPEND libgav1_dsp_sources "${libgav1_source}/dsp/average_blend.h" "${libgav1_source}/dsp/cdef.cc" "${libgav1_source}/dsp/cdef.h" + "${libgav1_source}/dsp/cdef.inc" "${libgav1_source}/dsp/common.h" "${libgav1_source}/dsp/constants.cc" "${libgav1_source}/dsp/constants.h" @@ -42,6 +43,7 @@ list(APPEND libgav1_dsp_sources "${libgav1_source}/dsp/intrapred.h" "${libgav1_source}/dsp/inverse_transform.cc" "${libgav1_source}/dsp/inverse_transform.h" + "${libgav1_source}/dsp/inverse_transform.inc" "${libgav1_source}/dsp/loop_filter.cc" "${libgav1_source}/dsp/loop_filter.h" "${libgav1_source}/dsp/loop_restoration.cc" @@ -54,6 +56,7 @@ list(APPEND libgav1_dsp_sources "${libgav1_source}/dsp/motion_vector_search.h" "${libgav1_source}/dsp/obmc.cc" "${libgav1_source}/dsp/obmc.h" + "${libgav1_source}/dsp/obmc.inc" "${libgav1_source}/dsp/super_res.cc" "${libgav1_source}/dsp/super_res.h" "${libgav1_source}/dsp/warp.cc" @@ -128,6 +131,10 @@ list(APPEND libgav1_dsp_sources_sse4 "${libgav1_source}/dsp/x86/loop_restoration_sse4.h" "${libgav1_source}/dsp/x86/mask_blend_sse4.cc" "${libgav1_source}/dsp/x86/mask_blend_sse4.h" + "${libgav1_source}/dsp/x86/motion_field_projection_sse4.cc" + "${libgav1_source}/dsp/x86/motion_field_projection_sse4.h" + "${libgav1_source}/dsp/x86/motion_vector_search_sse4.cc" + "${libgav1_source}/dsp/x86/motion_vector_search_sse4.h" "${libgav1_source}/dsp/x86/obmc_sse4.cc" "${libgav1_source}/dsp/x86/obmc_sse4.h" "${libgav1_source}/dsp/x86/super_res_sse4.cc" diff --git a/chromium/third_party/libgav1/src/src/dsp/loop_filter.cc b/chromium/third_party/libgav1/src/src/dsp/loop_filter.cc index 946952b029c..6cad97d4280 100644 --- a/chromium/third_party/libgav1/src/src/dsp/loop_filter.cc +++ b/chromium/third_party/libgav1/src/src/dsp/loop_filter.cc @@ -31,10 +31,10 @@ template <int bitdepth, typename Pixel> struct LoopFilterFuncs_C { LoopFilterFuncs_C() = delete; - static const int kMaxPixel = (1 << bitdepth) - 1; - static const int kMinSignedPixel = -(1 << (bitdepth - 1)); - static const int kMaxSignedPixel = (1 << (bitdepth - 1)) - 1; - static const int kFlatThresh = 1 << (bitdepth - 8); + static constexpr int kMaxPixel = (1 << bitdepth) - 1; + static constexpr int kMinSignedPixel = -(1 << (bitdepth - 1)); + static constexpr int kMaxSignedPixel = (1 << (bitdepth - 1)) - 1; + static constexpr int kFlatThresh = 1 << (bitdepth - 8); static void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh, int hev_thresh); diff --git a/chromium/third_party/libgav1/src/src/dsp/loop_restoration.cc b/chromium/third_party/libgav1/src/src/dsp/loop_restoration.cc index 467e33492fd..b2ae99c0882 100644 --- a/chromium/third_party/libgav1/src/src/dsp/loop_restoration.cc +++ b/chromium/third_party/libgav1/src/src/dsp/loop_restoration.cc @@ -26,15 +26,6 @@ namespace libgav1 { namespace dsp { -namespace { - -// Precision of a division table (mtable) -constexpr int kSgrProjScaleBits = 20; -constexpr int kSgrProjReciprocalBits = 12; -// Core self-guided restoration precision bits. -constexpr int kSgrProjSgrBits = 8; -// Precision bits of generated values higher than source before projection. -constexpr int kSgrProjRestoreBits = 4; // Section 7.17.3. // a2: range [1, 256]. @@ -44,7 +35,7 @@ constexpr int kSgrProjRestoreBits = 4; // a2 = 1; // else // a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1); -constexpr int kXByXPlus1[256] = { +const int kXByXPlus1[256] = { 1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239, 240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247, 248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250, @@ -64,65 +55,51 @@ constexpr int kXByXPlus1[256] = { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 256}; +// a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1); +// sgr_ma2 = 256 - a2 +const uint8_t kSgrMa2Lookup[256] = { + 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16, 15, 14, + 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, + 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 0}; + +namespace { + +constexpr ptrdiff_t kIntermediateStride = kRestorationUnitWidth + 2; + +struct SgrIntermediateBuffer { + uint16_t a; // [1, 256] + uint32_t b; // < 2^20. 32-bit is required for bitdepth 10 and up. +}; + +struct SgrBuffer { + // Circular buffer to save memory. + // The 2d arrays A and B in Section 7.17.3, the intermediate results in the + // box filter process. Reused for pass 0 and pass 1. Pass 0 uses 2 rows. Pass + // 1 uses 3 or 4 rows. + SgrIntermediateBuffer intermediate[6 * kIntermediateStride]; +}; + constexpr int kOneByX[25] = { 4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315, 293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164, }; -// Compute integral image. In an integral image, each pixel value of (xi, yi) -// is the sum of all pixel values {(x, y) | x <= xi, y <= yi} from the source -// image. -// The integral image (II) can be calculated as: -// II(D) = Pixel(D) + II(B) + II(C) - II(A), -// where the rectangular region ABCD is -// A = (x, y), B = (x + 1, y), C = (x, y + 1), D = (x + 1, y + 1). -// Integral image helps to compute the sum of a rectangular area fast. -// The box centered at (x, y), with radius r, is rectangular ABCD: -// A = (x - r, y - r), B = (x + r, y - r), -// C = (x - r, y + r), D = (x + r, y + r), -// The sum of the box, or the rectangular ABCD can be calculated with the -// integral image (II): -// sum = II(D) - II(B') - II(C') + II(A'). -// A' = (x - r - 1, y - r - 1), B' = (x + r, y - r - 1), -// C' = (x - r - 1, y + r), D = (x + r, y + r), -// Here we calculate the integral image, as well as the squared integral image. -template <typename Pixel> -void ComputeIntegralImage(const Pixel* const src, ptrdiff_t src_stride, - int width, int height, uint16_t* integral_image, - uint32_t* square_integral_image, - ptrdiff_t image_stride) { - memset(integral_image, 0, image_stride * sizeof(integral_image[0])); - memset(square_integral_image, 0, - image_stride * sizeof(square_integral_image[0])); - - const Pixel* src_ptr = src; - uint16_t* integral_image_ptr = integral_image + image_stride + 1; - uint32_t* square_integral_image_ptr = - square_integral_image + image_stride + 1; - int y = 0; - do { - integral_image_ptr[-1] = 0; - square_integral_image_ptr[-1] = 0; - for (int x = 0; x < width; ++x) { - integral_image_ptr[x] = src_ptr[x] + integral_image_ptr[x - 1] + - integral_image_ptr[x - image_stride] - - integral_image_ptr[x - image_stride - 1]; - square_integral_image_ptr[x] = - src_ptr[x] * src_ptr[x] + square_integral_image_ptr[x - 1] + - square_integral_image_ptr[x - image_stride] - - square_integral_image_ptr[x - image_stride - 1]; - } - src_ptr += src_stride; - integral_image_ptr += image_stride; - square_integral_image_ptr += image_stride; - } while (++y < height); -} - template <int bitdepth, typename Pixel> struct LoopRestorationFuncs_C { LoopRestorationFuncs_C() = delete; - // |stride| for SelfGuidedFilter and WienerFilter is given in bytes. static void SelfGuidedFilter(const void* source, void* dest, const RestorationUnitInfo& restoration_info, ptrdiff_t source_stride, ptrdiff_t dest_stride, @@ -132,15 +109,18 @@ struct LoopRestorationFuncs_C { const RestorationUnitInfo& restoration_info, ptrdiff_t source_stride, ptrdiff_t dest_stride, int width, int height, RestorationBuffer* buffer); - // |stride| for box filter processing is in Pixels. - static void BoxFilterPreProcess(const RestorationUnitInfo& restoration_info, - const uint16_t* integral_image, - const uint32_t* square_integral_image, - int width, int height, int pass, - RestorationBuffer* buffer); static void BoxFilterProcess(const RestorationUnitInfo& restoration_info, - const Pixel* src, ptrdiff_t stride, int width, - int height, RestorationBuffer* buffer); + const Pixel* src, ptrdiff_t src_stride, + int width, int height, SgrBuffer* buffer, + Pixel* dst, ptrdiff_t dst_stride); + static void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, + const Pixel* src, ptrdiff_t src_stride, + int width, int height, SgrBuffer* buffer, + Pixel* dst, ptrdiff_t dst_stride); + static void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, + const Pixel* src, ptrdiff_t src_stride, + int width, int height, SgrBuffer* buffer, + Pixel* dst, ptrdiff_t dst_stride); }; // Note: range of wiener filter coefficients. @@ -154,7 +134,7 @@ struct LoopRestorationFuncs_C { // filter[3] = 0 - (filter[0] + filter[1] + filter[2]) * 2. // Thus in libaom's computation, an offset of 128 is needed for filter[3]. inline void PopulateWienerCoefficients( - const RestorationUnitInfo& restoration_info, int direction, + const RestorationUnitInfo& restoration_info, const int direction, int16_t* const filter) { filter[3] = 128; for (int i = 0; i < 3; ++i) { @@ -178,26 +158,64 @@ inline int CountZeroCoefficients(const int16_t* const filter) { return number_zero_coefficients; } -template <typename Pixel> -inline int WienerHorizontal(const Pixel* const source, - const int16_t* const filter, - const int number_zero_coefficients, int sum) { +template <int bitdepth, typename Pixel> +inline void WienerHorizontal(const Pixel* source, const ptrdiff_t source_stride, + const int width, const int height, + const int16_t* const filter, + const int number_zero_coefficients, + uint16_t** wiener_buffer) { constexpr int kCenterTap = (kSubPixelTaps - 1) / 2; - for (int k = number_zero_coefficients; k < kCenterTap; ++k) { - sum += filter[k] * (source[k] + source[kSubPixelTaps - 2 - k]); - } - return sum; + constexpr int kRoundBitsHorizontal = (bitdepth == 12) + ? kInterRoundBitsHorizontal12bpp + : kInterRoundBitsHorizontal; + constexpr int limit = + (1 << (bitdepth + 1 + kWienerFilterBits - kRoundBitsHorizontal)) - 1; + constexpr int horizontal_rounding = 1 << (bitdepth + kWienerFilterBits - 1); + int y = height; + do { + int x = 0; + do { + // sum fits into 16 bits only when bitdepth = 8. + int sum = horizontal_rounding; + for (int k = number_zero_coefficients; k < kCenterTap; ++k) { + sum += filter[k] * (source[x + k] + source[x + kSubPixelTaps - 2 - k]); + } + sum += filter[kCenterTap] * source[x + kCenterTap]; + const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsHorizontal); + (*wiener_buffer)[x] = static_cast<uint16_t>(Clip3(rounded_sum, 0, limit)); + } while (++x < width); + source += source_stride; + *wiener_buffer += width; + } while (--y != 0); } -inline int WienerVertical(const uint16_t* const source, - const int16_t* const filter, const int width, - const int number_zero_coefficients, int sum) { +template <int bitdepth, typename Pixel> +inline void WienerVertical(const uint16_t* wiener_buffer, const int width, + const int height, const int16_t* const filter, + const int number_zero_coefficients, void* const dest, + const ptrdiff_t dest_stride) { constexpr int kCenterTap = (kSubPixelTaps - 1) / 2; - for (int k = number_zero_coefficients; k < kCenterTap; ++k) { - sum += filter[k] * - (source[k * width] + source[(kSubPixelTaps - 2 - k) * width]); - } - return sum; + constexpr int kRoundBitsVertical = + (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical; + constexpr int vertical_rounding = -(1 << (bitdepth + kRoundBitsVertical - 1)); + auto* dst = static_cast<Pixel*>(dest); + int y = height; + do { + int x = 0; + do { + // sum needs 32 bits. + int sum = vertical_rounding; + for (int k = number_zero_coefficients; k < kCenterTap; ++k) { + sum += filter[k] * (wiener_buffer[k * width + x] + + wiener_buffer[(kSubPixelTaps - 2 - k) * width + x]); + } + sum += filter[kCenterTap] * wiener_buffer[kCenterTap * width + x]; + const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical); + dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1)); + } while (++x < width); + wiener_buffer += width; + dst += dest_stride; + } while (--y != 0); } // Note: bit range for wiener filter. @@ -223,13 +241,6 @@ void LoopRestorationFuncs_C<bitdepth, Pixel>::WienerFilter( ptrdiff_t dest_stride, int width, int height, RestorationBuffer* const buffer) { constexpr int kCenterTap = (kSubPixelTaps - 1) / 2; - constexpr int kRoundBitsHorizontal = (bitdepth == 12) - ? kInterRoundBitsHorizontal12bpp - : kInterRoundBitsHorizontal; - constexpr int kRoundBitsVertical = - (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical; - const int limit = - (1 << (bitdepth + 1 + kWienerFilterBits - kRoundBitsHorizontal)) - 1; int16_t filter_horizontal[kSubPixelTaps / 2]; int16_t filter_vertical[kSubPixelTaps / 2]; PopulateWienerCoefficients(restoration_info, WienerInfo::kHorizontal, @@ -240,448 +251,470 @@ void LoopRestorationFuncs_C<bitdepth, Pixel>::WienerFilter( CountZeroCoefficients(filter_horizontal); const int number_zero_coefficients_vertical = CountZeroCoefficients(filter_vertical); - - source_stride /= sizeof(Pixel); - dest_stride /= sizeof(Pixel); + const int number_rows_to_skip = + std::max(number_zero_coefficients_vertical, 1); // horizontal filtering. const auto* src = static_cast<const Pixel*>(source); - src -= (kCenterTap - number_zero_coefficients_vertical) * source_stride + - kCenterTap; - auto* wiener_buffer = - buffer->wiener_buffer + number_zero_coefficients_vertical * width; - const int horizontal_rounding = 1 << (bitdepth + kWienerFilterBits - 1); - int y = height + kSubPixelTaps - 2 - 2 * number_zero_coefficients_vertical; + src -= (kCenterTap - number_rows_to_skip) * source_stride + kCenterTap; + auto* wiener_buffer = buffer->wiener_buffer + number_rows_to_skip * width; + const int height_horizontal = + height + kSubPixelTaps - 2 - 2 * number_rows_to_skip; if (number_zero_coefficients_horizontal == 0) { - do { - int x = 0; - do { - // sum fits into 16 bits only when bitdepth = 8. - int sum = horizontal_rounding; - sum = WienerHorizontal<Pixel>(src + x, filter_horizontal, 0, sum); - sum += filter_horizontal[kCenterTap] * src[x + kCenterTap]; - const int rounded_sum = - RightShiftWithRounding(sum, kRoundBitsHorizontal); - wiener_buffer[x] = static_cast<uint16_t>(Clip3(rounded_sum, 0, limit)); - } while (++x < width); - src += source_stride; - wiener_buffer += width; - } while (--y != 0); + WienerHorizontal<bitdepth, Pixel>(src, source_stride, width, + height_horizontal, filter_horizontal, 0, + &wiener_buffer); } else if (number_zero_coefficients_horizontal == 1) { - do { - int x = 0; - do { - // sum fits into 16 bits only when bitdepth = 8. - int sum = horizontal_rounding; - sum = WienerHorizontal<Pixel>(src + x, filter_horizontal, 1, sum); - sum += filter_horizontal[kCenterTap] * src[x + kCenterTap]; - const int rounded_sum = - RightShiftWithRounding(sum, kRoundBitsHorizontal); - wiener_buffer[x] = static_cast<uint16_t>(Clip3(rounded_sum, 0, limit)); - } while (++x < width); - src += source_stride; - wiener_buffer += width; - } while (--y != 0); + WienerHorizontal<bitdepth, Pixel>(src, source_stride, width, + height_horizontal, filter_horizontal, 1, + &wiener_buffer); } else if (number_zero_coefficients_horizontal == 2) { - do { - int x = 0; - do { - // sum fits into 16 bits only when bitdepth = 8. - int sum = horizontal_rounding; - sum = WienerHorizontal<Pixel>(src + x, filter_horizontal, 2, sum); - sum += filter_horizontal[kCenterTap] * src[x + kCenterTap]; - const int rounded_sum = - RightShiftWithRounding(sum, kRoundBitsHorizontal); - wiener_buffer[x] = static_cast<uint16_t>(Clip3(rounded_sum, 0, limit)); - } while (++x < width); - src += source_stride; - wiener_buffer += width; - } while (--y != 0); + WienerHorizontal<bitdepth, Pixel>(src, source_stride, width, + height_horizontal, filter_horizontal, 2, + &wiener_buffer); } else { - do { - int x = 0; - do { - // sum fits into 16 bits only when bitdepth = 8. - int sum = horizontal_rounding; - sum += filter_horizontal[kCenterTap] * src[x + kCenterTap]; - const int rounded_sum = - RightShiftWithRounding(sum, kRoundBitsHorizontal); - wiener_buffer[x] = static_cast<uint16_t>(Clip3(rounded_sum, 0, limit)); - } while (++x < width); - src += source_stride; - wiener_buffer += width; - } while (--y != 0); + WienerHorizontal<bitdepth, Pixel>(src, source_stride, width, + height_horizontal, filter_horizontal, 3, + &wiener_buffer); } // vertical filtering. - const int vertical_rounding = -(1 << (bitdepth + kRoundBitsVertical - 1)); - auto* dst = static_cast<Pixel*>(dest); - wiener_buffer = buffer->wiener_buffer; - y = height; - if (number_zero_coefficients_vertical == 0) { - do { - int x = 0; - do { - // sum needs 32 bits. - int sum = vertical_rounding; - sum = WienerVertical(wiener_buffer + x, filter_vertical, width, 0, sum); - sum += - filter_vertical[kCenterTap] * wiener_buffer[kCenterTap * width + x]; - const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical); - dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1)); - } while (++x < width); - dst += dest_stride; - wiener_buffer += width; - } while (--y != 0); + // Because the top row of |source| is a duplicate of the second row, and the + // bottom row of |source| is a duplicate of its above row, we can duplicate + // the top and bottom row of |wiener_buffer| accordingly. + memcpy(wiener_buffer, wiener_buffer - width, + sizeof(*wiener_buffer) * width); + memcpy(buffer->wiener_buffer, buffer->wiener_buffer + width, + sizeof(*wiener_buffer) * width); + WienerVertical<bitdepth, Pixel>(buffer->wiener_buffer, width, height, + filter_vertical, 0, dest, dest_stride); } else if (number_zero_coefficients_vertical == 1) { - do { - int x = 0; - do { - // sum needs 32 bits. - int sum = vertical_rounding; - sum = WienerVertical(wiener_buffer + x, filter_vertical, width, 1, sum); - sum += - filter_vertical[kCenterTap] * wiener_buffer[kCenterTap * width + x]; - const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical); - dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1)); - } while (++x < width); - dst += dest_stride; - wiener_buffer += width; - } while (--y != 0); + WienerVertical<bitdepth, Pixel>(buffer->wiener_buffer, width, height, + filter_vertical, 1, dest, dest_stride); } else if (number_zero_coefficients_vertical == 2) { - do { - int x = 0; - do { - // sum needs 32 bits. - int sum = vertical_rounding; - sum = WienerVertical(wiener_buffer + x, filter_vertical, width, 2, sum); - sum += - filter_vertical[kCenterTap] * wiener_buffer[kCenterTap * width + x]; - const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical); - dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1)); - } while (++x < width); - dst += dest_stride; - wiener_buffer += width; - } while (--y != 0); + WienerVertical<bitdepth, Pixel>(buffer->wiener_buffer, width, height, + filter_vertical, 2, dest, dest_stride); } else { - do { - int x = 0; - do { - // sum needs 32 bits. - int sum = vertical_rounding; - sum += - filter_vertical[kCenterTap] * wiener_buffer[kCenterTap * width + x]; - const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical); - dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1)); - } while (++x < width); - dst += dest_stride; - wiener_buffer += width; - } while (--y != 0); + WienerVertical<bitdepth, Pixel>(buffer->wiener_buffer, width, height, + filter_vertical, 3, dest, dest_stride); } } +//------------------------------------------------------------------------------ +// SGR + +template <int bitdepth> +inline void CalculateIntermediate(const uint32_t s, uint32_t a, + const uint32_t b, const uint32_t n, + SgrIntermediateBuffer* const intermediate) { + // a: before shift, max is 25 * (2^(bitdepth) - 1) * (2^(bitdepth) - 1). + // since max bitdepth = 12, max < 2^31. + // after shift, a < 2^16 * n < 2^22 regardless of bitdepth + a = RightShiftWithRounding(a, (bitdepth - 8) << 1); + // b: max is 25 * (2^(bitdepth) - 1). If bitdepth = 12, max < 2^19. + // d < 2^8 * n < 2^14 regardless of bitdepth + const uint32_t d = RightShiftWithRounding(b, bitdepth - 8); + // p: Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28, + // and p itself satisfies p < 2^14 * n^2 < 2^26. + // This bound on p is due to: + // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances + // Note: Sometimes, in high bitdepth, we can end up with a*n < b*b. + // This is an artifact of rounding, and can only happen if all pixels + // are (almost) identical, so in this case we saturate to p=0. + const uint32_t p = (a * n < d * d) ? 0 : a * n - d * d; + // p * s < (2^14 * n^2) * round(2^20 / (n^2 * scale)) < 2^34 / scale < + // 2^32 as long as scale >= 4. So p * s fits into a uint32_t, and z < 2^12 + // (this holds even after accounting for the rounding in s) + const uint32_t z = RightShiftWithRounding(p * s, kSgrProjScaleBits); + // a2: range [1, 256]. + uint32_t a2 = kXByXPlus1[std::min(z, 255u)]; + const uint32_t one_over_n = kOneByX[n - 1]; + // (kSgrProjSgrBits - a2) < 2^8, b < 2^(bitdepth) * n, + // one_over_n = round(2^12 / n) + // => the product here is < 2^(20 + bitdepth) <= 2^32, + // and b is set to a value < 2^(8 + bitdepth). + // This holds even with the rounding in one_over_n and in the overall + // result, as long as (kSgrProjSgrBits - a2) is strictly less than 2^8. + const uint32_t b2 = ((1 << kSgrProjSgrBits) - a2) * b * one_over_n; + intermediate->a = a2; + intermediate->b = RightShiftWithRounding(b2, kSgrProjReciprocalBits); +} + template <int bitdepth, typename Pixel> -void LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterPreProcess( - const RestorationUnitInfo& restoration_info, const uint16_t* integral_image, - const uint32_t* square_integral_image, int width, int height, int pass, - RestorationBuffer* const buffer) { - const int sgr_proj_index = restoration_info.sgr_proj_info.index; - const uint8_t radius = kSgrProjParams[sgr_proj_index][pass * 2]; - assert(radius != 0); - const uint32_t n = (2 * radius + 1) * (2 * radius + 1); - // const uint8_t scale = kSgrProjParams[sgr_proj_index][pass * 2 + 1]; - // n2_with_scale: max value < 2^16. min value is 4. - // const uint32_t n2_with_scale = n * n * scale; - // s: max value < 2^12. - // const uint32_t s = - // ((1 << kSgrProjScaleBits) + (n2_with_scale >> 1)) / n2_with_scale; - const uint32_t s = kSgrScaleParameter[sgr_proj_index][pass]; - assert(s != 0); - const ptrdiff_t array_stride = buffer->box_filter_process_intermediate_stride; - const ptrdiff_t integral_image_stride = - kRestorationProcessingUnitSizeWithBorders + 1; - // The size of the intermediate result buffer is the size of the filter area - // plus horizontal (3) and vertical (3) padding. The processing start point - // is the filter area start point -1 row and -1 column. Therefore we need to - // set offset and use the intermediate_result as the start point for - // processing. - const ptrdiff_t intermediate_buffer_offset = - kRestorationBorder * array_stride + kRestorationBorder; - uint32_t* intermediate_result[2] = { - buffer->box_filter_process_intermediate[0] + intermediate_buffer_offset - - array_stride, - buffer->box_filter_process_intermediate[1] + intermediate_buffer_offset - - array_stride}; - - // Calculate intermediate results, including one-pixel border, for example, - // if unit size is 64x64, we calculate 66x66 pixels. - const int step = (pass == 0) ? 2 : 1; - const ptrdiff_t intermediate_stride = step * array_stride; - for (int y = -1; y <= height; y += step) { - for (int x = -1; x <= width; ++x) { - // The integral image helps to calculate the sum of the square - // centered at (x, y). - // The calculation of a, b is equal to the following lines: - // uint32_t a = 0; - // uint32_t b = 0; - // for (int dy = -radius; dy <= radius; ++dy) { - // for (int dx = -radius; dx <= radius; ++dx) { - // const Pixel source = src[(y + dy) * stride + (x + dx)]; - // a += source * source; - // b += source; - // } - // } - const int top_left = - (y + kRestorationBorder - radius) * integral_image_stride + x + - kRestorationBorder - radius; - const int top_right = top_left + 2 * radius + 1; - const int bottom_left = - top_left + (2 * radius + 1) * integral_image_stride; - const int bottom_right = bottom_left + 2 * radius + 1; - uint32_t a = square_integral_image[bottom_right] - - square_integral_image[bottom_left] - - square_integral_image[top_right] + - square_integral_image[top_left]; - uint32_t b; - - if (bitdepth <= 10 || radius < 2) { - // The following cast is mandatory to get truncated sum. - b = static_cast<uint16_t>( - integral_image[bottom_right] - integral_image[bottom_left] - - integral_image[top_right] + integral_image[top_left]); - } else { - assert(radius == 2); - const uint16_t b_top_15_pixels = - integral_image[top_right + 3 * integral_image_stride] - - integral_image[top_left + 3 * integral_image_stride] - - integral_image[top_right] + integral_image[top_left]; - const uint16_t b_bottom_10_pixels = - integral_image[bottom_right] - integral_image[bottom_left] - - integral_image[top_right + 3 * integral_image_stride] + - integral_image[top_left + 3 * integral_image_stride]; - b = b_top_15_pixels + b_bottom_10_pixels; - } +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessTop( + const Pixel* src, const ptrdiff_t stride, const int width, const uint32_t s, + SgrIntermediateBuffer* intermediate) { + uint32_t a = 0; + uint32_t b = 0; + for (int dx = 0; dx < 5; ++dx) { + const Pixel source = src[dx]; + a += source * source; + b += source; + } + a += a; + b += b; + for (int dy = 1; dy < 4; ++dy) { + for (int dx = 0; dx < 5; ++dx) { + const Pixel source = src[dy * stride + dx]; + a += source * source; + b += source; + } + } + CalculateIntermediate<bitdepth>(s, a, b, 25, intermediate); + int x = width - 1; + do { + { + const Pixel source0 = src[0]; + const Pixel source1 = src[5]; + a += 2 * (source1 * source1 - source0 * source0); + b += 2 * (source1 - source0); + } + int dy = 1; + do { + const Pixel source0 = src[dy * stride]; + const Pixel source1 = src[dy * stride + 5]; + a -= source0 * source0; + a += source1 * source1; + b -= source0; + b += source1; + } while (++dy < 4); + src++; + CalculateIntermediate<bitdepth>(s, a, b, 25, ++intermediate); + } while (--x != 0); +} - // a: before shift, max is 25 * (2^(bitdepth) - 1) * (2^(bitdepth) - 1). - // since max bitdepth = 12, max < 2^31. - // after shift, a < 2^16 * n < 2^22 regardless of bitdepth - a = RightShiftWithRounding(a, (bitdepth - 8) << 1); - // b: max is 25 * (2^(bitdepth) - 1). If bitdepth = 12, max < 2^19. - // d < 2^8 * n < 2^14 regardless of bitdepth - const uint32_t d = RightShiftWithRounding(b, bitdepth - 8); - // p: Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28, - // and p itself satisfies p < 2^14 * n^2 < 2^26. - // This bound on p is due to: - // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances - // Note: Sometimes, in high bitdepth, we can end up with a*n < b*b. - // This is an artifact of rounding, and can only happen if all pixels - // are (almost) identical, so in this case we saturate to p=0. - const uint32_t p = (a * n < d * d) ? 0 : a * n - d * d; - // p * s < (2^14 * n^2) * round(2^20 / (n^2 * scale)) < 2^34 / scale < - // 2^32 as long as scale >= 4. So p * s fits into a uint32_t, and z < 2^12 - // (this holds even after accounting for the rounding in s) - const uint32_t z = RightShiftWithRounding(p * s, kSgrProjScaleBits); - // a2: range [1, 256]. - uint32_t a2 = kXByXPlus1[std::min(z, 255u)]; - const uint32_t one_over_n = kOneByX[n - 1]; - // (kSgrProjSgrBits - a2) < 2^8, b < 2^(bitdepth) * n, - // one_over_n = round(2^12 / n) - // => the product here is < 2^(20 + bitdepth) <= 2^32, - // and b is set to a value < 2^(8 + bitdepth). - // This holds even with the rounding in one_over_n and in the overall - // result, as long as (kSgrProjSgrBits - a2) is strictly less than 2^8. - const uint32_t b2 = ((1 << kSgrProjSgrBits) - a2) * b * one_over_n; - intermediate_result[0][x] = a2; - intermediate_result[1][x] = - RightShiftWithRounding(b2, kSgrProjReciprocalBits); +template <int bitdepth, typename Pixel, int size> +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess( + const Pixel* src, const ptrdiff_t stride, const int width, const uint32_t s, + SgrIntermediateBuffer* intermediate) { + const int n = size * size; + uint32_t a = 0; + uint32_t b = 0; + for (int dy = 0; dy < size; ++dy) { + for (int dx = 0; dx < size; ++dx) { + const Pixel source = src[dy * stride + dx]; + a += source * source; + b += source; } - intermediate_result[0] += intermediate_stride; - intermediate_result[1] += intermediate_stride; } + CalculateIntermediate<bitdepth>(s, a, b, n, intermediate); + int x = width - 1; + do { + int dy = 0; + do { + const Pixel source0 = src[dy * stride]; + const Pixel source1 = src[dy * stride + size]; + a -= source0 * source0; + a += source1 * source1; + b -= source0; + b += source1; + } while (++dy < size); + src++; + CalculateIntermediate<bitdepth>(s, a, b, n, ++intermediate); + } while (--x != 0); +} + +template <int bitdepth, typename Pixel> +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessBottom( + const Pixel* src, const ptrdiff_t stride, const int width, const uint32_t s, + SgrIntermediateBuffer* intermediate) { + uint32_t a = 0; + uint32_t b = 0; + for (int dx = 0; dx < 5; ++dx) { + const Pixel source = src[3 * stride + dx]; + a += source * source; + b += source; + } + a += a; + b += b; + for (int dy = 0; dy < 3; ++dy) { + for (int dx = 0; dx < 5; ++dx) { + const Pixel source = src[dy * stride + dx]; + a += source * source; + b += source; + } + } + CalculateIntermediate<bitdepth>(s, a, b, 25, intermediate); + int x = width - 1; + do { + { + const Pixel source0 = src[3 * stride + 0]; + const Pixel source1 = src[3 * stride + 5]; + a += 2 * (source1 * source1 - source0 * source0); + b += 2 * (source1 - source0); + } + int dy = 0; + do { + const Pixel source0 = src[dy * stride]; + const Pixel source1 = src[dy * stride + 5]; + a -= source0 * source0; + a += source1 * source1; + b -= source0; + b += source1; + } while (++dy < 3); + src++; + CalculateIntermediate<bitdepth>(s, a, b, 25, ++intermediate); + } while (--x != 0); +} + +inline void Sum565(const SgrIntermediateBuffer* const intermediate, + uint16_t* const a, uint32_t* const b) { + *a = 5 * (intermediate[0].a + intermediate[2].a) + 6 * intermediate[1].a; + *b = 5 * (intermediate[0].b + intermediate[2].b) + 6 * intermediate[1].b; +} + +template <typename Pixel> +inline int CalculateFilteredOutput(const Pixel src, const uint32_t a, + const uint32_t b, const int shift) { + // v < 2^32. All intermediate calculations are positive. + const uint32_t v = a * src + b; + return RightShiftWithRounding(v, + kSgrProjSgrBits + shift - kSgrProjRestoreBits); +} + +template <typename Pixel> +inline void BoxFilterPass1(const Pixel src0, const Pixel src1, + const SgrIntermediateBuffer* const intermediate[2], + const ptrdiff_t x, int p[2]) { + uint16_t a[2]; + uint32_t b[2]; + Sum565(intermediate[0] + x, &a[0], &b[0]); + Sum565(intermediate[1] + x, &a[1], &b[1]); + p[0] = CalculateFilteredOutput<Pixel>(src0, a[0] + a[1], b[0] + b[1], 5); + p[1] = CalculateFilteredOutput<Pixel>(src1, a[1], b[1], 4); +} + +template <typename Pixel> +inline int BoxFilterPass2(const Pixel src, + const SgrIntermediateBuffer* const intermediate[3], + const ptrdiff_t x) { + const uint32_t a = 3 * (intermediate[0][x + 0].a + intermediate[0][x + 2].a + + intermediate[2][x + 0].a + intermediate[2][x + 2].a) + + 4 * (intermediate[0][x + 1].a + intermediate[1][x + 0].a + + intermediate[1][x + 1].a + intermediate[1][x + 2].a + + intermediate[2][x + 1].a); + const uint32_t b = 3 * (intermediate[0][x + 0].b + intermediate[0][x + 2].b + + intermediate[2][x + 0].b + intermediate[2][x + 2].b) + + 4 * (intermediate[0][x + 1].b + intermediate[1][x + 0].b + + intermediate[1][x + 1].b + intermediate[1][x + 2].b + + intermediate[2][x + 1].b); + return CalculateFilteredOutput<Pixel>(src, a, b, 5); +} + +template <int bitdepth, typename Pixel> +inline Pixel SelfGuidedDoubleMultiplier(const int src, + const int box_filter_process_output0, + const int box_filter_process_output1, + const int16_t w0, const int16_t w1, + const int16_t w2) { + const int v = w1 * (src << kSgrProjRestoreBits) + + w0 * box_filter_process_output0 + + w2 * box_filter_process_output1; + // if radius_pass_0 == 0 and radius_pass_1 == 0, the range of v is: + // bits(u) + bits(w0/w1/w2) + 2 = bitdepth + 13. + // Then, range of s is bitdepth + 2. This is a rough estimation, taking + // the maximum value of each element. + const int s = + RightShiftWithRounding(v, kSgrProjRestoreBits + kSgrProjPrecisionBits); + return static_cast<Pixel>(Clip3(s, 0, (1 << bitdepth) - 1)); +} + +template <int bitdepth, typename Pixel> +inline Pixel SelfGuidedSingleMultiplier(const int src, + const int box_filter_process_output, + const int16_t w0, const int16_t w1) { + const int v = + w1 * (src << kSgrProjRestoreBits) + w0 * box_filter_process_output; + // if radius_pass_0 == 0 and radius_pass_1 == 0, the range of v is: + // bits(u) + bits(w0/w1/w2) + 2 = bitdepth + 13. + // Then, range of s is bitdepth + 2. This is a rough estimation, taking + // the maximum value of each element. + const int s = + RightShiftWithRounding(v, kSgrProjRestoreBits + kSgrProjPrecisionBits); + return static_cast<Pixel>(Clip3(s, 0, (1 << bitdepth) - 1)); } template <int bitdepth, typename Pixel> -void LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcess( +inline void LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcess( const RestorationUnitInfo& restoration_info, const Pixel* src, - ptrdiff_t stride, int width, int height, RestorationBuffer* const buffer) { + const ptrdiff_t src_stride, const int width, const int height, + SgrBuffer* const buffer, Pixel* dst, const ptrdiff_t dst_stride) { const int sgr_proj_index = restoration_info.sgr_proj_info.index; + const uint32_t s0 = kSgrScaleParameter[sgr_proj_index][0]; // s0 < 2^12. + const uint32_t s1 = kSgrScaleParameter[sgr_proj_index][1]; // s1 < 2^12. + const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0]; + const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1]; + const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1; + SgrIntermediateBuffer *intermediate0[2], *intermediate1[4]; + assert(s0 != 0); + assert(s1 != 0); + intermediate0[0] = buffer->intermediate; + intermediate0[1] = intermediate0[0] + kIntermediateStride; + intermediate1[0] = intermediate0[1] + kIntermediateStride; + intermediate1[1] = intermediate1[0] + kIntermediateStride, + intermediate1[2] = intermediate1[1] + kIntermediateStride, + intermediate1[3] = intermediate1[2] + kIntermediateStride; + BoxFilterPreProcessTop<bitdepth, Pixel>(src - 2 * src_stride - 3, src_stride, + width + 2, s0, intermediate0[0]); + BoxFilterPreProcess<bitdepth, Pixel, 3>(src - 2 * src_stride - 2, src_stride, + width + 2, s1, intermediate1[0]); + BoxFilterPreProcess<bitdepth, Pixel, 3>(src - 1 * src_stride - 2, src_stride, + width + 2, s1, intermediate1[1]); + for (int y = height >> 1; y != 0; --y) { + BoxFilterPreProcess<bitdepth, Pixel, 5>(src - src_stride - 3, src_stride, + width + 2, s0, intermediate0[1]); + BoxFilterPreProcess<bitdepth, Pixel, 3>(src - 2, src_stride, width + 2, s1, + intermediate1[2]); + BoxFilterPreProcess<bitdepth, Pixel, 3>(src + src_stride - 2, src_stride, + width + 2, s1, intermediate1[3]); + int x = 0; + do { + int p[2][2]; + BoxFilterPass1<Pixel>(src[x], src[src_stride + x], intermediate0, x, + p[0]); + p[1][0] = BoxFilterPass2<Pixel>(src[x], intermediate1, x); + p[1][1] = + BoxFilterPass2<Pixel>(src[src_stride + x], intermediate1 + 1, x); + dst[x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p[0][0], + p[1][0], w0, w1, w2); + dst[dst_stride + x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>( + src[src_stride + x], p[0][1], p[1][1], w0, w1, w2); + } while (++x < width); + src += 2 * src_stride; + dst += 2 * dst_stride; + std::swap(intermediate0[0], intermediate0[1]); + std::swap(intermediate1[0], intermediate1[2]); + std::swap(intermediate1[1], intermediate1[3]); + } + if ((height & 1) != 0) { + BoxFilterPreProcessBottom<bitdepth, Pixel>(src - src_stride - 3, src_stride, + width + 2, s0, intermediate0[1]); + BoxFilterPreProcess<bitdepth, Pixel, 3>(src - 2, src_stride, width + 2, s1, + intermediate1[2]); + int x = 0; + do { + int p[2][2]; + BoxFilterPass1<Pixel>(src[x], src[src_stride + x], intermediate0, x, + p[0]); + p[1][0] = BoxFilterPass2<Pixel>(src[x], intermediate1, x); + dst[x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p[0][0], + p[1][0], w0, w1, w2); + } while (++x < width); + } +} - // We calculate intermediate values for the region (width + 1) x (height + 1). - // The region we can access is (width + 1 + radius) x (height + 1 + radius). - // The max radius is 2. width = height = - // kRestorationProcessingUnitSizeWithBorders. - // For the integral_image, we need one row before the accessible region, - // so the stride is kRestorationProcessingUnitSizeWithBorders + 1. - // We fix the first row and first column of integral image be 0 to facilitate - // computation. - - // Note that the max sum = (2 ^ bitdepth - 1) * - // kRestorationProcessingUnitSizeWithBorders * - // kRestorationProcessingUnitSizeWithBorders. - // The max sum is larger than 2^16. - // Case 8 bit and 10 bit: - // The final box sum has at most 25 pixels, which is within 16 bits. So - // keeping truncated 16-bit values is enough. - // Case 12 bit, radius 1: - // The final box sum has 9 pixels, which is within 16 bits. So keeping - // truncated 16-bit values is enough. - // Case 12 bit, radius 2: - // The final box sum has 25 pixels. It can be calculated by calculating the - // top 15 pixels and the bottom 10 pixels separately, and adding them - // together. So keeping truncated 16-bit values is enough. - // If it is slower than using 32-bit for specific CPU targets, please split - // into 2 paths. - uint16_t integral_image[(kRestorationProcessingUnitSizeWithBorders + 1) * - (kRestorationProcessingUnitSizeWithBorders + 1)]; - - // Note that the max squared sum = - // (2 ^ bitdepth - 1) * (2 ^ bitdepth - 1) * - // kRestorationProcessingUnitSizeWithBorders * - // kRestorationProcessingUnitSizeWithBorders. - // For 8 bit, 32-bit is enough. For 10 bit and up, the sum could be larger - // than 2^32. However, the final box sum has at most 25 squares, which is - // within 32 bits. So keeping truncated 32-bit values is enough. - uint32_t - square_integral_image[(kRestorationProcessingUnitSizeWithBorders + 1) * - (kRestorationProcessingUnitSizeWithBorders + 1)]; - const ptrdiff_t integral_image_stride = - kRestorationProcessingUnitSizeWithBorders + 1; - const ptrdiff_t filtered_output_stride = - buffer->box_filter_process_output_stride; - const ptrdiff_t intermediate_stride = - buffer->box_filter_process_intermediate_stride; - const ptrdiff_t intermediate_buffer_offset = - kRestorationBorder * intermediate_stride + kRestorationBorder; - - ComputeIntegralImage<Pixel>( - src - kRestorationBorder * stride - kRestorationBorder, stride, - width + 2 * kRestorationBorder, height + 2 * kRestorationBorder, - integral_image, square_integral_image, integral_image_stride); - - for (int pass = 0; pass < 2; ++pass) { - const uint8_t radius = kSgrProjParams[sgr_proj_index][pass * 2]; - if (radius == 0) continue; - LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterPreProcess( - restoration_info, integral_image, square_integral_image, width, height, - pass, buffer); - - const Pixel* src_ptr = src; - // Set intermediate buffer start point to the actual start point of - // filtering. - const uint32_t* array_start[2] = { - buffer->box_filter_process_intermediate[0] + intermediate_buffer_offset, - buffer->box_filter_process_intermediate[1] + - intermediate_buffer_offset}; - int* filtered_output = buffer->box_filter_process_output[pass]; - for (int y = 0; y < height; ++y) { - const int shift = (pass == 0 && (y & 1) != 0) ? 4 : 5; - // array_start[0]: range [1, 256]. - // array_start[1] < 2^20. - for (int x = 0; x < width; ++x) { - uint32_t a, b; - if (pass == 0) { - if ((y & 1) == 0) { - a = 5 * (array_start[0][-intermediate_stride + x - 1] + - array_start[0][-intermediate_stride + x + 1] + - array_start[0][intermediate_stride + x - 1] + - array_start[0][intermediate_stride + x + 1]) + - 6 * (array_start[0][-intermediate_stride + x] + - array_start[0][intermediate_stride + x]); - b = 5 * (array_start[1][-intermediate_stride + x - 1] + - array_start[1][-intermediate_stride + x + 1] + - array_start[1][intermediate_stride + x - 1] + - array_start[1][intermediate_stride + x + 1]) + - 6 * (array_start[1][-intermediate_stride + x] + - array_start[1][intermediate_stride + x]); - } else { - a = 5 * (array_start[0][x - 1] + array_start[0][x + 1]) + - 6 * array_start[0][x]; - b = 5 * (array_start[1][x - 1] + array_start[1][x + 1]) + - 6 * array_start[1][x]; - } - } else { - a = 3 * (array_start[0][-intermediate_stride + x - 1] + - array_start[0][-intermediate_stride + x + 1] + - array_start[0][intermediate_stride + x - 1] + - array_start[0][intermediate_stride + x + 1]) + - 4 * (array_start[0][-intermediate_stride + x] + - array_start[0][x - 1] + array_start[0][x] + - array_start[0][x + 1] + - array_start[0][intermediate_stride + x]); - b = 3 * (array_start[1][-intermediate_stride + x - 1] + - array_start[1][-intermediate_stride + x + 1] + - array_start[1][intermediate_stride + x - 1] + - array_start[1][intermediate_stride + x + 1]) + - 4 * (array_start[1][-intermediate_stride + x] + - array_start[1][x - 1] + array_start[1][x] + - array_start[1][x + 1] + - array_start[1][intermediate_stride + x]); - } - // v < 2^32. All intermediate calculations are positive. - const uint32_t v = a * src_ptr[x] + b; - filtered_output[x] = RightShiftWithRounding( - v, kSgrProjSgrBits + shift - kSgrProjRestoreBits); - } - src_ptr += stride; - array_start[0] += intermediate_stride; - array_start[1] += intermediate_stride; - filtered_output += filtered_output_stride; - } +template <int bitdepth, typename Pixel> +inline void LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcessPass1( + const RestorationUnitInfo& restoration_info, const Pixel* src, + const ptrdiff_t src_stride, const int width, const int height, + SgrBuffer* const buffer, Pixel* dst, const ptrdiff_t dst_stride) { + const int sgr_proj_index = restoration_info.sgr_proj_info.index; + const uint32_t s = kSgrScaleParameter[sgr_proj_index][0]; // s < 2^12. + const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0]; + const int16_t w1 = (1 << kSgrProjPrecisionBits) - w0; + SgrIntermediateBuffer* intermediate[2]; + assert(s != 0); + intermediate[0] = buffer->intermediate; + intermediate[1] = intermediate[0] + kIntermediateStride; + BoxFilterPreProcessTop<bitdepth, Pixel>(src - 2 * src_stride - 3, src_stride, + width + 2, s, intermediate[0]); + for (int y = height >> 1; y != 0; --y) { + BoxFilterPreProcess<bitdepth, Pixel, 5>(src - src_stride - 3, src_stride, + width + 2, s, intermediate[1]); + int x = 0; + do { + int p[2]; + BoxFilterPass1<Pixel>(src[x], src[src_stride + x], intermediate, x, p); + dst[x] = + SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p[0], w0, w1); + dst[dst_stride + x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>( + src[src_stride + x], p[1], w0, w1); + } while (++x < width); + src += 2 * src_stride; + dst += 2 * dst_stride; + std::swap(intermediate[0], intermediate[1]); + } + if ((height & 1) != 0) { + BoxFilterPreProcessBottom<bitdepth, Pixel>(src - src_stride - 3, src_stride, + width + 2, s, intermediate[1]); + int x = 0; + do { + int p[2]; + BoxFilterPass1<Pixel>(src[x], src[src_stride + x], intermediate, x, p); + dst[x] = + SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p[0], w0, w1); + dst[dst_stride + x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>( + src[src_stride + x], p[1], w0, w1); + } while (++x < width); } } -// Assume box_filter_process_output[2] are allocated before calling -// this function. Their sizes are width * height, stride equals width. +template <int bitdepth, typename Pixel> +inline void LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcessPass2( + const RestorationUnitInfo& restoration_info, const Pixel* src, + const ptrdiff_t src_stride, const int width, const int height, + SgrBuffer* const buffer, Pixel* dst, const ptrdiff_t dst_stride) { + assert(restoration_info.sgr_proj_info.multiplier[0] == 0); + const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1]; + const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1; + const int sgr_proj_index = restoration_info.sgr_proj_info.index; + const uint32_t s = kSgrScaleParameter[sgr_proj_index][1]; // s < 2^12. + SgrIntermediateBuffer* intermediate[3]; + assert(s != 0); + intermediate[0] = buffer->intermediate; + intermediate[1] = intermediate[0] + kIntermediateStride; + intermediate[2] = intermediate[1] + kIntermediateStride; + BoxFilterPreProcess<bitdepth, Pixel, 3>(src - 2 * src_stride - 2, src_stride, + width + 2, s, intermediate[0]); + BoxFilterPreProcess<bitdepth, Pixel, 3>(src - 1 * src_stride - 2, src_stride, + width + 2, s, intermediate[1]); + int y = height; + do { + BoxFilterPreProcess<bitdepth, Pixel, 3>(src - 2, src_stride, width + 2, s, + intermediate[2]); + int x = 0; + do { + const int p = BoxFilterPass2<Pixel>(src[x], intermediate, x); + dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0, w1); + } while (++x < width); + src += src_stride; + dst += dst_stride; + SgrIntermediateBuffer* const intermediate0 = intermediate[0]; + intermediate[0] = intermediate[1]; + intermediate[1] = intermediate[2]; + intermediate[2] = intermediate0; + } while (--y != 0); +} + template <int bitdepth, typename Pixel> void LoopRestorationFuncs_C<bitdepth, Pixel>::SelfGuidedFilter( const void* const source, void* const dest, const RestorationUnitInfo& restoration_info, ptrdiff_t source_stride, ptrdiff_t dest_stride, int width, int height, - RestorationBuffer* const buffer) { - const int w0 = restoration_info.sgr_proj_info.multiplier[0]; - const int w1 = restoration_info.sgr_proj_info.multiplier[1]; - const int w2 = (1 << kSgrProjPrecisionBits) - w0 - w1; + RestorationBuffer* const /*buffer*/) { const int index = restoration_info.sgr_proj_info.index; - const int radius_pass_0 = kSgrProjParams[index][0]; - const int radius_pass_1 = kSgrProjParams[index][2]; - const ptrdiff_t array_stride = buffer->box_filter_process_output_stride; - const int* box_filter_process_output[2] = { - buffer->box_filter_process_output[0], - buffer->box_filter_process_output[1]}; + const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0 + const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0 const auto* src = static_cast<const Pixel*>(source); auto* dst = static_cast<Pixel*>(dest); - source_stride /= sizeof(Pixel); - dest_stride /= sizeof(Pixel); - LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcess( - restoration_info, src, source_stride, width, height, buffer); - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - const int u = src[x] << kSgrProjRestoreBits; - int v = w1 * u; - if (radius_pass_0 != 0) { - v += w0 * box_filter_process_output[0][x]; - } else { - v += w0 * u; - } - if (radius_pass_1 != 0) { - v += w2 * box_filter_process_output[1][x]; - } else { - v += w2 * u; - } - // if radius_pass_0 == 0 and radius_pass_1 == 0, the range of v is: - // bits(u) + bits(w0/w1/w2) + 2 = bitdepth + 13. - // Then, range of s is bitdepth + 2. This is a rough estimation, taking - // the maximum value of each element. - const int s = RightShiftWithRounding( - v, kSgrProjRestoreBits + kSgrProjPrecisionBits); - dst[x] = static_cast<Pixel>(Clip3(s, 0, (1 << bitdepth) - 1)); - } - src += source_stride; - dst += dest_stride; - box_filter_process_output[0] += array_stride; - box_filter_process_output[1] += array_stride; + SgrBuffer buffer; + if (radius_pass_1 == 0) { + // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the + // following assertion. + assert(radius_pass_0 != 0); + LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcessPass1( + restoration_info, src, source_stride, width, height, &buffer, dst, + dest_stride); + } else if (radius_pass_0 == 0) { + LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcessPass2( + restoration_info, src, source_stride, width, height, &buffer, dst, + dest_stride); + } else { + LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcess( + restoration_info, src, source_stride, width, height, &buffer, dst, + dest_stride); } } @@ -739,7 +772,7 @@ void LoopRestorationInit_C() { // available. static_cast<void>(CountZeroCoefficients); static_cast<void>(PopulateWienerCoefficients); - static_cast<void>(WienerVertical); + static_cast<void>(Sum565); } } // namespace dsp diff --git a/chromium/third_party/libgav1/src/src/dsp/loop_restoration.h b/chromium/third_party/libgav1/src/src/dsp/loop_restoration.h index 663639c682f..d5511eab24f 100644 --- a/chromium/third_party/libgav1/src/src/dsp/loop_restoration.h +++ b/chromium/third_party/libgav1/src/src/dsp/loop_restoration.h @@ -38,6 +38,19 @@ namespace libgav1 { namespace dsp { +enum { + // Precision of a division table (mtable) + kSgrProjScaleBits = 20, + kSgrProjReciprocalBits = 12, + // Core self-guided restoration precision bits. + kSgrProjSgrBits = 8, + // Precision bits of generated values higher than source before projection. + kSgrProjRestoreBits = 4 +}; // anonymous enum + +extern const int kXByXPlus1[256]; +extern const uint8_t kSgrMa2Lookup[256]; + // Initializes Dsp::loop_restorations. This function is not thread-safe. void LoopRestorationInit_C(); diff --git a/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.cc b/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.cc index 59cfeb4db72..b51ec8f7270 100644 --- a/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.cc +++ b/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.cc @@ -22,6 +22,7 @@ #include "src/dsp/dsp.h" #include "src/utils/common.h" #include "src/utils/constants.h" +#include "src/utils/reference_info.h" #include "src/utils/types.h" namespace libgav1 { @@ -36,12 +37,11 @@ namespace { !defined(LIBGAV1_Dsp10bpp_MotionFieldProjectionKernel)) // 7.9.2. -void MotionFieldProjectionKernel_C( - const ReferenceFrameType* source_reference_type, const MotionVector* mv, - const uint8_t order_hint[kNumReferenceFrameTypes], - unsigned int current_frame_order_hint, unsigned int order_hint_shift_bits, - int reference_to_current_with_sign, int dst_sign, int y8_start, int y8_end, - int x8_start, int x8_end, TemporalMotionField* motion_field) { +void MotionFieldProjectionKernel_C(const ReferenceInfo& reference_info, + int reference_to_current_with_sign, + int dst_sign, int y8_start, int y8_end, + int x8_start, int x8_end, + TemporalMotionField* motion_field) { const ptrdiff_t stride = motion_field->mv.columns(); // The column range has to be offset by kProjectionMvMaxHorizontalOffset since // coordinates in that range could end up being position_x8 because of @@ -50,37 +50,31 @@ void MotionFieldProjectionKernel_C( std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0); const int adjusted_x8_end = std::min( x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride)); + const int8_t* const reference_offsets = + reference_info.relative_distance_to.data(); + const bool* const skip_references = reference_info.skip_references.data(); + const int16_t* const projection_divisions = + reference_info.projection_divisions.data(); + const ReferenceFrameType* source_reference_types = + &reference_info.motion_field_reference_frame[y8_start][0]; + const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0]; int8_t* dst_reference_offset = motion_field->reference_offset[y8_start]; MotionVector* dst_mv = motion_field->mv[y8_start]; - int reference_offsets[kNumReferenceFrameTypes]; - bool skip_reference[kNumReferenceFrameTypes]; assert(stride == motion_field->reference_offset.columns()); assert((y8_start & 7) == 0); - // Initialize skip_reference[kReferenceFrameIntra] to simplify branch - // conditions in projection. - skip_reference[kReferenceFrameIntra] = true; - for (int reference_type = kReferenceFrameLast; - reference_type <= kNumInterReferenceFrameTypes; ++reference_type) { - const int reference_offset = - GetRelativeDistance(current_frame_order_hint, - order_hint[reference_type], order_hint_shift_bits); - skip_reference[reference_type] = - reference_offset > kMaxFrameDistance || reference_offset <= 0; - reference_offsets[reference_type] = reference_offset; - } - int y8 = y8_start; do { const int y8_floor = (y8 & ~7) - y8; const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8); int x8 = adjusted_x8_start; do { - if (skip_reference[source_reference_type[x8]]) continue; - const int reference_offset = reference_offsets[source_reference_type[x8]]; + const int source_reference_type = source_reference_types[x8]; + if (skip_references[source_reference_type]) continue; MotionVector projection_mv; // reference_to_current_with_sign could be 0. - GetMvProjection(mv[x8], reference_to_current_with_sign, reference_offset, + GetMvProjection(mv[x8], reference_to_current_with_sign, + projection_divisions[source_reference_type], &projection_mv); // Do not update the motion vector if the block position is not valid or // if position_x8 is outside the current range of x8_start and x8_end. @@ -97,9 +91,9 @@ void MotionFieldProjectionKernel_C( if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue; dst_mv[position_y8 * stride + position_x8] = mv[x8]; dst_reference_offset[position_y8 * stride + position_x8] = - reference_offset; + reference_offsets[source_reference_type]; } while (++x8 < adjusted_x8_end); - source_reference_type += stride; + source_reference_types += stride; mv += stride; dst_reference_offset += stride; dst_mv += stride; diff --git a/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.h b/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.h index 5b18be5a3ac..36de459d8f3 100644 --- a/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.h +++ b/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.h @@ -24,6 +24,14 @@ // ARM: #include "src/dsp/arm/motion_field_projection_neon.h" +// x86: +// Note includes should be sorted in logical order avx2/avx/sse4, etc. +// The order of includes is important as each tests for a superior version +// before setting the base. +// clang-format off +// SSE4_1 +#include "src/dsp/x86/motion_field_projection_sse4.h" +// clang-format on // IWYU pragma: end_exports diff --git a/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.cc b/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.cc index 33ecb2b1818..94023027fd9 100644 --- a/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.cc +++ b/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.cc @@ -47,9 +47,10 @@ void MvProjectionCompoundLowPrecision_C( for (int i = 0; i < 2; ++i) { // |offsets| non-zero check usually equals true and could be ignored. if (offsets[i] != 0) { - GetMvProjection(temporal_mvs[index], offsets[i], - temporal_reference_offsets[index], - &candidate_mvs[index].mv[i]); + GetMvProjection( + temporal_mvs[index], offsets[i], + kProjectionMvDivisionLookup[temporal_reference_offsets[index]], + &candidate_mvs[index].mv[i]); for (auto& mv : candidate_mvs[index].mv[i].mv) { // The next line is equivalent to: // if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1; @@ -73,9 +74,10 @@ void MvProjectionCompoundForceInteger_C( for (int i = 0; i < 2; ++i) { // |offsets| non-zero check usually equals true and could be ignored. if (offsets[i] != 0) { - GetMvProjection(temporal_mvs[index], offsets[i], - temporal_reference_offsets[index], - &candidate_mvs[index].mv[i]); + GetMvProjection( + temporal_mvs[index], offsets[i], + kProjectionMvDivisionLookup[temporal_reference_offsets[index]], + &candidate_mvs[index].mv[i]); for (auto& mv : candidate_mvs[index].mv[i].mv) { // The next line is equivalent to: // const int value = (std::abs(static_cast<int>(mv)) + 3) & ~7; @@ -101,9 +103,10 @@ void MvProjectionCompoundHighPrecision_C( for (int i = 0; i < 2; ++i) { // |offsets| non-zero check usually equals true and could be ignored. if (offsets[i] != 0) { - GetMvProjection(temporal_mvs[index], offsets[i], - temporal_reference_offsets[index], - &candidate_mvs[index].mv[i]); + GetMvProjection( + temporal_mvs[index], offsets[i], + kProjectionMvDivisionLookup[temporal_reference_offsets[index]], + &candidate_mvs[index].mv[i]); } } } while (++index < count); @@ -115,8 +118,10 @@ void MvProjectionSingleLowPrecision_C( const int count, MotionVector* const candidate_mvs) { int index = 0; do { - GetMvProjection(temporal_mvs[index], reference_offset, - temporal_reference_offsets[index], &candidate_mvs[index]); + GetMvProjection( + temporal_mvs[index], reference_offset, + kProjectionMvDivisionLookup[temporal_reference_offsets[index]], + &candidate_mvs[index]); for (auto& mv : candidate_mvs[index].mv) { // The next line is equivalent to: // if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1; @@ -131,8 +136,10 @@ void MvProjectionSingleForceInteger_C( const int count, MotionVector* const candidate_mvs) { int index = 0; do { - GetMvProjection(temporal_mvs[index], reference_offset, - temporal_reference_offsets[index], &candidate_mvs[index]); + GetMvProjection( + temporal_mvs[index], reference_offset, + kProjectionMvDivisionLookup[temporal_reference_offsets[index]], + &candidate_mvs[index]); for (auto& mv : candidate_mvs[index].mv) { // The next line is equivalent to: // const int value = (std::abs(static_cast<int>(mv)) + 3) & ~7; @@ -149,8 +156,10 @@ void MvProjectionSingleHighPrecision_C( const int count, MotionVector* const candidate_mvs) { int index = 0; do { - GetMvProjection(temporal_mvs[index], reference_offset, - temporal_reference_offsets[index], &candidate_mvs[index]); + GetMvProjection( + temporal_mvs[index], reference_offset, + kProjectionMvDivisionLookup[temporal_reference_offsets[index]], + &candidate_mvs[index]); } while (++index < count); } diff --git a/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.h b/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.h index 7ab99a3f2f9..ae16726a961 100644 --- a/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.h +++ b/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.h @@ -25,6 +25,15 @@ // ARM: #include "src/dsp/arm/motion_vector_search_neon.h" +// x86: +// Note includes should be sorted in logical order avx2/avx/sse4, etc. +// The order of includes is important as each tests for a superior version +// before setting the base. +// clang-format off +// SSE4_1 +#include "src/dsp/x86/motion_vector_search_sse4.h" +// clang-format on + // IWYU pragma: end_exports namespace libgav1 { diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/cdef_sse4.cc b/chromium/third_party/libgav1/src/src/dsp/x86/cdef_sse4.cc index eed99e5a9c6..fd2c54af4f2 100644 --- a/chromium/third_party/libgav1/src/src/dsp/x86/cdef_sse4.cc +++ b/chromium/third_party/libgav1/src/src/dsp/x86/cdef_sse4.cc @@ -38,16 +38,7 @@ namespace dsp { namespace low_bitdepth { namespace { -// CdefDirection: -// Mirror values and pad to 16 elements. -alignas(16) constexpr uint32_t kDivisionTable[] = {840, 420, 280, 210, 168, 140, - 120, 105, 120, 140, 168, 210, - 280, 420, 840, 0}; - -// Used when calculating odd |cost[x]| values to mask off unwanted elements. -// Holds elements 1 3 5 X 5 3 1 X -alignas(16) constexpr uint32_t kDivisionTableOdd[] = {420, 210, 140, 0, - 140, 210, 420, 0}; +#include "src/dsp/cdef.inc" // Used to calculate |partial[0][i + j]| and |partial[4][7 + i - j]|. The input // is |src[j]| and it is being added to |partial[]| based on the above indices. @@ -160,10 +151,10 @@ inline __m128i Square_S32(__m128i a) { return _mm_mullo_epi32(a, a); } // |cost[0]| and |cost[4]| square the input and sum with the corresponding // element from the other end of the vector: -// |kDivisionTable[]| element: +// |kCdefDivisionTable[]| element: // cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) * -// kDivisionTable[i + 1]; -// cost[0] += Square(partial[0][7]) * kDivisionTable[8]; +// kCdefDivisionTable[i + 1]; +// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8]; // Because everything is being summed into a single value the distributive // property allows us to mirror the division table and accumulate once. inline uint32_t Cost0Or4(const __m128i a, const __m128i b, @@ -185,16 +176,16 @@ inline uint32_t CostOdd(const __m128i a, const __m128i b, const __m128i a_hi_square = Square_S32(_mm_cvtepi16_epi32(_mm_srli_si128(a, 8))); // Swap element 0 and element 2. This pairs partial[i][10 - j] with - // kDivisionTable[2*j+1]. + // kCdefDivisionTable[2*j+1]. const __m128i b_lo_square = _mm_shuffle_epi32(Square_S32(_mm_cvtepi16_epi32(b)), 0x06); // First terms are indices 3-7. __m128i c = _mm_srli_si128(a_lo_square, 12); c = _mm_add_epi32(c, a_hi_square); - c = _mm_mullo_epi32(c, _mm_set1_epi32(kDivisionTable[7])); + c = _mm_mullo_epi32(c, _mm_set1_epi32(kCdefDivisionTable[7])); // cost[i] += (Square(base_partial[i][j]) + Square(base_partial[i][10 - j])) * - // kDivisionTable[2 * j + 1]; + // kCdefDivisionTable[2 * j + 1]; const __m128i second_cost = _mm_add_epi32(a_lo_square, b_lo_square); c = _mm_add_epi32(c, _mm_mullo_epi32(second_cost, division_table)); return SumVector_S32(c); @@ -241,18 +232,18 @@ void CdefDirection_SSE4_1(const void* const source, ptrdiff_t stride, const __m128i signed_offset = _mm_set1_epi16(128 * 8); partial_lo[2] = _mm_sub_epi16(partial_lo[2], signed_offset); - cost[2] = kDivisionTable[7] * SquareSum_S16(partial_lo[2]); - cost[6] = kDivisionTable[7] * SquareSum_S16(partial_lo[6]); + cost[2] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[2]); + cost[6] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[6]); - const __m128i division_table[4] = {LoadUnaligned16(kDivisionTable), - LoadUnaligned16(kDivisionTable + 4), - LoadUnaligned16(kDivisionTable + 8), - LoadUnaligned16(kDivisionTable + 12)}; + const __m128i division_table[4] = {LoadUnaligned16(kCdefDivisionTable), + LoadUnaligned16(kCdefDivisionTable + 4), + LoadUnaligned16(kCdefDivisionTable + 8), + LoadUnaligned16(kCdefDivisionTable + 12)}; cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table); cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table); - const __m128i division_table_odd = LoadAligned16(kDivisionTableOdd); + const __m128i division_table_odd = LoadAligned16(kCdefDivisionTableOdd); cost[1] = CostOdd(partial_lo[1], partial_hi[1], division_table_odd); cost[3] = CostOdd(partial_lo[3], partial_hi[3], division_table_odd); @@ -315,24 +306,6 @@ void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride, src + y_1 * stride + stride + x_1); } -// Load 4 vectors based on the given |direction|. Use when |block_width| == 2 to -// do 2 rows at a time. -void LoadDirection2(const uint16_t* const src, const ptrdiff_t stride, - __m128i* output, const int direction) { - const int y_0 = kCdefDirections[direction][0][0]; - const int x_0 = kCdefDirections[direction][0][1]; - const int y_1 = kCdefDirections[direction][1][0]; - const int x_1 = kCdefDirections[direction][1][1]; - output[0] = - Load4x2(src - y_0 * stride - x_0, src - y_0 * stride - x_0 + stride); - output[1] = - Load4x2(src + y_0 * stride + x_0, src - y_0 * stride - x_0 + stride); - output[2] = - Load4x2(src - y_1 * stride - x_1, src - y_0 * stride - x_0 + stride); - output[3] = - Load4x2(src + y_1 * stride + x_1, src - y_0 * stride - x_0 + stride); -} - inline __m128i Constrain(const __m128i& pixel, const __m128i& reference, const __m128i& damping, const __m128i& threshold) { const __m128i diff = _mm_sub_epi16(pixel, reference); @@ -340,6 +313,11 @@ inline __m128i Constrain(const __m128i& pixel, const __m128i& reference, // sign(diff) * Clip3(threshold - (std::abs(diff) >> damping), // 0, std::abs(diff)) const __m128i shifted_diff = _mm_srl_epi16(abs_diff, damping); + // For bitdepth == 8, the threshold range is [0, 15] and the damping range is + // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be + // larger than threshold. Subtract using saturation will return 0 when pixel + // == kCdefLargeValue. + static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue"); const __m128i thresh_minus_shifted_diff = _mm_subs_epu16(threshold, shifted_diff); const __m128i clamp_abs_diff = @@ -349,34 +327,35 @@ inline __m128i Constrain(const __m128i& pixel, const __m128i& reference, } inline __m128i ApplyConstrainAndTap(const __m128i& pixel, const __m128i& val, - const __m128i& mask, const __m128i& tap, - const __m128i& damping, + const __m128i& tap, const __m128i& damping, const __m128i& threshold) { const __m128i constrained = Constrain(val, pixel, damping, threshold); - return _mm_mullo_epi16(_mm_and_si128(constrained, mask), tap); + return _mm_mullo_epi16(constrained, tap); } -template <int width> +template <int width, bool enable_primary = true, bool enable_secondary = true> void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height, const int direction, const int primary_strength, const int secondary_strength, const int damping, uint8_t* dst, const ptrdiff_t dst_stride) { - static_assert(width == 8 || width == 4 || width == 2, "Invalid CDEF width."); - + static_assert(width == 8 || width == 4, "Invalid CDEF width."); + static_assert(enable_primary || enable_secondary, ""); __m128i primary_damping_shift, secondary_damping_shift; + // FloorLog2() requires input to be > 0. - if (primary_strength == 0) { - primary_damping_shift = _mm_setzero_si128(); - } else { + // 8-bit damping range: Y: [3, 6], UV: [2, 5]. + if (enable_primary) { + // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary + // for UV filtering. primary_damping_shift = _mm_cvtsi32_si128(std::max(0, damping - FloorLog2(primary_strength))); } - - if (secondary_strength == 0) { - secondary_damping_shift = _mm_setzero_si128(); - } else { + if (enable_secondary) { + // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is + // necessary. + assert(damping - FloorLog2(secondary_strength) >= 0); secondary_damping_shift = - _mm_cvtsi32_si128(std::max(0, damping - FloorLog2(secondary_strength))); + _mm_cvtsi32_si128(damping - FloorLog2(secondary_strength)); } const __m128i primary_tap_0 = @@ -385,8 +364,6 @@ void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height, _mm_set1_epi16(kCdefPrimaryTaps[primary_strength & 1][1]); const __m128i secondary_tap_0 = _mm_set1_epi16(kCdefSecondaryTap0); const __m128i secondary_tap_1 = _mm_set1_epi16(kCdefSecondaryTap1); - const __m128i cdef_large_value = - _mm_set1_epi16(static_cast<int16_t>(kCdefLargeValue)); const __m128i cdef_large_value_mask = _mm_set1_epi16(static_cast<int16_t>(~kCdefLargeValue)); const __m128i primary_threshold = _mm_set1_epi16(primary_strength); @@ -397,126 +374,113 @@ void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height, __m128i pixel; if (width == 8) { pixel = LoadUnaligned16(src); - } else if (width == 4) { - pixel = LoadHi8(LoadLo8(src), src + src_stride); - } else { - pixel = Load4x2(src, src + src_stride); - } - - // Primary |direction|. - __m128i primary_val[4]; - if (width == 8) { - LoadDirection(src, src_stride, primary_val, direction); - } else if (width == 4) { - LoadDirection4(src, src_stride, primary_val, direction); } else { - LoadDirection2(src, src_stride, primary_val, direction); + pixel = LoadHi8(LoadLo8(src), src + src_stride); } __m128i min = pixel; - min = _mm_min_epu16(min, primary_val[0]); - min = _mm_min_epu16(min, primary_val[1]); - min = _mm_min_epu16(min, primary_val[2]); - min = _mm_min_epu16(min, primary_val[3]); - __m128i max = pixel; - max = _mm_max_epu16(max, - _mm_and_si128(primary_val[0], cdef_large_value_mask)); - max = _mm_max_epu16(max, - _mm_and_si128(primary_val[1], cdef_large_value_mask)); - max = _mm_max_epu16(max, - _mm_and_si128(primary_val[2], cdef_large_value_mask)); - max = _mm_max_epu16(max, - _mm_and_si128(primary_val[3], cdef_large_value_mask)); - __m128i mask = _mm_cmplt_epi16(primary_val[0], cdef_large_value); - __m128i sum = - ApplyConstrainAndTap(pixel, primary_val[0], mask, primary_tap_0, - primary_damping_shift, primary_threshold); - mask = _mm_cmplt_epi16(primary_val[1], cdef_large_value); - sum = _mm_add_epi16( - sum, ApplyConstrainAndTap(pixel, primary_val[1], mask, primary_tap_0, - primary_damping_shift, primary_threshold)); - mask = _mm_cmplt_epi16(primary_val[2], cdef_large_value); - sum = _mm_add_epi16( - sum, ApplyConstrainAndTap(pixel, primary_val[2], mask, primary_tap_1, - primary_damping_shift, primary_threshold)); - mask = _mm_cmplt_epi16(primary_val[3], cdef_large_value); - sum = _mm_add_epi16( - sum, ApplyConstrainAndTap(pixel, primary_val[3], mask, primary_tap_1, - primary_damping_shift, primary_threshold)); - - // Secondary |direction| values (+/- 2). Clamp |direction|. - __m128i secondary_val[8]; - if (width == 8) { - LoadDirection(src, src_stride, secondary_val, (direction + 2) & 0x7); - LoadDirection(src, src_stride, secondary_val + 4, (direction - 2) & 0x7); - } else if (width == 4) { - LoadDirection4(src, src_stride, secondary_val, (direction + 2) & 0x7); - LoadDirection4(src, src_stride, secondary_val + 4, (direction - 2) & 0x7); + __m128i sum; + + if (enable_primary) { + // Primary |direction|. + __m128i primary_val[4]; + if (width == 8) { + LoadDirection(src, src_stride, primary_val, direction); + } else { + LoadDirection4(src, src_stride, primary_val, direction); + } + + min = _mm_min_epu16(min, primary_val[0]); + min = _mm_min_epu16(min, primary_val[1]); + min = _mm_min_epu16(min, primary_val[2]); + min = _mm_min_epu16(min, primary_val[3]); + + // The source is 16 bits, however, we only really care about the lower + // 8 bits. The upper 8 bits contain the "large" flag. After the final + // primary max has been calculated, zero out the upper 8 bits. Use this + // to find the "16 bit" max. + const __m128i max_p01 = _mm_max_epu8(primary_val[0], primary_val[1]); + const __m128i max_p23 = _mm_max_epu8(primary_val[2], primary_val[3]); + const __m128i max_p = _mm_max_epu8(max_p01, max_p23); + max = _mm_max_epu16(max, _mm_and_si128(max_p, cdef_large_value_mask)); + + sum = ApplyConstrainAndTap(pixel, primary_val[0], primary_tap_0, + primary_damping_shift, primary_threshold); + sum = _mm_add_epi16( + sum, ApplyConstrainAndTap(pixel, primary_val[1], primary_tap_0, + primary_damping_shift, primary_threshold)); + sum = _mm_add_epi16( + sum, ApplyConstrainAndTap(pixel, primary_val[2], primary_tap_1, + primary_damping_shift, primary_threshold)); + sum = _mm_add_epi16( + sum, ApplyConstrainAndTap(pixel, primary_val[3], primary_tap_1, + primary_damping_shift, primary_threshold)); } else { - LoadDirection2(src, src_stride, secondary_val, (direction + 2) & 0x7); - LoadDirection2(src, src_stride, secondary_val + 4, (direction - 2) & 0x7); + sum = _mm_setzero_si128(); } - min = _mm_min_epu16(min, secondary_val[0]); - min = _mm_min_epu16(min, secondary_val[1]); - min = _mm_min_epu16(min, secondary_val[2]); - min = _mm_min_epu16(min, secondary_val[3]); - min = _mm_min_epu16(min, secondary_val[4]); - min = _mm_min_epu16(min, secondary_val[5]); - min = _mm_min_epu16(min, secondary_val[6]); - min = _mm_min_epu16(min, secondary_val[7]); - - max = _mm_max_epu16(max, - _mm_and_si128(secondary_val[0], cdef_large_value_mask)); - max = _mm_max_epu16(max, - _mm_and_si128(secondary_val[1], cdef_large_value_mask)); - max = _mm_max_epu16(max, - _mm_and_si128(secondary_val[2], cdef_large_value_mask)); - max = _mm_max_epu16(max, - _mm_and_si128(secondary_val[3], cdef_large_value_mask)); - max = _mm_max_epu16(max, - _mm_and_si128(secondary_val[4], cdef_large_value_mask)); - max = _mm_max_epu16(max, - _mm_and_si128(secondary_val[5], cdef_large_value_mask)); - max = _mm_max_epu16(max, - _mm_and_si128(secondary_val[6], cdef_large_value_mask)); - max = _mm_max_epu16(max, - _mm_and_si128(secondary_val[7], cdef_large_value_mask)); - - mask = _mm_cmplt_epi16(secondary_val[0], cdef_large_value); - sum = _mm_add_epi16(sum, ApplyConstrainAndTap( - pixel, secondary_val[0], mask, secondary_tap_0, - secondary_damping_shift, secondary_threshold)); - mask = _mm_cmplt_epi16(secondary_val[1], cdef_large_value); - sum = _mm_add_epi16(sum, ApplyConstrainAndTap( - pixel, secondary_val[1], mask, secondary_tap_0, - secondary_damping_shift, secondary_threshold)); - mask = _mm_cmplt_epi16(secondary_val[2], cdef_large_value); - sum = _mm_add_epi16(sum, ApplyConstrainAndTap( - pixel, secondary_val[2], mask, secondary_tap_1, - secondary_damping_shift, secondary_threshold)); - mask = _mm_cmplt_epi16(secondary_val[3], cdef_large_value); - sum = _mm_add_epi16(sum, ApplyConstrainAndTap( - pixel, secondary_val[3], mask, secondary_tap_1, - secondary_damping_shift, secondary_threshold)); - mask = _mm_cmplt_epi16(secondary_val[4], cdef_large_value); - sum = _mm_add_epi16(sum, ApplyConstrainAndTap( - pixel, secondary_val[4], mask, secondary_tap_0, - secondary_damping_shift, secondary_threshold)); - mask = _mm_cmplt_epi16(secondary_val[5], cdef_large_value); - sum = _mm_add_epi16(sum, ApplyConstrainAndTap( - pixel, secondary_val[5], mask, secondary_tap_0, - secondary_damping_shift, secondary_threshold)); - mask = _mm_cmplt_epi16(secondary_val[6], cdef_large_value); - sum = _mm_add_epi16(sum, ApplyConstrainAndTap( - pixel, secondary_val[6], mask, secondary_tap_1, - secondary_damping_shift, secondary_threshold)); - mask = _mm_cmplt_epi16(secondary_val[7], cdef_large_value); - sum = _mm_add_epi16(sum, ApplyConstrainAndTap( - pixel, secondary_val[7], mask, secondary_tap_1, - secondary_damping_shift, secondary_threshold)); - + if (enable_secondary) { + // Secondary |direction| values (+/- 2). Clamp |direction|. + __m128i secondary_val[8]; + if (width == 8) { + LoadDirection(src, src_stride, secondary_val, direction + 2); + LoadDirection(src, src_stride, secondary_val + 4, direction - 2); + } else { + LoadDirection4(src, src_stride, secondary_val, direction + 2); + LoadDirection4(src, src_stride, secondary_val + 4, direction - 2); + } + + min = _mm_min_epu16(min, secondary_val[0]); + min = _mm_min_epu16(min, secondary_val[1]); + min = _mm_min_epu16(min, secondary_val[2]); + min = _mm_min_epu16(min, secondary_val[3]); + min = _mm_min_epu16(min, secondary_val[4]); + min = _mm_min_epu16(min, secondary_val[5]); + min = _mm_min_epu16(min, secondary_val[6]); + min = _mm_min_epu16(min, secondary_val[7]); + + const __m128i max_s01 = _mm_max_epu8(secondary_val[0], secondary_val[1]); + const __m128i max_s23 = _mm_max_epu8(secondary_val[2], secondary_val[3]); + const __m128i max_s45 = _mm_max_epu8(secondary_val[4], secondary_val[5]); + const __m128i max_s67 = _mm_max_epu8(secondary_val[6], secondary_val[7]); + const __m128i max_s = _mm_max_epu8(_mm_max_epu8(max_s01, max_s23), + _mm_max_epu8(max_s45, max_s67)); + max = _mm_max_epu16(max, _mm_and_si128(max_s, cdef_large_value_mask)); + + sum = _mm_add_epi16( + sum, + ApplyConstrainAndTap(pixel, secondary_val[0], secondary_tap_0, + secondary_damping_shift, secondary_threshold)); + sum = _mm_add_epi16( + sum, + ApplyConstrainAndTap(pixel, secondary_val[1], secondary_tap_0, + secondary_damping_shift, secondary_threshold)); + sum = _mm_add_epi16( + sum, + ApplyConstrainAndTap(pixel, secondary_val[2], secondary_tap_1, + secondary_damping_shift, secondary_threshold)); + sum = _mm_add_epi16( + sum, + ApplyConstrainAndTap(pixel, secondary_val[3], secondary_tap_1, + secondary_damping_shift, secondary_threshold)); + sum = _mm_add_epi16( + sum, + ApplyConstrainAndTap(pixel, secondary_val[4], secondary_tap_0, + secondary_damping_shift, secondary_threshold)); + sum = _mm_add_epi16( + sum, + ApplyConstrainAndTap(pixel, secondary_val[5], secondary_tap_0, + secondary_damping_shift, secondary_threshold)); + sum = _mm_add_epi16( + sum, + ApplyConstrainAndTap(pixel, secondary_val[6], secondary_tap_1, + secondary_damping_shift, secondary_threshold)); + sum = _mm_add_epi16( + sum, + ApplyConstrainAndTap(pixel, secondary_val[7], secondary_tap_1, + secondary_damping_shift, secondary_threshold)); + } // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max)) const __m128i sum_lt_0 = _mm_srai_epi16(sum, 15); // 8 + sum @@ -536,20 +500,13 @@ void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height, StoreLo8(dst, result); dst += dst_stride; ++y; - } else if (width == 4) { + } else { src += 2 * src_stride; Store4(dst, result); dst += dst_stride; Store4(dst, _mm_srli_si128(result, 4)); dst += dst_stride; y += 2; - } else { - src += 2 * src_stride; - Store2(dst, result); - dst += dst_stride; - Store2(dst, _mm_srli_si128(result, 2)); - dst += dst_stride; - y += 2; } } while (y < height); } @@ -558,29 +515,46 @@ void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height, // inside the frame. However it requires the source input to be padded with a // constant large value if at the boundary. The input must be uint16_t. void CdefFilter_SSE4_1(const void* const source, const ptrdiff_t source_stride, - const int rows4x4, const int columns4x4, - const int curr_x, const int curr_y, - const int subsampling_x, const int subsampling_y, + const int block_width, const int block_height, const int primary_strength, const int secondary_strength, const int damping, const int direction, void* const dest, const ptrdiff_t dest_stride) { - const int plane_width = MultiplyBy4(columns4x4) >> subsampling_x; - const int plane_height = MultiplyBy4(rows4x4) >> subsampling_y; - const int block_width = std::min(8 >> subsampling_x, plane_width - curr_x); - const int block_height = std::min(8 >> subsampling_y, plane_height - curr_y); const auto* src = static_cast<const uint16_t*>(source); auto* dst = static_cast<uint8_t*>(dest); - if (block_width == 8) { - DoCdef<8>(src, source_stride, block_height, direction, primary_strength, - secondary_strength, damping, dst, dest_stride); - } else if (block_width == 4) { - DoCdef<4>(src, source_stride, block_height, direction, primary_strength, - secondary_strength, damping, dst, dest_stride); + if (secondary_strength > 0) { + if (primary_strength > 0) { + if (block_width == 8) { + DoCdef<8>(src, source_stride, block_height, direction, primary_strength, + secondary_strength, damping, dst, dest_stride); + } else { + assert(block_width == 4); + DoCdef<4>(src, source_stride, block_height, direction, primary_strength, + secondary_strength, damping, dst, dest_stride); + } + } else { + if (block_width == 8) { + DoCdef<8, /*enable_primary=*/false>( + src, source_stride, block_height, direction, primary_strength, + secondary_strength, damping, dst, dest_stride); + } else { + assert(block_width == 4); + DoCdef<4, /*enable_primary=*/false>( + src, source_stride, block_height, direction, primary_strength, + secondary_strength, damping, dst, dest_stride); + } + } } else { - assert(block_width == 2); - DoCdef<2>(src, source_stride, block_height, direction, primary_strength, - secondary_strength, damping, dst, dest_stride); + if (block_width == 8) { + DoCdef<8, /*enable_primary=*/true, /*enable_secondary=*/false>( + src, source_stride, block_height, direction, primary_strength, + secondary_strength, damping, dst, dest_stride); + } else { + assert(block_width == 4); + DoCdef<4, /*enable_primary=*/true, /*enable_secondary=*/false>( + src, source_stride, block_height, direction, primary_strength, + secondary_strength, damping, dst, dest_stride); + } } } diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/common_sse4.h b/chromium/third_party/libgav1/src/src/dsp/x86/common_sse4.h index 8b03db69f7a..24c801fd863 100644 --- a/chromium/third_party/libgav1/src/src/dsp/x86/common_sse4.h +++ b/chromium/third_party/libgav1/src/src/dsp/x86/common_sse4.h @@ -17,6 +17,7 @@ #ifndef LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_ #define LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_ +#include "src/utils/compiler_attributes.h" #include "src/utils/cpu.h" #if LIBGAV1_ENABLE_SSE4_1 @@ -91,6 +92,14 @@ inline __m128i Load2x2(const void* src1, const void* src2) { return _mm_cvtsi32_si128(val1 | (val2 << 16)); } +// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1. +template <int lane> +inline __m128i Load2(const void* const buf, __m128i val) { + uint16_t temp; + memcpy(&temp, buf, 2); + return _mm_insert_epi16(val, temp, lane); +} + inline __m128i Load4(const void* src) { // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32 // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a @@ -136,6 +145,41 @@ inline __m128i LoadAligned16(const void* a) { } //------------------------------------------------------------------------------ +// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning. + +inline __m128i MaskOverreads(const __m128i source, + const int over_read_in_bytes) { + __m128i dst = source; +#if LIBGAV1_MSAN + if (over_read_in_bytes > 0) { + __m128i mask = _mm_set1_epi8(-1); + for (int i = 0; i < over_read_in_bytes; ++i) { + mask = _mm_srli_si128(mask, 1); + } + dst = _mm_and_si128(dst, mask); + } +#else + static_cast<void>(over_read_in_bytes); +#endif + return dst; +} + +inline __m128i LoadLo8Msan(const void* const source, + const int over_read_in_bytes) { + return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8); +} + +inline __m128i LoadAligned16Msan(const void* const source, + const int over_read_in_bytes) { + return MaskOverreads(LoadAligned16(source), over_read_in_bytes); +} + +inline __m128i LoadUnaligned16Msan(const void* const source, + const int over_read_in_bytes) { + return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes); +} + +//------------------------------------------------------------------------------ // Store functions. inline void Store2(void* dst, const __m128i x) { @@ -156,6 +200,10 @@ inline void StoreHi8(void* a, const __m128i v) { _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v)); } +inline void StoreAligned16(void* a, const __m128i v) { + _mm_store_si128(static_cast<__m128i*>(a), v); +} + inline void StoreUnaligned16(void* a, const __m128i v) { _mm_storeu_si128(static_cast<__m128i*>(a), v); } diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.cc b/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.cc index 40ce568a491..a0ed3bea758 100644 --- a/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.cc +++ b/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "src/dsp/convolve.h" +#include "src/utils/constants.h" #include "src/utils/cpu.h" #if LIBGAV1_ENABLE_SSE4_1 @@ -33,8 +34,40 @@ namespace dsp { namespace low_bitdepth { namespace { +// TODO(slavarnway): Move to common neon/sse4 file. +int GetNumTapsInFilter(const int filter_index) { + if (filter_index < 2) { + // Despite the names these only use 6 taps. + // kInterpolationFilterEightTap + // kInterpolationFilterEightTapSmooth + return 6; + } + + if (filter_index == 2) { + // kInterpolationFilterEightTapSharp + return 8; + } + + if (filter_index == 3) { + // kInterpolationFilterBilinear + return 2; + } + + assert(filter_index > 3); + // For small sizes (width/height <= 4) the large filters are replaced with 4 + // tap options. + // If the original filters were |kInterpolationFilterEightTap| or + // |kInterpolationFilterEightTapSharp| then it becomes + // |kInterpolationFilterSwitchable|. + // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4 + // tap filter. + return 4; +} + +constexpr int kIntermediateStride = kMaxSuperBlockSizeInPixels; constexpr int kSubPixelMask = (1 << kSubPixelBits) - 1; constexpr int kHorizontalOffset = 3; +constexpr int kFilterIndexShift = 6; // Multiply every entry in |src[]| by the corresponding entry in |taps[]| and // sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final @@ -177,6 +210,15 @@ __m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, return _mm_packus_epi16(sum, sum); } +template <int filter_index> +__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride, + const __m128i* const v_tap) { + const __m128i sum = + SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); + + return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); +} + template <int num_taps, int step, int filter_index, bool is_2d = false, bool is_compound = false> void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, @@ -195,7 +237,11 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, if (is_2d || is_compound) { const __m128i v_sum = HorizontalTaps8To16<filter_index>(&src[x], v_tap); - StoreUnaligned16(&dest16[x], v_sum); + if (is_2d) { + StoreAligned16(&dest16[x], v_sum); + } else { + StoreUnaligned16(&dest16[x], v_sum); + } } else { const __m128i result = SimpleHorizontalTaps<filter_index>(&src[x], v_tap); @@ -236,7 +282,12 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, int y = 0; do { if (is_2d) { - // TODO(slavarnway): Add 2d support + const __m128i sum = + HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap); + Store4(&dest16[0], sum); + dest16 += pred_stride; + Store4(&dest16[0], _mm_srli_si128(sum, 8)); + dest16 += pred_stride; } else { const __m128i sum = SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); @@ -254,13 +305,33 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, // generates context for the vertical pass. if (is_2d) { assert(height % 2 == 1); - // TODO(slavarnway): Add 2d support + __m128i sum; + const __m128i input = LoadLo8(&src[2]); + if (filter_index == 3) { + // 03 04 04 05 05 06 06 07 .... + const __m128i v_src_43 = + _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3); + sum = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3 + } else { + // 02 03 03 04 04 05 05 06 06 07 .... + const __m128i v_src_32 = + _mm_srli_si128(_mm_unpacklo_epi8(input, input), 1); + // 04 05 05 06 06 07 07 08 ... + const __m128i v_src_54 = _mm_srli_si128(v_src_32, 4); + const __m128i v_madd_32 = + _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2 + const __m128i v_madd_54 = + _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4 + sum = _mm_add_epi16(v_madd_54, v_madd_32); + } + sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); + Store4(dest16, sum); } } } } -template <int num_taps> +template <int num_taps, bool is_2d_vertical = false> LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter, __m128i* v_tap) { if (num_taps == 8) { @@ -268,30 +339,295 @@ LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter, v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2 v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4 v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff); // k7k6 - v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); - v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); - v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]); - v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]); + if (is_2d_vertical) { + v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); + v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); + v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]); + v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]); + } else { + v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); + v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); + v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]); + v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]); + } } else if (num_taps == 6) { const __m128i adjusted_filter = _mm_srli_si128(*filter, 1); v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0); // k2k1 v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3 v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa); // k6k5 - v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); - v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); - v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]); + if (is_2d_vertical) { + v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); + v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); + v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]); + } else { + v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); + v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); + v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]); + } } else if (num_taps == 4) { v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2 v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4 - v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); - v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); + if (is_2d_vertical) { + v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); + v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); + } else { + v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); + v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); + } } else { // num_taps == 2 const __m128i adjusted_filter = _mm_srli_si128(*filter, 1); v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3 - v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); + if (is_2d_vertical) { + v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); + } else { + v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); + } } } +template <int num_taps, bool is_compound> +__m128i SimpleSum2DVerticalTaps(const __m128i* const src, + const __m128i* const taps) { + __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]); + __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]); + if (num_taps >= 4) { + __m128i madd_lo = + _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]); + __m128i madd_hi = + _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]); + sum_lo = _mm_add_epi32(sum_lo, madd_lo); + sum_hi = _mm_add_epi32(sum_hi, madd_hi); + if (num_taps >= 6) { + madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]); + madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]); + sum_lo = _mm_add_epi32(sum_lo, madd_lo); + sum_hi = _mm_add_epi32(sum_hi, madd_hi); + if (num_taps == 8) { + madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]); + madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]); + sum_lo = _mm_add_epi32(sum_lo, madd_lo); + sum_hi = _mm_add_epi32(sum_hi, madd_hi); + } + } + } + + if (is_compound) { + return _mm_packs_epi32( + RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1), + RightShiftWithRounding_S32(sum_hi, + kInterRoundBitsCompoundVertical - 1)); + } + + return _mm_packs_epi32( + RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1), + RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1)); +} + +template <int num_taps, bool is_compound = false> +void Filter2DVertical(const uint16_t* src, void* const dst, + const ptrdiff_t dst_stride, const int width, + const int height, const __m128i* const taps) { + assert(width >= 8); + constexpr int next_row = num_taps - 1; + // The Horizontal pass uses |width| as |stride| for the intermediate buffer. + const ptrdiff_t src_stride = width; + + auto* dst8 = static_cast<uint8_t*>(dst); + auto* dst16 = static_cast<uint16_t*>(dst); + + int x = 0; + do { + __m128i srcs[8]; + const uint16_t* src_x = src + x; + srcs[0] = LoadAligned16(src_x); + src_x += src_stride; + if (num_taps >= 4) { + srcs[1] = LoadAligned16(src_x); + src_x += src_stride; + srcs[2] = LoadAligned16(src_x); + src_x += src_stride; + if (num_taps >= 6) { + srcs[3] = LoadAligned16(src_x); + src_x += src_stride; + srcs[4] = LoadAligned16(src_x); + src_x += src_stride; + if (num_taps == 8) { + srcs[5] = LoadAligned16(src_x); + src_x += src_stride; + srcs[6] = LoadAligned16(src_x); + src_x += src_stride; + } + } + } + + int y = 0; + do { + srcs[next_row] = LoadAligned16(src_x); + src_x += src_stride; + + const __m128i sum = + SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps); + if (is_compound) { + StoreUnaligned16(dst16 + x + y * dst_stride, sum); + } else { + StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(sum, sum)); + } + + srcs[0] = srcs[1]; + if (num_taps >= 4) { + srcs[1] = srcs[2]; + srcs[2] = srcs[3]; + if (num_taps >= 6) { + srcs[3] = srcs[4]; + srcs[4] = srcs[5]; + if (num_taps == 8) { + srcs[5] = srcs[6]; + srcs[6] = srcs[7]; + } + } + } + } while (++y < height); + x += 8; + } while (x < width); +} + +// Take advantage of |src_stride| == |width| to process two rows at a time. +template <int num_taps, bool is_compound = false> +void Filter2DVertical4xH(const uint16_t* src, void* const dst, + const ptrdiff_t dst_stride, const int height, + const __m128i* const taps) { + auto* dst8 = static_cast<uint8_t*>(dst); + auto* dst16 = static_cast<uint16_t*>(dst); + + __m128i srcs[9]; + srcs[0] = LoadAligned16(src); + src += 8; + if (num_taps >= 4) { + srcs[2] = LoadAligned16(src); + src += 8; + srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]); + if (num_taps >= 6) { + srcs[4] = LoadAligned16(src); + src += 8; + srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]); + if (num_taps == 8) { + srcs[6] = LoadAligned16(src); + src += 8; + srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]); + } + } + } + + int y = 0; + do { + srcs[num_taps] = LoadAligned16(src); + src += 8; + srcs[num_taps - 1] = _mm_unpacklo_epi64( + _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]); + + const __m128i sum = + SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps); + if (is_compound) { + StoreUnaligned16(dst16, sum); + dst16 += 4 << 1; + } else { + const __m128i results = _mm_packus_epi16(sum, sum); + Store4(dst8, results); + dst8 += dst_stride; + Store4(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + } + + srcs[0] = srcs[2]; + if (num_taps >= 4) { + srcs[1] = srcs[3]; + srcs[2] = srcs[4]; + if (num_taps >= 6) { + srcs[3] = srcs[5]; + srcs[4] = srcs[6]; + if (num_taps == 8) { + srcs[5] = srcs[7]; + srcs[6] = srcs[8]; + } + } + } + y += 2; + } while (y < height); +} + +// Take advantage of |src_stride| == |width| to process four rows at a time. +template <int num_taps> +void Filter2DVertical2xH(const uint16_t* src, void* const dst, + const ptrdiff_t dst_stride, const int height, + const __m128i* const taps) { + constexpr int next_row = (num_taps < 6) ? 4 : 8; + + auto* dst8 = static_cast<uint8_t*>(dst); + + __m128i srcs[9]; + srcs[0] = LoadAligned16(src); + src += 8; + if (num_taps >= 6) { + srcs[4] = LoadAligned16(src); + src += 8; + srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4); + if (num_taps == 8) { + srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8); + srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12); + } + } + + int y = 0; + do { + srcs[next_row] = LoadAligned16(src); + src += 8; + if (num_taps == 2) { + srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4); + } else if (num_taps == 4) { + srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4); + srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8); + srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12); + } else if (num_taps == 6) { + srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8); + srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12); + srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4); + } else if (num_taps == 8) { + srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4); + srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8); + srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12); + } + + const __m128i sum = + SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps); + const __m128i results = _mm_packus_epi16(sum, sum); + + Store2(dst8, results); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 2)); + // When |height| <= 4 the taps are restricted to 2 and 4 tap variants. + // Therefore we don't need to check this condition when |height| > 4. + if (num_taps <= 4 && height == 2) return; + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 6)); + dst8 += dst_stride; + + srcs[0] = srcs[4]; + if (num_taps == 6) { + srcs[1] = srcs[5]; + srcs[4] = srcs[8]; + } else if (num_taps == 8) { + srcs[1] = srcs[5]; + srcs[2] = srcs[6]; + srcs[3] = srcs[7]; + srcs[4] = srcs[8]; + } + + y += 4; + } while (y < height); +} + template <bool is_2d = false, bool is_compound = false> LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( const uint8_t* const src, const ptrdiff_t src_stride, void* const dst, @@ -330,6 +666,765 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( } } +void Convolve2D_SSE4_1(const void* const reference, + const ptrdiff_t reference_stride, + const int horizontal_filter_index, + const int vertical_filter_index, const int subpixel_x, + const int subpixel_y, const int width, const int height, + void* prediction, const ptrdiff_t pred_stride) { + const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); + const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); + const int vertical_taps = GetNumTapsInFilter(vert_filter_index); + + // The output of the horizontal filter is guaranteed to fit in 16 bits. + alignas(16) uint16_t + intermediate_result[kMaxSuperBlockSizeInPixels * + (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)]; + const int intermediate_height = height + vertical_taps - 1; + + const ptrdiff_t src_stride = reference_stride; + const auto* src = static_cast<const uint8_t*>(reference) - + (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset; + + DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width, + width, intermediate_height, subpixel_x, + horiz_filter_index); + + // Vertical filter. + auto* dest = static_cast<uint8_t*>(prediction); + const ptrdiff_t dest_stride = pred_stride; + const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask; + assert(filter_id != 0); + + __m128i taps[4]; + const __m128i v_filter = + LoadLo8(kHalfSubPixelFilters[vert_filter_index][filter_id]); + + if (vertical_taps == 8) { + SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 2) { + Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height, + taps); + } else if (width == 4) { + Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height, + taps); + } else { + Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height, + taps); + } + } else if (vertical_taps == 6) { + SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 2) { + Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height, + taps); + } else if (width == 4) { + Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height, + taps); + } else { + Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height, + taps); + } + } else if (vertical_taps == 4) { + SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 2) { + Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height, + taps); + } else if (width == 4) { + Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height, + taps); + } else { + Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height, + taps); + } + } else { // |vertical_taps| == 2 + SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 2) { + Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height, + taps); + } else if (width == 4) { + Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height, + taps); + } else { + Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height, + taps); + } + } +} + +// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D +// Vertical calculations. +__m128i Compound1DShift(const __m128i sum) { + return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); +} + +template <int filter_index> +__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) { + __m128i v_src[4]; + + if (filter_index < 2) { + // 6 taps. + v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); + v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); + v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]); + } else if (filter_index == 2) { + // 8 taps. + v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); + v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); + v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]); + v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]); + } else if (filter_index == 3) { + // 2 taps. + v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); + } else if (filter_index > 3) { + // 4 taps. + v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); + v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); + } + const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap); + return sum; +} + +template <int filter_index, bool is_compound = false> +void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride, + void* const dst, const ptrdiff_t dst_stride, + const int width, const int height, + const __m128i* const v_tap) { + const int num_taps = GetNumTapsInFilter(filter_index); + const int next_row = num_taps - 1; + auto* dst8 = static_cast<uint8_t*>(dst); + auto* dst16 = static_cast<uint16_t*>(dst); + assert(width >= 8); + + int x = 0; + do { + const uint8_t* src_x = src + x; + __m128i srcs[8]; + srcs[0] = LoadLo8(src_x); + src_x += src_stride; + if (num_taps >= 4) { + srcs[1] = LoadLo8(src_x); + src_x += src_stride; + srcs[2] = LoadLo8(src_x); + src_x += src_stride; + if (num_taps >= 6) { + srcs[3] = LoadLo8(src_x); + src_x += src_stride; + srcs[4] = LoadLo8(src_x); + src_x += src_stride; + if (num_taps == 8) { + srcs[5] = LoadLo8(src_x); + src_x += src_stride; + srcs[6] = LoadLo8(src_x); + src_x += src_stride; + } + } + } + + int y = 0; + do { + srcs[next_row] = LoadLo8(src_x); + src_x += src_stride; + + const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + if (is_compound) { + const __m128i results = Compound1DShift(sums); + StoreUnaligned16(dst16 + x + y * dst_stride, results); + } else { + const __m128i results = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(results, results)); + } + + srcs[0] = srcs[1]; + if (num_taps >= 4) { + srcs[1] = srcs[2]; + srcs[2] = srcs[3]; + if (num_taps >= 6) { + srcs[3] = srcs[4]; + srcs[4] = srcs[5]; + if (num_taps == 8) { + srcs[5] = srcs[6]; + srcs[6] = srcs[7]; + } + } + } + } while (++y < height); + x += 8; + } while (x < width); +} + +template <int filter_index, bool is_compound = false> +void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, + void* const dst, const ptrdiff_t dst_stride, + const int height, const __m128i* const v_tap) { + const int num_taps = GetNumTapsInFilter(filter_index); + auto* dst8 = static_cast<uint8_t*>(dst); + auto* dst16 = static_cast<uint16_t*>(dst); + + __m128i srcs[9]; + + if (num_taps == 2) { + srcs[2] = _mm_setzero_si128(); + // 00 01 02 03 + srcs[0] = Load4(src); + src += src_stride; + + int y = 0; + do { + // 10 11 12 13 + const __m128i a = Load4(src); + // 00 01 02 03 10 11 12 13 + srcs[0] = _mm_unpacklo_epi32(srcs[0], a); + src += src_stride; + // 20 21 22 23 + srcs[2] = Load4(src); + src += src_stride; + // 10 11 12 13 20 21 22 23 + srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); + + const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + if (is_compound) { + const __m128i results = Compound1DShift(sums); + StoreUnaligned16(dst16, results); + dst16 += 4 << 1; + } else { + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + Store4(dst8, results); + dst8 += dst_stride; + Store4(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + } + + srcs[0] = srcs[2]; + y += 2; + } while (y < height); + } else if (num_taps == 4) { + srcs[4] = _mm_setzero_si128(); + // 00 01 02 03 + srcs[0] = Load4(src); + src += src_stride; + // 10 11 12 13 + const __m128i a = Load4(src); + // 00 01 02 03 10 11 12 13 + srcs[0] = _mm_unpacklo_epi32(srcs[0], a); + src += src_stride; + // 20 21 22 23 + srcs[2] = Load4(src); + src += src_stride; + // 10 11 12 13 20 21 22 23 + srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); + + int y = 0; + do { + // 30 31 32 33 + const __m128i b = Load4(src); + // 20 21 22 23 30 31 32 33 + srcs[2] = _mm_unpacklo_epi32(srcs[2], b); + src += src_stride; + // 40 41 42 43 + srcs[4] = Load4(src); + src += src_stride; + // 30 31 32 33 40 41 42 43 + srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); + + const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + if (is_compound) { + const __m128i results = Compound1DShift(sums); + StoreUnaligned16(dst16, results); + dst16 += 4 << 1; + } else { + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + Store4(dst8, results); + dst8 += dst_stride; + Store4(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + } + + srcs[0] = srcs[2]; + srcs[1] = srcs[3]; + srcs[2] = srcs[4]; + y += 2; + } while (y < height); + } else if (num_taps == 6) { + srcs[6] = _mm_setzero_si128(); + // 00 01 02 03 + srcs[0] = Load4(src); + src += src_stride; + // 10 11 12 13 + const __m128i a = Load4(src); + // 00 01 02 03 10 11 12 13 + srcs[0] = _mm_unpacklo_epi32(srcs[0], a); + src += src_stride; + // 20 21 22 23 + srcs[2] = Load4(src); + src += src_stride; + // 10 11 12 13 20 21 22 23 + srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); + // 30 31 32 33 + const __m128i b = Load4(src); + // 20 21 22 23 30 31 32 33 + srcs[2] = _mm_unpacklo_epi32(srcs[2], b); + src += src_stride; + // 40 41 42 43 + srcs[4] = Load4(src); + src += src_stride; + // 30 31 32 33 40 41 42 43 + srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); + + int y = 0; + do { + // 50 51 52 53 + const __m128i c = Load4(src); + // 40 41 42 43 50 51 52 53 + srcs[4] = _mm_unpacklo_epi32(srcs[4], c); + src += src_stride; + // 60 61 62 63 + srcs[6] = Load4(src); + src += src_stride; + // 50 51 52 53 60 61 62 63 + srcs[5] = _mm_unpacklo_epi32(c, srcs[6]); + + const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + if (is_compound) { + const __m128i results = Compound1DShift(sums); + StoreUnaligned16(dst16, results); + dst16 += 4 << 1; + } else { + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + Store4(dst8, results); + dst8 += dst_stride; + Store4(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + } + + srcs[0] = srcs[2]; + srcs[1] = srcs[3]; + srcs[2] = srcs[4]; + srcs[3] = srcs[5]; + srcs[4] = srcs[6]; + y += 2; + } while (y < height); + } else if (num_taps == 8) { + srcs[8] = _mm_setzero_si128(); + // 00 01 02 03 + srcs[0] = Load4(src); + src += src_stride; + // 10 11 12 13 + const __m128i a = Load4(src); + // 00 01 02 03 10 11 12 13 + srcs[0] = _mm_unpacklo_epi32(srcs[0], a); + src += src_stride; + // 20 21 22 23 + srcs[2] = Load4(src); + src += src_stride; + // 10 11 12 13 20 21 22 23 + srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); + // 30 31 32 33 + const __m128i b = Load4(src); + // 20 21 22 23 30 31 32 33 + srcs[2] = _mm_unpacklo_epi32(srcs[2], b); + src += src_stride; + // 40 41 42 43 + srcs[4] = Load4(src); + src += src_stride; + // 30 31 32 33 40 41 42 43 + srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); + // 50 51 52 53 + const __m128i c = Load4(src); + // 40 41 42 43 50 51 52 53 + srcs[4] = _mm_unpacklo_epi32(srcs[4], c); + src += src_stride; + // 60 61 62 63 + srcs[6] = Load4(src); + src += src_stride; + // 50 51 52 53 60 61 62 63 + srcs[5] = _mm_unpacklo_epi32(c, srcs[6]); + + int y = 0; + do { + // 70 71 72 73 + const __m128i d = Load4(src); + // 60 61 62 63 70 71 72 73 + srcs[6] = _mm_unpacklo_epi32(srcs[6], d); + src += src_stride; + // 80 81 82 83 + srcs[8] = Load4(src); + src += src_stride; + // 70 71 72 73 80 81 82 83 + srcs[7] = _mm_unpacklo_epi32(d, srcs[8]); + + const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + if (is_compound) { + const __m128i results = Compound1DShift(sums); + StoreUnaligned16(dst16, results); + dst16 += 4 << 1; + } else { + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + Store4(dst8, results); + dst8 += dst_stride; + Store4(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + } + + srcs[0] = srcs[2]; + srcs[1] = srcs[3]; + srcs[2] = srcs[4]; + srcs[3] = srcs[5]; + srcs[4] = srcs[6]; + srcs[5] = srcs[7]; + srcs[6] = srcs[8]; + y += 2; + } while (y < height); + } +} + +template <int filter_index, bool negative_outside_taps = false> +void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, + void* const dst, const ptrdiff_t dst_stride, + const int height, const __m128i* const v_tap) { + const int num_taps = GetNumTapsInFilter(filter_index); + auto* dst8 = static_cast<uint8_t*>(dst); + + __m128i srcs[9]; + + if (num_taps == 2) { + srcs[2] = _mm_setzero_si128(); + // 00 01 + srcs[0] = Load2(src); + src += src_stride; + + int y = 0; + do { + // 00 01 10 11 + srcs[0] = Load2<1>(src, srcs[0]); + src += src_stride; + // 00 01 10 11 20 21 + srcs[0] = Load2<2>(src, srcs[0]); + src += src_stride; + // 00 01 10 11 20 21 30 31 + srcs[0] = Load2<3>(src, srcs[0]); + src += src_stride; + // 40 41 + srcs[2] = Load2<0>(src, srcs[2]); + src += src_stride; + // 00 01 10 11 20 21 30 31 40 41 + const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]); + // 10 11 20 21 30 31 40 41 + srcs[1] = _mm_srli_si128(srcs_0_2, 2); + // This uses srcs[0]..srcs[1]. + const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + + Store2(dst8, results); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 2)); + if (height == 2) return; + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 6)); + dst8 += dst_stride; + + srcs[0] = srcs[2]; + y += 4; + } while (y < height); + } else if (num_taps == 4) { + srcs[4] = _mm_setzero_si128(); + + // 00 01 + srcs[0] = Load2(src); + src += src_stride; + // 00 01 10 11 + srcs[0] = Load2<1>(src, srcs[0]); + src += src_stride; + // 00 01 10 11 20 21 + srcs[0] = Load2<2>(src, srcs[0]); + src += src_stride; + + int y = 0; + do { + // 00 01 10 11 20 21 30 31 + srcs[0] = Load2<3>(src, srcs[0]); + src += src_stride; + // 40 41 + srcs[4] = Load2<0>(src, srcs[4]); + src += src_stride; + // 40 41 50 51 + srcs[4] = Load2<1>(src, srcs[4]); + src += src_stride; + // 40 41 50 51 60 61 + srcs[4] = Load2<2>(src, srcs[4]); + src += src_stride; + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 + const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]); + // 10 11 20 21 30 31 40 41 + srcs[1] = _mm_srli_si128(srcs_0_4, 2); + // 20 21 30 31 40 41 50 51 + srcs[2] = _mm_srli_si128(srcs_0_4, 4); + // 30 31 40 41 50 51 60 61 + srcs[3] = _mm_srli_si128(srcs_0_4, 6); + + // This uses srcs[0]..srcs[3]. + const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + + Store2(dst8, results); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 2)); + if (height == 2) return; + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 6)); + dst8 += dst_stride; + + srcs[0] = srcs[4]; + y += 4; + } while (y < height); + } else if (num_taps == 6) { + // During the vertical pass the number of taps is restricted when + // |height| <= 4. + assert(height > 4); + srcs[8] = _mm_setzero_si128(); + + // 00 01 + srcs[0] = Load2(src); + src += src_stride; + // 00 01 10 11 + srcs[0] = Load2<1>(src, srcs[0]); + src += src_stride; + // 00 01 10 11 20 21 + srcs[0] = Load2<2>(src, srcs[0]); + src += src_stride; + // 00 01 10 11 20 21 30 31 + srcs[0] = Load2<3>(src, srcs[0]); + src += src_stride; + // 40 41 + srcs[4] = Load2(src); + src += src_stride; + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 + const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]); + // 10 11 20 21 30 31 40 41 + srcs[1] = _mm_srli_si128(srcs_0_4x, 2); + + int y = 0; + do { + // 40 41 50 51 + srcs[4] = Load2<1>(src, srcs[4]); + src += src_stride; + // 40 41 50 51 60 61 + srcs[4] = Load2<2>(src, srcs[4]); + src += src_stride; + // 40 41 50 51 60 61 70 71 + srcs[4] = Load2<3>(src, srcs[4]); + src += src_stride; + // 80 81 + srcs[8] = Load2<0>(src, srcs[8]); + src += src_stride; + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 + const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]); + // 20 21 30 31 40 41 50 51 + srcs[2] = _mm_srli_si128(srcs_0_4, 4); + // 30 31 40 41 50 51 60 61 + srcs[3] = _mm_srli_si128(srcs_0_4, 6); + const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]); + // 50 51 60 61 70 71 80 81 + srcs[5] = _mm_srli_si128(srcs_4_8, 2); + + // This uses srcs[0]..srcs[5]. + const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + + Store2(dst8, results); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 2)); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 6)); + dst8 += dst_stride; + + srcs[0] = srcs[4]; + srcs[1] = srcs[5]; + srcs[4] = srcs[8]; + y += 4; + } while (y < height); + } else if (num_taps == 8) { + // During the vertical pass the number of taps is restricted when + // |height| <= 4. + assert(height > 4); + srcs[8] = _mm_setzero_si128(); + // 00 01 + srcs[0] = Load2(src); + src += src_stride; + // 00 01 10 11 + srcs[0] = Load2<1>(src, srcs[0]); + src += src_stride; + // 00 01 10 11 20 21 + srcs[0] = Load2<2>(src, srcs[0]); + src += src_stride; + // 00 01 10 11 20 21 30 31 + srcs[0] = Load2<3>(src, srcs[0]); + src += src_stride; + // 40 41 + srcs[4] = Load2(src); + src += src_stride; + // 40 41 50 51 + srcs[4] = Load2<1>(src, srcs[4]); + src += src_stride; + // 40 41 50 51 60 61 + srcs[4] = Load2<2>(src, srcs[4]); + src += src_stride; + + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 + const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]); + // 10 11 20 21 30 31 40 41 + srcs[1] = _mm_srli_si128(srcs_0_4, 2); + // 20 21 30 31 40 41 50 51 + srcs[2] = _mm_srli_si128(srcs_0_4, 4); + // 30 31 40 41 50 51 60 61 + srcs[3] = _mm_srli_si128(srcs_0_4, 6); + + int y = 0; + do { + // 40 41 50 51 60 61 70 71 + srcs[4] = Load2<3>(src, srcs[4]); + src += src_stride; + // 80 81 + srcs[8] = Load2<0>(src, srcs[8]); + src += src_stride; + // 80 81 90 91 + srcs[8] = Load2<1>(src, srcs[8]); + src += src_stride; + // 80 81 90 91 a0 a1 + srcs[8] = Load2<2>(src, srcs[8]); + src += src_stride; + + // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1 + const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]); + // 50 51 60 61 70 71 80 81 + srcs[5] = _mm_srli_si128(srcs_4_8, 2); + // 60 61 70 71 80 81 90 91 + srcs[6] = _mm_srli_si128(srcs_4_8, 4); + // 70 71 80 81 90 91 a0 a1 + srcs[7] = _mm_srli_si128(srcs_4_8, 6); + + // This uses srcs[0]..srcs[7]. + const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + + Store2(dst8, results); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 2)); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 6)); + dst8 += dst_stride; + + srcs[0] = srcs[4]; + srcs[1] = srcs[5]; + srcs[2] = srcs[6]; + srcs[3] = srcs[7]; + srcs[4] = srcs[8]; + y += 4; + } while (y < height); + } +} + +void ConvolveVertical_SSE4_1(const void* const reference, + const ptrdiff_t reference_stride, + const int /*horizontal_filter_index*/, + const int vertical_filter_index, + const int /*subpixel_x*/, const int subpixel_y, + const int width, const int height, + void* prediction, const ptrdiff_t pred_stride) { + const int filter_index = GetFilterIndex(vertical_filter_index, height); + const int vertical_taps = GetNumTapsInFilter(filter_index); + const ptrdiff_t src_stride = reference_stride; + const auto* src = static_cast<const uint8_t*>(reference) - + (vertical_taps / 2 - 1) * src_stride; + auto* dest = static_cast<uint8_t*>(prediction); + const ptrdiff_t dest_stride = pred_stride; + const int filter_id = (subpixel_y >> 6) & kSubPixelMask; + assert(filter_id != 0); + + __m128i taps[4]; + const __m128i v_filter = + LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]); + + if (filter_index < 2) { // 6 tap. + SetupTaps<6>(&v_filter, taps); + if (width == 2) { + FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height, taps); + } else if (width == 4) { + FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height, taps); + } else { + FilterVertical<0>(src, src_stride, dest, dest_stride, width, height, + taps); + } + } else if (filter_index == 2) { // 8 tap. + SetupTaps<8>(&v_filter, taps); + if (width == 2) { + FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps); + } else if (width == 4) { + FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps); + } else { + FilterVertical<2>(src, src_stride, dest, dest_stride, width, height, + taps); + } + } else if (filter_index == 3) { // 2 tap. + SetupTaps<2>(&v_filter, taps); + if (width == 2) { + FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height, taps); + } else if (width == 4) { + FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height, taps); + } else { + FilterVertical<3>(src, src_stride, dest, dest_stride, width, height, + taps); + } + } else if (filter_index == 4) { // 4 tap. + SetupTaps<4>(&v_filter, taps); + if (width == 2) { + FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps); + } else if (width == 4) { + FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps); + } else { + FilterVertical<4>(src, src_stride, dest, dest_stride, width, height, + taps); + } + } else { + // TODO(slavarnway): Investigate adding |filter_index| == 1 special cases. + // See convolve_neon.cc + SetupTaps<4>(&v_filter, taps); + + if (width == 2) { + FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height, taps); + } else if (width == 4) { + FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height, taps); + } else { + FilterVertical<5>(src, src_stride, dest, dest_stride, width, height, + taps); + } + } +} + void ConvolveCompoundCopy_SSE4( const void* const reference, const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, @@ -388,6 +1483,76 @@ void ConvolveCompoundCopy_SSE4( } } +void ConvolveCompoundVertical_SSE4_1( + const void* const reference, const ptrdiff_t reference_stride, + const int /*horizontal_filter_index*/, const int vertical_filter_index, + const int /*subpixel_x*/, const int subpixel_y, const int width, + const int height, void* prediction, const ptrdiff_t /*pred_stride*/) { + const int filter_index = GetFilterIndex(vertical_filter_index, height); + const int vertical_taps = GetNumTapsInFilter(filter_index); + const ptrdiff_t src_stride = reference_stride; + const auto* src = static_cast<const uint8_t*>(reference) - + (vertical_taps / 2 - 1) * src_stride; + auto* dest = static_cast<uint16_t*>(prediction); + const int filter_id = (subpixel_y >> 6) & kSubPixelMask; + assert(filter_id != 0); + + __m128i taps[4]; + const __m128i v_filter = + LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]); + + if (filter_index < 2) { // 6 tap. + SetupTaps<6>(&v_filter, taps); + if (width == 4) { + FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); + } else { + FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width, + width, height, taps); + } + } else if (filter_index == 2) { // 8 tap. + SetupTaps<8>(&v_filter, taps); + + if (width == 4) { + FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); + } else { + FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width, + width, height, taps); + } + } else if (filter_index == 3) { // 2 tap. + SetupTaps<2>(&v_filter, taps); + + if (width == 4) { + FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); + } else { + FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width, + width, height, taps); + } + } else if (filter_index == 4) { // 4 tap. + SetupTaps<4>(&v_filter, taps); + + if (width == 4) { + FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); + } else { + FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width, + width, height, taps); + } + } else { + SetupTaps<4>(&v_filter, taps); + + if (width == 4) { + FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); + } else { + FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width, + width, height, taps); + } + } +} + void ConvolveHorizontal_SSE4_1(const void* const reference, const ptrdiff_t reference_stride, const int horizontal_filter_index, @@ -418,13 +1583,720 @@ void ConvolveCompoundHorizontal_SSE4_1( filter_index); } +void ConvolveCompound2D_SSE4_1( + const void* const reference, const ptrdiff_t reference_stride, + const int horizontal_filter_index, const int vertical_filter_index, + const int subpixel_x, const int subpixel_y, const int width, + const int height, void* prediction, const ptrdiff_t /*pred_stride*/) { + // The output of the horizontal filter, i.e. the intermediate_result, is + // guaranteed to fit in int16_t. + alignas(16) uint16_t + intermediate_result[kMaxSuperBlockSizeInPixels * + (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)]; + + // Horizontal filter. + // Filter types used for width <= 4 are different from those for width > 4. + // When width > 4, the valid filter index range is always [0, 3]. + // When width <= 4, the valid filter index range is always [4, 5]. + // Similarly for height. + const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); + const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); + const int vertical_taps = GetNumTapsInFilter(vert_filter_index); + const int intermediate_height = height + vertical_taps - 1; + const ptrdiff_t src_stride = reference_stride; + const auto* const src = static_cast<const uint8_t*>(reference) - + (vertical_taps / 2 - 1) * src_stride - + kHorizontalOffset; + + DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>( + src, src_stride, intermediate_result, width, width, intermediate_height, + subpixel_x, horiz_filter_index); + + // Vertical filter. + auto* dest = static_cast<uint16_t*>(prediction); + const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask; + assert(filter_id != 0); + + const ptrdiff_t dest_stride = width; + __m128i taps[4]; + const __m128i v_filter = + LoadLo8(kHalfSubPixelFilters[vert_filter_index][filter_id]); + + if (vertical_taps == 8) { + SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 4) { + Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest, + dest_stride, height, taps); + } else { + Filter2DVertical<8, /*is_compound=*/true>( + intermediate_result, dest, dest_stride, width, height, taps); + } + } else if (vertical_taps == 6) { + SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 4) { + Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest, + dest_stride, height, taps); + } else { + Filter2DVertical<6, /*is_compound=*/true>( + intermediate_result, dest, dest_stride, width, height, taps); + } + } else if (vertical_taps == 4) { + SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 4) { + Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest, + dest_stride, height, taps); + } else { + Filter2DVertical<4, /*is_compound=*/true>( + intermediate_result, dest, dest_stride, width, height, taps); + } + } else { // |vertical_taps| == 2 + SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 4) { + Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest, + dest_stride, height, taps); + } else { + Filter2DVertical<2, /*is_compound=*/true>( + intermediate_result, dest, dest_stride, width, height, taps); + } + } +} + +// Pre-transposed filters. +template <int filter_index> +inline void GetHalfSubPixelFilter(__m128i* output) { + // Filter 0 + alignas( + 16) static constexpr int8_t kHalfSubPixel6TapSignedFilterColumns[6][16] = + {{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0}, + {0, -3, -5, -6, -7, -7, -8, -7, -7, -6, -6, -6, -5, -4, -2, -1}, + {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4}, + {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63}, + {0, -1, -2, -4, -5, -6, -6, -6, -7, -7, -8, -7, -7, -6, -5, -3}, + {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}; + // Filter 1 + alignas(16) static constexpr int8_t + kHalfSubPixel6TapMixedSignedFilterColumns[6][16] = { + {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0}, + {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1}, + {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17}, + {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31}, + {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14}, + {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}}; + // Filter 2 + alignas( + 16) static constexpr int8_t kHalfSubPixel8TapSignedFilterColumns[8][16] = + {{0, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, 0}, + {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1}, + {0, -3, -6, -9, -11, -11, -12, -12, -12, -11, -10, -9, -7, -5, -3, -1}, + {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4}, + {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63}, + {0, -1, -3, -5, -7, -9, -10, -11, -12, -12, -12, -11, -11, -9, -6, -3}, + {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1}, + {0, 0, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1}}; + // Filter 3 + alignas(16) static constexpr uint8_t kHalfSubPixel2TapFilterColumns[2][16] = { + {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4}, + {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}}; + // Filter 4 + alignas( + 16) static constexpr int8_t kHalfSubPixel4TapSignedFilterColumns[4][16] = + {{0, -2, -4, -5, -6, -6, -7, -6, -6, -5, -5, -5, -4, -3, -2, -1}, + {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4}, + {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63}, + {0, -1, -2, -3, -4, -5, -5, -5, -6, -6, -7, -6, -6, -5, -4, -2}}; + // Filter 5 + alignas( + 16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = { + {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1}, + {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17}, + {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31}, + {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}}; + switch (filter_index) { + case 0: + output[0] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[0]); + output[1] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[1]); + output[2] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[2]); + output[3] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[3]); + output[4] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[4]); + output[5] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[5]); + break; + case 1: + // The term "mixed" refers to the fact that the outer taps have a mix of + // negative and positive values. + output[0] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[0]); + output[1] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[1]); + output[2] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[2]); + output[3] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[3]); + output[4] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[4]); + output[5] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[5]); + break; + case 2: + output[0] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[0]); + output[1] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[1]); + output[2] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[2]); + output[3] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[3]); + output[4] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[4]); + output[5] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[5]); + output[6] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[6]); + output[7] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[7]); + break; + case 3: + output[0] = LoadAligned16(kHalfSubPixel2TapFilterColumns[0]); + output[1] = LoadAligned16(kHalfSubPixel2TapFilterColumns[1]); + break; + case 4: + output[0] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[0]); + output[1] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[1]); + output[2] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[2]); + output[3] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[3]); + break; + default: + assert(filter_index == 5); + output[0] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[0]); + output[1] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[1]); + output[2] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[2]); + output[3] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[3]); + break; + } +} + +// There are many opportunities for overreading in scaled convolve, because +// the range of starting points for filter windows is anywhere from 0 to 16 +// for 8 destination pixels, and the window sizes range from 2 to 8. To +// accommodate this range concisely, we use |grade_x| to mean the most steps +// in src that can be traversed in a single |step_x| increment, i.e. 1 or 2. +// More importantly, |grade_x| answers the question "how many vector loads are +// needed to cover the source values?" +// When |grade_x| == 1, the maximum number of source values needed is 8 separate +// starting positions plus 7 more to cover taps, all fitting into 16 bytes. +// When |grade_x| > 1, we are guaranteed to exceed 8 whole steps in src for +// every 8 |step_x| increments, on top of 8 possible taps. The first load covers +// the starting sources for each kernel, while the final load covers the taps. +// Since the offset value of src_x cannot exceed 8 and |num_taps| does not +// exceed 4 when width <= 4, |grade_x| is set to 1 regardless of the value of +// |step_x|. +template <int num_taps, int grade_x> +inline void PrepareSourceVectors(const uint8_t* src, const __m128i src_indices, + __m128i source[num_taps >> 1]) { + const __m128i src_vals = LoadUnaligned16(src); + source[0] = _mm_shuffle_epi8(src_vals, src_indices); + if (grade_x == 1) { + if (num_taps > 2) { + source[1] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 2), src_indices); + } + if (num_taps > 4) { + source[2] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 4), src_indices); + } + if (num_taps > 6) { + source[3] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 6), src_indices); + } + } else { + assert(grade_x > 1); + assert(num_taps != 4); + // grade_x > 1 also means width >= 8 && num_taps != 4 + const __m128i src_vals_ext = LoadLo8(src + 16); + if (num_taps > 2) { + source[1] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 2), + src_indices); + source[2] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 4), + src_indices); + } + if (num_taps > 6) { + source[3] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 6), + src_indices); + } + } +} + +template <int num_taps> +inline void PrepareHorizontalTaps(const __m128i subpel_indices, + const __m128i* filter_taps, + __m128i* out_taps) { + const __m128i scale_index_offsets = + _mm_srli_epi16(subpel_indices, kFilterIndexShift); + const __m128i filter_index_mask = _mm_set1_epi8(kSubPixelMask); + const __m128i filter_indices = + _mm_and_si128(_mm_packus_epi16(scale_index_offsets, scale_index_offsets), + filter_index_mask); + // Line up taps for maddubs_epi16. + // The unpack is also assumed to be lighter than shift+alignr. + for (int k = 0; k < (num_taps >> 1); ++k) { + const __m128i taps0 = _mm_shuffle_epi8(filter_taps[2 * k], filter_indices); + const __m128i taps1 = + _mm_shuffle_epi8(filter_taps[2 * k + 1], filter_indices); + out_taps[k] = _mm_unpacklo_epi8(taps0, taps1); + } +} + +inline __m128i HorizontalScaleIndices(const __m128i subpel_indices) { + const __m128i src_indices16 = + _mm_srli_epi16(subpel_indices, kScaleSubPixelBits); + const __m128i src_indices = _mm_packus_epi16(src_indices16, src_indices16); + return _mm_unpacklo_epi8(src_indices, + _mm_add_epi8(src_indices, _mm_set1_epi8(1))); +} + +template <int grade_x, int filter_index, int num_taps> +inline void ConvolveHorizontalScale(const uint8_t* src, ptrdiff_t src_stride, + int width, int subpixel_x, int step_x, + int intermediate_height, + int16_t* intermediate) { + // Account for the 0-taps that precede the 2 nonzero taps. + const int kernel_offset = (8 - num_taps) >> 1; + const int ref_x = subpixel_x >> kScaleSubPixelBits; + const int step_x8 = step_x << 3; + __m128i filter_taps[num_taps]; + GetHalfSubPixelFilter<filter_index>(filter_taps); + const __m128i index_steps = + _mm_mullo_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), + _mm_set1_epi16(static_cast<int16_t>(step_x))); + + __m128i taps[num_taps >> 1]; + __m128i source[num_taps >> 1]; + int p = subpixel_x; + // Case when width <= 4 is possible. + if (filter_index >= 3) { + if (filter_index > 3 || width <= 4) { + const uint8_t* src_x = + &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset]; + // Only add steps to the 10-bit truncated p to avoid overflow. + const __m128i p_fraction = _mm_set1_epi16(p & 1023); + const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction); + PrepareHorizontalTaps<num_taps>(subpel_indices, filter_taps, taps); + const __m128i packed_indices = HorizontalScaleIndices(subpel_indices); + + int y = intermediate_height; + do { + // Load and line up source values with the taps. Width 4 means no need + // to load extended source. + PrepareSourceVectors<num_taps, /*grade_x=*/1>(src_x, packed_indices, + source); + + StoreLo8(intermediate, RightShiftWithRounding_S16( + SumOnePassTaps<filter_index>(source, taps), + kInterRoundBitsHorizontal - 1)); + src_x += src_stride; + intermediate += kIntermediateStride; + } while (--y != 0); + return; + } + } + + // |width| >= 8 + int x = 0; + do { + const uint8_t* src_x = + &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset]; + int16_t* intermediate_x = intermediate + x; + // Only add steps to the 10-bit truncated p to avoid overflow. + const __m128i p_fraction = _mm_set1_epi16(p & 1023); + const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction); + PrepareHorizontalTaps<num_taps>(subpel_indices, filter_taps, taps); + const __m128i packed_indices = HorizontalScaleIndices(subpel_indices); + + int y = intermediate_height; + do { + // For each x, a lane of src_k[k] contains src_x[k]. + PrepareSourceVectors<num_taps, grade_x>(src_x, packed_indices, source); + + // Shift by one less because the taps are halved. + StoreAligned16( + intermediate_x, + RightShiftWithRounding_S16(SumOnePassTaps<filter_index>(source, taps), + kInterRoundBitsHorizontal - 1)); + src_x += src_stride; + intermediate_x += kIntermediateStride; + } while (--y != 0); + x += 8; + p += step_x8; + } while (x < width); +} + +template <int num_taps> +inline void PrepareVerticalTaps(const int8_t* taps, __m128i* output) { + // Avoid overreading the filter due to starting at kernel_offset. + // The only danger of overread is in the final filter, which has 4 taps. + const __m128i filter = + _mm_cvtepi8_epi16((num_taps > 4) ? LoadLo8(taps) : Load4(taps)); + output[0] = _mm_shuffle_epi32(filter, 0); + if (num_taps > 2) { + output[1] = _mm_shuffle_epi32(filter, 0x55); + } + if (num_taps > 4) { + output[2] = _mm_shuffle_epi32(filter, 0xAA); + } + if (num_taps > 6) { + output[3] = _mm_shuffle_epi32(filter, 0xFF); + } +} + +// Process eight 16 bit inputs and output eight 16 bit values. +template <int num_taps, bool is_compound> +inline __m128i Sum2DVerticalTaps(const __m128i* const src, + const __m128i* taps) { + const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]); + __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps[0]); + const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]); + __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps[0]); + if (num_taps > 2) { + const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]); + sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps[1])); + const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]); + sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps[1])); + } + if (num_taps > 4) { + const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]); + sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps[2])); + const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]); + sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps[2])); + } + if (num_taps > 6) { + const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]); + sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps[3])); + const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]); + sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps[3])); + } + if (is_compound) { + return _mm_packs_epi32( + RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1), + RightShiftWithRounding_S32(sum_hi, + kInterRoundBitsCompoundVertical - 1)); + } + return _mm_packs_epi32( + RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1), + RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1)); +} + +// Bottom half of each src[k] is the source for one filter, and the top half +// is the source for the other filter, for the next destination row. +template <int num_taps, bool is_compound> +__m128i Sum2DVerticalTaps4x2(const __m128i* const src, const __m128i* taps_lo, + const __m128i* taps_hi) { + const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]); + __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps_lo[0]); + const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]); + __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps_hi[0]); + if (num_taps > 2) { + const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]); + sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps_lo[1])); + const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]); + sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps_hi[1])); + } + if (num_taps > 4) { + const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]); + sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps_lo[2])); + const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]); + sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps_hi[2])); + } + if (num_taps > 6) { + const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]); + sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps_lo[3])); + const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]); + sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps_hi[3])); + } + + if (is_compound) { + return _mm_packs_epi32( + RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1), + RightShiftWithRounding_S32(sum_hi, + kInterRoundBitsCompoundVertical - 1)); + } + return _mm_packs_epi32( + RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1), + RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1)); +} + +// |width_class| is 2, 4, or 8, according to the Store function that should be +// used. +template <int num_taps, int width_class, bool is_compound> +#if LIBGAV1_MSAN +__attribute__((no_sanitize_memory)) void ConvolveVerticalScale( +#else +inline void ConvolveVerticalScale( +#endif + const int16_t* src, const int width, const int subpixel_y, + const int filter_index, const int step_y, const int height, void* dest, + const ptrdiff_t dest_stride) { + constexpr ptrdiff_t src_stride = kIntermediateStride; + constexpr int kernel_offset = (8 - num_taps) / 2; + const int16_t* src_y = src; + // |dest| is 16-bit in compound mode, Pixel otherwise. + auto* dest16_y = static_cast<uint16_t*>(dest); + auto* dest_y = static_cast<uint8_t*>(dest); + __m128i s[num_taps]; + + int p = subpixel_y & 1023; + int y = height; + if (width_class <= 4) { + __m128i filter_taps_lo[num_taps >> 1]; + __m128i filter_taps_hi[num_taps >> 1]; + do { // y > 0 + for (int i = 0; i < num_taps; ++i) { + s[i] = LoadLo8(src_y + i * src_stride); + } + int filter_id = (p >> 6) & kSubPixelMask; + const int8_t* filter0 = + kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset; + PrepareVerticalTaps<num_taps>(filter0, filter_taps_lo); + p += step_y; + src_y = src + (p >> kScaleSubPixelBits) * src_stride; + + for (int i = 0; i < num_taps; ++i) { + s[i] = LoadHi8(s[i], src_y + i * src_stride); + } + filter_id = (p >> 6) & kSubPixelMask; + const int8_t* filter1 = + kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset; + PrepareVerticalTaps<num_taps>(filter1, filter_taps_hi); + p += step_y; + src_y = src + (p >> kScaleSubPixelBits) * src_stride; + + const __m128i sums = Sum2DVerticalTaps4x2<num_taps, is_compound>( + s, filter_taps_lo, filter_taps_hi); + if (is_compound) { + assert(width_class > 2); + StoreLo8(dest16_y, sums); + dest16_y += dest_stride; + StoreHi8(dest16_y, sums); + dest16_y += dest_stride; + } else { + const __m128i result = _mm_packus_epi16(sums, sums); + if (width_class == 2) { + Store2(dest_y, result); + dest_y += dest_stride; + Store2(dest_y, _mm_srli_si128(result, 4)); + } else { + Store4(dest_y, result); + dest_y += dest_stride; + Store4(dest_y, _mm_srli_si128(result, 4)); + } + dest_y += dest_stride; + } + y -= 2; + } while (y != 0); + return; + } + + // |width_class| >= 8 + __m128i filter_taps[num_taps >> 1]; + do { // y > 0 + src_y = src + (p >> kScaleSubPixelBits) * src_stride; + const int filter_id = (p >> 6) & kSubPixelMask; + const int8_t* filter = + kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset; + PrepareVerticalTaps<num_taps>(filter, filter_taps); + + int x = 0; + do { // x < width + for (int i = 0; i < num_taps; ++i) { + s[i] = LoadUnaligned16(src_y + i * src_stride); + } + + const __m128i sums = + Sum2DVerticalTaps<num_taps, is_compound>(s, filter_taps); + if (is_compound) { + StoreUnaligned16(dest16_y + x, sums); + } else { + StoreLo8(dest_y + x, _mm_packus_epi16(sums, sums)); + } + x += 8; + src_y += 8; + } while (x < width); + p += step_y; + dest_y += dest_stride; + dest16_y += dest_stride; + } while (--y != 0); +} + +template <bool is_compound> +void ConvolveScale2D_SSE4_1(const void* const reference, + const ptrdiff_t reference_stride, + const int horizontal_filter_index, + const int vertical_filter_index, + const int subpixel_x, const int subpixel_y, + const int step_x, const int step_y, const int width, + const int height, void* prediction, + const ptrdiff_t pred_stride) { + const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); + const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); + assert(step_x <= 2048); + // The output of the horizontal filter, i.e. the intermediate_result, is + // guaranteed to fit in int16_t. + // TODO(petersonab): Reduce intermediate block stride to width to make smaller + // blocks faster. + alignas(16) int16_t + intermediate_result[kMaxSuperBlockSizeInPixels * + (2 * kMaxSuperBlockSizeInPixels + kSubPixelTaps)]; + const int num_vert_taps = GetNumTapsInFilter(vert_filter_index); + const int intermediate_height = + (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >> + kScaleSubPixelBits) + + num_vert_taps; + + // Horizontal filter. + // Filter types used for width <= 4 are different from those for width > 4. + // When width > 4, the valid filter index range is always [0, 3]. + // When width <= 4, the valid filter index range is always [3, 5]. + // Similarly for height. + int16_t* intermediate = intermediate_result; + const ptrdiff_t src_stride = reference_stride; + const auto* src = static_cast<const uint8_t*>(reference); + const int vert_kernel_offset = (8 - num_vert_taps) / 2; + src += vert_kernel_offset * src_stride; + + // Derive the maximum value of |step_x| at which all source values fit in one + // 16-byte load. Final index is src_x + |num_taps| - 1 < 16 + // step_x*7 is the final base sub-pixel index for the shuffle mask for filter + // inputs in each iteration on large blocks. When step_x is large, we need a + // second register and alignr in order to gather all filter inputs. + // |num_taps| - 1 is the offset for the shuffle of inputs to the final tap. + const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index); + const int kernel_start_ceiling = 16 - num_horiz_taps; + // This truncated quotient |grade_x_threshold| selects |step_x| such that: + // (step_x * 7) >> kScaleSubPixelBits < single load limit + const int grade_x_threshold = + (kernel_start_ceiling << kScaleSubPixelBits) / 7; + switch (horiz_filter_index) { + case 0: + if (step_x > grade_x_threshold) { + ConvolveHorizontalScale<2, 0, 6>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + } else { + ConvolveHorizontalScale<1, 0, 6>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + } + break; + case 1: + if (step_x > grade_x_threshold) { + ConvolveHorizontalScale<2, 1, 6>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + + } else { + ConvolveHorizontalScale<1, 1, 6>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + } + break; + case 2: + if (step_x > grade_x_threshold) { + ConvolveHorizontalScale<2, 2, 8>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + } else { + ConvolveHorizontalScale<1, 2, 8>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + } + break; + case 3: + if (step_x > grade_x_threshold) { + ConvolveHorizontalScale<2, 3, 2>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + } else { + ConvolveHorizontalScale<1, 3, 2>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + } + break; + case 4: + assert(width <= 4); + ConvolveHorizontalScale<1, 4, 4>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + break; + default: + assert(horiz_filter_index == 5); + assert(width <= 4); + ConvolveHorizontalScale<1, 5, 4>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + } + + // Vertical filter. + intermediate = intermediate_result; + switch (vert_filter_index) { + case 0: + case 1: + if (!is_compound && width == 2) { + ConvolveVerticalScale<6, 2, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } else if (width == 4) { + ConvolveVerticalScale<6, 4, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } else { + ConvolveVerticalScale<6, 8, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } + break; + case 2: + if (!is_compound && width == 2) { + ConvolveVerticalScale<8, 2, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } else if (width == 4) { + ConvolveVerticalScale<8, 4, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } else { + ConvolveVerticalScale<8, 8, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } + break; + case 3: + if (!is_compound && width == 2) { + ConvolveVerticalScale<2, 2, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } else if (width == 4) { + ConvolveVerticalScale<2, 4, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } else { + ConvolveVerticalScale<2, 8, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } + break; + default: + assert(vert_filter_index == 4 || vert_filter_index == 5); + if (!is_compound && width == 2) { + ConvolveVerticalScale<4, 2, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } else if (width == 4) { + ConvolveVerticalScale<4, 4, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } else { + ConvolveVerticalScale<4, 8, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } + } +} + void Init8bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); assert(dsp != nullptr); dsp->convolve[0][0][0][1] = ConvolveHorizontal_SSE4_1; + dsp->convolve[0][0][1][0] = ConvolveVertical_SSE4_1; + dsp->convolve[0][0][1][1] = Convolve2D_SSE4_1; dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4; dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_SSE4_1; + dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_SSE4_1; + dsp->convolve[0][1][1][1] = ConvolveCompound2D_SSE4_1; + + dsp->convolve_scale[0] = ConvolveScale2D_SSE4_1<false>; + dsp->convolve_scale[1] = ConvolveScale2D_SSE4_1<true>; } } // namespace diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.h b/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.h index 92f35d79426..e449a87436f 100644 --- a/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.h +++ b/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.h @@ -38,6 +38,14 @@ void ConvolveInit_SSE4_1(); #define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_SSE4_1 #endif +#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical +#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_Convolve2D +#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_SSE4_1 +#endif + #ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundCopy #define LIBGAV1_Dsp8bpp_ConvolveCompoundCopy LIBGAV1_CPU_SSE4_1 #endif @@ -46,6 +54,22 @@ void ConvolveInit_SSE4_1(); #define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_SSE4_1 #endif +#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical +#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_ConvolveCompound2D +#define LIBGAV1_Dsp8bpp_ConvolveCompound2D LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_ConvolveScale2D +#define LIBGAV1_Dsp8bpp_ConvolveScale2D LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D +#define LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D LIBGAV1_CPU_SSE4_1 +#endif + #endif // LIBGAV1_ENABLE_SSE4_1 #endif // LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_ diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/loop_filter_sse4.cc b/chromium/third_party/libgav1/src/src/dsp/x86/loop_filter_sse4.cc index 78dec96bc69..edb8b1405f8 100644 --- a/chromium/third_party/libgav1/src/src/dsp/x86/loop_filter_sse4.cc +++ b/chromium/third_party/libgav1/src/src/dsp/x86/loop_filter_sse4.cc @@ -1143,7 +1143,7 @@ template <int bitdepth> struct LoopFilterFuncs_SSE4_1 { LoopFilterFuncs_SSE4_1() = delete; - static const int kThreshShift = bitdepth - 8; + static constexpr int kThreshShift = bitdepth - 8; static void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh, int hev_thresh); diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/loop_restoration_sse4.cc b/chromium/third_party/libgav1/src/src/dsp/x86/loop_restoration_sse4.cc index 02b7ed03e1a..7a01ab15aae 100644 --- a/chromium/third_party/libgav1/src/src/dsp/x86/loop_restoration_sse4.cc +++ b/chromium/third_party/libgav1/src/src/dsp/x86/loop_restoration_sse4.cc @@ -36,14 +36,6 @@ namespace dsp { namespace low_bitdepth { namespace { -// Precision of a division table (mtable) -constexpr int kSgrProjScaleBits = 20; -constexpr int kSgrProjReciprocalBits = 12; -// Core self-guided restoration precision bits. -constexpr int kSgrProjSgrBits = 8; -// Precision bits of generated values higher than source before projection. -constexpr int kSgrProjRestoreBits = 4; - // Note: range of wiener filter coefficients. // Wiener filter coefficients are symmetric, and their sum is 1 (128). // The range of each coefficient: @@ -85,12 +77,12 @@ void WienerFilter_SSE4_1(const void* source, void* const dest, (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1; const auto* src = static_cast<const uint8_t*>(source); auto* dst = static_cast<uint8_t*>(dest); - const ptrdiff_t buffer_stride = buffer->wiener_buffer_stride; - auto* wiener_buffer = buffer->wiener_buffer; + const ptrdiff_t buffer_stride = (width + 7) & ~7; + auto* wiener_buffer = buffer->wiener_buffer + buffer_stride; // horizontal filtering. PopulateWienerCoefficients(restoration_info, WienerInfo::kHorizontal, filter); const int center_tap = 3; - src -= center_tap * source_stride + center_tap; + src -= (center_tap - 1) * source_stride + center_tap; const int horizontal_rounding = 1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1); @@ -108,7 +100,7 @@ void WienerFilter_SSE4_1(const void* source, void* const dest, const __m128i v_offset_shift = _mm_cvtsi32_si128(7 - kInterRoundBitsHorizontal); - int y = 0; + int y = height + kSubPixelTaps - 4; do { int x = 0; do { @@ -156,9 +148,16 @@ void WienerFilter_SSE4_1(const void* source, void* const dest, } while (x < width); src += source_stride; wiener_buffer += buffer_stride; - } while (++y < height + kSubPixelTaps - 2); - + } while (--y != 0); + // Because the top row of |source| is a duplicate of the second row, and the + // bottom row of |source| is a duplicate of its above row, we can duplicate + // the top and bottom row of |wiener_buffer| accordingly. + memcpy(wiener_buffer, wiener_buffer - buffer_stride, + sizeof(*wiener_buffer) * width); wiener_buffer = buffer->wiener_buffer; + memcpy(wiener_buffer, wiener_buffer + buffer_stride, + sizeof(*wiener_buffer) * width); + // vertical filtering. PopulateWienerCoefficients(restoration_info, WienerInfo::kVertical, filter); @@ -211,521 +210,1380 @@ void WienerFilter_SSE4_1(const void* source, void* const dest, } while (++y < height); } -// Section 7.17.3. -// a2: range [1, 256]. -// if (z >= 255) -// a2 = 256; -// else if (z == 0) -// a2 = 1; -// else -// a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1); -constexpr int kXByXPlus1[256] = { - 1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239, - 240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247, - 248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250, - 250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252, - 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253, - 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, - 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254, - 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, - 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, - 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, - 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, - 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 256}; - -inline __m128i HorizontalAddVerticalSumsRadius1(const uint32_t* vert_sums) { - // Horizontally add vertical sums to get total box sum. - const __m128i v_sums_3210 = LoadUnaligned16(&vert_sums[0]); - const __m128i v_sums_7654 = LoadUnaligned16(&vert_sums[4]); - const __m128i v_sums_4321 = _mm_alignr_epi8(v_sums_7654, v_sums_3210, 4); - const __m128i v_sums_5432 = _mm_alignr_epi8(v_sums_7654, v_sums_3210, 8); - const __m128i v_s0 = _mm_add_epi32(v_sums_3210, v_sums_4321); - const __m128i v_s1 = _mm_add_epi32(v_s0, v_sums_5432); - return v_s1; -} - -inline __m128i HorizontalAddVerticalSumsRadius2(const uint32_t* vert_sums) { - // Horizontally add vertical sums to get total box sum. - const __m128i v_sums_3210 = LoadUnaligned16(&vert_sums[0]); - const __m128i v_sums_7654 = LoadUnaligned16(&vert_sums[4]); - const __m128i v_sums_4321 = _mm_alignr_epi8(v_sums_7654, v_sums_3210, 4); - const __m128i v_sums_5432 = _mm_alignr_epi8(v_sums_7654, v_sums_3210, 8); - const __m128i v_sums_6543 = _mm_alignr_epi8(v_sums_7654, v_sums_3210, 12); - const __m128i v_s0 = _mm_add_epi32(v_sums_3210, v_sums_4321); - const __m128i v_s1 = _mm_add_epi32(v_s0, v_sums_5432); - const __m128i v_s2 = _mm_add_epi32(v_s1, v_sums_6543); - const __m128i v_s3 = _mm_add_epi32(v_s2, v_sums_7654); - return v_s3; -} - -void BoxFilterPreProcessRadius1_SSE4_1( - const uint8_t* const src, ptrdiff_t stride, int width, int height, - uint32_t s, uint32_t* intermediate_result[2], ptrdiff_t array_stride, - uint32_t* vertical_sums, uint32_t* vertical_sum_of_squares) { - assert(s != 0); - const uint32_t n = 9; - const uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n; - const __m128i v_one_over_n = - _mm_shuffle_epi32(_mm_cvtsi32_si128(one_over_n), 0); - const __m128i v_sgrbits = - _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << kSgrProjSgrBits), 0); - -#if LIBGAV1_MSAN - // Over-reads occur in the x loop, so set to a known value. - memset(&vertical_sums[width], 0, 8 * sizeof(vertical_sums[0])); - memset(&vertical_sum_of_squares[width], 0, - 8 * sizeof(vertical_sum_of_squares[0])); -#endif +//------------------------------------------------------------------------------ +// SGR - // Calculate intermediate results, including one-pixel border, for example, - // if unit size is 64x64, we calculate 66x66 pixels. - int y = -1; - do { - const uint8_t* top_left = &src[(y - 1) * stride - 2]; - // Calculate the box vertical sums for each x position. - int vsx = -2; - do { - const __m128i v_box0 = _mm_cvtepu8_epi32(Load4(top_left)); - const __m128i v_box1 = _mm_cvtepu8_epi32(Load4(top_left + stride)); - const __m128i v_box2 = _mm_cvtepu8_epi32(Load4(top_left + stride * 2)); - const __m128i v_sqr0 = _mm_mullo_epi32(v_box0, v_box0); - const __m128i v_sqr1 = _mm_mullo_epi32(v_box1, v_box1); - const __m128i v_sqr2 = _mm_mullo_epi32(v_box2, v_box2); - const __m128i v_a01 = _mm_add_epi32(v_sqr0, v_sqr1); - const __m128i v_a012 = _mm_add_epi32(v_a01, v_sqr2); - const __m128i v_b01 = _mm_add_epi32(v_box0, v_box1); - const __m128i v_b012 = _mm_add_epi32(v_b01, v_box2); - StoreUnaligned16(&vertical_sum_of_squares[vsx], v_a012); - StoreUnaligned16(&vertical_sums[vsx], v_b012); - top_left += 4; - vsx += 4; - } while (vsx <= width + 1); - - int x = -1; +// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following +// functions. Some compilers may generate super inefficient code and the whole +// decoder could be 15% slower. + +inline __m128i VaddlLo8(const __m128i a, const __m128i b) { + const __m128i a0 = _mm_unpacklo_epi8(a, _mm_setzero_si128()); + const __m128i b0 = _mm_unpacklo_epi8(b, _mm_setzero_si128()); + return _mm_add_epi16(a0, b0); +} + +inline __m128i VaddlHi8(const __m128i a, const __m128i b) { + const __m128i a0 = _mm_unpackhi_epi8(a, _mm_setzero_si128()); + const __m128i b0 = _mm_unpackhi_epi8(b, _mm_setzero_si128()); + return _mm_add_epi16(a0, b0); +} + +inline __m128i VaddlLo16(const __m128i a, const __m128i b) { + const __m128i a0 = _mm_unpacklo_epi16(a, _mm_setzero_si128()); + const __m128i b0 = _mm_unpacklo_epi16(b, _mm_setzero_si128()); + return _mm_add_epi32(a0, b0); +} + +inline __m128i VaddlHi16(const __m128i a, const __m128i b) { + const __m128i a0 = _mm_unpackhi_epi16(a, _mm_setzero_si128()); + const __m128i b0 = _mm_unpackhi_epi16(b, _mm_setzero_si128()); + return _mm_add_epi32(a0, b0); +} + +inline __m128i VaddwLo8(const __m128i a, const __m128i b) { + const __m128i b0 = _mm_unpacklo_epi8(b, _mm_setzero_si128()); + return _mm_add_epi16(a, b0); +} + +inline __m128i VaddwHi8(const __m128i a, const __m128i b) { + const __m128i b0 = _mm_unpackhi_epi8(b, _mm_setzero_si128()); + return _mm_add_epi16(a, b0); +} + +inline __m128i VaddwLo16(const __m128i a, const __m128i b) { + const __m128i b0 = _mm_unpacklo_epi16(b, _mm_setzero_si128()); + return _mm_add_epi32(a, b0); +} + +inline __m128i VaddwHi16(const __m128i a, const __m128i b) { + const __m128i b0 = _mm_unpackhi_epi16(b, _mm_setzero_si128()); + return _mm_add_epi32(a, b0); +} + +// Using VgetLane16() can save a sign extension instruction. +template <int n> +inline int16_t VgetLane16(const __m128i a) { + return _mm_extract_epi16(a, n); +} + +inline __m128i VmullLo8(const __m128i a, const __m128i b) { + const __m128i a0 = _mm_unpacklo_epi8(a, _mm_setzero_si128()); + const __m128i b0 = _mm_unpacklo_epi8(b, _mm_setzero_si128()); + return _mm_mullo_epi16(a0, b0); +} + +inline __m128i VmullHi8(const __m128i a, const __m128i b) { + const __m128i a0 = _mm_unpackhi_epi8(a, _mm_setzero_si128()); + const __m128i b0 = _mm_unpackhi_epi8(b, _mm_setzero_si128()); + return _mm_mullo_epi16(a0, b0); +} + +inline __m128i VmullNLo8(const __m128i a, const int16_t b) { + const __m128i a0 = _mm_unpacklo_epi16(a, _mm_setzero_si128()); + return _mm_madd_epi16(a0, _mm_set1_epi32(b)); +} + +inline __m128i VmullNHi8(const __m128i a, const int16_t b) { + const __m128i a0 = _mm_unpackhi_epi16(a, _mm_setzero_si128()); + return _mm_madd_epi16(a0, _mm_set1_epi32(b)); +} + +inline __m128i VmullLo16(const __m128i a, const __m128i b) { + const __m128i a0 = _mm_unpacklo_epi16(a, _mm_setzero_si128()); + const __m128i b0 = _mm_unpacklo_epi16(b, _mm_setzero_si128()); + return _mm_madd_epi16(a0, b0); +} + +inline __m128i VmullHi16(const __m128i a, const __m128i b) { + const __m128i a0 = _mm_unpackhi_epi16(a, _mm_setzero_si128()); + const __m128i b0 = _mm_unpackhi_epi16(b, _mm_setzero_si128()); + return _mm_madd_epi16(a0, b0); +} + +inline __m128i VmulwLo16(const __m128i a, const __m128i b) { + const __m128i b0 = _mm_unpacklo_epi16(b, _mm_setzero_si128()); + return _mm_madd_epi16(a, b0); +} + +inline __m128i VmulwHi16(const __m128i a, const __m128i b) { + const __m128i b0 = _mm_unpackhi_epi16(b, _mm_setzero_si128()); + return _mm_madd_epi16(a, b0); +} + +inline __m128i VmlalNLo16(const __m128i sum, const __m128i a, const int16_t b) { + return _mm_add_epi32(sum, VmullNLo8(a, b)); +} + +inline __m128i VmlalNHi16(const __m128i sum, const __m128i a, const int16_t b) { + return _mm_add_epi32(sum, VmullNHi8(a, b)); +} + +inline __m128i VmlawLo16(const __m128i sum, const __m128i a, const __m128i b) { + const __m128i b0 = _mm_unpacklo_epi16(b, _mm_setzero_si128()); + return _mm_add_epi32(sum, _mm_madd_epi16(a, b0)); +} + +inline __m128i VmlawHi16(const __m128i sum, const __m128i a, const __m128i b) { + const __m128i b0 = _mm_unpackhi_epi16(b, _mm_setzero_si128()); + return _mm_add_epi32(sum, _mm_madd_epi16(a, b0)); +} + +inline __m128i VrshrNS32(const __m128i a, const int b) { + const __m128i sum = _mm_add_epi32(a, _mm_set1_epi32(1 << (b - 1))); + return _mm_srai_epi32(sum, b); +} + +inline __m128i VrshrN32(const __m128i a, const int b) { + const __m128i sum = _mm_add_epi32(a, _mm_set1_epi32(1 << (b - 1))); + return _mm_srli_epi32(sum, b); +} + +inline __m128i VshllN8(const __m128i a, const int b) { + const __m128i a0 = _mm_unpacklo_epi8(a, _mm_setzero_si128()); + return _mm_slli_epi16(a0, b); +} + +template <int n> +inline __m128i CalcAxN(const __m128i a) { + static_assert(n == 9 || n == 25, ""); + // _mm_mullo_epi32() has high latency. Using shifts and additions instead. + // Some compilers could do this for us but we make this explicit. + // return _mm_mullo_epi32(a, _mm_set1_epi32(n)); + const __m128i ax9 = _mm_add_epi32(a, _mm_slli_epi32(a, 3)); + if (n == 9) return ax9; + if (n == 25) return _mm_add_epi32(ax9, _mm_slli_epi32(a, 4)); +} + +template <int n> +inline __m128i CalculateSgrMA2(const __m128i sum_sq, const __m128i sum, + const uint32_t s) { + // a = |sum_sq| + // d = |sum| + // p = (a * n < d * d) ? 0 : a * n - d * d; + const __m128i dxd = _mm_madd_epi16(sum, sum); + const __m128i axn = CalcAxN<n>(sum_sq); + const __m128i sub = _mm_sub_epi32(axn, dxd); + const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128()); + + // z = RightShiftWithRounding(p * s, kSgrProjScaleBits); + const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(s)); + return VrshrN32(pxs, kSgrProjScaleBits); +} + +inline __m128i CalculateIntermediate4(const __m128i sgr_ma2, const __m128i sum, + const uint32_t one_over_n) { + // b2 = ((1 << kSgrProjSgrBits) - a2) * b * one_over_n + // 1 << kSgrProjSgrBits = 256 + // |a2| = [1, 256] + // |sgr_ma2| max value = 255 + // |sum| is a box sum with radius 1 or 2. + // For the first pass radius is 2. Maximum value is 5x5x255 = 6375. + // For the second pass radius is 1. Maximum value is 3x3x255 = 2295. + // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n + // When radius is 2 |n| is 25. |one_over_n| is 164. + // When radius is 1 |n| is 9. |one_over_n| is 455. + const __m128i sgr_ma2q = _mm_unpacklo_epi8(sgr_ma2, _mm_setzero_si128()); + const __m128i s = _mm_unpackhi_epi16(sgr_ma2q, _mm_setzero_si128()); + const __m128i m = _mm_madd_epi16(s, sum); + const __m128i b2 = _mm_mullo_epi32(m, _mm_set1_epi32(one_over_n)); + // static_cast<int>(RightShiftWithRounding(b2, kSgrProjReciprocalBits)); + // |kSgrProjReciprocalBits| is 12. + // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits). + // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits). + const __m128i truncate_u32 = VrshrN32(b2, kSgrProjReciprocalBits); + return _mm_packus_epi32(truncate_u32, truncate_u32); +} + +inline __m128i CalculateIntermediate8(const __m128i sgr_ma2, const __m128i sum, + const uint32_t one_over_n) { + // b2 = ((1 << kSgrProjSgrBits) - a2) * b * one_over_n + // 1 << kSgrProjSgrBits = 256 + // |a2| = [1, 256] + // |sgr_ma2| max value = 255 + // |sum| is a box sum with radius 1 or 2. + // For the first pass radius is 2. Maximum value is 5x5x255 = 6375. + // For the second pass radius is 1. Maximum value is 3x3x255 = 2295. + // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n + // When radius is 2 |n| is 25. |one_over_n| is 164. + // When radius is 1 |n| is 9. |one_over_n| is 455. + const __m128i sgr_ma2q = _mm_unpackhi_epi8(sgr_ma2, _mm_setzero_si128()); + const __m128i m0 = VmullLo16(sgr_ma2q, sum); + const __m128i m1 = VmullHi16(sgr_ma2q, sum); + const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n)); + const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n)); + // static_cast<int>(RightShiftWithRounding(b2, kSgrProjReciprocalBits)); + // |kSgrProjReciprocalBits| is 12. + // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits). + // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits). + const __m128i b2_lo = VrshrN32(m2, kSgrProjReciprocalBits); + const __m128i b2_hi = VrshrN32(m3, kSgrProjReciprocalBits); + return _mm_packus_epi32(b2_lo, b2_hi); +} + +inline __m128i Sum3_16(const __m128i left, const __m128i middle, + const __m128i right) { + const __m128i sum = _mm_add_epi16(left, middle); + return _mm_add_epi16(sum, right); +} + +inline __m128i Sum3_32(const __m128i left, const __m128i middle, + const __m128i right) { + const __m128i sum = _mm_add_epi32(left, middle); + return _mm_add_epi32(sum, right); +} + +inline __m128i Sum3W_16(const __m128i left, const __m128i middle, + const __m128i right) { + const __m128i sum = VaddlLo8(left, middle); + return VaddwLo8(sum, right); +} + +inline __m128i Sum3WLo_16(const __m128i a[3]) { + return Sum3W_16(a[0], a[1], a[2]); +} + +inline __m128i Sum3WHi_16(const __m128i a[3]) { + const __m128i sum = VaddlHi8(a[0], a[1]); + return VaddwHi8(sum, a[2]); +} + +inline __m128i Sum3WLo_32(const __m128i left, const __m128i middle, + const __m128i right) { + const __m128i sum = VaddlLo16(left, middle); + return VaddwLo16(sum, right); +} + +inline __m128i Sum3WHi_32(const __m128i left, const __m128i middle, + const __m128i right) { + const __m128i sum = VaddlHi16(left, middle); + return VaddwHi16(sum, right); +} + +inline __m128i* Sum3W_16x2(const __m128i a[3], __m128i sum[2]) { + sum[0] = Sum3WLo_16(a); + sum[1] = Sum3WHi_16(a); + return sum; +} + +inline __m128i* Sum3W(const __m128i a[3], __m128i sum[2]) { + sum[0] = Sum3WLo_32(a[0], a[1], a[2]); + sum[1] = Sum3WHi_32(a[0], a[1], a[2]); + return sum; +} + +template <int index> +inline __m128i Sum3WLo(const __m128i a[3][2]) { + const __m128i b0 = a[0][index]; + const __m128i b1 = a[1][index]; + const __m128i b2 = a[2][index]; + return Sum3WLo_32(b0, b1, b2); +} + +inline __m128i Sum3WHi(const __m128i a[3][2]) { + const __m128i b0 = a[0][0]; + const __m128i b1 = a[1][0]; + const __m128i b2 = a[2][0]; + return Sum3WHi_32(b0, b1, b2); +} + +inline __m128i* Sum3W(const __m128i a[3][2], __m128i sum[3]) { + sum[0] = Sum3WLo<0>(a); + sum[1] = Sum3WHi(a); + sum[2] = Sum3WLo<1>(a); + return sum; +} + +inline __m128i Sum5_16(const __m128i a[5]) { + const __m128i sum01 = _mm_add_epi16(a[0], a[1]); + const __m128i sum23 = _mm_add_epi16(a[2], a[3]); + const __m128i sum = _mm_add_epi16(sum01, sum23); + return _mm_add_epi16(sum, a[4]); +} + +inline __m128i Sum5_32(const __m128i a[5]) { + const __m128i sum01 = _mm_add_epi32(a[0], a[1]); + const __m128i sum23 = _mm_add_epi32(a[2], a[3]); + const __m128i sum = _mm_add_epi32(sum01, sum23); + return _mm_add_epi32(sum, a[4]); +} + +inline __m128i Sum5WLo_16(const __m128i a[5]) { + const __m128i sum01 = VaddlLo8(a[0], a[1]); + const __m128i sum23 = VaddlLo8(a[2], a[3]); + const __m128i sum = _mm_add_epi16(sum01, sum23); + return VaddwLo8(sum, a[4]); +} + +inline __m128i Sum5WHi_16(const __m128i a[5]) { + const __m128i sum01 = VaddlHi8(a[0], a[1]); + const __m128i sum23 = VaddlHi8(a[2], a[3]); + const __m128i sum = _mm_add_epi16(sum01, sum23); + return VaddwHi8(sum, a[4]); +} + +inline __m128i Sum5WLo_32(const __m128i a[5]) { + const __m128i sum01 = VaddlLo16(a[0], a[1]); + const __m128i sum23 = VaddlLo16(a[2], a[3]); + const __m128i sum0123 = _mm_add_epi32(sum01, sum23); + return VaddwLo16(sum0123, a[4]); +} + +inline __m128i Sum5WHi_32(const __m128i a[5]) { + const __m128i sum01 = VaddlHi16(a[0], a[1]); + const __m128i sum23 = VaddlHi16(a[2], a[3]); + const __m128i sum0123 = _mm_add_epi32(sum01, sum23); + return VaddwHi16(sum0123, a[4]); +} + +inline __m128i* Sum5W_16D(const __m128i a[5], __m128i sum[2]) { + sum[0] = Sum5WLo_16(a); + sum[1] = Sum5WHi_16(a); + return sum; +} + +inline __m128i* Sum5W_32x2(const __m128i a[5], __m128i sum[2]) { + sum[0] = Sum5WLo_32(a); + sum[1] = Sum5WHi_32(a); + return sum; +} + +template <int index> +inline __m128i Sum5WLo(const __m128i a[5][2]) { + __m128i b[5]; + b[0] = a[0][index]; + b[1] = a[1][index]; + b[2] = a[2][index]; + b[3] = a[3][index]; + b[4] = a[4][index]; + return Sum5WLo_32(b); +} + +inline __m128i Sum5WHi(const __m128i a[5][2]) { + __m128i b[5]; + b[0] = a[0][0]; + b[1] = a[1][0]; + b[2] = a[2][0]; + b[3] = a[3][0]; + b[4] = a[4][0]; + return Sum5WHi_32(b); +} + +inline __m128i* Sum5W_32x3(const __m128i a[5][2], __m128i sum[3]) { + sum[0] = Sum5WLo<0>(a); + sum[1] = Sum5WHi(a); + sum[2] = Sum5WLo<1>(a); + return sum; +} + +inline __m128i Sum3Horizontal(const __m128i a) { + const auto left = a; + const auto middle = _mm_srli_si128(a, 2); + const auto right = _mm_srli_si128(a, 4); + return Sum3_16(left, middle, right); +} + +inline __m128i Sum3Horizontal_16(const __m128i a[2]) { + const auto left = a[0]; + const auto middle = _mm_alignr_epi8(a[1], a[0], 2); + const auto right = _mm_alignr_epi8(a[1], a[0], 4); + return Sum3_16(left, middle, right); +} + +inline __m128i Sum3Horizontal_32(const __m128i a[2]) { + const auto left = a[0]; + const auto middle = _mm_alignr_epi8(a[1], a[0], 4); + const auto right = _mm_alignr_epi8(a[1], a[0], 8); + return Sum3_32(left, middle, right); +} + +inline __m128i* Sum3Horizontal_32x2(const __m128i a[3], __m128i sum[2]) { + { + const auto left = a[0]; + const auto middle = _mm_alignr_epi8(a[1], a[0], 4); + const auto right = _mm_alignr_epi8(a[1], a[0], 8); + sum[0] = Sum3_32(left, middle, right); + } + { + const auto left = a[1]; + const auto middle = _mm_alignr_epi8(a[2], a[1], 4); + const auto right = _mm_alignr_epi8(a[2], a[1], 8); + sum[1] = Sum3_32(left, middle, right); + } + return sum; +} + +inline __m128i Sum3HorizontalOffset1(const __m128i a) { + const auto left = _mm_srli_si128(a, 2); + const auto middle = _mm_srli_si128(a, 4); + const auto right = _mm_srli_si128(a, 6); + return Sum3_16(left, middle, right); +} + +inline __m128i Sum3HorizontalOffset1_16(const __m128i a[2]) { + const auto left = _mm_alignr_epi8(a[1], a[0], 2); + const auto middle = _mm_alignr_epi8(a[1], a[0], 4); + const auto right = _mm_alignr_epi8(a[1], a[0], 6); + return Sum3_16(left, middle, right); +} + +inline __m128i Sum3HorizontalOffset1_32(const __m128i a[2]) { + const auto left = _mm_alignr_epi8(a[1], a[0], 4); + const auto middle = _mm_alignr_epi8(a[1], a[0], 8); + const auto right = _mm_alignr_epi8(a[1], a[0], 12); + return Sum3_32(left, middle, right); +} + +inline void Sum3HorizontalOffset1_32x2(const __m128i a[3], __m128i sum[2]) { + sum[0] = Sum3HorizontalOffset1_32(a + 0); + sum[1] = Sum3HorizontalOffset1_32(a + 1); +} + +inline __m128i Sum5Horizontal(const __m128i a) { + __m128i s[5]; + s[0] = a; + s[1] = _mm_srli_si128(a, 2); + s[2] = _mm_srli_si128(a, 4); + s[3] = _mm_srli_si128(a, 6); + s[4] = _mm_srli_si128(a, 8); + return Sum5_16(s); +} + +inline __m128i Sum5Horizontal_16(const __m128i a[2]) { + __m128i s[5]; + s[0] = a[0]; + s[1] = _mm_alignr_epi8(a[1], a[0], 2); + s[2] = _mm_alignr_epi8(a[1], a[0], 4); + s[3] = _mm_alignr_epi8(a[1], a[0], 6); + s[4] = _mm_alignr_epi8(a[1], a[0], 8); + return Sum5_16(s); +} + +inline __m128i Sum5Horizontal_32(const __m128i a[2]) { + __m128i s[5]; + s[0] = a[0]; + s[1] = _mm_alignr_epi8(a[1], a[0], 4); + s[2] = _mm_alignr_epi8(a[1], a[0], 8); + s[3] = _mm_alignr_epi8(a[1], a[0], 12); + s[4] = a[1]; + return Sum5_32(s); +} + +inline __m128i* Sum5Horizontal_32x2(const __m128i a[3], __m128i sum[2]) { + __m128i s[5]; + s[0] = a[0]; + s[1] = _mm_alignr_epi8(a[1], a[0], 4); + s[2] = _mm_alignr_epi8(a[1], a[0], 8); + s[3] = _mm_alignr_epi8(a[1], a[0], 12); + s[4] = a[1]; + sum[0] = Sum5_32(s); + s[0] = a[1]; + s[1] = _mm_alignr_epi8(a[2], a[1], 4); + s[2] = _mm_alignr_epi8(a[2], a[1], 8); + s[3] = _mm_alignr_epi8(a[2], a[1], 12); + s[4] = a[2]; + sum[1] = Sum5_32(s); + return sum; +} + +template <int size, int offset> +inline void BoxFilterPreProcess4(const __m128i* const row, + const __m128i* const row_sq, const uint32_t s, + uint16_t* const dst) { + static_assert(offset == 0 || offset == 1, ""); + // Number of elements in the box being summed. + constexpr uint32_t n = size * size; + constexpr uint32_t one_over_n = + ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n; + __m128i sum, sum_sq; + if (size == 3) { + __m128i temp32[2]; + if (offset == 0) { + sum = Sum3Horizontal(Sum3WLo_16(row)); + sum_sq = Sum3Horizontal_32(Sum3W(row_sq, temp32)); + } else { + sum = Sum3HorizontalOffset1(Sum3WLo_16(row)); + sum_sq = Sum3HorizontalOffset1_32(Sum3W(row_sq, temp32)); + } + } + if (size == 5) { + __m128i temp[2]; + sum = Sum5Horizontal(Sum5WLo_16(row)); + sum_sq = Sum5Horizontal_32(Sum5W_32x2(row_sq, temp)); + } + const __m128i sum_32 = _mm_unpacklo_epi16(sum, _mm_setzero_si128()); + const __m128i z0 = CalculateSgrMA2<n>(sum_sq, sum_32, s); + const __m128i z1 = _mm_packus_epi32(z0, z0); + const __m128i z = _mm_min_epu16(z1, _mm_set1_epi16(255)); + __m128i sgr_ma2 = _mm_setzero_si128(); + sgr_ma2 = _mm_insert_epi8(sgr_ma2, kSgrMa2Lookup[VgetLane16<0>(z)], 4); + sgr_ma2 = _mm_insert_epi8(sgr_ma2, kSgrMa2Lookup[VgetLane16<1>(z)], 5); + sgr_ma2 = _mm_insert_epi8(sgr_ma2, kSgrMa2Lookup[VgetLane16<2>(z)], 6); + sgr_ma2 = _mm_insert_epi8(sgr_ma2, kSgrMa2Lookup[VgetLane16<3>(z)], 7); + const __m128i b2 = CalculateIntermediate4(sgr_ma2, sum_32, one_over_n); + const __m128i sgr_ma2_b2 = _mm_unpacklo_epi64(sgr_ma2, b2); + StoreAligned16(dst, sgr_ma2_b2); +} + +template <int size, int offset> +inline void BoxFilterPreProcess8(const __m128i* const row, + const __m128i row_sq[][2], const uint32_t s, + __m128i* const sgr_ma2, __m128i* const b2, + uint16_t* const dst) { + // Number of elements in the box being summed. + constexpr uint32_t n = size * size; + constexpr uint32_t one_over_n = + ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n; + __m128i sum, sum_sq[2]; + if (size == 3) { + __m128i temp16[2], temp32[3]; + if (offset == 0) { + sum = Sum3Horizontal_16(Sum3W_16x2(row, temp16)); + Sum3Horizontal_32x2(Sum3W(row_sq, temp32), sum_sq); + } else /* if (offset == 1) */ { + sum = Sum3HorizontalOffset1_16(Sum3W_16x2(row, temp16)); + Sum3HorizontalOffset1_32x2(Sum3W(row_sq, temp32), sum_sq); + } + } + if (size == 5) { + __m128i temp16[2], temp32[3]; + sum = Sum5Horizontal_16(Sum5W_16D(row, temp16)); + Sum5Horizontal_32x2(Sum5W_32x3(row_sq, temp32), sum_sq); + } + const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128()); + const __m128i z0 = CalculateSgrMA2<n>(sum_sq[0], sum_lo, s); + const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128()); + const __m128i z1 = CalculateSgrMA2<n>(sum_sq[1], sum_hi, s); + const __m128i z01 = _mm_packus_epi32(z0, z1); + const __m128i z = _mm_min_epu16(z01, _mm_set1_epi16(255)); + *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<0>(z)], 8); + *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<1>(z)], 9); + *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<2>(z)], 10); + *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<3>(z)], 11); + *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<4>(z)], 12); + *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<5>(z)], 13); + *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<6>(z)], 14); + *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<7>(z)], 15); + *b2 = CalculateIntermediate8(*sgr_ma2, sum, one_over_n); + const __m128i sgr_ma2_b2 = _mm_unpackhi_epi64(*sgr_ma2, *b2); + StoreAligned16(dst, sgr_ma2_b2); +} + +inline void Prepare3_8(const __m128i a, __m128i* const left, + __m128i* const middle, __m128i* const right) { + *left = _mm_srli_si128(a, 4); + *middle = _mm_srli_si128(a, 5); + *right = _mm_srli_si128(a, 6); +} + +inline void Prepare3_16(const __m128i a[2], __m128i* const left, + __m128i* const middle, __m128i* const right) { + *left = _mm_alignr_epi8(a[1], a[0], 8); + *middle = _mm_alignr_epi8(a[1], a[0], 10); + *right = _mm_alignr_epi8(a[1], a[0], 12); +} + +inline __m128i Sum343(const __m128i a) { + __m128i left, middle, right; + Prepare3_8(a, &left, &middle, &right); + const auto sum = Sum3W_16(left, middle, right); + const auto sum3 = Sum3_16(sum, sum, sum); + return VaddwLo8(sum3, middle); +} + +inline void Sum343_444(const __m128i a, __m128i* const sum343, + __m128i* const sum444) { + __m128i left, middle, right; + Prepare3_8(a, &left, &middle, &right); + const auto sum = Sum3W_16(left, middle, right); + const auto sum3 = Sum3_16(sum, sum, sum); + *sum343 = VaddwLo8(sum3, middle); + *sum444 = _mm_slli_epi16(sum, 2); +} + +inline __m128i* Sum343W(const __m128i a[2], __m128i d[2]) { + __m128i left, middle, right; + Prepare3_16(a, &left, &middle, &right); + d[0] = Sum3WLo_32(left, middle, right); + d[1] = Sum3WHi_32(left, middle, right); + d[0] = Sum3_32(d[0], d[0], d[0]); + d[1] = Sum3_32(d[1], d[1], d[1]); + d[0] = VaddwLo16(d[0], middle); + d[1] = VaddwHi16(d[1], middle); + return d; +} + +inline void Sum343_444W(const __m128i a[2], __m128i sum343[2], + __m128i sum444[2]) { + __m128i left, middle, right; + Prepare3_16(a, &left, &middle, &right); + sum444[0] = Sum3WLo_32(left, middle, right); + sum444[1] = Sum3WHi_32(left, middle, right); + sum343[0] = Sum3_32(sum444[0], sum444[0], sum444[0]); + sum343[1] = Sum3_32(sum444[1], sum444[1], sum444[1]); + sum343[0] = VaddwLo16(sum343[0], middle); + sum343[1] = VaddwHi16(sum343[1], middle); + sum444[0] = _mm_slli_epi32(sum444[0], 2); + sum444[1] = _mm_slli_epi32(sum444[1], 2); +} + +inline __m128i Sum565(const __m128i a) { + __m128i left, middle, right; + Prepare3_8(a, &left, &middle, &right); + const auto sum = Sum3W_16(left, middle, right); + const auto sum4 = _mm_slli_epi16(sum, 2); + const auto sum5 = _mm_add_epi16(sum4, sum); + return VaddwLo8(sum5, middle); +} + +inline __m128i Sum565W(const __m128i a) { + const auto left = a; + const auto middle = _mm_srli_si128(a, 2); + const auto right = _mm_srli_si128(a, 4); + const auto sum = Sum3WLo_32(left, middle, right); + const auto sum4 = _mm_slli_epi32(sum, 2); + const auto sum5 = _mm_add_epi32(sum4, sum); + return VaddwLo16(sum5, middle); +} + +// RightShiftWithRounding( +// (a * src_ptr[x] + b), kSgrProjSgrBits + shift - kSgrProjRestoreBits); +template <int shift> +inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i a, + const __m128i b[2]) { + const __m128i src_u16 = _mm_unpacklo_epi8(src, _mm_setzero_si128()); + // a: 256 * 32 = 8192 (14 bits) + // b: 65088 * 32 = 2082816 (21 bits) + const __m128i axsrc_lo = VmullLo16(a, src_u16); + const __m128i axsrc_hi = VmullHi16(a, src_u16); + // v: 8192 * 255 + 2082816 = 4171876 (22 bits) + const __m128i v_lo = _mm_add_epi32(axsrc_lo, b[0]); + const __m128i v_hi = _mm_add_epi32(axsrc_hi, b[1]); + + // kSgrProjSgrBits = 8 + // kSgrProjRestoreBits = 4 + // shift = 4 or 5 + // v >> 8 or 9 + // 22 bits >> 8 = 14 bits + const __m128i dst_lo = + VrshrN32(v_lo, kSgrProjSgrBits + shift - kSgrProjRestoreBits); + const __m128i dst_hi = + VrshrN32(v_hi, kSgrProjSgrBits + shift - kSgrProjRestoreBits); + return _mm_packus_epi32(dst_lo, dst_hi); // 14 bits +} + +inline __m128i BoxFilterPass1(const __m128i src_u8, const __m128i a2, + const __m128i b2[2], __m128i sum565_a[2], + __m128i sum565_b[2][2]) { + __m128i b_v[2]; + sum565_a[1] = Sum565(a2); + sum565_a[1] = _mm_sub_epi16(_mm_set1_epi16((5 + 6 + 5) * 256), sum565_a[1]); + sum565_b[1][0] = Sum565W(_mm_alignr_epi8(b2[1], b2[0], 8)); + sum565_b[1][1] = Sum565W(b2[1]); + + __m128i a_v = _mm_add_epi16(sum565_a[0], sum565_a[1]); + b_v[0] = _mm_add_epi32(sum565_b[0][0], sum565_b[1][0]); + b_v[1] = _mm_add_epi32(sum565_b[0][1], sum565_b[1][1]); + return CalculateFilteredOutput<5>(src_u8, a_v, b_v); // 14 bits +} + +inline __m128i BoxFilterPass2(const __m128i src_u8, const __m128i a2, + const __m128i b2[2], __m128i sum343_a[4], + __m128i sum444_a[3], __m128i sum343_b[4][2], + __m128i sum444_b[3][2]) { + __m128i b_v[2]; + Sum343_444(a2, &sum343_a[2], &sum444_a[1]); + sum343_a[2] = _mm_sub_epi16(_mm_set1_epi16((3 + 4 + 3) * 256), sum343_a[2]); + sum444_a[1] = _mm_sub_epi16(_mm_set1_epi16((4 + 4 + 4) * 256), sum444_a[1]); + __m128i a_v = Sum3_16(sum343_a[0], sum444_a[0], sum343_a[2]); + Sum343_444W(b2, sum343_b[2], sum444_b[1]); + b_v[0] = Sum3_32(sum343_b[0][0], sum444_b[0][0], sum343_b[2][0]); + b_v[1] = Sum3_32(sum343_b[0][1], sum444_b[0][1], sum343_b[2][1]); + return CalculateFilteredOutput<5>(src_u8, a_v, b_v); // 14 bits +} + +inline void SelfGuidedDoubleMultiplier( + const __m128i src, const __m128i box_filter_process_output[2], + const __m128i w0, const __m128i w1, const __m128i w2, uint8_t* const dst) { + // |wN| values are signed. |src| values can be treated as int16_t. + const __m128i u = VshllN8(src, kSgrProjRestoreBits); + __m128i v_lo = VmulwLo16(w1, u); + v_lo = VmlawLo16(v_lo, w0, box_filter_process_output[0]); + v_lo = VmlawLo16(v_lo, w2, box_filter_process_output[1]); + __m128i v_hi = VmulwHi16(w1, u); + v_hi = VmlawHi16(v_hi, w0, box_filter_process_output[0]); + v_hi = VmlawHi16(v_hi, w2, box_filter_process_output[1]); + // |s| is saturated to uint8_t. + const __m128i s_lo = + VrshrNS32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits); + const __m128i s_hi = + VrshrNS32(v_hi, kSgrProjRestoreBits + kSgrProjPrecisionBits); + const __m128i s = _mm_packs_epi32(s_lo, s_hi); + StoreLo8(dst, _mm_packus_epi16(s, s)); +} + +inline void SelfGuidedSingleMultiplier(const __m128i src, + const __m128i box_filter_process_output, + const int16_t w0, const int16_t w1, + uint8_t* const dst) { + // weight: -96 to 96 (Sgrproj_Xqd_Min/Max) + const __m128i u = VshllN8(src, kSgrProjRestoreBits); + // u * w1 + u * wN == u * (w1 + wN) + __m128i v_lo = VmullNLo8(u, w1); + v_lo = VmlalNLo16(v_lo, box_filter_process_output, w0); + __m128i v_hi = VmullNHi8(u, w1); + v_hi = VmlalNHi16(v_hi, box_filter_process_output, w0); + const __m128i s_lo = + VrshrNS32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits); + const __m128i s_hi = + VrshrNS32(v_hi, kSgrProjRestoreBits + kSgrProjPrecisionBits); + const __m128i s = _mm_packs_epi32(s_lo, s_hi); + StoreLo8(dst, _mm_packus_epi16(s, s)); +} + +inline void BoxFilterProcess(const uint8_t* const src, + const ptrdiff_t src_stride, + const RestorationUnitInfo& restoration_info, + const int width, const int height, + const uint16_t s[2], uint16_t* const temp, + uint8_t* const dst, const ptrdiff_t dst_stride) { + // We have combined PreProcess and Process for the first pass by storing + // intermediate values in the |a2| region. The values stored are one vertical + // column of interleaved |a2| and |b2| values and consume 8 * |height| values. + // This is |height| and not |height| * 2 because PreProcess only generates + // output for every other row. When processing the next column we write the + // new scratch values right after reading the previously saved ones. + + // The PreProcess phase calculates a 5x5 box sum for every other row + // + // PreProcess and Process have been combined into the same step. We need 12 + // input values to generate 8 output values for PreProcess: + // 0 1 2 3 4 5 6 7 8 9 10 11 + // 2 = 0 + 1 + 2 + 3 + 4 + // 3 = 1 + 2 + 3 + 4 + 5 + // 4 = 2 + 3 + 4 + 5 + 6 + // 5 = 3 + 4 + 5 + 6 + 7 + // 6 = 4 + 5 + 6 + 7 + 8 + // 7 = 5 + 6 + 7 + 8 + 9 + // 8 = 6 + 7 + 8 + 9 + 10 + // 9 = 7 + 8 + 9 + 10 + 11 + // + // and then we need 10 input values to generate 8 output values for Process: + // 0 1 2 3 4 5 6 7 8 9 + // 1 = 0 + 1 + 2 + // 2 = 1 + 2 + 3 + // 3 = 2 + 3 + 4 + // 4 = 3 + 4 + 5 + // 5 = 4 + 5 + 6 + // 6 = 5 + 6 + 7 + // 7 = 6 + 7 + 8 + // 8 = 7 + 8 + 9 + // + // To avoid re-calculating PreProcess values over and over again we will do a + // single column of 8 output values and store the second half of them + // interleaved in |temp|. The first half is not stored, since it is used + // immediately and becomes useless for the next column. Next we will start the + // second column. When 2 rows have been calculated we can calculate Process + // and output the results. + + // Calculate and store a single column. Scope so we can re-use the variable + // names for the next step. + uint16_t* ab_ptr = temp; + + const uint8_t* const src_pre_process = src - 2 * src_stride - 3; + // Calculate intermediate results, including two-pixel border, for example, if + // unit size is 64x64, we calculate 68x68 pixels. + { + const uint8_t* column = src_pre_process; + __m128i row[5], row_sq[5]; + row[0] = row[1] = LoadLo8Msan(column, 2 - width); + column += src_stride; + row[2] = LoadLo8Msan(column, 2 - width); + + row_sq[0] = row_sq[1] = VmullLo8(row[1], row[1]); + row_sq[2] = VmullLo8(row[2], row[2]); + + int y = (height + 2) >> 1; do { - const __m128i v_a = - HorizontalAddVerticalSumsRadius1(&vertical_sum_of_squares[x - 1]); - const __m128i v_b = - HorizontalAddVerticalSumsRadius1(&vertical_sums[x - 1]); - // ----------------------- - // calc p, z, a2 - // ----------------------- - const __m128i v_255 = _mm_shuffle_epi32(_mm_cvtsi32_si128(255), 0); - const __m128i v_n = _mm_shuffle_epi32(_mm_cvtsi32_si128(n), 0); - const __m128i v_s = _mm_shuffle_epi32(_mm_cvtsi32_si128(s), 0); - const __m128i v_dxd = _mm_mullo_epi32(v_b, v_b); - const __m128i v_axn = _mm_mullo_epi32(v_a, v_n); - const __m128i v_p = _mm_sub_epi32(v_axn, v_dxd); - const __m128i v_z = _mm_min_epi32( - v_255, RightShiftWithRounding_U32(_mm_mullo_epi32(v_p, v_s), - kSgrProjScaleBits)); - const __m128i v_a2 = _mm_set_epi32(kXByXPlus1[_mm_extract_epi32(v_z, 3)], - kXByXPlus1[_mm_extract_epi32(v_z, 2)], - kXByXPlus1[_mm_extract_epi32(v_z, 1)], - kXByXPlus1[_mm_extract_epi32(v_z, 0)]); - // ----------------------- - // calc b2 and store - // ----------------------- - const __m128i v_sgrbits_sub_a2 = _mm_sub_epi32(v_sgrbits, v_a2); - const __m128i v_b2 = - _mm_mullo_epi32(v_sgrbits_sub_a2, _mm_mullo_epi32(v_b, v_one_over_n)); - StoreUnaligned16(&intermediate_result[0][x], v_a2); - StoreUnaligned16( - &intermediate_result[1][x], - RightShiftWithRounding_U32(v_b2, kSgrProjReciprocalBits)); - x += 4; - } while (x <= width); - intermediate_result[0] += array_stride; - intermediate_result[1] += array_stride; - } while (++y <= height); -} - -void BoxFilterPreProcessRadius2_SSE4_1( - const uint8_t* const src, ptrdiff_t stride, int width, int height, - uint32_t s, uint32_t* intermediate_result[2], ptrdiff_t array_stride, - uint32_t* vertical_sums, uint32_t* vertical_sum_of_squares) { - assert(s != 0); - const uint32_t n = 25; - const uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n; - const __m128i v_one_over_n = - _mm_shuffle_epi32(_mm_cvtsi32_si128(one_over_n), 0); - const __m128i v_sgrbits = - _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << kSgrProjSgrBits), 0); - - // Calculate intermediate results, including one-pixel border, for example, - // if unit size is 64x64, we calculate 66x66 pixels. - int y = -1; + column += src_stride; + row[3] = LoadLo8Msan(column, 2 - width); + column += src_stride; + row[4] = LoadLo8Msan(column, 2 - width); + + row_sq[3] = VmullLo8(row[3], row[3]); + row_sq[4] = VmullLo8(row[4], row[4]); + + BoxFilterPreProcess4<5, 0>(row + 0, row_sq + 0, s[0], ab_ptr + 0); + BoxFilterPreProcess4<3, 1>(row + 1, row_sq + 1, s[1], ab_ptr + 8); + BoxFilterPreProcess4<3, 1>(row + 2, row_sq + 2, s[1], ab_ptr + 16); + + row[0] = row[2]; + row[1] = row[3]; + row[2] = row[4]; + + row_sq[0] = row_sq[2]; + row_sq[1] = row_sq[3]; + row_sq[2] = row_sq[4]; + ab_ptr += 24; + } while (--y != 0); + if ((height & 1) != 0) { + column += src_stride; + row[3] = row[4] = LoadLo8Msan(column, 2 - width); + row_sq[3] = row_sq[4] = VmullLo8(row[3], row[3]); + BoxFilterPreProcess4<5, 0>(row + 0, row_sq + 0, s[0], ab_ptr + 0); + BoxFilterPreProcess4<3, 1>(row + 1, row_sq + 1, s[1], ab_ptr + 8); + } + } + + const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0]; + const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1]; + const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1; + const __m128i w0_v = _mm_set1_epi32(w0); + const __m128i w1_v = _mm_set1_epi32(w1); + const __m128i w2_v = _mm_set1_epi32(w2); + int x = 0; do { - // Calculate the box vertical sums for each x position. - const uint8_t* top_left = &src[(y - 2) * stride - 3]; - int vsx = -3; - do { - const __m128i v_box0 = _mm_cvtepu8_epi32(Load4(top_left)); - const __m128i v_box1 = _mm_cvtepu8_epi32(Load4(top_left + stride)); - const __m128i v_box2 = _mm_cvtepu8_epi32(Load4(top_left + stride * 2)); - const __m128i v_box3 = _mm_cvtepu8_epi32(Load4(top_left + stride * 3)); - const __m128i v_box4 = _mm_cvtepu8_epi32(Load4(top_left + stride * 4)); - const __m128i v_sqr0 = _mm_mullo_epi32(v_box0, v_box0); - const __m128i v_sqr1 = _mm_mullo_epi32(v_box1, v_box1); - const __m128i v_sqr2 = _mm_mullo_epi32(v_box2, v_box2); - const __m128i v_sqr3 = _mm_mullo_epi32(v_box3, v_box3); - const __m128i v_sqr4 = _mm_mullo_epi32(v_box4, v_box4); - const __m128i v_a01 = _mm_add_epi32(v_sqr0, v_sqr1); - const __m128i v_a012 = _mm_add_epi32(v_a01, v_sqr2); - const __m128i v_a0123 = _mm_add_epi32(v_a012, v_sqr3); - const __m128i v_a01234 = _mm_add_epi32(v_a0123, v_sqr4); - const __m128i v_b01 = _mm_add_epi32(v_box0, v_box1); - const __m128i v_b012 = _mm_add_epi32(v_b01, v_box2); - const __m128i v_b0123 = _mm_add_epi32(v_b012, v_box3); - const __m128i v_b01234 = _mm_add_epi32(v_b0123, v_box4); - StoreUnaligned16(&vertical_sum_of_squares[vsx], v_a01234); - StoreUnaligned16(&vertical_sums[vsx], v_b01234); - top_left += 4; - vsx += 4; - } while (vsx <= width + 2); - - int x = -1; + // |src_pre_process| is X but we already processed the first column of 4 + // values so we want to start at Y and increment from there. + // X s s s Y s s + // s s s s s s s + // s s i i i i i + // s s i o o o o + // s s i o o o o + + // Seed the loop with one line of output. Then, inside the loop, for each + // iteration we can output one even row and one odd row and carry the new + // line to the next iteration. In the diagram below 'i' values are + // intermediary values from the first step and '-' values are empty. + // iiii + // ---- > even row + // iiii - odd row + // ---- > even row + // iiii + __m128i a2[2], b2[2][2], sum565_a[2], sum343_a[4], sum444_a[3]; + __m128i sum565_b[2][2], sum343_b[4][2], sum444_b[3][2]; + ab_ptr = temp; + a2[0] = b2[0][0] = LoadAligned16(ab_ptr); + a2[1] = b2[1][0] = LoadAligned16(ab_ptr + 8); + + const uint8_t* column = src_pre_process + x + 4; + __m128i row[5], row_sq[5][2]; + row[0] = row[1] = LoadUnaligned16Msan(column, x + 14 - width); + column += src_stride; + row[2] = LoadUnaligned16Msan(column, x + 14 - width); + column += src_stride; + row[3] = LoadUnaligned16Msan(column, x + 14 - width); + column += src_stride; + row[4] = LoadUnaligned16Msan(column, x + 14 - width); + + row_sq[0][0] = row_sq[1][0] = VmullLo8(row[1], row[1]); + row_sq[0][1] = row_sq[1][1] = VmullHi8(row[1], row[1]); + row_sq[2][0] = VmullLo8(row[2], row[2]); + row_sq[2][1] = VmullHi8(row[2], row[2]); + row_sq[3][0] = VmullLo8(row[3], row[3]); + row_sq[3][1] = VmullHi8(row[3], row[3]); + row_sq[4][0] = VmullLo8(row[4], row[4]); + row_sq[4][1] = VmullHi8(row[4], row[4]); + + BoxFilterPreProcess8<5, 0>(row, row_sq, s[0], &a2[0], &b2[0][1], ab_ptr); + BoxFilterPreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1], &b2[1][1], + ab_ptr + 8); + + // Pass 1 Process. These are the only values we need to propagate between + // rows. + sum565_a[0] = Sum565(a2[0]); + sum565_a[0] = _mm_sub_epi16(_mm_set1_epi16((5 + 6 + 5) * 256), sum565_a[0]); + sum565_b[0][0] = Sum565W(_mm_alignr_epi8(b2[0][1], b2[0][0], 8)); + sum565_b[0][1] = Sum565W(b2[0][1]); + + sum343_a[0] = Sum343(a2[1]); + sum343_a[0] = _mm_sub_epi16(_mm_set1_epi16((3 + 4 + 3) * 256), sum343_a[0]); + Sum343W(b2[1], sum343_b[0]); + + a2[1] = b2[1][0] = LoadAligned16(ab_ptr + 16); + + BoxFilterPreProcess8<3, 1>(row + 2, row_sq + 2, s[1], &a2[1], &b2[1][1], + ab_ptr + 16); + + Sum343_444(a2[1], &sum343_a[1], &sum444_a[0]); + sum343_a[1] = _mm_sub_epi16(_mm_set1_epi16((3 + 4 + 3) * 256), sum343_a[1]); + sum444_a[0] = _mm_sub_epi16(_mm_set1_epi16((4 + 4 + 4) * 256), sum444_a[0]); + Sum343_444W(b2[1], sum343_b[1], sum444_b[0]); + + const uint8_t* src_ptr = src + x; + uint8_t* dst_ptr = dst + x; + + // Calculate one output line. Add in the line from the previous pass and + // output one even row. Sum the new line and output the odd row. Carry the + // new row into the next pass. + for (int y = height >> 1; y != 0; --y) { + ab_ptr += 24; + a2[0] = b2[0][0] = LoadAligned16(ab_ptr); + a2[1] = b2[1][0] = LoadAligned16(ab_ptr + 8); + + row[0] = row[2]; + row[1] = row[3]; + row[2] = row[4]; + + row_sq[0][0] = row_sq[2][0], row_sq[0][1] = row_sq[2][1]; + row_sq[1][0] = row_sq[3][0], row_sq[1][1] = row_sq[3][1]; + row_sq[2][0] = row_sq[4][0], row_sq[2][1] = row_sq[4][1]; + + column += src_stride; + row[3] = LoadUnaligned16Msan(column, x + 14 - width); + column += src_stride; + row[4] = LoadUnaligned16Msan(column, x + 14 - width); + + row_sq[3][0] = VmullLo8(row[3], row[3]); + row_sq[3][1] = VmullHi8(row[3], row[3]); + row_sq[4][0] = VmullLo8(row[4], row[4]); + row_sq[4][1] = VmullHi8(row[4], row[4]); + + BoxFilterPreProcess8<5, 0>(row, row_sq, s[0], &a2[0], &b2[0][1], ab_ptr); + BoxFilterPreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1], &b2[1][1], + ab_ptr + 8); + + __m128i p[2]; + const __m128i src0 = LoadLo8(src_ptr); + p[0] = BoxFilterPass1(src0, a2[0], b2[0], sum565_a, sum565_b); + p[1] = BoxFilterPass2(src0, a2[1], b2[1], sum343_a, sum444_a, sum343_b, + sum444_b); + SelfGuidedDoubleMultiplier(src0, p, w0_v, w1_v, w2_v, dst_ptr); + src_ptr += src_stride; + dst_ptr += dst_stride; + + const __m128i src1 = LoadLo8(src_ptr); + p[0] = CalculateFilteredOutput<4>(src1, sum565_a[1], sum565_b[1]); + a2[1] = b2[1][0] = LoadAligned16(ab_ptr + 16); + BoxFilterPreProcess8<3, 1>(row + 2, row_sq + 2, s[1], &a2[1], &b2[1][1], + ab_ptr + 16); + p[1] = BoxFilterPass2(src1, a2[1], b2[1], sum343_a + 1, sum444_a + 1, + sum343_b + 1, sum444_b + 1); + SelfGuidedDoubleMultiplier(src1, p, w0_v, w1_v, w2_v, dst_ptr); + src_ptr += src_stride; + dst_ptr += dst_stride; + + sum565_a[0] = sum565_a[1]; + sum565_b[0][0] = sum565_b[1][0], sum565_b[0][1] = sum565_b[1][1]; + sum343_a[0] = sum343_a[2]; + sum343_a[1] = sum343_a[3]; + sum444_a[0] = sum444_a[2]; + sum343_b[0][0] = sum343_b[2][0], sum343_b[0][1] = sum343_b[2][1]; + sum343_b[1][0] = sum343_b[3][0], sum343_b[1][1] = sum343_b[3][1]; + sum444_b[0][0] = sum444_b[2][0], sum444_b[0][1] = sum444_b[2][1]; + } + if ((height & 1) != 0) { + ab_ptr += 24; + a2[0] = b2[0][0] = LoadAligned16(ab_ptr); + a2[1] = b2[1][0] = LoadAligned16(ab_ptr + 8); + + row[0] = row[2]; + row[1] = row[3]; + row[2] = row[4]; + + row_sq[0][0] = row_sq[2][0], row_sq[0][1] = row_sq[2][1]; + row_sq[1][0] = row_sq[3][0], row_sq[1][1] = row_sq[3][1]; + row_sq[2][0] = row_sq[4][0], row_sq[2][1] = row_sq[4][1]; + + column += src_stride; + row[3] = row[4] = LoadUnaligned16Msan(column, x + 14 - width); + + row_sq[3][0] = row_sq[4][0] = VmullLo8(row[3], row[3]); + row_sq[3][1] = row_sq[4][1] = VmullHi8(row[3], row[3]); + + BoxFilterPreProcess8<5, 0>(row, row_sq, s[0], &a2[0], &b2[0][1], ab_ptr); + BoxFilterPreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1], &b2[1][1], + ab_ptr + 8); + + __m128i p[2]; + const __m128i src0 = LoadLo8(src_ptr); + p[0] = BoxFilterPass1(src0, a2[0], b2[0], sum565_a, sum565_b); + p[1] = BoxFilterPass2(src0, a2[1], b2[1], sum343_a, sum444_a, sum343_b, + sum444_b); + SelfGuidedDoubleMultiplier(src0, p, w0_v, w1_v, w2_v, dst_ptr); + } + x += 8; + } while (x < width); +} + +inline void BoxFilterProcessPass1(const uint8_t* const src, + const ptrdiff_t src_stride, + const RestorationUnitInfo& restoration_info, + const int width, const int height, + const uint32_t s, uint16_t* const temp, + uint8_t* const dst, + const ptrdiff_t dst_stride) { + // We have combined PreProcess and Process for the first pass by storing + // intermediate values in the |a2| region. The values stored are one vertical + // column of interleaved |a2| and |b2| values and consume 8 * |height| values. + // This is |height| and not |height| * 2 because PreProcess only generates + // output for every other row. When processing the next column we write the + // new scratch values right after reading the previously saved ones. + + // The PreProcess phase calculates a 5x5 box sum for every other row + // + // PreProcess and Process have been combined into the same step. We need 12 + // input values to generate 8 output values for PreProcess: + // 0 1 2 3 4 5 6 7 8 9 10 11 + // 2 = 0 + 1 + 2 + 3 + 4 + // 3 = 1 + 2 + 3 + 4 + 5 + // 4 = 2 + 3 + 4 + 5 + 6 + // 5 = 3 + 4 + 5 + 6 + 7 + // 6 = 4 + 5 + 6 + 7 + 8 + // 7 = 5 + 6 + 7 + 8 + 9 + // 8 = 6 + 7 + 8 + 9 + 10 + // 9 = 7 + 8 + 9 + 10 + 11 + // + // and then we need 10 input values to generate 8 output values for Process: + // 0 1 2 3 4 5 6 7 8 9 + // 1 = 0 + 1 + 2 + // 2 = 1 + 2 + 3 + // 3 = 2 + 3 + 4 + // 4 = 3 + 4 + 5 + // 5 = 4 + 5 + 6 + // 6 = 5 + 6 + 7 + // 7 = 6 + 7 + 8 + // 8 = 7 + 8 + 9 + // + // To avoid re-calculating PreProcess values over and over again we will do a + // single column of 8 output values and store the second half of them + // interleaved in |temp|. The first half is not stored, since it is used + // immediately and becomes useless for the next column. Next we will start the + // second column. When 2 rows have been calculated we can calculate Process + // and output the results. + + // Calculate and store a single column. Scope so we can re-use the variable + // names for the next step. + uint16_t* ab_ptr = temp; + + const uint8_t* const src_pre_process = src - 2 * src_stride - 3; + // Calculate intermediate results, including two-pixel border, for example, if + // unit size is 64x64, we calculate 68x68 pixels. + { + const uint8_t* column = src_pre_process; + __m128i row[5], row_sq[5]; + row[0] = row[1] = LoadLo8Msan(column, 2 - width); + column += src_stride; + row[2] = LoadLo8Msan(column, 2 - width); + + row_sq[0] = row_sq[1] = VmullLo8(row[1], row[1]); + row_sq[2] = VmullLo8(row[2], row[2]); + + int y = (height + 2) >> 1; do { - const __m128i v_a = - HorizontalAddVerticalSumsRadius2(&vertical_sum_of_squares[x - 2]); - const __m128i v_b = - HorizontalAddVerticalSumsRadius2(&vertical_sums[x - 2]); - // ----------------------- - // calc p, z, a2 - // ----------------------- - const __m128i v_255 = _mm_shuffle_epi32(_mm_cvtsi32_si128(255), 0); - const __m128i v_n = _mm_shuffle_epi32(_mm_cvtsi32_si128(n), 0); - const __m128i v_s = _mm_shuffle_epi32(_mm_cvtsi32_si128(s), 0); - const __m128i v_dxd = _mm_mullo_epi32(v_b, v_b); - const __m128i v_axn = _mm_mullo_epi32(v_a, v_n); - const __m128i v_p = _mm_sub_epi32(v_axn, v_dxd); - const __m128i v_z = _mm_min_epi32( - v_255, RightShiftWithRounding_U32(_mm_mullo_epi32(v_p, v_s), - kSgrProjScaleBits)); - const __m128i v_a2 = _mm_set_epi32(kXByXPlus1[_mm_extract_epi32(v_z, 3)], - kXByXPlus1[_mm_extract_epi32(v_z, 2)], - kXByXPlus1[_mm_extract_epi32(v_z, 1)], - kXByXPlus1[_mm_extract_epi32(v_z, 0)]); - // ----------------------- - // calc b2 and store - // ----------------------- - const __m128i v_sgrbits_sub_a2 = _mm_sub_epi32(v_sgrbits, v_a2); - const __m128i v_b2 = - _mm_mullo_epi32(v_sgrbits_sub_a2, _mm_mullo_epi32(v_b, v_one_over_n)); - StoreUnaligned16(&intermediate_result[0][x], v_a2); - StoreUnaligned16( - &intermediate_result[1][x], - RightShiftWithRounding_U32(v_b2, kSgrProjReciprocalBits)); - x += 4; - } while (x <= width); - intermediate_result[0] += 2 * array_stride; - intermediate_result[1] += 2 * array_stride; - y += 2; - } while (y <= height); -} - -void BoxFilterPreProcess_SSE4_1(const RestorationUnitInfo& restoration_info, - const uint8_t* const src, ptrdiff_t stride, - int width, int height, int pass, - RestorationBuffer* const buffer) { - uint32_t vertical_sums_buf[kRestorationProcessingUnitSize + - 2 * kRestorationBorder + kRestorationPadding]; - uint32_t vertical_sum_of_squares_buf[kRestorationProcessingUnitSize + - 2 * kRestorationBorder + - kRestorationPadding]; - uint32_t* vertical_sums = &vertical_sums_buf[4]; - uint32_t* vertical_sum_of_squares = &vertical_sum_of_squares_buf[4]; - const ptrdiff_t array_stride = buffer->box_filter_process_intermediate_stride; - // The size of the intermediate result buffer is the size of the filter area - // plus horizontal (3) and vertical (3) padding. The processing start point - // is the filter area start point -1 row and -1 column. Therefore we need to - // set offset and use the intermediate_result as the start point for - // processing. - const ptrdiff_t intermediate_buffer_offset = - kRestorationBorder * array_stride + kRestorationBorder; - uint32_t* intermediate_result[2] = { - buffer->box_filter_process_intermediate[0] + intermediate_buffer_offset - - array_stride, - buffer->box_filter_process_intermediate[1] + intermediate_buffer_offset - - array_stride}; - const int sgr_proj_index = restoration_info.sgr_proj_info.index; - if (pass == 0) { - assert(kSgrProjParams[sgr_proj_index][0] == 2); - BoxFilterPreProcessRadius2_SSE4_1(src, stride, width, height, - kSgrScaleParameter[sgr_proj_index][0], - intermediate_result, array_stride, - vertical_sums, vertical_sum_of_squares); - } else { - assert(kSgrProjParams[sgr_proj_index][2] == 1); - BoxFilterPreProcessRadius1_SSE4_1(src, stride, width, height, - kSgrScaleParameter[sgr_proj_index][1], - intermediate_result, array_stride, - vertical_sums, vertical_sum_of_squares); + column += src_stride; + row[3] = LoadLo8Msan(column, 2 - width); + column += src_stride; + row[4] = LoadLo8Msan(column, 2 - width); + + row_sq[3] = VmullLo8(row[3], row[3]); + row_sq[4] = VmullLo8(row[4], row[4]); + + BoxFilterPreProcess4<5, 0>(row, row_sq, s, ab_ptr); + + row[0] = row[2]; + row[1] = row[3]; + row[2] = row[4]; + + row_sq[0] = row_sq[2]; + row_sq[1] = row_sq[3]; + row_sq[2] = row_sq[4]; + ab_ptr += 8; + } while (--y != 0); + if ((height & 1) != 0) { + column += src_stride; + row[3] = row[4] = LoadLo8Msan(column, 2 - width); + row_sq[3] = row_sq[4] = VmullLo8(row[3], row[3]); + BoxFilterPreProcess4<5, 0>(row, row_sq, s, ab_ptr); + } } -} -inline __m128i Sum565Row(const __m128i v_DBCA, const __m128i v_XXFE) { - __m128i v_sum = v_DBCA; - const __m128i v_EDCB = _mm_alignr_epi8(v_XXFE, v_DBCA, 4); - v_sum = _mm_add_epi32(v_sum, v_EDCB); - const __m128i v_FEDC = _mm_alignr_epi8(v_XXFE, v_DBCA, 8); - v_sum = _mm_add_epi32(v_sum, v_FEDC); - // D C B A x4 - // + E D C B x4 - // + F E D C x4 - v_sum = _mm_slli_epi32(v_sum, 2); - // + D C B A - v_sum = _mm_add_epi32(v_sum, v_DBCA); // 5 - // + E D C B x2 - v_sum = _mm_add_epi32(v_sum, _mm_slli_epi32(v_EDCB, 1)); // 6 - // + F E D C - return _mm_add_epi32(v_sum, v_FEDC); // 5 -} - -inline __m128i Process3x3Block_565_Odd(const uint32_t* src, ptrdiff_t stride) { - // 0 0 0 - // 5 6 5 - // 0 0 0 - const uint32_t* top_left = src - 1; - const __m128i v_src1_lo = LoadUnaligned16(top_left + stride); - const __m128i v_src1_hi = LoadLo8(top_left + stride + 4); - return Sum565Row(v_src1_lo, v_src1_hi); -} - -inline __m128i Process3x3Block_565_Even(const uint32_t* src, ptrdiff_t stride) { - // 5 6 5 - // 0 0 0 - // 5 6 5 - const uint32_t* top_left = src - 1; - const __m128i v_src0_lo = LoadUnaligned16(top_left); - const __m128i v_src0_hi = LoadLo8(top_left + 4); - const __m128i v_src2_lo = LoadUnaligned16(top_left + stride * 2); - const __m128i v_src2_hi = LoadLo8(top_left + stride * 2 + 4); - const __m128i v_a0 = Sum565Row(v_src0_lo, v_src0_hi); - const __m128i v_a2 = Sum565Row(v_src2_lo, v_src2_hi); - return _mm_add_epi32(v_a0, v_a2); -} - -inline __m128i Sum343Row(const __m128i v_DBCA, const __m128i v_XXFE) { - __m128i v_sum = v_DBCA; - const __m128i v_EDCB = _mm_alignr_epi8(v_XXFE, v_DBCA, 4); - v_sum = _mm_add_epi32(v_sum, v_EDCB); - const __m128i v_FEDC = _mm_alignr_epi8(v_XXFE, v_DBCA, 8); - v_sum = _mm_add_epi32(v_sum, v_FEDC); - // D C B A x4 - // + E D C B x4 - // + F E D C x4 - v_sum = _mm_slli_epi32(v_sum, 2); // 4 - // - D C B A - v_sum = _mm_sub_epi32(v_sum, v_DBCA); // 3 - // - F E D C - return _mm_sub_epi32(v_sum, v_FEDC); // 3 -} - -inline __m128i Sum444Row(const __m128i v_DBCA, const __m128i v_XXFE) { - __m128i v_sum = v_DBCA; - const __m128i v_EDCB = _mm_alignr_epi8(v_XXFE, v_DBCA, 4); - v_sum = _mm_add_epi32(v_sum, v_EDCB); - const __m128i v_FEDC = _mm_alignr_epi8(v_XXFE, v_DBCA, 8); - v_sum = _mm_add_epi32(v_sum, v_FEDC); - // D C B A x4 - // + E D C B x4 - // + F E D C x4 - return _mm_slli_epi32(v_sum, 2); // 4 -} - -inline __m128i Process3x3Block_343(const uint32_t* src, ptrdiff_t stride) { - const uint32_t* top_left = src - 1; - const __m128i v_ir0_lo = LoadUnaligned16(top_left); - const __m128i v_ir0_hi = LoadLo8(top_left + 4); - const __m128i v_ir1_lo = LoadUnaligned16(top_left + stride); - const __m128i v_ir1_hi = LoadLo8(top_left + stride + 4); - const __m128i v_ir2_lo = LoadUnaligned16(top_left + stride * 2); - const __m128i v_ir2_hi = LoadLo8(top_left + stride * 2 + 4); - const __m128i v_a0 = Sum343Row(v_ir0_lo, v_ir0_hi); - const __m128i v_a1 = Sum444Row(v_ir1_lo, v_ir1_hi); - const __m128i v_a2 = Sum343Row(v_ir2_lo, v_ir2_hi); - return _mm_add_epi32(v_a0, _mm_add_epi32(v_a1, v_a2)); -} - -void BoxFilterProcess_SSE4_1(const RestorationUnitInfo& restoration_info, - const uint8_t* src, ptrdiff_t stride, int width, - int height, RestorationBuffer* const buffer) { - const int sgr_proj_index = restoration_info.sgr_proj_info.index; - for (int pass = 0; pass < 2; ++pass) { - const uint8_t radius = kSgrProjParams[sgr_proj_index][pass * 2]; - const uint8_t* src_ptr = src; - if (radius == 0) continue; - - BoxFilterPreProcess_SSE4_1(restoration_info, src_ptr, stride, width, height, - pass, buffer); - - int* filtered_output = buffer->box_filter_process_output[pass]; - const ptrdiff_t filtered_output_stride = - buffer->box_filter_process_output_stride; - const ptrdiff_t intermediate_stride = - buffer->box_filter_process_intermediate_stride; - // Set intermediate buffer start point to the actual start point of - // filtering. - const ptrdiff_t intermediate_buffer_offset = - kRestorationBorder * intermediate_stride + kRestorationBorder; - - if (pass == 0) { - int y = 0; - do { - const int shift = ((y & 1) != 0) ? 4 : 5; - uint32_t* const array_start[2] = { - buffer->box_filter_process_intermediate[0] + - intermediate_buffer_offset + y * intermediate_stride, - buffer->box_filter_process_intermediate[1] + - intermediate_buffer_offset + y * intermediate_stride}; - uint32_t* intermediate_result2[2] = { - array_start[0] - intermediate_stride, - array_start[1] - intermediate_stride}; - if ((y & 1) == 0) { // even row - int x = 0; - do { - // 5 6 5 - // 0 0 0 - // 5 6 5 - const __m128i v_A = Process3x3Block_565_Even( - &intermediate_result2[0][x], intermediate_stride); - const __m128i v_B = Process3x3Block_565_Even( - &intermediate_result2[1][x], intermediate_stride); - const __m128i v_src = _mm_cvtepu8_epi32(Load4(src_ptr + x)); - const __m128i v_v0 = _mm_mullo_epi32(v_A, v_src); - const __m128i v_v = _mm_add_epi32(v_v0, v_B); - const __m128i v_filtered = RightShiftWithRounding_U32( - v_v, kSgrProjSgrBits + shift - kSgrProjRestoreBits); - - StoreUnaligned16(&filtered_output[x], v_filtered); - x += 4; - } while (x < width); - } else { - int x = 0; - do { - // 0 0 0 - // 5 6 5 - // 0 0 0 - const __m128i v_A = Process3x3Block_565_Odd( - &intermediate_result2[0][x], intermediate_stride); - const __m128i v_B = Process3x3Block_565_Odd( - &intermediate_result2[1][x], intermediate_stride); - const __m128i v_src = _mm_cvtepu8_epi32(Load4(src_ptr + x)); - const __m128i v_v0 = _mm_mullo_epi32(v_A, v_src); - const __m128i v_v = _mm_add_epi32(v_v0, v_B); - const __m128i v_filtered = RightShiftWithRounding_U32( - v_v, kSgrProjSgrBits + shift - kSgrProjRestoreBits); - - StoreUnaligned16(&filtered_output[x], v_filtered); - x += 4; - } while (x < width); - } - src_ptr += stride; - filtered_output += filtered_output_stride; - } while (++y < height); - } else { - int y = 0; - do { - const int shift = 5; - uint32_t* const array_start[2] = { - buffer->box_filter_process_intermediate[0] + - intermediate_buffer_offset + y * intermediate_stride, - buffer->box_filter_process_intermediate[1] + - intermediate_buffer_offset + y * intermediate_stride}; - uint32_t* intermediate_result2[2] = { - array_start[0] - intermediate_stride, - array_start[1] - intermediate_stride}; - int x = 0; - do { - const __m128i v_A = Process3x3Block_343(&intermediate_result2[0][x], - intermediate_stride); - const __m128i v_B = Process3x3Block_343(&intermediate_result2[1][x], - intermediate_stride); - const __m128i v_src = _mm_cvtepu8_epi32(Load4(src_ptr + x)); - const __m128i v_v0 = _mm_mullo_epi32(v_A, v_src); - const __m128i v_v = _mm_add_epi32(v_v0, v_B); - const __m128i v_filtered = RightShiftWithRounding_U32( - v_v, kSgrProjSgrBits + shift - kSgrProjRestoreBits); - - StoreUnaligned16(&filtered_output[x], v_filtered); - x += 4; - } while (x < width); - src_ptr += stride; - filtered_output += filtered_output_stride; - } while (++y < height); + const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0]; + const int16_t w1 = (1 << kSgrProjPrecisionBits) - w0; + int x = 0; + do { + // |src_pre_process| is X but we already processed the first column of 4 + // values so we want to start at Y and increment from there. + // X s s s Y s s + // s s s s s s s + // s s i i i i i + // s s i o o o o + // s s i o o o o + + // Seed the loop with one line of output. Then, inside the loop, for each + // iteration we can output one even row and one odd row and carry the new + // line to the next iteration. In the diagram below 'i' values are + // intermediary values from the first step and '-' values are empty. + // iiii + // ---- > even row + // iiii - odd row + // ---- > even row + // iiii + __m128i a2[2], b2[2], sum565_a[2], sum565_b[2][2]; + ab_ptr = temp; + a2[0] = b2[0] = LoadAligned16(ab_ptr); + + const uint8_t* column = src_pre_process + x + 4; + __m128i row[5], row_sq[5][2]; + row[0] = row[1] = LoadUnaligned16Msan(column, x + 14 - width); + column += src_stride; + row[2] = LoadUnaligned16Msan(column, x + 14 - width); + column += src_stride; + row[3] = LoadUnaligned16Msan(column, x + 14 - width); + column += src_stride; + row[4] = LoadUnaligned16Msan(column, x + 14 - width); + + row_sq[0][0] = row_sq[1][0] = VmullLo8(row[1], row[1]); + row_sq[0][1] = row_sq[1][1] = VmullHi8(row[1], row[1]); + row_sq[2][0] = VmullLo8(row[2], row[2]); + row_sq[2][1] = VmullHi8(row[2], row[2]); + row_sq[3][0] = VmullLo8(row[3], row[3]); + row_sq[3][1] = VmullHi8(row[3], row[3]); + row_sq[4][0] = VmullLo8(row[4], row[4]); + row_sq[4][1] = VmullHi8(row[4], row[4]); + + BoxFilterPreProcess8<5, 0>(row, row_sq, s, &a2[0], &b2[1], ab_ptr); + + // Pass 1 Process. These are the only values we need to propagate between + // rows. + sum565_a[0] = Sum565(a2[0]); + sum565_a[0] = _mm_sub_epi16(_mm_set1_epi16((5 + 6 + 5) * 256), sum565_a[0]); + sum565_b[0][0] = Sum565W(_mm_alignr_epi8(b2[1], b2[0], 8)); + sum565_b[0][1] = Sum565W(b2[1]); + + const uint8_t* src_ptr = src + x; + uint8_t* dst_ptr = dst + x; + + // Calculate one output line. Add in the line from the previous pass and + // output one even row. Sum the new line and output the odd row. Carry the + // new row into the next pass. + for (int y = height >> 1; y != 0; --y) { + ab_ptr += 8; + a2[0] = b2[0] = LoadAligned16(ab_ptr); + + row[0] = row[2]; + row[1] = row[3]; + row[2] = row[4]; + + row_sq[0][0] = row_sq[2][0], row_sq[0][1] = row_sq[2][1]; + row_sq[1][0] = row_sq[3][0], row_sq[1][1] = row_sq[3][1]; + row_sq[2][0] = row_sq[4][0], row_sq[2][1] = row_sq[4][1]; + + column += src_stride; + row[3] = LoadUnaligned16Msan(column, x + 14 - width); + column += src_stride; + row[4] = LoadUnaligned16Msan(column, x + 14 - width); + + row_sq[3][0] = VmullLo8(row[3], row[3]); + row_sq[3][1] = VmullHi8(row[3], row[3]); + row_sq[4][0] = VmullLo8(row[4], row[4]); + row_sq[4][1] = VmullHi8(row[4], row[4]); + + BoxFilterPreProcess8<5, 0>(row, row_sq, s, &a2[0], &b2[1], ab_ptr); + + const __m128i src0 = LoadLo8(src_ptr); + const __m128i p0 = BoxFilterPass1(src0, a2[0], b2, sum565_a, sum565_b); + SelfGuidedSingleMultiplier(src0, p0, w0, w1, dst_ptr); + src_ptr += src_stride; + dst_ptr += dst_stride; + + const __m128i src1 = LoadLo8(src_ptr); + const __m128i p1 = + CalculateFilteredOutput<4>(src1, sum565_a[1], sum565_b[1]); + SelfGuidedSingleMultiplier(src1, p1, w0, w1, dst_ptr); + src_ptr += src_stride; + dst_ptr += dst_stride; + + sum565_a[0] = sum565_a[1]; + sum565_b[0][0] = sum565_b[1][0], sum565_b[0][1] = sum565_b[1][1]; + } + if ((height & 1) != 0) { + ab_ptr += 8; + a2[0] = b2[0] = LoadAligned16(ab_ptr); + + row[0] = row[2]; + row[1] = row[3]; + row[2] = row[4]; + + row_sq[0][0] = row_sq[2][0], row_sq[0][1] = row_sq[2][1]; + row_sq[1][0] = row_sq[3][0], row_sq[1][1] = row_sq[3][1]; + row_sq[2][0] = row_sq[4][0], row_sq[2][1] = row_sq[4][1]; + + column += src_stride; + row[3] = row[4] = LoadUnaligned16Msan(column, x + 14 - width); + + row_sq[3][0] = row_sq[4][0] = VmullLo8(row[3], row[3]); + row_sq[3][1] = row_sq[4][1] = VmullHi8(row[3], row[3]); + + BoxFilterPreProcess8<5, 0>(row, row_sq, s, &a2[0], &b2[1], ab_ptr); + + const __m128i src0 = LoadLo8(src_ptr); + const __m128i p0 = BoxFilterPass1(src0, a2[0], b2, sum565_a, sum565_b); + SelfGuidedSingleMultiplier(src0, p0, w0, w1, dst_ptr); } + x += 8; + } while (x < width); +} + +inline void BoxFilterProcessPass2(const uint8_t* src, + const ptrdiff_t src_stride, + const RestorationUnitInfo& restoration_info, + const int width, const int height, + const uint32_t s, uint16_t* const temp, + uint8_t* const dst, + const ptrdiff_t dst_stride) { + uint16_t* ab_ptr = temp; + + // Calculate intermediate results, including one-pixel border, for example, if + // unit size is 64x64, we calculate 66x66 pixels. + // Because of the vectors this calculates start in blocks of 4 so we actually + // get 68 values. + const uint8_t* const src_top_left_corner = src - 2 * src_stride - 2; + { + const uint8_t* column = src_top_left_corner; + __m128i row[3], row_sq[3]; + row[0] = LoadLo8Msan(column, 4 - width); + column += src_stride; + row[1] = LoadLo8Msan(column, 4 - width); + row_sq[0] = VmullLo8(row[0], row[0]); + row_sq[1] = VmullLo8(row[1], row[1]); + + int y = height + 2; + do { + column += src_stride; + row[2] = LoadLo8Msan(column, 4 - width); + row_sq[2] = VmullLo8(row[2], row[2]); + + BoxFilterPreProcess4<3, 0>(row, row_sq, s, ab_ptr); + + row[0] = row[1]; + row[1] = row[2]; + + row_sq[0] = row_sq[1]; + row_sq[1] = row_sq[2]; + ab_ptr += 8; + } while (--y != 0); } + + assert(restoration_info.sgr_proj_info.multiplier[0] == 0); + const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1]; + const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1; + int x = 0; + do { + ab_ptr = temp; + + __m128i a2, b2[2], sum343_a[3], sum444_a[2], sum343_b[3][2], sum444_b[2][2]; + a2 = b2[0] = LoadAligned16(ab_ptr); + + const uint8_t* column = src_top_left_corner + x + 4; + __m128i row[3], row_sq[3][2]; + row[0] = LoadUnaligned16Msan(column, x + 16 - width); + column += src_stride; + row[1] = LoadUnaligned16Msan(column, x + 16 - width); + column += src_stride; + row[2] = LoadUnaligned16Msan(column, x + 16 - width); + + row_sq[0][0] = VmullLo8(row[0], row[0]); + row_sq[0][1] = VmullHi8(row[0], row[0]); + row_sq[1][0] = VmullLo8(row[1], row[1]); + row_sq[1][1] = VmullHi8(row[1], row[1]); + row_sq[2][0] = VmullLo8(row[2], row[2]); + row_sq[2][1] = VmullHi8(row[2], row[2]); + + BoxFilterPreProcess8<3, 0>(row, row_sq, s, &a2, &b2[1], ab_ptr); + + sum343_a[0] = Sum343(a2); + sum343_a[0] = _mm_sub_epi16(_mm_set1_epi16((3 + 4 + 3) * 256), sum343_a[0]); + Sum343W(b2, sum343_b[0]); + + ab_ptr += 8; + a2 = b2[0] = LoadAligned16(ab_ptr); + + row[0] = row[1]; + row[1] = row[2]; + + row_sq[0][0] = row_sq[1][0], row_sq[0][1] = row_sq[1][1]; + row_sq[1][0] = row_sq[2][0], row_sq[1][1] = row_sq[2][1]; + column += src_stride; + row[2] = LoadUnaligned16Msan(column, x + 16 - width); + + row_sq[2][0] = VmullLo8(row[2], row[2]); + row_sq[2][1] = VmullHi8(row[2], row[2]); + + BoxFilterPreProcess8<3, 0>(row, row_sq, s, &a2, &b2[1], ab_ptr); + + Sum343_444(a2, &sum343_a[1], &sum444_a[0]); + sum343_a[1] = _mm_sub_epi16(_mm_set1_epi16((3 + 4 + 3) * 256), sum343_a[1]); + sum444_a[0] = _mm_sub_epi16(_mm_set1_epi16((4 + 4 + 4) * 256), sum444_a[0]); + Sum343_444W(b2, sum343_b[1], sum444_b[0]); + + const uint8_t* src_ptr = src + x; + uint8_t* dst_ptr = dst + x; + int y = height; + do { + ab_ptr += 8; + a2 = b2[0] = LoadAligned16(ab_ptr); + + row[0] = row[1]; + row[1] = row[2]; + + row_sq[0][0] = row_sq[1][0], row_sq[0][1] = row_sq[1][1]; + row_sq[1][0] = row_sq[2][0], row_sq[1][1] = row_sq[2][1]; + column += src_stride; + row[2] = LoadUnaligned16Msan(column, x + 16 - width); + + row_sq[2][0] = VmullLo8(row[2], row[2]); + row_sq[2][1] = VmullHi8(row[2], row[2]); + + BoxFilterPreProcess8<3, 0>(row, row_sq, s, &a2, &b2[1], ab_ptr); + + const __m128i src_u8 = LoadLo8(src_ptr); + const __m128i p = BoxFilterPass2(src_u8, a2, b2, sum343_a, sum444_a, + sum343_b, sum444_b); + SelfGuidedSingleMultiplier(src_u8, p, w0, w1, dst_ptr); + sum343_a[0] = sum343_a[1]; + sum343_a[1] = sum343_a[2]; + sum444_a[0] = sum444_a[1]; + sum343_b[0][0] = sum343_b[1][0], sum343_b[0][1] = sum343_b[1][1]; + sum343_b[1][0] = sum343_b[2][0], sum343_b[1][1] = sum343_b[2][1]; + sum444_b[0][0] = sum444_b[1][0], sum444_b[0][1] = sum444_b[1][1]; + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--y != 0); + x += 8; + } while (x < width); } -void SelfGuidedFilter_SSE4_1(const void* source, void* dest, +// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in +// the end of each row. It is safe to overwrite the output as it will not be +// part of the visible frame. +void SelfGuidedFilter_SSE4_1(const void* const source, void* const dest, const RestorationUnitInfo& restoration_info, - ptrdiff_t source_stride, ptrdiff_t dest_stride, - int width, int height, + const ptrdiff_t source_stride, + const ptrdiff_t dest_stride, const int width, + const int height, RestorationBuffer* const buffer) { + const int index = restoration_info.sgr_proj_info.index; + const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0 + const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0 const auto* src = static_cast<const uint8_t*>(source); auto* dst = static_cast<uint8_t*>(dest); - const int w0 = restoration_info.sgr_proj_info.multiplier[0]; - const int w1 = restoration_info.sgr_proj_info.multiplier[1]; - const int w2 = (1 << kSgrProjPrecisionBits) - w0 - w1; - const int index = restoration_info.sgr_proj_info.index; - const uint8_t r0 = kSgrProjParams[index][0]; - const uint8_t r1 = kSgrProjParams[index][2]; - const ptrdiff_t array_stride = buffer->box_filter_process_output_stride; - int* box_filter_process_output[2] = {buffer->box_filter_process_output[0], - buffer->box_filter_process_output[1]}; - - BoxFilterProcess_SSE4_1(restoration_info, src, source_stride, width, height, - buffer); - - const __m128i v_w0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(w0), 0); - const __m128i v_w1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(w1), 0); - const __m128i v_w2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(w2), 0); - const __m128i v_r0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(r0), 0); - const __m128i v_r1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(r1), 0); - const __m128i zero = _mm_setzero_si128(); - // Create masks used to select between src and box_filter_process_output. - const __m128i v_r0_mask = _mm_cmpeq_epi32(v_r0, zero); - const __m128i v_r1_mask = _mm_cmpeq_epi32(v_r1, zero); - - int y = 0; - do { - int x = 0; - do { - const __m128i v_src = _mm_cvtepu8_epi32(Load4(src + x)); - const __m128i v_u = _mm_slli_epi32(v_src, kSgrProjRestoreBits); - const __m128i v_v_a = _mm_mullo_epi32(v_w1, v_u); - const __m128i v_bfp_out0 = - LoadUnaligned16(&box_filter_process_output[0][x]); - // Select u or box_filter_process_output[0][x]. - const __m128i v_r0_mult = _mm_blendv_epi8(v_bfp_out0, v_u, v_r0_mask); - const __m128i v_v_b = _mm_mullo_epi32(v_w0, v_r0_mult); - const __m128i v_v_c = _mm_add_epi32(v_v_a, v_v_b); - const __m128i v_bfp_out1 = - LoadUnaligned16(&box_filter_process_output[1][x]); - // Select u or box_filter_process_output[1][x]. - const __m128i v_r1_mult = _mm_blendv_epi8(v_bfp_out1, v_u, v_r1_mask); - const __m128i v_v_d = _mm_mullo_epi32(v_w2, v_r1_mult); - const __m128i v_v_e = _mm_add_epi32(v_v_c, v_v_d); - __m128i v_s = RightShiftWithRounding_S32( - v_v_e, kSgrProjRestoreBits + kSgrProjPrecisionBits); - v_s = _mm_packs_epi32(v_s, v_s); - v_s = _mm_packus_epi16(v_s, v_s); - Store4(&dst[x], v_s); - x += 4; - } while (x < width); - - src += source_stride; - dst += dest_stride; - box_filter_process_output[0] += array_stride; - box_filter_process_output[1] += array_stride; - } while (++y < height); + if (radius_pass_1 == 0) { + // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the + // following assertion. + assert(radius_pass_0 != 0); + BoxFilterProcessPass1(src, source_stride, restoration_info, width, height, + kSgrScaleParameter[index][0], buffer->sgf_buffer, dst, + dest_stride); + } else if (radius_pass_0 == 0) { + BoxFilterProcessPass2(src, source_stride, restoration_info, width, height, + kSgrScaleParameter[index][1], buffer->sgf_buffer, dst, + dest_stride); + } else { + BoxFilterProcess(src, source_stride, restoration_info, width, height, + kSgrScaleParameter[index], buffer->sgf_buffer, dst, + dest_stride); + } } void Init8bpp() { diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.cc b/chromium/third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.cc new file mode 100644 index 00000000000..13f0853b2cb --- /dev/null +++ b/chromium/third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.cc @@ -0,0 +1,397 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/motion_field_projection.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_SSE4_1 + +#include <smmintrin.h> + +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdint> + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/types.h" + +namespace libgav1 { +namespace dsp { +namespace { + +inline __m128i LoadDivision(const __m128i division_table, + const __m128i reference_offset) { + const __m128i kOne = _mm_set1_epi16(0x0100); + const __m128i t = _mm_add_epi8(reference_offset, reference_offset); + const __m128i tt = _mm_unpacklo_epi8(t, t); + const __m128i idx = _mm_add_epi8(tt, kOne); + return _mm_shuffle_epi8(division_table, idx); +} + +inline __m128i MvProjection(const __m128i mv, const __m128i denominator, + const int numerator) { + const __m128i m0 = _mm_madd_epi16(mv, denominator); + const __m128i m = _mm_mullo_epi32(m0, _mm_set1_epi32(numerator)); + // Add the sign (0 or -1) to round towards zero. + const __m128i sign = _mm_srai_epi32(m, 31); + const __m128i add_sign = _mm_add_epi32(m, sign); + const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13)); + return _mm_srai_epi32(sum, 14); +} + +inline __m128i MvProjectionClip(const __m128i mv, const __m128i denominator, + const int numerator) { + const __m128i mv0 = _mm_unpacklo_epi16(mv, _mm_setzero_si128()); + const __m128i mv1 = _mm_unpackhi_epi16(mv, _mm_setzero_si128()); + const __m128i denorm0 = _mm_unpacklo_epi16(denominator, _mm_setzero_si128()); + const __m128i denorm1 = _mm_unpackhi_epi16(denominator, _mm_setzero_si128()); + const __m128i s0 = MvProjection(mv0, denorm0, numerator); + const __m128i s1 = MvProjection(mv1, denorm1, numerator); + const __m128i projection = _mm_packs_epi32(s0, s1); + const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp); + const __m128i projection_mv_clamp_negative = + _mm_set1_epi16(-kProjectionMvClamp); + const __m128i clamp = _mm_min_epi16(projection, projection_mv_clamp); + return _mm_max_epi16(clamp, projection_mv_clamp_negative); +} + +inline __m128i Project_SSE4_1(const __m128i delta, const __m128i dst_sign) { + // Add 63 to negative delta so that it shifts towards zero. + const __m128i delta_sign = _mm_srai_epi16(delta, 15); + const __m128i delta_sign_63 = _mm_srli_epi16(delta_sign, 10); + const __m128i delta_adjust = _mm_add_epi16(delta, delta_sign_63); + const __m128i offset0 = _mm_srai_epi16(delta_adjust, 6); + const __m128i offset1 = _mm_xor_si128(offset0, dst_sign); + return _mm_sub_epi16(offset1, dst_sign); +} + +inline void GetPosition( + const __m128i division_table, const MotionVector* const mv, + const int numerator, const int x8_start, const int x8_end, const int x8, + const __m128i r_offsets, const __m128i source_reference_type8, + const __m128i skip_r, const __m128i y8_floor8, const __m128i y8_ceiling8, + const __m128i d_sign, const int delta, __m128i* const r, + __m128i* const position_xy, int64_t* const skip_64, __m128i mvs[2]) { + const auto* const mv_int = reinterpret_cast<const int32_t*>(mv + x8); + *r = _mm_shuffle_epi8(r_offsets, source_reference_type8); + const __m128i denorm = LoadDivision(division_table, source_reference_type8); + __m128i projection_mv[2]; + mvs[0] = LoadUnaligned16(mv_int + 0); + mvs[1] = LoadUnaligned16(mv_int + 4); + // Deinterlace x and y components + const __m128i kShuffle = + _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + const __m128i mv0 = _mm_shuffle_epi8(mvs[0], kShuffle); + const __m128i mv1 = _mm_shuffle_epi8(mvs[1], kShuffle); + const __m128i mv_y = _mm_unpacklo_epi64(mv0, mv1); + const __m128i mv_x = _mm_unpackhi_epi64(mv0, mv1); + // numerator could be 0. + projection_mv[0] = MvProjectionClip(mv_y, denorm, numerator); + projection_mv[1] = MvProjectionClip(mv_x, denorm, numerator); + // Do not update the motion vector if the block position is not valid or + // if position_x8 is outside the current range of x8_start and x8_end. + // Note that position_y8 will always be within the range of y8_start and + // y8_end. + // After subtracting the base, valid projections are within 8-bit. + const __m128i position_y = Project_SSE4_1(projection_mv[0], d_sign); + const __m128i position_x = Project_SSE4_1(projection_mv[1], d_sign); + const __m128i positions = _mm_packs_epi16(position_x, position_y); + const __m128i k01234567 = + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0); + *position_xy = _mm_add_epi8(positions, k01234567); + const int x8_floor = std::max( + x8_start - x8, delta - kProjectionMvMaxHorizontalOffset); // [-8, 8] + const int x8_ceiling = + std::min(x8_end - x8, delta + 8 + kProjectionMvMaxHorizontalOffset) - + 1; // [-1, 15] + const __m128i x8_floor8 = _mm_set1_epi8(x8_floor); + const __m128i x8_ceiling8 = _mm_set1_epi8(x8_ceiling); + const __m128i floor_xy = _mm_unpacklo_epi64(x8_floor8, y8_floor8); + const __m128i ceiling_xy = _mm_unpacklo_epi64(x8_ceiling8, y8_ceiling8); + const __m128i underflow = _mm_cmplt_epi8(*position_xy, floor_xy); + const __m128i overflow = _mm_cmpgt_epi8(*position_xy, ceiling_xy); + const __m128i out = _mm_or_si128(underflow, overflow); + const __m128i skip_low = _mm_or_si128(skip_r, out); + const __m128i skip = _mm_or_si128(skip_low, _mm_srli_si128(out, 8)); + StoreLo8(skip_64, skip); +} + +template <int idx> +inline void Store(const __m128i position, const __m128i reference_offset, + const __m128i mv, int8_t* dst_reference_offset, + MotionVector* dst_mv) { + const ptrdiff_t offset = + static_cast<int16_t>(_mm_extract_epi16(position, idx)); + if ((idx & 3) == 0) { + dst_mv[offset].mv32 = _mm_cvtsi128_si32(mv); + } else { + dst_mv[offset].mv32 = _mm_extract_epi32(mv, idx & 3); + } + dst_reference_offset[offset] = _mm_extract_epi8(reference_offset, idx); +} + +template <int idx> +inline void CheckStore(const int8_t* skips, const __m128i position, + const __m128i reference_offset, const __m128i mv, + int8_t* dst_reference_offset, MotionVector* dst_mv) { + if (skips[idx] == 0) { + Store<idx>(position, reference_offset, mv, dst_reference_offset, dst_mv); + } +} + +// 7.9.2. +void MotionFieldProjectionKernel_SSE4_1( + const ReferenceInfo& reference_info, + const int reference_to_current_with_sign, const int dst_sign, + const int y8_start, const int y8_end, const int x8_start, const int x8_end, + TemporalMotionField* const motion_field) { + const ptrdiff_t stride = motion_field->mv.columns(); + // The column range has to be offset by kProjectionMvMaxHorizontalOffset since + // coordinates in that range could end up being position_x8 because of + // projection. + const int adjusted_x8_start = + std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0); + const int adjusted_x8_end = std::min( + x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride)); + const int adjusted_x8_end8 = adjusted_x8_end & ~7; + const int leftover = adjusted_x8_end - adjusted_x8_end8; + const int8_t* const reference_offsets = + reference_info.relative_distance_to.data(); + const bool* const skip_references = reference_info.skip_references.data(); + const int16_t* const projection_divisions = + reference_info.projection_divisions.data(); + const ReferenceFrameType* source_reference_types = + &reference_info.motion_field_reference_frame[y8_start][0]; + const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0]; + int8_t* dst_reference_offset = motion_field->reference_offset[y8_start]; + MotionVector* dst_mv = motion_field->mv[y8_start]; + const __m128i d_sign = _mm_set1_epi16(dst_sign); + + static_assert(sizeof(int8_t) == sizeof(bool), ""); + static_assert(sizeof(int8_t) == sizeof(ReferenceFrameType), ""); + static_assert(sizeof(int32_t) == sizeof(MotionVector), ""); + assert(dst_sign == 0 || dst_sign == -1); + assert(stride == motion_field->reference_offset.columns()); + assert((y8_start & 7) == 0); + assert((adjusted_x8_start & 7) == 0); + // The final position calculation is represented with int16_t. Valid + // position_y8 from its base is at most 7. After considering the horizontal + // offset which is at most |stride - 1|, we have the following assertion, + // which means this optimization works for frame width up to 32K (each + // position is a 8x8 block). + assert(8 * stride <= 32768); + const __m128i skip_reference = LoadLo8(skip_references); + const __m128i r_offsets = LoadLo8(reference_offsets); + const __m128i division_table = LoadUnaligned16(projection_divisions); + + int y8 = y8_start; + do { + const int y8_floor = (y8 & ~7) - y8; // [-7, 0] + const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8) - 1; // [0, 7] + const __m128i y8_floor8 = _mm_set1_epi8(y8_floor); + const __m128i y8_ceiling8 = _mm_set1_epi8(y8_ceiling); + int x8; + + for (x8 = adjusted_x8_start; x8 < adjusted_x8_end8; x8 += 8) { + const __m128i source_reference_type8 = + LoadLo8(source_reference_types + x8); + const __m128i skip_r = + _mm_shuffle_epi8(skip_reference, source_reference_type8); + int64_t early_skip; + StoreLo8(&early_skip, skip_r); + // Early termination #1 if all are skips. Chance is typically ~30-40%. + if (early_skip == -1) continue; + int64_t skip_64; + __m128i r, position_xy, mvs[2]; + GetPosition(division_table, mv, reference_to_current_with_sign, x8_start, + x8_end, x8, r_offsets, source_reference_type8, skip_r, + y8_floor8, y8_ceiling8, d_sign, 0, &r, &position_xy, &skip_64, + mvs); + // Early termination #2 if all are skips. + // Chance is typically ~15-25% after Early termination #1. + if (skip_64 == -1) continue; + const __m128i p_y = _mm_cvtepi8_epi16(_mm_srli_si128(position_xy, 8)); + const __m128i p_x = _mm_cvtepi8_epi16(position_xy); + const __m128i p_y_offset = _mm_mullo_epi16(p_y, _mm_set1_epi16(stride)); + const __m128i pos = _mm_add_epi16(p_y_offset, p_x); + const __m128i position = _mm_add_epi16(pos, _mm_set1_epi16(x8)); + if (skip_64 == 0) { + // Store all. Chance is typically ~70-85% after Early termination #2. + Store<0>(position, r, mvs[0], dst_reference_offset, dst_mv); + Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv); + Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv); + Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv); + Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv); + Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv); + Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv); + Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv); + } else { + // Check and store each. + // Chance is typically ~15-30% after Early termination #2. + // The compiler is smart enough to not create the local buffer skips[]. + int8_t skips[8]; + memcpy(skips, &skip_64, sizeof(skips)); + CheckStore<0>(skips, position, r, mvs[0], dst_reference_offset, dst_mv); + CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset, dst_mv); + CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset, dst_mv); + CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset, dst_mv); + CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset, dst_mv); + CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset, dst_mv); + CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset, dst_mv); + CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset, dst_mv); + } + } + + // The following leftover processing cannot be moved out of the do...while + // loop. Doing so may change the result storing orders of the same position. + if (leftover > 0) { + // Use SIMD only when leftover is at least 4, and there are at least 8 + // elements in a row. + if (leftover >= 4 && adjusted_x8_start < adjusted_x8_end8) { + // Process the last 8 elements to avoid loading invalid memory. Some + // elements may have been processed in the above loop, which is OK. + const int delta = 8 - leftover; + x8 = adjusted_x8_end - 8; + const __m128i source_reference_type8 = + LoadLo8(source_reference_types + x8); + const __m128i skip_r = + _mm_shuffle_epi8(skip_reference, source_reference_type8); + int64_t early_skip; + StoreLo8(&early_skip, skip_r); + // Early termination #1 if all are skips. + if (early_skip != -1) { + int64_t skip_64; + __m128i r, position_xy, mvs[2]; + GetPosition(division_table, mv, reference_to_current_with_sign, + x8_start, x8_end, x8, r_offsets, source_reference_type8, + skip_r, y8_floor8, y8_ceiling8, d_sign, delta, &r, + &position_xy, &skip_64, mvs); + // Early termination #2 if all are skips. + if (skip_64 != -1) { + const __m128i p_y = + _mm_cvtepi8_epi16(_mm_srli_si128(position_xy, 8)); + const __m128i p_x = _mm_cvtepi8_epi16(position_xy); + const __m128i p_y_offset = + _mm_mullo_epi16(p_y, _mm_set1_epi16(stride)); + const __m128i pos = _mm_add_epi16(p_y_offset, p_x); + const __m128i position = _mm_add_epi16(pos, _mm_set1_epi16(x8)); + // Store up to 7 elements since leftover is at most 7. + if (skip_64 == 0) { + // Store all. + Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv); + Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv); + Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv); + Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv); + Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv); + Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv); + Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv); + } else { + // Check and store each. + // The compiler is smart enough to not create the local buffer + // skips[]. + int8_t skips[8]; + memcpy(skips, &skip_64, sizeof(skips)); + CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset, + dst_mv); + CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset, + dst_mv); + CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset, + dst_mv); + CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset, + dst_mv); + CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset, + dst_mv); + CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset, + dst_mv); + CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset, + dst_mv); + } + } + } + } else { + for (; x8 < adjusted_x8_end; ++x8) { + const int source_reference_type = source_reference_types[x8]; + if (skip_references[source_reference_type]) continue; + MotionVector projection_mv; + // reference_to_current_with_sign could be 0. + GetMvProjection(mv[x8], reference_to_current_with_sign, + projection_divisions[source_reference_type], + &projection_mv); + // Do not update the motion vector if the block position is not valid + // or if position_x8 is outside the current range of x8_start and + // x8_end. Note that position_y8 will always be within the range of + // y8_start and y8_end. + const int position_y8 = Project(0, projection_mv.mv[0], dst_sign); + if (position_y8 < y8_floor || position_y8 > y8_ceiling) continue; + const int x8_base = x8 & ~7; + const int x8_floor = + std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset); + const int x8_ceiling = + std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset); + const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign); + if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue; + dst_mv[position_y8 * stride + position_x8] = mv[x8]; + dst_reference_offset[position_y8 * stride + position_x8] = + reference_offsets[source_reference_type]; + } + } + } + + source_reference_types += stride; + mv += stride; + dst_reference_offset += stride; + dst_mv += stride; + } while (++y8 < y8_end); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1; +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1; +} +#endif + +} // namespace + +void MotionFieldProjectionInit_SSE4_1() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_SSE4_1 +namespace libgav1 { +namespace dsp { + +void MotionFieldProjectionInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_SSE4_1 diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.h b/chromium/third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.h new file mode 100644 index 00000000000..7828de5ca39 --- /dev/null +++ b/chromium/third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.h @@ -0,0 +1,37 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::motion_field_projection_kernel. This function is not +// thread-safe. +void MotionFieldProjectionInit_SSE4_1(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_SSE4_1 +#define LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel LIBGAV1_CPU_SSE4_1 +#endif // LIBGAV1_ENABLE_SSE4_1 + +#endif // LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_ diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.cc b/chromium/third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.cc new file mode 100644 index 00000000000..a4b77da7877 --- /dev/null +++ b/chromium/third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.cc @@ -0,0 +1,262 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/motion_vector_search.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_SSE4_1 + +#include <smmintrin.h> + +#include <cassert> +#include <cstddef> +#include <cstdint> + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/types.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr int kProjectionMvDivisionLookup_32bit[kMaxFrameDistance + 1] = { + 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638, + 1489, 1365, 1260, 1170, 1092, 1024, 963, 910, 862, 819, 780, + 744, 712, 682, 655, 630, 606, 585, 564, 546, 528}; + +inline __m128i MvProjection(const __m128i mv, const __m128i denominator, + const __m128i numerator) { + const __m128i m0 = _mm_madd_epi16(mv, denominator); + const __m128i m = _mm_mullo_epi32(m0, numerator); + // Add the sign (0 or -1) to round towards zero. + const __m128i sign = _mm_srai_epi32(m, 31); + const __m128i add_sign = _mm_add_epi32(m, sign); + const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13)); + return _mm_srai_epi32(sum, 14); +} + +inline __m128i MvProjectionClip(const __m128i mvs[2], + const __m128i denominators[2], + const __m128i numerator) { + const __m128i s0 = MvProjection(mvs[0], denominators[0], numerator); + const __m128i s1 = MvProjection(mvs[1], denominators[1], numerator); + const __m128i mv = _mm_packs_epi32(s0, s1); + const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp); + const __m128i projection_mv_clamp_negative = + _mm_set1_epi16(-kProjectionMvClamp); + const __m128i clamp = _mm_min_epi16(mv, projection_mv_clamp); + return _mm_max_epi16(clamp, projection_mv_clamp_negative); +} + +inline __m128i MvProjectionCompoundClip( + const MotionVector* const temporal_mvs, + const int8_t temporal_reference_offsets[2], + const int reference_offsets[2]) { + const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs); + const __m128i temporal_mv = LoadLo8(tmvs); + const __m128i temporal_mv_0 = _mm_cvtepu16_epi32(temporal_mv); + __m128i mvs[2], denominators[2]; + mvs[0] = _mm_unpacklo_epi64(temporal_mv_0, temporal_mv_0); + mvs[1] = _mm_unpackhi_epi64(temporal_mv_0, temporal_mv_0); + denominators[0] = _mm_set1_epi32( + kProjectionMvDivisionLookup[temporal_reference_offsets[0]]); + denominators[1] = _mm_set1_epi32( + kProjectionMvDivisionLookup[temporal_reference_offsets[1]]); + const __m128i offsets = LoadLo8(reference_offsets); + const __m128i numerator = _mm_unpacklo_epi32(offsets, offsets); + return MvProjectionClip(mvs, denominators, numerator); +} + +inline __m128i MvProjectionSingleClip( + const MotionVector* const temporal_mvs, + const int8_t* const temporal_reference_offsets, + const int reference_offset) { + const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs); + const __m128i temporal_mv = LoadAligned16(tmvs); + __m128i lookup = _mm_cvtsi32_si128( + kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[0]]); + lookup = _mm_insert_epi32( + lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[1]], + 1); + lookup = _mm_insert_epi32( + lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[2]], + 2); + lookup = _mm_insert_epi32( + lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[3]], + 3); + __m128i mvs[2], denominators[2]; + mvs[0] = _mm_unpacklo_epi16(temporal_mv, _mm_setzero_si128()); + mvs[1] = _mm_unpackhi_epi16(temporal_mv, _mm_setzero_si128()); + denominators[0] = _mm_unpacklo_epi32(lookup, lookup); + denominators[1] = _mm_unpackhi_epi32(lookup, lookup); + const __m128i numerator = _mm_set1_epi32(reference_offset); + return MvProjectionClip(mvs, denominators, numerator); +} + +inline void LowPrecision(const __m128i mv, void* const candidate_mvs) { + const __m128i kRoundDownMask = _mm_set1_epi16(~1); + const __m128i sign = _mm_srai_epi16(mv, 15); + const __m128i sub_sign = _mm_sub_epi16(mv, sign); + const __m128i d = _mm_and_si128(sub_sign, kRoundDownMask); + StoreAligned16(candidate_mvs, d); +} + +inline void ForceInteger(const __m128i mv, void* const candidate_mvs) { + const __m128i kRoundDownMask = _mm_set1_epi16(~7); + const __m128i sign = _mm_srai_epi16(mv, 15); + const __m128i mv1 = _mm_add_epi16(mv, _mm_set1_epi16(3)); + const __m128i mv2 = _mm_sub_epi16(mv1, sign); + const __m128i mv3 = _mm_and_si128(mv2, kRoundDownMask); + StoreAligned16(candidate_mvs, mv3); +} + +void MvProjectionCompoundLowPrecision_SSE4_1( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offsets[2], const int count, + CompoundMotionVector* candidate_mvs) { + // |reference_offsets| non-zero check usually equals true and is ignored. + // To facilitate the compilers, make a local copy of |reference_offsets|. + const int offsets[2] = {reference_offsets[0], reference_offsets[1]}; + // One more element could be calculated. + int i = 0; + do { + const __m128i mv = MvProjectionCompoundClip( + temporal_mvs + i, temporal_reference_offsets + i, offsets); + LowPrecision(mv, candidate_mvs + i); + i += 2; + } while (i < count); +} + +void MvProjectionCompoundForceInteger_SSE4_1( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offsets[2], const int count, + CompoundMotionVector* candidate_mvs) { + // |reference_offsets| non-zero check usually equals true and is ignored. + // To facilitate the compilers, make a local copy of |reference_offsets|. + const int offsets[2] = {reference_offsets[0], reference_offsets[1]}; + // One more element could be calculated. + int i = 0; + do { + const __m128i mv = MvProjectionCompoundClip( + temporal_mvs + i, temporal_reference_offsets + i, offsets); + ForceInteger(mv, candidate_mvs + i); + i += 2; + } while (i < count); +} + +void MvProjectionCompoundHighPrecision_SSE4_1( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offsets[2], const int count, + CompoundMotionVector* candidate_mvs) { + // |reference_offsets| non-zero check usually equals true and is ignored. + // To facilitate the compilers, make a local copy of |reference_offsets|. + const int offsets[2] = {reference_offsets[0], reference_offsets[1]}; + // One more element could be calculated. + int i = 0; + do { + const __m128i mv = MvProjectionCompoundClip( + temporal_mvs + i, temporal_reference_offsets + i, offsets); + StoreAligned16(candidate_mvs + i, mv); + i += 2; + } while (i < count); +} + +void MvProjectionSingleLowPrecision_SSE4_1( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offset, const int count, MotionVector* candidate_mvs) { + // Up to three more elements could be calculated. + int i = 0; + do { + const __m128i mv = MvProjectionSingleClip( + temporal_mvs + i, temporal_reference_offsets + i, reference_offset); + LowPrecision(mv, candidate_mvs + i); + i += 4; + } while (i < count); +} + +void MvProjectionSingleForceInteger_SSE4_1( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offset, const int count, MotionVector* candidate_mvs) { + // Up to three more elements could be calculated. + int i = 0; + do { + const __m128i mv = MvProjectionSingleClip( + temporal_mvs + i, temporal_reference_offsets + i, reference_offset); + ForceInteger(mv, candidate_mvs + i); + i += 4; + } while (i < count); +} + +void MvProjectionSingleHighPrecision_SSE4_1( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offset, const int count, MotionVector* candidate_mvs) { + // Up to three more elements could be calculated. + int i = 0; + do { + const __m128i mv = MvProjectionSingleClip( + temporal_mvs + i, temporal_reference_offsets + i, reference_offset); + StoreAligned16(candidate_mvs + i, mv); + i += 4; + } while (i < count); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1; + dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1; + dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1; + dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1; + dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1; + dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1; +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1; + dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1; + dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1; + dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1; + dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1; + dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1; +} +#endif + +} // namespace + +void MotionVectorSearchInit_SSE4_1() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_SSE4_1 +namespace libgav1 { +namespace dsp { + +void MotionVectorSearchInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_SSE4_1 diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.h b/chromium/third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.h new file mode 100644 index 00000000000..b8b04123635 --- /dev/null +++ b/chromium/third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.h @@ -0,0 +1,37 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::mv_projection_compound and Dsp::mv_projection_single. This +// function is not thread-safe. +void MotionVectorSearchInit_SSE4_1(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_SSE4_1 +#define LIBGAV1_Dsp8bpp_MotionVectorSearch LIBGAV1_CPU_SSE4_1 +#endif // LIBGAV1_ENABLE_SSE4_1 + +#endif // LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_ diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/transpose_sse4.h b/chromium/third_party/libgav1/src/src/dsp/x86/transpose_sse4.h index 2a10dc05633..cd61c9275d3 100644 --- a/chromium/third_party/libgav1/src/src/dsp/x86/transpose_sse4.h +++ b/chromium/third_party/libgav1/src/src/dsp/x86/transpose_sse4.h @@ -27,7 +27,7 @@ namespace libgav1 { namespace dsp { LIBGAV1_ALWAYS_INLINE __m128i Transpose4x4_U8(const __m128i* const in) { - // Unpack 16 bit elements. Goes from: + // Unpack 8 bit elements. Goes from: // in[0]: 00 01 02 03 // in[1]: 10 11 12 13 // in[2]: 20 21 22 23 @@ -43,10 +43,10 @@ LIBGAV1_ALWAYS_INLINE __m128i Transpose4x4_U8(const __m128i* const in) { return _mm_unpacklo_epi16(a0, a1); } -LIBGAV1_ALWAYS_INLINE void Transpose8x8_U8(const __m128i* const in, - __m128i* out) { - // Unpack 16 bit elements. Goes from: - // in[0]: 00 01 02 03 04 05 06 07 +LIBGAV1_ALWAYS_INLINE void Transpose8x8To4x16_U8(const __m128i* const in, + __m128i* out) { + // Unpack 8 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 // in[1]: 10 11 12 13 14 15 16 17 // in[2]: 20 21 22 23 24 25 26 27 // in[3]: 30 31 32 33 34 35 36 37 diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/warp_sse4.cc b/chromium/third_party/libgav1/src/src/dsp/x86/warp_sse4.cc index 4003f5db459..922110ba573 100644 --- a/chromium/third_party/libgav1/src/src/dsp/x86/warp_sse4.cc +++ b/chromium/third_party/libgav1/src/src/dsp/x86/warp_sse4.cc @@ -19,11 +19,10 @@ #include <smmintrin.h> -#include <algorithm> #include <cassert> #include <cstddef> #include <cstdint> -#include <cstdlib> +#include <cstring> #include <type_traits> #include "src/dsp/constants.h" @@ -69,7 +68,7 @@ inline void HorizontalFilter(const int sx4, const int16_t alpha, f = LoadLo8(kWarpedFilters8[offset]); sx += alpha; } - Transpose8x8_U8(filter, filter); + Transpose8x8To4x16_U8(filter, filter); // |filter| now contains two filters per register. // Staggered combinations allow us to take advantage of _mm_maddubs_epi16 // without overflowing the sign bit. The sign bit is hit only where two taps @@ -128,10 +127,10 @@ inline void WriteVerticalFilter(const __m128i filter[8], sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical); if (is_compound) { const __m128i sum = _mm_packs_epi32(sum_low, sum_high); - StoreUnaligned16(reinterpret_cast<int16_t*>(dst_row), sum); + StoreUnaligned16(static_cast<int16_t*>(dst_row), sum); } else { const __m128i sum = _mm_packus_epi32(sum_low, sum_high); - StoreLo8(reinterpret_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum)); + StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum)); } } @@ -159,22 +158,206 @@ inline void WriteVerticalFilter(const __m128i filter[8], sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical); if (is_compound) { const __m128i sum = _mm_packs_epi32(sum_low, sum_high); - StoreUnaligned16(reinterpret_cast<int16_t*>(dst_row), sum); + StoreUnaligned16(static_cast<int16_t*>(dst_row), sum); } else { const __m128i sum = _mm_packus_epi32(sum_low, sum_high); - StoreLo8(reinterpret_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum)); + StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum)); } } -template <bool is_compound> -void Warp_SSE4_1(const void* source, ptrdiff_t source_stride, int source_width, - int source_height, const int* warp_params, int subsampling_x, - int subsampling_y, int block_start_x, int block_start_y, - int block_width, int block_height, int16_t alpha, int16_t beta, - int16_t gamma, int16_t delta, void* dest, - ptrdiff_t dest_stride) { - constexpr int kRoundBitsVertical = - is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical; +template <bool is_compound, typename DestType> +inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma, + int delta, DestType* dest_row, + ptrdiff_t dest_stride) { + int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + for (int y = 0; y < 8; ++y) { + int sy = sy4 - MultiplyBy4(gamma); + __m128i filter[8]; + for (__m128i& f : filter) { + const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) + + kWarpedPixelPrecisionShifts; + f = LoadUnaligned16(kWarpedFilters[offset]); + sy += gamma; + } + Transpose8x8_U16(filter, filter); + WriteVerticalFilter<is_compound>(filter, source, y, dest_row); + dest_row += dest_stride; + sy4 += delta; + } +} + +template <bool is_compound, typename DestType> +inline void VerticalFilter(const int16_t* source_cols, int y4, int gamma, + int delta, DestType* dest_row, + ptrdiff_t dest_stride) { + int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + for (int y = 0; y < 8; ++y) { + int sy = sy4 - MultiplyBy4(gamma); + __m128i filter[8]; + for (__m128i& f : filter) { + const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) + + kWarpedPixelPrecisionShifts; + f = LoadUnaligned16(kWarpedFilters[offset]); + sy += gamma; + } + Transpose8x8_U16(filter, filter); + WriteVerticalFilter<is_compound>(filter, &source_cols[y], dest_row); + dest_row += dest_stride; + sy4 += delta; + } +} + +template <bool is_compound, typename DestType> +inline void WarpRegion1(const uint8_t* src, ptrdiff_t source_stride, + int source_width, int source_height, int ix4, int iy4, + DestType* dst_row, ptrdiff_t dest_stride) { + // Region 1 + // Points to the left or right border of the first row of |src|. + const uint8_t* first_row_border = + (ix4 + 7 <= 0) ? src : src + source_width - 1; + // In general, for y in [-7, 8), the row number iy4 + y is clipped: + // const int row = Clip3(iy4 + y, 0, source_height - 1); + // In two special cases, iy4 + y is clipped to either 0 or + // source_height - 1 for all y. In the rest of the cases, iy4 + y is + // bounded and we can avoid clipping iy4 + y by relying on a reference + // frame's boundary extension on the top and bottom. + // Region 1. + // Every sample used to calculate the prediction block has the same + // value. So the whole prediction block has the same value. + const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const uint8_t row_border_pixel = first_row_border[row * source_stride]; + + if (is_compound) { + const __m128i sum = + _mm_set1_epi16(row_border_pixel << (kInterRoundBitsVertical - + kInterRoundBitsCompoundVertical)); + StoreUnaligned16(dst_row, sum); + } else { + memset(dst_row, row_border_pixel, 8); + } + const DestType* const first_dst_row = dst_row; + dst_row += dest_stride; + for (int y = 1; y < 8; ++y) { + memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row)); + dst_row += dest_stride; + } +} + +template <bool is_compound, typename DestType> +inline void WarpRegion2(const uint8_t* src, ptrdiff_t source_stride, + int source_width, int y4, int ix4, int iy4, int gamma, + int delta, int16_t intermediate_result_column[15], + DestType* dst_row, ptrdiff_t dest_stride) { + // Region 2. + // Points to the left or right border of the first row of |src|. + const uint8_t* first_row_border = + (ix4 + 7 <= 0) ? src : src + source_width - 1; + // In general, for y in [-7, 8), the row number iy4 + y is clipped: + // const int row = Clip3(iy4 + y, 0, source_height - 1); + // In two special cases, iy4 + y is clipped to either 0 or + // source_height - 1 for all y. In the rest of the cases, iy4 + y is + // bounded and we can avoid clipping iy4 + y by relying on a reference + // frame's boundary extension on the top and bottom. + + // Region 2. + // Horizontal filter. + // The input values in this region are generated by extending the border + // which makes them identical in the horizontal direction. This + // computation could be inlined in the vertical pass but most + // implementations will need a transpose of some sort. + // It is not necessary to use the offset values here because the + // horizontal pass is a simple shift and the vertical pass will always + // require using 32 bits. + for (int y = -7; y < 8; ++y) { + // We may over-read up to 13 pixels above the top source row, or up + // to 13 pixels below the bottom source row. This is proved in + // warp.cc. + const int row = iy4 + y; + int sum = first_row_border[row * source_stride]; + sum <<= (kFilterBits - kInterRoundBitsHorizontal); + intermediate_result_column[y + 7] = sum; + } + // Region 2 vertical filter. + VerticalFilter<is_compound, DestType>(intermediate_result_column, y4, gamma, + delta, dst_row, dest_stride); +} + +template <bool is_compound, typename DestType> +inline void WarpRegion3(const uint8_t* src, ptrdiff_t source_stride, + int source_height, int alpha, int beta, int x4, int ix4, + int iy4, int16_t intermediate_result[15][8]) { + // Region 3 + // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0. + + // In general, for y in [-7, 8), the row number iy4 + y is clipped: + // const int row = Clip3(iy4 + y, 0, source_height - 1); + // In two special cases, iy4 + y is clipped to either 0 or + // source_height - 1 for all y. In the rest of the cases, iy4 + y is + // bounded and we can avoid clipping iy4 + y by relying on a reference + // frame's boundary extension on the top and bottom. + // Horizontal filter. + const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const uint8_t* const src_row = src + row * source_stride; + // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also + // read but is ignored. + // + // NOTE: This may read up to 13 bytes before src_row[0] or up to 14 + // bytes after src_row[source_width - 1]. We assume the source frame + // has left and right borders of at least 13 bytes that extend the + // frame boundary pixels. We also assume there is at least one extra + // padding byte after the right border of the last source row. + const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]); + int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + for (int y = -7; y < 8; ++y) { + HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]); + sx4 += beta; + } +} + +template <bool is_compound, typename DestType> +inline void WarpRegion4(const uint8_t* src, ptrdiff_t source_stride, int alpha, + int beta, int x4, int ix4, int iy4, + int16_t intermediate_result[15][8]) { + // Region 4. + // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0. + + // In general, for y in [-7, 8), the row number iy4 + y is clipped: + // const int row = Clip3(iy4 + y, 0, source_height - 1); + // In two special cases, iy4 + y is clipped to either 0 or + // source_height - 1 for all y. In the rest of the cases, iy4 + y is + // bounded and we can avoid clipping iy4 + y by relying on a reference + // frame's boundary extension on the top and bottom. + // Horizontal filter. + int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + for (int y = -7; y < 8; ++y) { + // We may over-read up to 13 pixels above the top source row, or up + // to 13 pixels below the bottom source row. This is proved in + // warp.cc. + const int row = iy4 + y; + const uint8_t* const src_row = src + row * source_stride; + // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also + // read but is ignored. + // + // NOTE: This may read up to 13 bytes before src_row[0] or up to 14 + // bytes after src_row[source_width - 1]. We assume the source frame + // has left and right borders of at least 13 bytes that extend the + // frame boundary pixels. We also assume there is at least one extra + // padding byte after the right border of the last source row. + const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]); + // Convert src_row_v to int8 (subtract 128). + HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]); + sx4 += beta; + } +} + +template <bool is_compound, typename DestType> +inline void HandleWarpBlock(const uint8_t* src, ptrdiff_t source_stride, + int source_width, int source_height, + const int* warp_params, int subsampling_x, + int subsampling_y, int src_x, int src_y, + int16_t alpha, int16_t beta, int16_t gamma, + int16_t delta, DestType* dst_row, + ptrdiff_t dest_stride) { union { // Intermediate_result is the output of the horizontal filtering and // rounding. The range is within 13 (= bitdepth + kFilterBits + 1 - @@ -187,242 +370,133 @@ void Warp_SSE4_1(const void* source, ptrdiff_t source_stride, int source_width, int16_t intermediate_result_column[15]; }; + const int dst_x = + src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0]; + const int dst_y = + src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1]; + const int x4 = dst_x >> subsampling_x; + const int y4 = dst_y >> subsampling_y; + const int ix4 = x4 >> kWarpedModelPrecisionBits; + const int iy4 = y4 >> kWarpedModelPrecisionBits; + // A prediction block may fall outside the frame's boundaries. If a + // prediction block is calculated using only samples outside the frame's + // boundary, the filtering can be simplified. We can divide the plane + // into several regions and handle them differently. + // + // | | + // 1 | 3 | 1 + // | | + // -------+-----------+------- + // |***********| + // 2 |*****4*****| 2 + // |***********| + // -------+-----------+------- + // | | + // 1 | 3 | 1 + // | | + // + // At the center, region 4 represents the frame and is the general case. + // + // In regions 1 and 2, the prediction block is outside the frame's + // boundary horizontally. Therefore the horizontal filtering can be + // simplified. Furthermore, in the region 1 (at the four corners), the + // prediction is outside the frame's boundary both horizontally and + // vertically, so we get a constant prediction block. + // + // In region 3, the prediction block is outside the frame's boundary + // vertically. Unfortunately because we apply the horizontal filters + // first, by the time we apply the vertical filters, they no longer see + // simple inputs. So the only simplification is that all the rows are + // the same, but we still need to apply all the horizontal and vertical + // filters. + + // Check for two simple special cases, where the horizontal filter can + // be significantly simplified. + // + // In general, for each row, the horizontal filter is calculated as + // follows: + // for (int x = -4; x < 4; ++x) { + // const int offset = ...; + // int sum = first_pass_offset; + // for (int k = 0; k < 8; ++k) { + // const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1); + // sum += kWarpedFilters[offset][k] * src_row[column]; + // } + // ... + // } + // The column index before clipping, ix4 + x + k - 3, varies in the range + // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1 + // or ix4 + 7 <= 0, then all the column indexes are clipped to the same + // border index (source_width - 1 or 0, respectively). Then for each x, + // the inner for loop of the horizontal filter is reduced to multiplying + // the border pixel by the sum of the filter coefficients. + if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) { + if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) { + // Outside the frame in both directions. One repeated value. + WarpRegion1<is_compound, DestType>(src, source_stride, source_width, + source_height, ix4, iy4, dst_row, + dest_stride); + return; + } + // Outside the frame horizontally. Rows repeated. + WarpRegion2<is_compound, DestType>( + src, source_stride, source_width, y4, ix4, iy4, gamma, delta, + intermediate_result_column, dst_row, dest_stride); + return; + } + + if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) { + // Outside the frame vertically. + WarpRegion3<is_compound, DestType>(src, source_stride, source_height, alpha, + beta, x4, ix4, iy4, intermediate_result); + } else { + // Inside the frame. + WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta, x4, ix4, + iy4, intermediate_result); + } + // Region 3 and 4 vertical filter. + VerticalFilter<is_compound, DestType>(intermediate_result, y4, gamma, delta, + dst_row, dest_stride); +} + +template <bool is_compound> +void Warp_SSE4_1(const void* source, ptrdiff_t source_stride, int source_width, + int source_height, const int* warp_params, int subsampling_x, + int subsampling_y, int block_start_x, int block_start_y, + int block_width, int block_height, int16_t alpha, int16_t beta, + int16_t gamma, int16_t delta, void* dest, + ptrdiff_t dest_stride) { const auto* const src = static_cast<const uint8_t*>(source); using DestType = typename std::conditional<is_compound, int16_t, uint8_t>::type; auto* dst = static_cast<DestType*>(dest); + // Warp process applies for each 8x8 block. assert(block_width >= 8); assert(block_height >= 8); - - // Warp process applies for each 8x8 block (or smaller). - int start_y = block_start_y; + const int block_end_x = block_start_x + block_width; + const int block_end_y = block_start_y + block_height; + + const int start_x = block_start_x; + const int start_y = block_start_y; + int src_x = (start_x + 4) << subsampling_x; + int src_y = (start_y + 4) << subsampling_y; + const int end_x = (block_end_x + 4) << subsampling_x; + const int end_y = (block_end_y + 4) << subsampling_y; do { - int start_x = block_start_x; + DestType* dst_row = dst; + src_x = (start_x + 4) << subsampling_x; do { - const int src_x = (start_x + 4) << subsampling_x; - const int src_y = (start_y + 4) << subsampling_y; - const int dst_x = - src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0]; - const int dst_y = - src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1]; - const int x4 = dst_x >> subsampling_x; - const int y4 = dst_y >> subsampling_y; - const int ix4 = x4 >> kWarpedModelPrecisionBits; - const int iy4 = y4 >> kWarpedModelPrecisionBits; - // A prediction block may fall outside the frame's boundaries. If a - // prediction block is calculated using only samples outside the frame's - // boundary, the filtering can be simplified. We can divide the plane - // into several regions and handle them differently. - // - // | | - // 1 | 3 | 1 - // | | - // -------+-----------+------- - // |***********| - // 2 |*****4*****| 2 - // |***********| - // -------+-----------+------- - // | | - // 1 | 3 | 1 - // | | - // - // At the center, region 4 represents the frame and is the general case. - // - // In regions 1 and 2, the prediction block is outside the frame's - // boundary horizontally. Therefore the horizontal filtering can be - // simplified. Furthermore, in the region 1 (at the four corners), the - // prediction is outside the frame's boundary both horizontally and - // vertically, so we get a constant prediction block. - // - // In region 3, the prediction block is outside the frame's boundary - // vertically. Unfortunately because we apply the horizontal filters - // first, by the time we apply the vertical filters, they no longer see - // simple inputs. So the only simplification is that all the rows are - // the same, but we still need to apply all the horizontal and vertical - // filters. - - // Check for two simple special cases, where the horizontal filter can - // be significantly simplified. - // - // In general, for each row, the horizontal filter is calculated as - // follows: - // for (int x = -4; x < 4; ++x) { - // const int offset = ...; - // int sum = first_pass_offset; - // for (int k = 0; k < 8; ++k) { - // const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1); - // sum += kWarpedFilters[offset][k] * src_row[column]; - // } - // ... - // } - // The column index before clipping, ix4 + x + k - 3, varies in the range - // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1 - // or ix4 + 7 <= 0, then all the column indexes are clipped to the same - // border index (source_width - 1 or 0, respectively). Then for each x, - // the inner for loop of the horizontal filter is reduced to multiplying - // the border pixel by the sum of the filter coefficients. - if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) { - // Regions 1 and 2. - // Points to the left or right border of the first row of |src|. - const uint8_t* first_row_border = - (ix4 + 7 <= 0) ? src : src + source_width - 1; - // In general, for y in [-7, 8), the row number iy4 + y is clipped: - // const int row = Clip3(iy4 + y, 0, source_height - 1); - // In two special cases, iy4 + y is clipped to either 0 or - // source_height - 1 for all y. In the rest of the cases, iy4 + y is - // bounded and we can avoid clipping iy4 + y by relying on a reference - // frame's boundary extension on the top and bottom. - if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { - // Region 1. - // Every sample used to calculate the prediction block has the same - // value. So the whole prediction block has the same value. - const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; - const uint8_t row_border_pixel = - first_row_border[row * source_stride]; - - DestType* dst_row = dst + start_x - block_start_x; - if (is_compound) { - const __m128i sum = - _mm_set1_epi16(row_border_pixel << (kInterRoundBitsVertical - - kRoundBitsVertical)); - StoreUnaligned16(dst_row, sum); - } else { - memset(dst_row, row_border_pixel, 8); - } - const DestType* const first_dst_row = dst_row; - dst_row += dest_stride; - for (int y = 1; y < 8; ++y) { - memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row)); - dst_row += dest_stride; - } - // End of region 1. Continue the |start_x| do-while loop. - start_x += 8; - continue; - } - - // Region 2. - // Horizontal filter. - // The input values in this region are generated by extending the border - // which makes them identical in the horizontal direction. This - // computation could be inlined in the vertical pass but most - // implementations will need a transpose of some sort. - // It is not necessary to use the offset values here because the - // horizontal pass is a simple shift and the vertical pass will always - // require using 32 bits. - for (int y = -7; y < 8; ++y) { - // We may over-read up to 13 pixels above the top source row, or up - // to 13 pixels below the bottom source row. This is proved in - // warp.cc. - const int row = iy4 + y; - int sum = first_row_border[row * source_stride]; - sum <<= (kFilterBits - kInterRoundBitsHorizontal); - intermediate_result_column[y + 7] = sum; - } - // Vertical filter. - DestType* dst_row = dst + start_x - block_start_x; - int sy4 = - (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); - for (int y = 0; y < 8; ++y) { - int sy = sy4 - MultiplyBy4(gamma); - __m128i filter[8]; - for (__m128i& f : filter) { - const int offset = - RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) + - kWarpedPixelPrecisionShifts; - f = LoadUnaligned16(kWarpedFilters[offset]); - sy += gamma; - } - Transpose8x8_U16(filter, filter); - WriteVerticalFilter<is_compound>( - filter, &intermediate_result_column[y], dst_row); - dst_row += dest_stride; - sy4 += delta; - } - // End of region 2. Continue the |start_x| do-while loop. - start_x += 8; - continue; - } - - // Regions 3 and 4. - // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0. - - // In general, for y in [-7, 8), the row number iy4 + y is clipped: - // const int row = Clip3(iy4 + y, 0, source_height - 1); - // In two special cases, iy4 + y is clipped to either 0 or - // source_height - 1 for all y. In the rest of the cases, iy4 + y is - // bounded and we can avoid clipping iy4 + y by relying on a reference - // frame's boundary extension on the top and bottom. - if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { - // Region 3. - // Horizontal filter. - const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; - const uint8_t* const src_row = src + row * source_stride; - // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also - // read but is ignored. - // - // NOTE: This may read up to 13 bytes before src_row[0] or up to 14 - // bytes after src_row[source_width - 1]. We assume the source frame - // has left and right borders of at least 13 bytes that extend the - // frame boundary pixels. We also assume there is at least one extra - // padding byte after the right border of the last source row. - const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]); - int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; - for (int y = -7; y < 8; ++y) { - HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]); - sx4 += beta; - } - } else { - // Region 4. - // Horizontal filter. - int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; - for (int y = -7; y < 8; ++y) { - // We may over-read up to 13 pixels above the top source row, or up - // to 13 pixels below the bottom source row. This is proved in - // warp.cc. - const int row = iy4 + y; - const uint8_t* const src_row = src + row * source_stride; - // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also - // read but is ignored. - // - // NOTE: This may read up to 13 bytes before src_row[0] or up to 14 - // bytes after src_row[source_width - 1]. We assume the source frame - // has left and right borders of at least 13 bytes that extend the - // frame boundary pixels. We also assume there is at least one extra - // padding byte after the right border of the last source row. - const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]); - // Convert src_row_v to int8 (subtract 128). - HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]); - sx4 += beta; - } - } - - // Regions 3 and 4. - // Vertical filter. - DestType* dst_row = dst + start_x - block_start_x; - int sy4 = - (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); - for (int y = 0; y < 8; ++y) { - int sy = sy4 - MultiplyBy4(gamma); - __m128i filter[8]; - for (__m128i& f : filter) { - const int offset = - RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) + - kWarpedPixelPrecisionShifts; - f = LoadUnaligned16(kWarpedFilters[offset]); - sy += gamma; - } - Transpose8x8_U16(filter, filter); - WriteVerticalFilter<is_compound>(filter, intermediate_result, y, - dst_row); - dst_row += dest_stride; - sy4 += delta; - } - start_x += 8; - } while (start_x < block_start_x + block_width); + HandleWarpBlock<is_compound, DestType>( + src, source_stride, source_width, source_height, warp_params, + subsampling_x, subsampling_y, src_x, src_y, alpha, beta, gamma, delta, + dst_row, dest_stride); + src_x += (8 << subsampling_x); + dst_row += 8; + } while (src_x < end_x); dst += 8 * dest_stride; - start_y += 8; - } while (start_y < block_start_y + block_height); + src_y += (8 << subsampling_y); + } while (src_y < end_y); } void Init8bpp() { diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/weight_mask_sse4.h b/chromium/third_party/libgav1/src/src/dsp/x86/weight_mask_sse4.h index 42309916eb0..841dd5a26af 100644 --- a/chromium/third_party/libgav1/src/src/dsp/x86/weight_mask_sse4.h +++ b/chromium/third_party/libgav1/src/src/dsp/x86/weight_mask_sse4.h @@ -36,6 +36,7 @@ void WeightMaskInit_SSE4_1(); #define LIBGAV1_Dsp8bpp_WeightMask_16x8 LIBGAV1_CPU_SSE4_1 #define LIBGAV1_Dsp8bpp_WeightMask_16x16 LIBGAV1_CPU_SSE4_1 #define LIBGAV1_Dsp8bpp_WeightMask_16x32 LIBGAV1_CPU_SSE4_1 +#define LIBGAV1_Dsp8bpp_WeightMask_16x64 LIBGAV1_CPU_SSE4_1 #define LIBGAV1_Dsp8bpp_WeightMask_32x8 LIBGAV1_CPU_SSE4_1 #define LIBGAV1_Dsp8bpp_WeightMask_32x16 LIBGAV1_CPU_SSE4_1 #define LIBGAV1_Dsp8bpp_WeightMask_32x32 LIBGAV1_CPU_SSE4_1 diff --git a/chromium/third_party/libgav1/src/src/frame_scratch_buffer.h b/chromium/third_party/libgav1/src/src/frame_scratch_buffer.h index 6b336b0a58c..1d6a1f4fadb 100644 --- a/chromium/third_party/libgav1/src/src/frame_scratch_buffer.h +++ b/chromium/third_party/libgav1/src/src/frame_scratch_buffer.h @@ -17,17 +17,19 @@ #ifndef LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_ #define LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_ +#include <condition_variable> // NOLINT (unapproved c++11 header) #include <cstdint> #include <memory> #include <mutex> // NOLINT (unapproved c++11 header) -#include "src/loop_filter_mask.h" #include "src/loop_restoration_info.h" #include "src/residual_buffer_pool.h" #include "src/symbol_decoder_context.h" #include "src/threading_strategy.h" #include "src/tile_scratch_buffer.h" #include "src/utils/array_2d.h" +#include "src/utils/block_parameters_holder.h" +#include "src/utils/compiler_attributes.h" #include "src/utils/constants.h" #include "src/utils/dynamic_buffer.h" #include "src/utils/memory.h" @@ -37,17 +39,21 @@ namespace libgav1 { +// Buffer used to store the unfiltered pixels that are necessary for decoding +// the next superblock row (for the intra prediction process). +using IntraPredictionBuffer = + std::array<AlignedDynamicBuffer<uint8_t, kMaxAlignment>, kMaxPlanes>; + // Buffer to facilitate decoding a frame. This struct is used only within // DecoderImpl::DecodeTiles(). struct FrameScratchBuffer { - LoopFilterMask loop_filter_mask; LoopRestorationInfo loop_restoration_info; Array2D<int16_t> cdef_index; Array2D<TransformSize> inter_transform_sizes; + BlockParametersHolder block_parameters_holder; TemporalMotionField motion_field; SymbolDecoderContext symbol_decoder_context; std::unique_ptr<ResidualBufferPool> residual_buffer_pool; - Array2D<SuperBlockState> superblock_state; // threaded_window_buffer will be subdivided by PostFilter into windows of // width 512 pixels. Each row in the window is filtered by a worker thread. // To avoid false sharing, each 512-pixel row processed by one thread should @@ -62,11 +68,22 @@ struct FrameScratchBuffer { // for every 32x32 for chroma with subsampling). The indices of the rows that // are stored are specified in |kDeblockedRowsForLoopRestoration|. YuvBuffer deblock_buffer; + // The size of this dynamic buffer is |tile_rows|. + DynamicBuffer<IntraPredictionBuffer> intra_prediction_buffers; TileScratchBufferPool tile_scratch_buffer_pool; - // TODO(vigneshv): This is part of the frame scratch buffer for now. This will - // have to change or move to DecoderImpl when frame parallel mode with - // in-frame multi-theading is implemented. ThreadingStrategy threading_strategy; + std::mutex superblock_row_mutex; + // The size of this buffer is the number of superblock rows. + // |superblock_row_progress[i]| is incremented whenever a tile finishes + // decoding superblock row at index i. If the count reaches tile_columns, then + // |superblock_row_progress_condvar[i]| is notified. + DynamicBuffer<int> superblock_row_progress + LIBGAV1_GUARDED_BY(superblock_row_mutex); + // The size of this buffer is the number of superblock rows. Used to wait for + // |superblock_row_progress[i]| to reach tile_columns. + DynamicBuffer<std::condition_variable> superblock_row_progress_condvar; + // Used to signal tile decoding failure in the combined multithreading mode. + bool tile_decoding_failed LIBGAV1_GUARDED_BY(superblock_row_mutex); }; class FrameScratchBufferPool { @@ -89,8 +106,6 @@ class FrameScratchBufferPool { private: std::mutex mutex_; - // TODO(b/142583029): The size of this stack is set to kMaxThreads. This may - // have to be revisited as we iterate over the frame parallel design. Stack<std::unique_ptr<FrameScratchBuffer>, kMaxThreads> buffers_ LIBGAV1_GUARDED_BY(mutex_); }; diff --git a/chromium/third_party/libgav1/src/src/gav1/decoder.h b/chromium/third_party/libgav1/src/src/gav1/decoder.h index 5151d647b6f..9d0d87291ee 100644 --- a/chromium/third_party/libgav1/src/src/gav1/decoder.h +++ b/chromium/third_party/libgav1/src/src/gav1/decoder.h @@ -94,11 +94,11 @@ class LIBGAV1_PUBLIC Decoder { // NOTE: |EnqueueFrame()| does not copy the data. Therefore, after a // successful |EnqueueFrame()| call, the caller must keep the |data| buffer // alive until: - // 1) If release_input_buffer is not nullptr, then |data| buffer must be kept - // alive until release_input_buffer is called with the |buffer_private_data| - // passed into this EnqueueFrame call. - // 2) If release_input_buffer is nullptr, then |data| buffer must be kept - // alive until the corresponding DequeueFrame() call is completed. + // 1) If |settings_.release_input_buffer| is not nullptr, then |data| buffer + // must be kept alive until release_input_buffer is called with the + // |buffer_private_data| passed into this EnqueueFrame call. + // 2) If |settings_.release_input_buffer| is nullptr, then |data| buffer must + // be kept alive until the corresponding DequeueFrame() call is completed. StatusCode EnqueueFrame(const uint8_t* data, size_t size, int64_t user_private_data, void* buffer_private_data); @@ -107,9 +107,12 @@ class LIBGAV1_PUBLIC Decoder { // compressed frame. If there are no displayable frames available, sets // |*out_ptr| to nullptr. Returns an error status if there is an error. // - // In frame parallel mode, if |settings_.blocking_dequeue| is true, then this - // call will block until an enqueued frame has been decoded. Otherwise, it - // will return kStatusTryAgain if an enqueued frame is not yet decoded. + // If |settings_.blocking_dequeue| is false and the decoder is operating in + // frame parallel mode (|settings_.frame_parallel| is true and the video + // stream passes the decoder's heuristics for enabling frame parallel mode), + // then this call will return kStatusTryAgain if an enqueued frame is not yet + // decoded (it is a non blocking call in this case). In all other cases, this + // call will block until an enqueued frame has been decoded. StatusCode DequeueFrame(const DecoderBuffer** out_ptr); // Signals the end of stream. diff --git a/chromium/third_party/libgav1/src/src/gav1/decoder_settings.h b/chromium/third_party/libgav1/src/src/gav1/decoder_settings.h index d7ec8d6754b..33777248a3c 100644 --- a/chromium/third_party/libgav1/src/src/gav1/decoder_settings.h +++ b/chromium/third_party/libgav1/src/src/gav1/decoder_settings.h @@ -41,15 +41,13 @@ typedef void (*Libgav1ReleaseInputBufferCallback)(void* callback_private_data, void* buffer_private_data); typedef struct Libgav1DecoderSettings { - // Number of threads to use when decoding. Must be greater than 0. The - // library will create at most |threads|-1 new threads, the calling thread is - // considered part of the library's thread count. Defaults to 1 (no new - // threads will be created). + // Number of threads to use when decoding. Must be greater than 0. The library + // will create at most |threads| new threads. Defaults to 1 (no new threads + // will be created). int threads; - // A boolean. Do frame parallel decoding. - // - // NOTE: Frame parallel decoding is not implemented, this setting is - // currently ignored. + // A boolean. Indicate to the decoder that frame parallel decoding is allowed. + // Note that this is just a request and the decoder will decide the number of + // frames to be decoded in parallel based on the video stream being decoded. int frame_parallel; // A boolean. In frame parallel mode, should Libgav1DecoderDequeueFrame wait // until a enqueued frame is available for dequeueing. @@ -91,15 +89,13 @@ using ReleaseInputBufferCallback = Libgav1ReleaseInputBufferCallback; // Applications must populate this structure before creating a decoder instance. struct DecoderSettings { - // Number of threads to use when decoding. Must be greater than 0. The - // library will create at most |threads|-1 new threads, the calling thread is - // considered part of the library's thread count. Defaults to 1 (no new - // threads will be created). + // Number of threads to use when decoding. Must be greater than 0. The library + // will create at most |threads| new threads. Defaults to 1 (no new threads + // will be created). int threads = 1; - // Do frame parallel decoding. - // - // NOTE: Frame parallel decoding is not implemented, this setting is - // currently ignored. + // Indicate to the decoder that frame parallel decoding is allowed. Note that + // this is just a request and the decoder will decide the number of frames to + // be decoded in parallel based on the video stream being decoded. bool frame_parallel = false; // In frame parallel mode, should DequeueFrame wait until a enqueued frame is // available for dequeueing. diff --git a/chromium/third_party/libgav1/src/src/libgav1_decoder.cmake b/chromium/third_party/libgav1/src/src/libgav1_decoder.cmake index a97f1425dd3..b97d09def17 100644 --- a/chromium/third_party/libgav1/src/src/libgav1_decoder.cmake +++ b/chromium/third_party/libgav1/src/src/libgav1_decoder.cmake @@ -33,8 +33,6 @@ list(APPEND libgav1_decoder_sources "${libgav1_source}/inter_intra_masks.inc" "${libgav1_source}/internal_frame_buffer_list.cc" "${libgav1_source}/internal_frame_buffer_list.h" - "${libgav1_source}/loop_filter_mask.cc" - "${libgav1_source}/loop_filter_mask.h" "${libgav1_source}/loop_restoration_info.cc" "${libgav1_source}/loop_restoration_info.h" "${libgav1_source}/motion_vector.cc" @@ -43,6 +41,7 @@ list(APPEND libgav1_decoder_sources "${libgav1_source}/obu_parser.h" "${libgav1_source}/post_filter/cdef.cc" "${libgav1_source}/post_filter/deblock.cc" + "${libgav1_source}/post_filter/deblock_thresholds.inc" "${libgav1_source}/post_filter/loop_restoration.cc" "${libgav1_source}/post_filter/post_filter.cc" "${libgav1_source}/post_filter/super_res.cc" @@ -56,6 +55,7 @@ list(APPEND libgav1_decoder_sources "${libgav1_source}/reconstruction.h" "${libgav1_source}/residual_buffer_pool.cc" "${libgav1_source}/residual_buffer_pool.h" + "${libgav1_source}/scan_tables.inc" "${libgav1_source}/symbol_decoder_context.cc" "${libgav1_source}/symbol_decoder_context.h" "${libgav1_source}/symbol_decoder_context_cdfs.inc" diff --git a/chromium/third_party/libgav1/src/src/loop_filter_mask.cc b/chromium/third_party/libgav1/src/src/loop_filter_mask.cc deleted file mode 100644 index 8f96df9bf92..00000000000 --- a/chromium/third_party/libgav1/src/src/loop_filter_mask.cc +++ /dev/null @@ -1,208 +0,0 @@ -// Copyright 2019 The libgav1 Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "src/loop_filter_mask.h" - -#include <algorithm> -#include <cstdint> -#include <cstring> -#include <memory> -#include <new> - -#include "src/utils/array_2d.h" -#include "src/utils/compiler_attributes.h" - -namespace libgav1 { - -#if !LIBGAV1_CXX17 -// static. -constexpr BitMaskSet LoopFilterMask::kPredictionModeDeltasMask; -#endif - -bool LoopFilterMask::Reset(int width, int height) { - num_64x64_blocks_per_row_ = DivideBy64(width + 63); - num_64x64_blocks_per_column_ = DivideBy64(height + 63); - const int num_64x64_blocks = - num_64x64_blocks_per_row_ * num_64x64_blocks_per_column_; - if (num_64x64_blocks_ == -1 || num_64x64_blocks_ < num_64x64_blocks) { - // Note that this need not be zero initialized here since we zero - // initialize the required number of entries in the loop that follows. - loop_filter_masks_.reset(new (std::nothrow) - Data[num_64x64_blocks]); // NOLINT. - if (loop_filter_masks_ == nullptr) { - return false; - } - } - for (int i = 0; i < num_64x64_blocks; ++i) { - memset(&loop_filter_masks_[i], 0, sizeof(loop_filter_masks_[i])); - } - num_64x64_blocks_ = num_64x64_blocks; - return true; -} - -void LoopFilterMask::Build( - const ObuSequenceHeader& sequence_header, - const ObuFrameHeader& frame_header, int tile_group_start, - int tile_group_end, const BlockParametersHolder& block_parameters_holder, - const Array2D<TransformSize>& inter_transform_sizes) { - for (int tile_number = tile_group_start; tile_number <= tile_group_end; - ++tile_number) { - const int row = tile_number / frame_header.tile_info.tile_columns; - const int column = tile_number % frame_header.tile_info.tile_columns; - const int row4x4_start = frame_header.tile_info.tile_row_start[row]; - const int row4x4_end = frame_header.tile_info.tile_row_start[row + 1]; - const int column4x4_start = - frame_header.tile_info.tile_column_start[column]; - const int column4x4_end = - frame_header.tile_info.tile_column_start[column + 1]; - - const int num_planes = sequence_header.color_config.is_monochrome - ? kMaxPlanesMonochrome - : kMaxPlanes; - for (int plane = kPlaneY; plane < num_planes; ++plane) { - // For U and V planes, do not build bit masks if level == 0. - if (plane > kPlaneY && frame_header.loop_filter.level[plane + 1] == 0) { - continue; - } - const int8_t subsampling_x = - (plane == kPlaneY) ? 0 : sequence_header.color_config.subsampling_x; - const int8_t subsampling_y = - (plane == kPlaneY) ? 0 : sequence_header.color_config.subsampling_y; - const int vertical_step = 1 << subsampling_y; - const int horizontal_step = 1 << subsampling_x; - - // Build bit masks for vertical edges (except the frame boundary). - if (column4x4_start != 0) { - const int plane_height = - RightShiftWithRounding(frame_header.height, subsampling_y); - const int row4x4_limit = - std::min(row4x4_end, DivideBy4(plane_height + 3) << subsampling_y); - const int vertical_level_index = - kDeblockFilterLevelIndex[plane][kLoopFilterTypeVertical]; - for (int row4x4 = GetDeblockPosition(row4x4_start, subsampling_y); - row4x4 < row4x4_limit; row4x4 += vertical_step) { - const int column4x4 = - GetDeblockPosition(column4x4_start, subsampling_x); - const BlockParameters& bp = - *block_parameters_holder.Find(row4x4, column4x4); - const uint8_t vertical_level = - bp.deblock_filter_level[vertical_level_index]; - const BlockParameters& bp_left = *block_parameters_holder.Find( - row4x4, column4x4 - horizontal_step); - const uint8_t left_level = - bp_left.deblock_filter_level[vertical_level_index]; - const int unit_id = DivideBy16(row4x4) * num_64x64_blocks_per_row_ + - DivideBy16(column4x4); - const int row = row4x4 % kNum4x4InLoopFilterMaskUnit; - const int column = column4x4 % kNum4x4InLoopFilterMaskUnit; - const int shift = LoopFilterMask::GetShift(row, column); - const int index = LoopFilterMask::GetIndex(row); - const auto mask = static_cast<uint64_t>(1) << shift; - // Tile boundary must be coding block boundary. So we don't have to - // check (!left_skip || !skip || is_vertical_border). - if (vertical_level != 0 || left_level != 0) { - assert(inter_transform_sizes[row4x4] != nullptr); - const TransformSize tx_size = - (plane == kPlaneY) ? inter_transform_sizes[row4x4][column4x4] - : bp.uv_transform_size; - const TransformSize left_tx_size = - (plane == kPlaneY) - ? inter_transform_sizes[row4x4][column4x4 - horizontal_step] - : bp_left.uv_transform_size; - const LoopFilterTransformSizeId transform_size_id = - GetTransformSizeIdWidth(tx_size, left_tx_size); - SetLeft(mask, unit_id, plane, transform_size_id, index); - const uint8_t current_level = - (vertical_level == 0) ? left_level : vertical_level; - SetLevel(current_level, unit_id, plane, kLoopFilterTypeVertical, - LoopFilterMask::GetLevelOffset(row, column)); - } - } - } - - // Build bit masks for horizontal edges (except the frame boundary). - if (row4x4_start != 0) { - const int plane_width = - RightShiftWithRounding(frame_header.width, subsampling_x); - const int column4x4_limit = std::min( - column4x4_end, DivideBy4(plane_width + 3) << subsampling_y); - const int horizontal_level_index = - kDeblockFilterLevelIndex[plane][kLoopFilterTypeHorizontal]; - for (int column4x4 = GetDeblockPosition(column4x4_start, subsampling_x); - column4x4 < column4x4_limit; column4x4 += horizontal_step) { - const int row4x4 = GetDeblockPosition(row4x4_start, subsampling_y); - const BlockParameters& bp = - *block_parameters_holder.Find(row4x4, column4x4); - const uint8_t horizontal_level = - bp.deblock_filter_level[horizontal_level_index]; - const BlockParameters& bp_top = - *block_parameters_holder.Find(row4x4 - vertical_step, column4x4); - const uint8_t top_level = - bp_top.deblock_filter_level[horizontal_level_index]; - const int unit_id = DivideBy16(row4x4) * num_64x64_blocks_per_row_ + - DivideBy16(column4x4); - const int row = row4x4 % kNum4x4InLoopFilterMaskUnit; - const int column = column4x4 % kNum4x4InLoopFilterMaskUnit; - const int shift = LoopFilterMask::GetShift(row, column); - const int index = LoopFilterMask::GetIndex(row); - const auto mask = static_cast<uint64_t>(1) << shift; - // Tile boundary must be coding block boundary. So we don't have to - // check (!top_skip || !skip || is_horizontal_border). - if (horizontal_level != 0 || top_level != 0) { - assert(inter_transform_sizes[row4x4] != nullptr); - const TransformSize tx_size = - (plane == kPlaneY) ? inter_transform_sizes[row4x4][column4x4] - : bp.uv_transform_size; - const TransformSize top_tx_size = - (plane == kPlaneY) - ? inter_transform_sizes[row4x4 - vertical_step][column4x4] - : bp_top.uv_transform_size; - const LoopFilterTransformSizeId transform_size_id = - static_cast<LoopFilterTransformSizeId>( - std::min({kTransformHeightLog2[tx_size] - 2, - kTransformHeightLog2[top_tx_size] - 2, 2})); - SetTop(mask, unit_id, plane, transform_size_id, index); - const uint8_t current_level = - (horizontal_level == 0) ? top_level : horizontal_level; - SetLevel(current_level, unit_id, plane, kLoopFilterTypeHorizontal, - LoopFilterMask::GetLevelOffset(row, column)); - } - } - } - } - } - assert(IsValid()); -} - -bool LoopFilterMask::IsValid() const { - for (int mask_id = 0; mask_id < num_64x64_blocks_; ++mask_id) { - for (int plane = 0; plane < kMaxPlanes; ++plane) { - for (int i = 0; i < kNumLoopFilterTransformSizeIds; ++i) { - for (int j = i + 1; j < kNumLoopFilterTransformSizeIds; ++j) { - for (int k = 0; k < kNumLoopFilterMasks; ++k) { - if ((loop_filter_masks_[mask_id].left[plane][i][k] & - loop_filter_masks_[mask_id].left[plane][j][k]) != 0 || - (loop_filter_masks_[mask_id].top[plane][i][k] & - loop_filter_masks_[mask_id].top[plane][j][k]) != 0) { - return false; - } - } - } - } - } - } - return true; -} - -} // namespace libgav1 diff --git a/chromium/third_party/libgav1/src/src/loop_filter_mask.h b/chromium/third_party/libgav1/src/src/loop_filter_mask.h deleted file mode 100644 index 314f020b99b..00000000000 --- a/chromium/third_party/libgav1/src/src/loop_filter_mask.h +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Copyright 2019 The libgav1 Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LIBGAV1_SRC_LOOP_FILTER_MASK_H_ -#define LIBGAV1_SRC_LOOP_FILTER_MASK_H_ - -#include <array> -#include <cassert> -#include <cstdint> -#include <memory> - -#include "src/dsp/constants.h" -#include "src/dsp/dsp.h" -#include "src/obu_parser.h" -#include "src/utils/array_2d.h" -#include "src/utils/bit_mask_set.h" -#include "src/utils/block_parameters_holder.h" -#include "src/utils/common.h" -#include "src/utils/constants.h" -#include "src/utils/segmentation.h" -#include "src/utils/types.h" - -namespace libgav1 { - -class LoopFilterMask { - public: - // This structure holds loop filter bit masks for a 64x64 block. - // 64x64 block contains kNum4x4In64x64 = (64x64 / (4x4) = 256) - // 4x4 blocks. It requires kNumLoopFilterMasks = 4 uint64_t to represent them. - struct Data : public Allocable { - uint8_t level[kMaxPlanes][kNumLoopFilterTypes][kNum4x4In64x64]; - uint64_t left[kMaxPlanes][kNumLoopFilterTransformSizeIds] - [kNumLoopFilterMasks]; - uint64_t top[kMaxPlanes][kNumLoopFilterTransformSizeIds] - [kNumLoopFilterMasks]; - }; - - LoopFilterMask() = default; - - // Loop filter mask is built and used for each superblock individually. - // Thus not copyable/movable. - LoopFilterMask(const LoopFilterMask&) = delete; - LoopFilterMask& operator=(const LoopFilterMask&) = delete; - LoopFilterMask(LoopFilterMask&&) = delete; - LoopFilterMask& operator=(LoopFilterMask&&) = delete; - - // Allocates the loop filter masks for the given |width| and - // |height| if necessary and zeros out the appropriate number of - // entries. Returns true on success. - bool Reset(int width, int height); - - // Builds bit masks for tile boundaries. - // This function is called after the frame has been decoded so that - // information across tiles is available. - // Before this function call, bit masks of transform edges other than those - // on tile boundaries are built together with tile decoding, in - // Tile::BuildBitMask(). - void Build(const ObuSequenceHeader& sequence_header, - const ObuFrameHeader& frame_header, int tile_group_start, - int tile_group_end, - const BlockParametersHolder& block_parameters_holder, - const Array2D<TransformSize>& inter_transform_sizes); - - uint8_t GetLevel(int mask_id, int plane, LoopFilterType type, - int offset) const { - return loop_filter_masks_[mask_id].level[plane][type][offset]; - } - - uint64_t GetLeft(int mask_id, int plane, LoopFilterTransformSizeId tx_size_id, - int index) const { - return loop_filter_masks_[mask_id].left[plane][tx_size_id][index]; - } - - uint64_t GetTop(int mask_id, int plane, LoopFilterTransformSizeId tx_size_id, - int index) const { - return loop_filter_masks_[mask_id].top[plane][tx_size_id][index]; - } - - int num_64x64_blocks_per_row() const { return num_64x64_blocks_per_row_; } - - void SetLeft(uint64_t new_mask, int mask_id, int plane, - LoopFilterTransformSizeId transform_size_id, int index) { - loop_filter_masks_[mask_id].left[plane][transform_size_id][index] |= - new_mask; - } - - void SetTop(uint64_t new_mask, int mask_id, int plane, - LoopFilterTransformSizeId transform_size_id, int index) { - loop_filter_masks_[mask_id].top[plane][transform_size_id][index] |= - new_mask; - } - - void SetLevel(uint8_t level, int mask_id, int plane, LoopFilterType type, - int offset) { - loop_filter_masks_[mask_id].level[plane][type][offset] = level; - } - - static int GetIndex(int row4x4) { return row4x4 >> 2; } - - static int GetShift(int row4x4, int column4x4) { - return ((row4x4 & 3) << 4) | column4x4; - } - - static int GetLevelOffset(int row4x4, int column4x4) { - assert(row4x4 < 16); - assert(column4x4 < 16); - return (row4x4 << 4) | column4x4; - } - - static constexpr int GetModeId(PredictionMode mode) { - return static_cast<int>(kPredictionModeDeltasMask.Contains(mode)); - } - - // 7.14.5. - static void ComputeDeblockFilterLevels( - const ObuFrameHeader& frame_header, int segment_id, int level_index, - const int8_t delta_lf[kFrameLfCount], - uint8_t deblock_filter_levels[kNumReferenceFrameTypes][2]) { - const int delta = delta_lf[frame_header.delta_lf.multi ? level_index : 0]; - uint8_t level = Clip3(frame_header.loop_filter.level[level_index] + delta, - 0, kMaxLoopFilterValue); - const auto feature = static_cast<SegmentFeature>( - kSegmentFeatureLoopFilterYVertical + level_index); - level = Clip3( - level + frame_header.segmentation.feature_data[segment_id][feature], 0, - kMaxLoopFilterValue); - if (!frame_header.loop_filter.delta_enabled) { - static_assert(sizeof(deblock_filter_levels[0][0]) == 1, ""); - memset(deblock_filter_levels, level, kNumReferenceFrameTypes * 2); - return; - } - assert(frame_header.loop_filter.delta_enabled); - const int shift = level >> 5; - deblock_filter_levels[kReferenceFrameIntra][0] = Clip3( - level + - LeftShift(frame_header.loop_filter.ref_deltas[kReferenceFrameIntra], - shift), - 0, kMaxLoopFilterValue); - // deblock_filter_levels[kReferenceFrameIntra][1] is never used. So it does - // not have to be populated. - for (int reference_frame = kReferenceFrameIntra + 1; - reference_frame < kNumReferenceFrameTypes; ++reference_frame) { - for (int mode_id = 0; mode_id < 2; ++mode_id) { - deblock_filter_levels[reference_frame][mode_id] = Clip3( - level + - LeftShift(frame_header.loop_filter.ref_deltas[reference_frame] + - frame_header.loop_filter.mode_deltas[mode_id], - shift), - 0, kMaxLoopFilterValue); - } - } - } - - private: - std::unique_ptr<Data[]> loop_filter_masks_; - int num_64x64_blocks_ = -1; - int num_64x64_blocks_per_row_; - int num_64x64_blocks_per_column_; - - // Mask used to determine the index for mode_deltas lookup. - static constexpr BitMaskSet kPredictionModeDeltasMask{ - BitMaskSet(kPredictionModeNearestMv, kPredictionModeNearMv, - kPredictionModeNewMv, kPredictionModeNearestNearestMv, - kPredictionModeNearNearMv, kPredictionModeNearestNewMv, - kPredictionModeNewNearestMv, kPredictionModeNearNewMv, - kPredictionModeNewNearMv, kPredictionModeNewNewMv)}; - - // Validates that the loop filter masks at different transform sizes are - // mutually exclusive. Only used in an assert. This function will not be used - // in optimized builds. - bool IsValid() const; -}; - -} // namespace libgav1 - -#endif // LIBGAV1_SRC_LOOP_FILTER_MASK_H_ diff --git a/chromium/third_party/libgav1/src/src/motion_vector.cc b/chromium/third_party/libgav1/src/src/motion_vector.cc index c7a496e5979..8223f3decc1 100644 --- a/chromium/third_party/libgav1/src/src/motion_vector.cc +++ b/chromium/third_party/libgav1/src/src/motion_vector.cc @@ -479,19 +479,28 @@ void TemporalScan(const Tile::Block& block, bool is_compound, if (count != 0) { BlockParameters* const bp = block.bp; int reference_offsets[2]; - const int offset_0 = GetRelativeDistance( - tile.frame_header().order_hint, - tile.current_frame().order_hint(bp->reference_frame[0]), - tile.sequence_header().order_hint_shift_bits); + const int offset_0 = tile.current_frame() + .reference_info() + ->relative_distance_to[bp->reference_frame[0]]; reference_offsets[0] = Clip3(offset_0, -kMaxFrameDistance, kMaxFrameDistance); if (is_compound) { - const int offset_1 = GetRelativeDistance( - tile.frame_header().order_hint, - tile.current_frame().order_hint(bp->reference_frame[1]), - tile.sequence_header().order_hint_shift_bits); + const int offset_1 = tile.current_frame() + .reference_info() + ->relative_distance_to[bp->reference_frame[1]]; reference_offsets[1] = Clip3(offset_1, -kMaxFrameDistance, kMaxFrameDistance); + // Pad so that SIMD implementations won't read uninitialized memory. + if ((count & 1) != 0) { + temporal_mvs[count].mv32 = 0; + temporal_reference_offsets[count] = 0; + } + } else { + // Pad so that SIMD implementations won't read uninitialized memory. + for (int i = count; i < ((count + 3) & ~3); ++i) { + temporal_mvs[i].mv32 = 0; + temporal_reference_offsets[i] = 0; + } } AddTemporalReferenceMvCandidate( tile.frame_header(), reference_offsets, temporal_mvs, @@ -752,12 +761,12 @@ void AddSample(const Tile::Block& block, int delta_row, int delta_column, // or -1 so that it can be XORed and subtracted directly in ApplySign() and // corresponding SIMD implementations. bool MotionFieldProjection( - const ObuFrameHeader& frame_header, const RefCountedBuffer& current_frame, + const ObuFrameHeader& frame_header, const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>& reference_frames, - ReferenceFrameType source, unsigned int order_hint_shift_bits, - int reference_to_current_with_sign, int dst_sign, int y8_start, int y8_end, - int x8_start, int x8_end, TemporalMotionField* const motion_field) { + ReferenceFrameType source, int reference_to_current_with_sign, int dst_sign, + int y8_start, int y8_end, int x8_start, int x8_end, + TemporalMotionField* const motion_field) { const int source_index = frame_header.reference_frame_index[source - kReferenceFrameLast]; auto* const source_frame = reference_frames[source_index].get(); @@ -770,12 +779,10 @@ bool MotionFieldProjection( } assert(reference_to_current_with_sign >= -kMaxFrameDistance); if (reference_to_current_with_sign > kMaxFrameDistance) return true; + const ReferenceInfo& reference_info = *source_frame->reference_info(); const dsp::Dsp& dsp = *dsp::GetDspTable(8); dsp.motion_field_projection_kernel( - source_frame->motion_field_reference_frame(y8_start, 0), - source_frame->motion_field_mv(y8_start, 0), - source_frame->order_hint_array(), current_frame.order_hint(source), - order_hint_shift_bits, reference_to_current_with_sign, dst_sign, y8_start, + reference_info, reference_to_current_with_sign, dst_sign, y8_start, y8_end, x8_start, x8_end, motion_field); return true; } @@ -921,62 +928,58 @@ void SetupMotionField( const ObuFrameHeader& frame_header, const RefCountedBuffer& current_frame, const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>& reference_frames, - unsigned int order_hint_shift_bits, int row4x4_start, int row4x4_end, - int column4x4_start, int column4x4_end, + int row4x4_start, int row4x4_end, int column4x4_start, int column4x4_end, TemporalMotionField* const motion_field) { assert(frame_header.use_ref_frame_mvs); - assert(order_hint_shift_bits != 0); const int y8_start = DivideBy2(row4x4_start); const int y8_end = DivideBy2(std::min(row4x4_end, frame_header.rows4x4)); const int x8_start = DivideBy2(column4x4_start); const int x8_end = DivideBy2(std::min(column4x4_end, frame_header.columns4x4)); - const int8_t* const reference_frame_index = - frame_header.reference_frame_index; - const int last_index = reference_frame_index[0]; - const int last_alternate_order_hint = - reference_frames[last_index]->order_hint(kReferenceFrameAlternate); - const int current_gold_order_hint = - current_frame.order_hint(kReferenceFrameGolden); - if (last_alternate_order_hint != current_gold_order_hint) { - const int reference_offset_last = - -GetRelativeDistance(current_frame.order_hint(kReferenceFrameLast), - frame_header.order_hint, order_hint_shift_bits); - if (std::abs(reference_offset_last) <= kMaxFrameDistance) { - MotionFieldProjection(frame_header, current_frame, reference_frames, - kReferenceFrameLast, order_hint_shift_bits, - reference_offset_last, -1, y8_start, y8_end, - x8_start, x8_end, motion_field); + const int last_index = frame_header.reference_frame_index[0]; + const ReferenceInfo& reference_info = *current_frame.reference_info(); + if (!IsIntraFrame(reference_frames[last_index]->frame_type())) { + const int last_alternate_order_hint = + reference_frames[last_index] + ->reference_info() + ->order_hint[kReferenceFrameAlternate]; + const int current_gold_order_hint = + reference_info.order_hint[kReferenceFrameGolden]; + if (last_alternate_order_hint != current_gold_order_hint) { + const int reference_offset_last = + -reference_info.relative_distance_from[kReferenceFrameLast]; + if (std::abs(reference_offset_last) <= kMaxFrameDistance) { + MotionFieldProjection(frame_header, reference_frames, + kReferenceFrameLast, reference_offset_last, -1, + y8_start, y8_end, x8_start, x8_end, motion_field); + } } } int ref_stamp = 1; const int reference_offset_backward = - GetRelativeDistance(current_frame.order_hint(kReferenceFrameBackward), - frame_header.order_hint, order_hint_shift_bits); + reference_info.relative_distance_from[kReferenceFrameBackward]; if (reference_offset_backward > 0 && - MotionFieldProjection(frame_header, current_frame, reference_frames, - kReferenceFrameBackward, order_hint_shift_bits, - reference_offset_backward, 0, y8_start, y8_end, - x8_start, x8_end, motion_field)) { + MotionFieldProjection(frame_header, reference_frames, + kReferenceFrameBackward, reference_offset_backward, + 0, y8_start, y8_end, x8_start, x8_end, + motion_field)) { --ref_stamp; } const int reference_offset_alternate2 = - GetRelativeDistance(current_frame.order_hint(kReferenceFrameAlternate2), - frame_header.order_hint, order_hint_shift_bits); + reference_info.relative_distance_from[kReferenceFrameAlternate2]; if (reference_offset_alternate2 > 0 && - MotionFieldProjection(frame_header, current_frame, reference_frames, - kReferenceFrameAlternate2, order_hint_shift_bits, + MotionFieldProjection(frame_header, reference_frames, + kReferenceFrameAlternate2, reference_offset_alternate2, 0, y8_start, y8_end, x8_start, x8_end, motion_field)) { --ref_stamp; } if (ref_stamp >= 0) { const int reference_offset_alternate = - GetRelativeDistance(current_frame.order_hint(kReferenceFrameAlternate), - frame_header.order_hint, order_hint_shift_bits); + reference_info.relative_distance_from[kReferenceFrameAlternate]; if (reference_offset_alternate > 0 && - MotionFieldProjection(frame_header, current_frame, reference_frames, - kReferenceFrameAlternate, order_hint_shift_bits, + MotionFieldProjection(frame_header, reference_frames, + kReferenceFrameAlternate, reference_offset_alternate, 0, y8_start, y8_end, x8_start, x8_end, motion_field)) { --ref_stamp; @@ -984,13 +987,11 @@ void SetupMotionField( } if (ref_stamp >= 0) { const int reference_offset_last2 = - -GetRelativeDistance(current_frame.order_hint(kReferenceFrameLast2), - frame_header.order_hint, order_hint_shift_bits); + -reference_info.relative_distance_from[kReferenceFrameLast2]; if (std::abs(reference_offset_last2) <= kMaxFrameDistance) { - MotionFieldProjection(frame_header, current_frame, reference_frames, - kReferenceFrameLast2, order_hint_shift_bits, - reference_offset_last2, -1, y8_start, y8_end, - x8_start, x8_end, motion_field); + MotionFieldProjection(frame_header, reference_frames, + kReferenceFrameLast2, reference_offset_last2, -1, + y8_start, y8_end, x8_start, x8_end, motion_field); } } } diff --git a/chromium/third_party/libgav1/src/src/motion_vector.h b/chromium/third_party/libgav1/src/src/motion_vector.h index f34bebb5346..d739e802831 100644 --- a/chromium/third_party/libgav1/src/src/motion_vector.h +++ b/chromium/third_party/libgav1/src/src/motion_vector.h @@ -51,8 +51,8 @@ void SetupMotionField( const ObuFrameHeader& frame_header, const RefCountedBuffer& current_frame, const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>& reference_frames, - unsigned int order_hint_shift_bits, int row4x4_start, int row4x4_end, - int column4x4_start, int column4x4_end, TemporalMotionField* motion_field); + int row4x4_start, int row4x4_end, int column4x4_start, int column4x4_end, + TemporalMotionField* motion_field); } // namespace libgav1 diff --git a/chromium/third_party/libgav1/src/src/obu_parser.cc b/chromium/third_party/libgav1/src/src/obu_parser.cc index 0a3ccd49254..ffa267fb348 100644 --- a/chromium/third_party/libgav1/src/src/obu_parser.cc +++ b/chromium/third_party/libgav1/src/src/obu_parser.cc @@ -1080,29 +1080,32 @@ void ObuParser::ComputeSegmentLosslessAndQIndex() { } bool ObuParser::ParseCdefParameters() { + const int coeff_shift = sequence_header_.color_config.bitdepth - 8; if (frame_header_.coded_lossless || frame_header_.allow_intrabc || !sequence_header_.enable_cdef) { - frame_header_.cdef.damping = 3; + frame_header_.cdef.damping = 3 + coeff_shift; return true; } Cdef* const cdef = &frame_header_.cdef; int64_t scratch; OBU_READ_LITERAL_OR_FAIL(2); - cdef->damping = scratch + 3; + cdef->damping = scratch + 3 + coeff_shift; OBU_READ_LITERAL_OR_FAIL(2); cdef->bits = scratch; for (int i = 0; i < (1 << cdef->bits); ++i) { OBU_READ_LITERAL_OR_FAIL(4); - cdef->y_primary_strength[i] = scratch; + cdef->y_primary_strength[i] = scratch << coeff_shift; OBU_READ_LITERAL_OR_FAIL(2); cdef->y_secondary_strength[i] = scratch; if (cdef->y_secondary_strength[i] == 3) ++cdef->y_secondary_strength[i]; + cdef->y_secondary_strength[i] <<= coeff_shift; if (sequence_header_.color_config.is_monochrome) continue; OBU_READ_LITERAL_OR_FAIL(4); - cdef->uv_primary_strength[i] = scratch; + cdef->uv_primary_strength[i] = scratch << coeff_shift; OBU_READ_LITERAL_OR_FAIL(2); cdef->uv_secondary_strength[i] = scratch; if (cdef->uv_secondary_strength[i] == 3) ++cdef->uv_secondary_strength[i]; + cdef->uv_secondary_strength[i] <<= coeff_shift; } return true; } @@ -1192,6 +1195,12 @@ bool ObuParser::IsSkipModeAllowed() { const unsigned int reference_hint = decoder_state_ .reference_order_hint[frame_header_.reference_frame_index[i]]; + // TODO(linfengz): |relative_distance| equals + // current_frame_->reference_info()-> + // relative_distance_from[i + kReferenceFrameLast]; + // However, the unit test ObuParserTest.SkipModeParameters() would fail. + // Will figure out how to initialize |current_frame_.reference_info_| in the + // RefCountedBuffer later. const int relative_distance = GetRelativeDistance(reference_hint, frame_header_.order_hint, sequence_header_.order_hint_shift_bits); @@ -1842,7 +1851,6 @@ bool ObuParser::ParseFrameParameters() { if (frame_header_.frame_type == kFrameKey && frame_header_.show_frame) { decoder_state_.reference_valid.fill(false); decoder_state_.reference_order_hint.fill(0); - current_frame_->ClearOrderHints(); } OBU_READ_BIT_OR_FAIL; frame_header_.enable_cdf_update = !static_cast<bool>(scratch); @@ -2092,16 +2100,44 @@ bool ObuParser::ParseFrameParameters() { return false; } if (!IsIntraFrame(frame_header_.frame_type)) { - for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) { - const auto reference_frame = - static_cast<ReferenceFrameType>(kReferenceFrameLast + i); + // Initialize the kReferenceFrameIntra type reference frame information to + // simplify the frame type validation in motion field projection. + // Set the kReferenceFrameIntra type |order_hint_| to + // |frame_header_.order_hint|. This guarantees that in SIMD implementations, + // the other reference frame information of the kReferenceFrameIntra type + // could be correctly initialized using the following loop with + // |frame_header_.order_hint| being the |hint|. + ReferenceInfo* const reference_info = current_frame_->reference_info(); + reference_info->order_hint[kReferenceFrameIntra] = frame_header_.order_hint; + reference_info->relative_distance_from[kReferenceFrameIntra] = 0; + reference_info->relative_distance_to[kReferenceFrameIntra] = 0; + reference_info->skip_references[kReferenceFrameIntra] = true; + reference_info->projection_divisions[kReferenceFrameIntra] = 0; + + for (int i = kReferenceFrameLast; i <= kNumInterReferenceFrameTypes; ++i) { + const auto reference_frame = static_cast<ReferenceFrameType>(i); const uint8_t hint = - decoder_state_ - .reference_order_hint[frame_header_.reference_frame_index[i]]; - current_frame_->set_order_hint(reference_frame, hint); - decoder_state_.reference_frame_sign_bias[reference_frame] = + decoder_state_.reference_order_hint + [frame_header_.reference_frame_index[i - kReferenceFrameLast]]; + reference_info->order_hint[reference_frame] = hint; + const int relative_distance_from = GetRelativeDistance(hint, frame_header_.order_hint, - sequence_header_.order_hint_shift_bits) > 0; + sequence_header_.order_hint_shift_bits); + const int relative_distance_to = + GetRelativeDistance(frame_header_.order_hint, hint, + sequence_header_.order_hint_shift_bits); + reference_info->relative_distance_from[reference_frame] = + relative_distance_from; + reference_info->relative_distance_to[reference_frame] = + relative_distance_to; + reference_info->skip_references[reference_frame] = + relative_distance_to > kMaxFrameDistance || relative_distance_to <= 0; + reference_info->projection_divisions[reference_frame] = + reference_info->skip_references[reference_frame] + ? 0 + : kProjectionMvDivisionLookup[relative_distance_to]; + decoder_state_.reference_frame_sign_bias[reference_frame] = + relative_distance_from > 0; } } if (frame_header_.enable_cdf_update && @@ -2128,6 +2164,11 @@ bool ObuParser::ParseFrameHeader() { ParseQuantizerIndexDeltaParameters() && ParseLoopFilterDeltaParameters(); if (!status) return false; ComputeSegmentLosslessAndQIndex(); + // Section 6.8.2: It is a requirement of bitstream conformance that + // delta_q_present is equal to 0 when CodedLossless is equal to 1. + if (frame_header_.coded_lossless && frame_header_.delta_q.present) { + return false; + } status = ParseLoopFilterParameters(); if (!status) return false; current_frame_->SetLoopFilterDeltas(frame_header_.loop_filter); diff --git a/chromium/third_party/libgav1/src/src/post_filter.h b/chromium/third_party/libgav1/src/src/post_filter.h index 16c784ac458..c7af197575d 100644 --- a/chromium/third_party/libgav1/src/src/post_filter.h +++ b/chromium/third_party/libgav1/src/src/post_filter.h @@ -27,7 +27,7 @@ #include "src/dsp/common.h" #include "src/dsp/dsp.h" -#include "src/loop_filter_mask.h" +#include "src/frame_scratch_buffer.h" #include "src/loop_restoration_info.h" #include "src/obu_parser.h" #include "src/utils/array_2d.h" @@ -46,8 +46,6 @@ namespace libgav1 { // and loop restoration. // Historically, for example in libaom, loop filter refers to deblock filter. // To avoid name conflicts, we call this class PostFilter (post processing). -// Input info includes deblock parameters (bit masks), CDEF -// parameters, super resolution parameters and loop restoration parameters. // In-loop post filtering order is: // deblock --> CDEF --> super resolution--> loop restoration. // When CDEF and super resolution is not used, we can combine deblock @@ -76,14 +74,9 @@ class PostFilter { // * Output: |loop_restoration_buffer_|. // -> Now |frame_buffer_| contains the filtered frame. PostFilter(const ObuFrameHeader& frame_header, - const ObuSequenceHeader& sequence_header, LoopFilterMask* masks, - const Array2D<int16_t>& cdef_index, - const Array2D<TransformSize>& inter_transform_sizes, - LoopRestorationInfo* restoration_info, - BlockParametersHolder* block_parameters, YuvBuffer* frame_buffer, - YuvBuffer* deblock_buffer, const dsp::Dsp* dsp, - ThreadPool* thread_pool, uint8_t* threaded_window_buffer, - uint8_t* superres_line_buffer, int do_post_filter_mask); + const ObuSequenceHeader& sequence_header, + FrameScratchBuffer* frame_scratch_buffer, YuvBuffer* frame_buffer, + const dsp::Dsp* dsp, int do_post_filter_mask); // non copyable/movable. PostFilter(const PostFilter&) = delete; @@ -123,9 +116,9 @@ class PostFilter { // with a shift to the top-left). void ApplyFilteringThreaded(); - // Does the overall post processing filter for one superblock row (starting at - // |row4x4| with height 4*|sb4x4|. Cdef, SuperRes and Loop Restoration lag by - // one superblock row to account for deblocking. + // Does the overall post processing filter for one superblock row starting at + // |row4x4| with height 4*|sb4x4|. If |do_deblock| is false, deblocking filter + // will not be applied. // // Filter behavior (single-threaded): // * Deblock: In-place filtering. The output is written to |source_buffer_|. @@ -143,26 +136,35 @@ class PostFilter { // top-left). // Returns the index of the last row whose post processing is complete and can // be used for referencing. - int ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4, - bool is_last_row); + int ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4, bool is_last_row, + bool do_deblock); + + // Apply deblocking filter in one direction (specified by |loop_filter_type|) + // for the superblock row starting at |row4x4_start| for columns starting from + // |column4x4_start| in increments of 16 (or 8 for chroma with subsampling) + // until the smallest multiple of 16 that is >= |column4x4_end| or until + // |frame_header_.columns4x4|, whichever is lower. This function must be + // called only if |DoDeblock()| returns true. + void ApplyDeblockFilter(LoopFilterType loop_filter_type, int row4x4_start, + int column4x4_start, int column4x4_end, int sb4x4); - bool DoCdef() const { return DoCdef(frame_header_, do_post_filter_mask_); } static bool DoCdef(const ObuFrameHeader& frame_header, int do_post_filter_mask) { - return (do_post_filter_mask & 0x02) != 0 && - (frame_header.cdef.bits > 0 || + return (frame_header.cdef.bits > 0 || frame_header.cdef.y_primary_strength[0] > 0 || frame_header.cdef.y_secondary_strength[0] > 0 || frame_header.cdef.uv_primary_strength[0] > 0 || - frame_header.cdef.uv_secondary_strength[0] > 0); + frame_header.cdef.uv_secondary_strength[0] > 0) && + (do_post_filter_mask & 0x02) != 0; } + bool DoCdef() const { return DoCdef(frame_header_, do_post_filter_mask_); } // If filter levels for Y plane (0 for vertical, 1 for horizontal), // are all zero, deblock filter will not be applied. static bool DoDeblock(const ObuFrameHeader& frame_header, uint8_t do_post_filter_mask) { - return (do_post_filter_mask & 0x01) != 0 && - (frame_header.loop_filter.level[0] > 0 || - frame_header.loop_filter.level[1] > 0); + return (frame_header.loop_filter.level[0] > 0 || + frame_header.loop_filter.level[1] > 0) && + (do_post_filter_mask & 0x01) != 0; } bool DoDeblock() const { return DoDeblock(frame_header_, do_post_filter_mask_); @@ -178,20 +180,21 @@ class PostFilter { const int8_t delta_lf[kFrameLfCount], uint8_t deblock_filter_levels[kMaxSegments][kFrameLfCount] [kNumReferenceFrameTypes][2]) const; - bool DoRestoration() const { - return DoRestoration(loop_restoration_, do_post_filter_mask_, planes_); - } // Returns true if loop restoration will be performed for the given parameters // and mask. static bool DoRestoration(const LoopRestoration& loop_restoration, uint8_t do_post_filter_mask, int num_planes) { - if ((do_post_filter_mask & 0x08) == 0) return false; if (num_planes == kMaxPlanesMonochrome) { - return loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone; + return loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone && + (do_post_filter_mask & 0x08) != 0; } - return loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone || - loop_restoration.type[kPlaneU] != kLoopRestorationTypeNone || - loop_restoration.type[kPlaneV] != kLoopRestorationTypeNone; + return (loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone || + loop_restoration.type[kPlaneU] != kLoopRestorationTypeNone || + loop_restoration.type[kPlaneV] != kLoopRestorationTypeNone) && + (do_post_filter_mask & 0x08) != 0; + } + bool DoRestoration() const { + return DoRestoration(loop_restoration_, do_post_filter_mask_, planes_); } // Returns a pointer to the unfiltered buffer. This is used by the Tile class @@ -204,13 +207,12 @@ class PostFilter { // mask. static bool DoSuperRes(const ObuFrameHeader& frame_header, uint8_t do_post_filter_mask) { - return (do_post_filter_mask & 0x04) != 0 && - frame_header.width != frame_header.upscaled_width; + return frame_header.width != frame_header.upscaled_width && + (do_post_filter_mask & 0x04) != 0; } bool DoSuperRes() const { return DoSuperRes(frame_header_, do_post_filter_mask_); } - LoopFilterMask* masks() const { return masks_; } LoopRestorationInfo* restoration_info() const { return restoration_info_; } uint8_t* GetBufferOffset(uint8_t* base_buffer, int stride, Plane plane, int row4x4, int column4x4) const { @@ -249,37 +251,23 @@ class PostFilter { // The type of the HorizontalDeblockFilter and VerticalDeblockFilter member // functions. using DeblockFilter = void (PostFilter::*)(Plane plane, int row4x4_start, - int column4x4_start, int unit_id); - // The lookup table for picking the deblock filter, according to: - // kDeblockFilterBitMask (first dimension), and deblock filter type (second). - const DeblockFilter deblock_filter_type_table_[2][2] = { - {&PostFilter::VerticalDeblockFilterNoMask, - &PostFilter::HorizontalDeblockFilterNoMask}, - {&PostFilter::VerticalDeblockFilter, - &PostFilter::HorizontalDeblockFilter}, - }; - // Buffers for loop restoration intermediate results. Depending on the filter - // type, only one member of the union is used. - union IntermediateBuffers { - // For Wiener filter. - // The array |intermediate| in Section 7.17.4, the intermediate results - // between the horizontal and vertical filters. - alignas(kMaxAlignment) - uint16_t wiener[(kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1) * - kMaxSuperBlockSizeInPixels]; - // For self-guided filter. - struct { - // The arrays flt0 and flt1 in Section 7.17.2, the outputs of the box - // filter process in pass 0 and pass 1. - alignas( - kMaxAlignment) int32_t output[2][kMaxBoxFilterProcessOutputPixels]; - // The 2d arrays A and B in Section 7.17.3, the intermediate results in - // the box filter process. Reused for pass 0 and pass 1. - alignas(kMaxAlignment) uint32_t - intermediate_a[kBoxFilterProcessIntermediatePixels]; - alignas(kMaxAlignment) uint32_t - intermediate_b[kBoxFilterProcessIntermediatePixels]; - } box_filter; + int column4x4_start); + // The lookup table for picking the deblock filter, according to deblock + // filter type. + const DeblockFilter deblock_filter_func_[2] = { + &PostFilter::VerticalDeblockFilter, &PostFilter::HorizontalDeblockFilter}; + + // The type of GetVerticalDeblockFilterEdgeInfo* member functions. + using DeblockVerticalEdgeInfo = bool (PostFilter::*)( + const Plane plane, int row4x4, int column4x4, const int8_t subsampling_x, + const int8_t subsampling_y, BlockParameters* const* bp_ptr, + uint8_t* level, int* step, int* filter_length) const; + // The lookup table for picking the GetVerticalDeblockEdgeInfo based on the + // plane. + const DeblockVerticalEdgeInfo deblock_vertical_edge_info_[kMaxPlanes] = { + &PostFilter::GetVerticalDeblockFilterEdgeInfo, + &PostFilter::GetVerticalDeblockFilterEdgeInfoUV, + &PostFilter::GetVerticalDeblockFilterEdgeInfoUV, }; // Functions common to all post filters. @@ -337,35 +325,26 @@ class PostFilter { int GetDeblockUnitId(int row_unit, int column_unit) const { return row_unit * num_64x64_blocks_per_row_ + column_unit; } - static dsp::LoopFilterSize GetLoopFilterSize(Plane plane, int step) { - if (step == 4) { - return dsp::kLoopFilterSize4; - } - if (step == 8) { - return (plane == kPlaneY) ? dsp::kLoopFilterSize8 : dsp::kLoopFilterSize6; - } - return (plane == kPlaneY) ? dsp::kLoopFilterSize14 : dsp::kLoopFilterSize6; - } - void InitDeblockFilterParams(); // Part of 7.14.4. - void GetDeblockFilterParams(uint8_t level, int* outer_thresh, - int* inner_thresh, int* hev_thresh) const; - template <LoopFilterType type> - bool GetDeblockFilterEdgeInfo(Plane plane, int row4x4, int column4x4, - int8_t subsampling_x, int8_t subsampling_y, - uint8_t* level, int* step, - int* filter_length) const; + bool GetHorizontalDeblockFilterEdgeInfo(Plane plane, int row4x4, + int column4x4, int8_t subsampling_x, + int8_t subsampling_y, uint8_t* level, + int* step, int* filter_length) const; + bool GetVerticalDeblockFilterEdgeInfo(Plane plane, int row4x4, int column4x4, + int8_t subsampling_x, + int8_t subsampling_y, + BlockParameters* const* bp_ptr, + uint8_t* level, int* step, + int* filter_length) const; + bool GetVerticalDeblockFilterEdgeInfoUV(Plane plane, int row4x4, + int column4x4, int8_t subsampling_x, + int8_t subsampling_y, + BlockParameters* const* bp_ptr, + uint8_t* level, int* step, + int* filter_length) const; void HorizontalDeblockFilter(Plane plane, int row4x4_start, - int column4x4_start, int unit_id); - void VerticalDeblockFilter(Plane plane, int row4x4_start, int column4x4_start, - int unit_id); - // |unit_id| is not used, keep it to match the same interface as - // HorizontalDeblockFilter(). - void HorizontalDeblockFilterNoMask(Plane plane, int row4x4_start, - int column4x4_start, int unit_id); - // |unit_id| is not used, keep it to match the same interface as - // VerticalDeblockFilter(). - void VerticalDeblockFilterNoMask(Plane plane, int row4x4_start, - int column4x4_start, int unit_id); + int column4x4_start); + void VerticalDeblockFilter(Plane plane, int row4x4_start, + int column4x4_start); // HorizontalDeblockFilter and VerticalDeblockFilter must have the correct // signature. static_assert(std::is_same<decltype(&PostFilter::HorizontalDeblockFilter), @@ -385,7 +364,6 @@ class PostFilter { // Functions for the cdef filter. uint8_t* GetCdefBufferAndStride(int start_x, int start_y, int plane, - int subsampling_x, int subsampling_y, int window_buffer_plane_size, int* cdef_stride) const; // This function prepares the input source block for cdef filtering. The input @@ -394,9 +372,9 @@ class PostFilter { // pixels with a large value. This achieves the required behavior defined in // section 5.11.52 of the spec. template <typename Pixel> - void PrepareCdefBlock(int block_width4x4, int block_height4x4, int row_64x64, - int column_64x64, uint16_t* cdef_source, - ptrdiff_t cdef_stride); + void PrepareCdefBlock(int block_width4x4, int block_height4x4, int row4x4, + int column4x4, uint16_t* cdef_source, + ptrdiff_t cdef_stride, bool y_plane); template <typename Pixel> void ApplyCdefForOneUnit(uint16_t* cdef_block, int index, int block_width4x4, int block_height4x4, int row4x4_start, @@ -434,12 +412,14 @@ class PostFilter { // Functions for the Loop Restoration filter. template <typename Pixel> - void ApplyLoopRestorationForOneUnit( - uint8_t* cdef_buffer, ptrdiff_t cdef_buffer_stride, Plane plane, - int plane_height, int x, int y, int row, int column, int unit_row, - int current_process_unit_height, int plane_process_unit_width, - int plane_unit_size, int num_horizontal_units, int plane_width, - Array2DView<Pixel>* loop_restored_window); + void ApplyLoopRestorationForOneUnit(uint8_t* cdef_buffer, + ptrdiff_t cdef_buffer_stride, Plane plane, + int plane_height, int x, int y, int row, + int column, int unit_row, + int current_process_unit_height, + int plane_unit_size, + int num_horizontal_units, int plane_width, + Array2DView<Pixel>* loop_restored_window); template <typename Pixel> void ApplyLoopRestorationForSuperBlock(Plane plane, int x, int y, int unit_row, @@ -454,8 +434,8 @@ class PostFilter { void ApplyLoopRestorationForOneRowInWindow( uint8_t* cdef_buffer, ptrdiff_t cdef_buffer_stride, Plane plane, int plane_height, int plane_width, int x, int y, int row, int unit_row, - int current_process_unit_height, int process_unit_width, int window_width, - int plane_unit_size, int num_horizontal_units); + int current_process_unit_height, int plane_unit_size, int window_width, + int num_horizontal_units); // Note for ApplyLoopRestoration(): // First, we must differentiate loop restoration processing unit from loop // restoration unit. @@ -501,12 +481,8 @@ class PostFilter { const int8_t subsampling_y_[kMaxPlanes]; const int8_t planes_; const int pixel_size_; - // This class does not take ownership of the masks/restoration_info, but it - // could change their values. - LoopFilterMask* const masks_; - uint8_t inner_thresh_[kMaxLoopFilterValue + 1] = {}; - uint8_t outer_thresh_[kMaxLoopFilterValue + 1] = {}; - uint8_t hev_thresh_[kMaxLoopFilterValue + 1] = {}; + const uint8_t* const inner_thresh_; + const uint8_t* const outer_thresh_; // This stores the deblocking filter levels assuming that the delta is zero. // This will be used by all superblocks whose delta is zero (without having to // recompute them). The dimensions (in order) are: segment_id, level_index @@ -529,8 +505,6 @@ class PostFilter { // nullptr as well. uint8_t* const threaded_window_buffer_; LoopRestorationInfo* const restoration_info_; - const int window_buffer_width_; - const int window_buffer_height_; // Pointer to the line buffer used by ApplySuperRes(). If SuperRes is on, then // the buffer will be large enough to hold one downscaled row + // kSuperResHorizontalBorder. @@ -560,8 +534,10 @@ class PostFilter { // This buffer is used only when both Cdef and Loop Restoration are on. YuvBuffer& deblock_buffer_; const uint8_t do_post_filter_mask_; - ThreadPool* const thread_pool_; + const int window_buffer_width_; + const int window_buffer_height_; + // Tracks the progress of the post filters. int progress_row_ = -1; @@ -571,13 +547,11 @@ class PostFilter { // Wiener filter needs extended border of three pixels. // Therefore the size of the buffer is 70x70 pixels. alignas(alignof(uint16_t)) uint8_t - block_buffer_[kRestorationProcessingUnitSizeWithBorders * - kRestorationProcessingUnitSizeWithBorders * - sizeof(uint16_t)]; + block_buffer_[kRestorationUnitHeightWithBorders * + kRestorationUnitWidthWithBorders * sizeof(uint16_t)]; // A block buffer to hold the input that is converted to uint16_t before // cdef filtering. Only used in single threaded case. - uint16_t cdef_block_[kRestorationProcessingUnitSizeWithBorders * - kRestorationProcessingUnitSizeWithBorders * 3]; + uint16_t cdef_block_[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 3]; template <int bitdepth, typename Pixel> friend class PostFilterSuperResTest; @@ -586,75 +560,69 @@ class PostFilter { friend class PostFilterHelperFuncTest; }; +template <typename Pixel> +void CopyTwoRows(const Pixel* src, const ptrdiff_t src_stride, Pixel** dst, + const ptrdiff_t dst_stride, const int width) { + for (int i = 0; i < kRestorationBorder - 1; ++i) { + memcpy(*dst, src, sizeof(Pixel) * width); + src += src_stride; + *dst += dst_stride; + } +} + // This function takes the cdef filtered buffer and the deblocked buffer to // prepare a block as input for loop restoration. // In striped loop restoration: -// The filtering needs to fetch the area of size (width + 6) x (height + 6), -// in which (width + 6) x height area is from cdef filtered frame -// (cdef_buffer). Top 3 rows and bottom 3 rows are from deblocked frame -// (deblock_buffer). +// The filtering needs to fetch the area of size (width + 6) x (height + 4), +// in which (width + 6) x height area is from cdef filtered frame (cdef_buffer). +// Top 2 rows and bottom 2 rows are from deblocked frame (deblock_buffer). // Special cases are: -// (1). when it is the top border, the top 3 rows are from cdef -// filtered frame. -// (2). when it is the bottom border, the bottom 3 rows are from cdef -// filtered frame. -// For the top 3 rows and bottom 3 rows, the top_row[0] is a copy of the -// top_row[1]. The bottom_row[2] is a copy of the bottom_row[1]. If cdef is -// not applied for this frame, cdef_buffer is the same as deblock_buffer. +// (1). when it is the top border, the top 2 rows are from cdef filtered frame. +// (2). when it is the bottom border, the bottom 2 rows are from cdef filtered +// frame. +// This function is called only when cdef is applied for this frame. template <typename Pixel> -void PrepareLoopRestorationBlock(const bool do_cdef, const uint8_t* cdef_buffer, +void PrepareLoopRestorationBlock(const uint8_t* cdef_buffer, ptrdiff_t cdef_stride, const uint8_t* deblock_buffer, ptrdiff_t deblock_stride, uint8_t* dest, ptrdiff_t dest_stride, const int width, const int height, const bool frame_top_border, const bool frame_bottom_border) { - const auto* cdef_ptr = reinterpret_cast<const Pixel*>(cdef_buffer); cdef_stride /= sizeof(Pixel); - const auto* deblock_ptr = reinterpret_cast<const Pixel*>(deblock_buffer); deblock_stride /= sizeof(Pixel); - auto* dst = reinterpret_cast<Pixel*>(dest); dest_stride /= sizeof(Pixel); - // Top 3 rows. - cdef_ptr -= (kRestorationBorder - 1) * cdef_stride + kRestorationBorder; - if (deblock_ptr != nullptr) deblock_ptr -= kRestorationBorder; - for (int i = 0; i < kRestorationBorder; ++i) { - if (frame_top_border || !do_cdef) { - memcpy(dst, cdef_ptr, sizeof(Pixel) * (width + 2 * kRestorationBorder)); - } else { - memcpy(dst, deblock_ptr, - sizeof(Pixel) * (width + 2 * kRestorationBorder)); - } - if (i > 0) { - if (deblock_ptr != nullptr) deblock_ptr += deblock_stride; - cdef_ptr += cdef_stride; - } - dst += dest_stride; + const auto* cdef_ptr = reinterpret_cast<const Pixel*>(cdef_buffer) - + (kRestorationBorder - 1) * cdef_stride - + kRestorationBorder; + const auto* deblock_ptr = + reinterpret_cast<const Pixel*>(deblock_buffer) - kRestorationBorder; + auto* dst = reinterpret_cast<Pixel*>(dest) + dest_stride; + int h = height; + // Top 2 rows. + if (frame_top_border) { + h += kRestorationBorder - 1; + } else { + CopyTwoRows<Pixel>(deblock_ptr, deblock_stride, &dst, dest_stride, + width + 2 * kRestorationBorder); + cdef_ptr += (kRestorationBorder - 1) * cdef_stride; + // If |frame_top_border| is true, then we are in the first superblock row, + // so in that case, do not increment |deblock_ptr| since we don't store + // anything from the first superblock row into |deblock_buffer|. + deblock_ptr += 4 * deblock_stride; } + if (frame_bottom_border) h += kRestorationBorder - 1; // Main body. - for (int i = 0; i < height; ++i) { + do { memcpy(dst, cdef_ptr, sizeof(Pixel) * (width + 2 * kRestorationBorder)); cdef_ptr += cdef_stride; dst += dest_stride; - } - // Bottom 3 rows. If |frame_top_border| is true, then we are in the first - // superblock row, so in that case, do not increment |deblock_ptr| since we - // don't store anything from the first superblock row into |deblock_buffer|. - if (deblock_ptr != nullptr && !frame_top_border) { - deblock_ptr += deblock_stride * 4; - } - for (int i = 0; i < kRestorationBorder; ++i) { - if (frame_bottom_border || !do_cdef) { - memcpy(dst, cdef_ptr, sizeof(Pixel) * (width + 2 * kRestorationBorder)); - } else { - memcpy(dst, deblock_ptr, - sizeof(Pixel) * (width + 2 * kRestorationBorder)); - } - if (i < kRestorationBorder - 2) { - if (deblock_ptr != nullptr) deblock_ptr += deblock_stride; - cdef_ptr += cdef_stride; - } - dst += dest_stride; + } while (--h != 0); + // Bottom 2 rows. + if (!frame_bottom_border) { + deblock_ptr += (kRestorationBorder - 1) * deblock_stride; + CopyTwoRows<Pixel>(deblock_ptr, deblock_stride, &dst, dest_stride, + width + 2 * kRestorationBorder); } } diff --git a/chromium/third_party/libgav1/src/src/post_filter/cdef.cc b/chromium/third_party/libgav1/src/src/post_filter/cdef.cc index c169acd6532..2b3b7119f0b 100644 --- a/chromium/third_party/libgav1/src/src/post_filter/cdef.cc +++ b/chromium/third_party/libgav1/src/src/post_filter/cdef.cc @@ -20,6 +20,7 @@ namespace libgav1 { namespace { constexpr int kStep64x64 = 16; // =64/4. +constexpr int kCdefSkip = 8; constexpr uint8_t kCdefUvDirection[2][2][8] = { {{0, 1, 2, 3, 4, 5, 6, 7}, {1, 2, 2, 2, 3, 4, 6, 0}}, @@ -57,19 +58,31 @@ void CopyRowForCdef(const Pixel* src, int block_width, int unit_width, } } +// For |height| rows, copy |width| pixels of size |pixel_size| from |src| to +// |dst|. +void CopyPixels(const uint8_t* src, int src_stride, uint8_t* dst, + int dst_stride, int width, int height, size_t pixel_size) { + int y = height; + do { + memcpy(dst, src, width * pixel_size); + src += src_stride; + dst += dst_stride; + } while (--y != 0); +} + } // namespace uint8_t* PostFilter::GetCdefBufferAndStride(const int start_x, const int start_y, const int plane, - const int subsampling_x, - const int subsampling_y, const int window_buffer_plane_size, int* cdef_stride) const { if (thread_pool_ != nullptr) { // write output to threaded_window_buffer. *cdef_stride = window_buffer_width_ * pixel_size_; - const int column_window = start_x % (window_buffer_width_ >> subsampling_x); - const int row_window = start_y % (window_buffer_height_ >> subsampling_y); + const int column_window = + start_x % (window_buffer_width_ >> subsampling_x_[plane]); + const int row_window = + start_y % (window_buffer_height_ >> subsampling_y_[plane]); return threaded_window_buffer_ + plane * window_buffer_plane_size + row_window * (*cdef_stride) + column_window * pixel_size_; } @@ -80,72 +93,82 @@ uint8_t* PostFilter::GetCdefBufferAndStride(const int start_x, template <typename Pixel> void PostFilter::PrepareCdefBlock(int block_width4x4, int block_height4x4, - int row_64x64, int column_64x64, - uint16_t* cdef_source, - ptrdiff_t cdef_stride) { - for (int plane = kPlaneY; plane < planes_; ++plane) { - uint16_t* cdef_src = - cdef_source + plane * kRestorationProcessingUnitSizeWithBorders * - kRestorationProcessingUnitSizeWithBorders; - const int8_t subsampling_x = subsampling_x_[plane]; - const int8_t subsampling_y = subsampling_y_[plane]; - const int start_x = MultiplyBy4(column_64x64) >> subsampling_x; - const int start_y = MultiplyBy4(row_64x64) >> subsampling_y; - const int plane_width = RightShiftWithRounding(width_, subsampling_x); - const int plane_height = RightShiftWithRounding(height_, subsampling_y); - const int block_width = MultiplyBy4(block_width4x4) >> subsampling_x; - const int block_height = MultiplyBy4(block_height4x4) >> subsampling_y; - // unit_width, unit_height are the same as block_width, block_height unless - // it reaches the frame boundary, where block_width < 64 or - // block_height < 64. unit_width, unit_height guarantee we build blocks on - // a multiple of 8. - const int unit_width = Align(block_width, (subsampling_x > 0) ? 4 : 8); - const int unit_height = Align(block_height, (subsampling_y > 0) ? 4 : 8); - const bool is_frame_left = column_64x64 == 0; - const bool is_frame_right = start_x + block_width >= plane_width; - const bool is_frame_top = row_64x64 == 0; - const bool is_frame_bottom = start_y + block_height >= plane_height; + int row4x4, int column4x4, + uint16_t* cdef_source, ptrdiff_t cdef_stride, + const bool y_plane) { + assert(y_plane || planes_ == kMaxPlanes); + const int max_planes = y_plane ? 1 : kMaxPlanes; + const int8_t subsampling_x = y_plane ? 0 : subsampling_x_[kPlaneU]; + const int8_t subsampling_y = y_plane ? 0 : subsampling_y_[kPlaneU]; + const int start_x = MultiplyBy4(column4x4) >> subsampling_x; + const int start_y = MultiplyBy4(row4x4) >> subsampling_y; + const int plane_width = RightShiftWithRounding(width_, subsampling_x); + const int plane_height = RightShiftWithRounding(height_, subsampling_y); + const int block_width = MultiplyBy4(block_width4x4) >> subsampling_x; + const int block_height = MultiplyBy4(block_height4x4) >> subsampling_y; + // unit_width, unit_height are the same as block_width, block_height unless + // it reaches the frame boundary, where block_width < 64 or + // block_height < 64. unit_width, unit_height guarantee we build blocks on + // a multiple of 8. + const int unit_width = Align(block_width, 8 >> subsampling_x); + const int unit_height = Align(block_height, 8 >> subsampling_y); + const bool is_frame_left = column4x4 == 0; + const bool is_frame_right = start_x + block_width >= plane_width; + const bool is_frame_top = row4x4 == 0; + const bool is_frame_bottom = start_y + block_height >= plane_height; + const int y_offset = is_frame_top ? 0 : kCdefBorder; + + for (int plane = y_plane ? kPlaneY : kPlaneU; plane < max_planes; ++plane) { + uint16_t* cdef_src = cdef_source + plane * kCdefUnitSizeWithBorders * + kCdefUnitSizeWithBorders; const int src_stride = frame_buffer_.stride(plane) / sizeof(Pixel); const Pixel* src_buffer = reinterpret_cast<const Pixel*>(source_buffer_[plane]) + - (start_y - (is_frame_top ? 0 : kCdefBorder)) * src_stride + start_x; + (start_y - y_offset) * src_stride + start_x; // All the copying code will use negative indices for populating the left // border. So the starting point is set to kCdefBorder. cdef_src += kCdefBorder; // Copy the top 2 rows. - for (int y = 0; y < kCdefBorder; ++y) { - if (is_frame_top) { + if (is_frame_top) { + for (int y = 0; y < kCdefBorder; ++y) { Memset(cdef_src - kCdefBorder, kCdefLargeValue, unit_width + 2 * kCdefBorder); - } else { + cdef_src += cdef_stride; + } + } else { + for (int y = 0; y < kCdefBorder; ++y) { CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left, is_frame_right, cdef_src); src_buffer += src_stride; + cdef_src += cdef_stride; } - cdef_src += cdef_stride; } // Copy the body. - for (int y = 0; y < block_height; ++y) { + int y = block_height; + do { CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left, is_frame_right, cdef_src); cdef_src += cdef_stride; src_buffer += src_stride; - } + } while (--y != 0); // Copy the bottom 2 rows. - for (int y = 0; y < kCdefBorder + unit_height - block_height; ++y) { - if (is_frame_bottom) { + if (is_frame_bottom) { + do { Memset(cdef_src - kCdefBorder, kCdefLargeValue, unit_width + 2 * kCdefBorder); - } else { + cdef_src += cdef_stride; + } while (++y < kCdefBorder + unit_height - block_height); + } else { + do { CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left, is_frame_right, cdef_src); src_buffer += src_stride; - } - cdef_src += cdef_stride; + cdef_src += cdef_stride; + } while (++y < kCdefBorder + unit_height - block_height); } } } @@ -156,130 +179,237 @@ void PostFilter::ApplyCdefForOneUnit(uint16_t* cdef_block, const int index, const int block_height4x4, const int row4x4_start, const int column4x4_start) { - const int coeff_shift = bitdepth_ - 8; - const int step = kNum4x4BlocksWide[kBlock8x8]; + // Cdef operates in 8x8 blocks (4x4 for chroma with subsampling). + static constexpr int kStep = 8; + static constexpr int kStep4x4 = 2; + const int window_buffer_plane_size = window_buffer_width_ * window_buffer_height_ * pixel_size_; + int cdef_buffer_row_base_stride[kMaxPlanes]; + int cdef_buffer_stride[kMaxPlanes]; + uint8_t* cdef_buffer_row_base[kMaxPlanes]; + int src_buffer_row_base_stride[kMaxPlanes]; + const uint8_t* src_buffer_row_base[kMaxPlanes]; + int column_step[kMaxPlanes]; + for (int plane = kPlaneY; plane < planes_; ++plane) { + const int start_y = MultiplyBy4(row4x4_start) >> subsampling_y_[plane]; + const int start_x = MultiplyBy4(column4x4_start) >> subsampling_x_[plane]; + cdef_buffer_row_base[plane] = GetCdefBufferAndStride( + start_x, start_y, plane, window_buffer_plane_size, + &cdef_buffer_stride[plane]); + cdef_buffer_row_base_stride[plane] = + cdef_buffer_stride[plane] * (kStep >> subsampling_y_[plane]); + src_buffer_row_base[plane] = source_buffer_[plane] + + start_y * frame_buffer_.stride(plane) + + start_x * pixel_size_; + src_buffer_row_base_stride[plane] = + frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]); + column_step[plane] = (kStep >> subsampling_x_[plane]) * pixel_size_; + } if (index == -1) { for (int plane = kPlaneY; plane < planes_; ++plane) { - const int start_x = MultiplyBy4(column4x4_start) >> subsampling_x_[plane]; - const int start_y = MultiplyBy4(row4x4_start) >> subsampling_y_[plane]; - int cdef_stride; - uint8_t* const cdef_buffer = GetCdefBufferAndStride( - start_x, start_y, plane, subsampling_x_[plane], subsampling_y_[plane], - window_buffer_plane_size, &cdef_stride); - const int src_stride = frame_buffer_.stride(plane); - uint8_t* const src_buffer = - source_buffer_[plane] + start_y * src_stride + start_x * pixel_size_; - const int block_width = - MultiplyBy4(block_width4x4) >> subsampling_x_[plane]; - const int block_height = - MultiplyBy4(block_height4x4) >> subsampling_y_[plane]; - for (int y = 0; y < block_height; ++y) { - memcpy(cdef_buffer + y * cdef_stride, src_buffer + y * src_stride, - block_width * pixel_size_); + CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane), + cdef_buffer_row_base[plane], cdef_buffer_stride[plane], + MultiplyBy4(block_width4x4) >> subsampling_x_[plane], + MultiplyBy4(block_height4x4) >> subsampling_y_[plane], + pixel_size_); + } + return; + } + + PrepareCdefBlock<Pixel>(block_width4x4, block_height4x4, row4x4_start, + column4x4_start, cdef_block, kCdefUnitSizeWithBorders, + true); + + // Stored direction used during the u/v pass. If bit 3 is set, then block is + // a skip. + int direction_y[8 * 8]; + int y_index = 0; + + const uint8_t y_primary_strength = + frame_header_.cdef.y_primary_strength[index]; + const uint8_t y_secondary_strength = + frame_header_.cdef.y_secondary_strength[index]; + + const bool compute_direction_and_variance = + (y_primary_strength | frame_header_.cdef.uv_primary_strength[index]) != 0; + BlockParameters* const* bp_row0_base = + block_parameters_.Address(row4x4_start, column4x4_start); + BlockParameters* const* bp_row1_base = + bp_row0_base + block_parameters_.columns4x4(); + const int bp_stride = MultiplyBy2(block_parameters_.columns4x4()); + int row4x4 = row4x4_start; + do { + uint8_t* cdef_buffer_base = cdef_buffer_row_base[kPlaneY]; + const uint8_t* src_buffer_base = src_buffer_row_base[kPlaneY]; + BlockParameters* const* bp0 = bp_row0_base; + BlockParameters* const* bp1 = bp_row1_base; + int column4x4 = column4x4_start; + do { + const int block_width = kStep; + const int block_height = kStep; + const int cdef_stride = cdef_buffer_stride[kPlaneY]; + uint8_t* const cdef_buffer = cdef_buffer_base; + const int src_stride = frame_buffer_.stride(kPlaneY); + const uint8_t* const src_buffer = src_buffer_base; + + const bool skip = (*bp0)->skip && (*(bp0 + 1))->skip && (*bp1)->skip && + (*(bp1 + 1))->skip; + + if (skip) { // No cdef filtering. + direction_y[y_index] = kCdefSkip; + CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride, + block_width, block_height, pixel_size_); + } else { + // Zero out residual skip flag. + direction_y[y_index] = 0; + + int variance = 0; + if (compute_direction_and_variance) { + dsp_.cdef_direction(src_buffer, src_stride, &direction_y[y_index], + &variance); + } + const int direction = + (y_primary_strength == 0) ? 0 : direction_y[y_index]; + const int variance_strength = + ((variance >> 6) != 0) ? std::min(FloorLog2(variance >> 6), 12) : 0; + const uint8_t primary_strength = + (variance != 0) + ? (y_primary_strength * (4 + variance_strength) + 8) >> 4 + : 0; + + if ((primary_strength | y_secondary_strength) == 0) { + CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride, + block_width, block_height, pixel_size_); + } else { + uint16_t* cdef_src = cdef_block + kPlaneY * kCdefUnitSizeWithBorders * + kCdefUnitSizeWithBorders; + cdef_src += kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder; + cdef_src += + (MultiplyBy4(row4x4 - row4x4_start)) * kCdefUnitSizeWithBorders + + (MultiplyBy4(column4x4 - column4x4_start)); + dsp_.cdef_filter(cdef_src, kCdefUnitSizeWithBorders, block_width, + block_height, primary_strength, y_secondary_strength, + frame_header_.cdef.damping, direction, cdef_buffer, + cdef_stride); + } } + cdef_buffer_base += column_step[kPlaneY]; + src_buffer_base += column_step[kPlaneY]; + + bp0 += kStep4x4; + bp1 += kStep4x4; + column4x4 += kStep4x4; + y_index++; + } while (column4x4 < column4x4_start + block_width4x4); + + cdef_buffer_row_base[kPlaneY] += cdef_buffer_row_base_stride[kPlaneY]; + src_buffer_row_base[kPlaneY] += src_buffer_row_base_stride[kPlaneY]; + bp_row0_base += bp_stride; + bp_row1_base += bp_stride; + row4x4 += kStep4x4; + } while (row4x4 < row4x4_start + block_height4x4); + + if (planes_ == kMaxPlanesMonochrome) { + return; + } + + const uint8_t uv_primary_strength = + frame_header_.cdef.uv_primary_strength[index]; + const uint8_t uv_secondary_strength = + frame_header_.cdef.uv_secondary_strength[index]; + + if ((uv_primary_strength | uv_secondary_strength) == 0) { + for (int plane = kPlaneU; plane <= kPlaneV; ++plane) { + CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane), + cdef_buffer_row_base[plane], cdef_buffer_stride[plane], + MultiplyBy4(block_width4x4) >> subsampling_x_[plane], + MultiplyBy4(block_height4x4) >> subsampling_y_[plane], + pixel_size_); } return; } PrepareCdefBlock<Pixel>(block_width4x4, block_height4x4, row4x4_start, - column4x4_start, cdef_block, - kRestorationProcessingUnitSizeWithBorders); - - for (int row4x4 = row4x4_start; row4x4 < row4x4_start + block_height4x4; - row4x4 += step) { - for (int column4x4 = column4x4_start; - column4x4 < column4x4_start + block_width4x4; column4x4 += step) { - const bool skip = - block_parameters_.Find(row4x4, column4x4) != nullptr && - block_parameters_.Find(row4x4 + 1, column4x4) != nullptr && - block_parameters_.Find(row4x4, column4x4 + 1) != nullptr && - block_parameters_.Find(row4x4 + 1, column4x4 + 1) != nullptr && - block_parameters_.Find(row4x4, column4x4)->skip && - block_parameters_.Find(row4x4 + 1, column4x4)->skip && - block_parameters_.Find(row4x4, column4x4 + 1)->skip && - block_parameters_.Find(row4x4 + 1, column4x4 + 1)->skip; - int damping = frame_header_.cdef.damping + coeff_shift; - int direction_y; - int direction; - int variance; - uint8_t primary_strength; - uint8_t secondary_strength; + column4x4_start, cdef_block, kCdefUnitSizeWithBorders, + false); - for (int plane = kPlaneY; plane < planes_; ++plane) { - const int8_t subsampling_x = subsampling_x_[plane]; - const int8_t subsampling_y = subsampling_y_[plane]; - const int start_x = MultiplyBy4(column4x4) >> subsampling_x; - const int start_y = MultiplyBy4(row4x4) >> subsampling_y; - const int block_width = 8 >> subsampling_x; - const int block_height = 8 >> subsampling_y; - int cdef_stride; - uint8_t* const cdef_buffer = GetCdefBufferAndStride( - start_x, start_y, plane, subsampling_x, subsampling_y, - window_buffer_plane_size, &cdef_stride); + for (int plane = kPlaneU; plane <= kPlaneV; ++plane) { + const int8_t subsampling_x = subsampling_x_[plane]; + const int8_t subsampling_y = subsampling_y_[plane]; + const int block_width = kStep >> subsampling_x; + const int block_height = kStep >> subsampling_y; + int row4x4 = row4x4_start; + + y_index = 0; + do { + uint8_t* cdef_buffer_base = cdef_buffer_row_base[plane]; + const uint8_t* src_buffer_base = src_buffer_row_base[plane]; + int column4x4 = column4x4_start; + do { + const int cdef_stride = cdef_buffer_stride[plane]; + uint8_t* const cdef_buffer = cdef_buffer_base; const int src_stride = frame_buffer_.stride(plane); - uint8_t* const src_buffer = source_buffer_[plane] + - start_y * src_stride + - start_x * pixel_size_; + const uint8_t* const src_buffer = src_buffer_base; + const bool skip = direction_y[y_index] & kCdefSkip; + int dual_cdef = 0; if (skip) { // No cdef filtering. - for (int y = 0; y < block_height; ++y) { - memcpy(cdef_buffer + y * cdef_stride, src_buffer + y * src_stride, - block_width * pixel_size_); + CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride, + block_width, block_height, pixel_size_); + } else { + // Make sure block pair is not out of bounds. + if (column4x4 + (kStep4x4 * 2) <= column4x4_start + block_width4x4) { + // Enable dual processing if subsampling_x is 1. + dual_cdef = subsampling_x; } - continue; - } - if (plane == kPlaneY) { - dsp_.cdef_direction(src_buffer, src_stride, &direction_y, &variance); - primary_strength = frame_header_.cdef.y_primary_strength[index] - << coeff_shift; - secondary_strength = frame_header_.cdef.y_secondary_strength[index] - << coeff_shift; - direction = (primary_strength == 0) ? 0 : direction_y; - const int variance_strength = - ((variance >> 6) != 0) ? std::min(FloorLog2(variance >> 6), 12) - : 0; - primary_strength = - (variance != 0) - ? (primary_strength * (4 + variance_strength) + 8) >> 4 - : 0; - } else { - primary_strength = frame_header_.cdef.uv_primary_strength[index] - << coeff_shift; - secondary_strength = frame_header_.cdef.uv_secondary_strength[index] - << coeff_shift; - direction = - (primary_strength == 0) - ? 0 - : kCdefUvDirection[subsampling_x][subsampling_y][direction_y]; - damping = frame_header_.cdef.damping + coeff_shift - 1; - } + int direction = (uv_primary_strength == 0) + ? 0 + : kCdefUvDirection[subsampling_x][subsampling_y] + [direction_y[y_index]]; + + if (dual_cdef != 0) { + if (uv_primary_strength && + direction_y[y_index] != direction_y[y_index + 1]) { + // Disable dual processing if the second block of the pair does + // not have the same direction. + dual_cdef = 0; + } - if ((primary_strength | secondary_strength) == 0) { - for (int y = 0; y < block_height; ++y) { - memcpy(cdef_buffer + y * cdef_stride, src_buffer + y * src_stride, - block_width * pixel_size_); + // Disable dual processing if the second block of the pair is a + // skip. + if (direction_y[y_index + 1] == kCdefSkip) { + dual_cdef = 0; + } } - continue; + + uint16_t* cdef_src = cdef_block + plane * kCdefUnitSizeWithBorders * + kCdefUnitSizeWithBorders; + cdef_src += kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder; + cdef_src += + (MultiplyBy4(row4x4 - row4x4_start) >> subsampling_y) * + kCdefUnitSizeWithBorders + + (MultiplyBy4(column4x4 - column4x4_start) >> subsampling_x); + dsp_.cdef_filter(cdef_src, kCdefUnitSizeWithBorders, + block_width << dual_cdef, block_height, + uv_primary_strength, uv_secondary_strength, + frame_header_.cdef.damping - 1, direction, + cdef_buffer, cdef_stride); } - uint16_t* cdef_src = - cdef_block + plane * kRestorationProcessingUnitSizeWithBorders * - kRestorationProcessingUnitSizeWithBorders; - cdef_src += kCdefBorder * kRestorationProcessingUnitSizeWithBorders + - kCdefBorder; - cdef_src += (MultiplyBy4(row4x4 - row4x4_start) >> subsampling_y) * - kRestorationProcessingUnitSizeWithBorders + - (MultiplyBy4(column4x4 - column4x4_start) >> subsampling_x); - dsp_.cdef_filter(cdef_src, kRestorationProcessingUnitSizeWithBorders, - frame_header_.rows4x4, frame_header_.columns4x4, - start_x, start_y, subsampling_x, subsampling_y, - primary_strength, secondary_strength, damping, - direction, cdef_buffer, cdef_stride); - } - } + // When dual_cdef is set, the above cdef_filter() will process 2 blocks, + // so adjust the pointers and indexes for 2 blocks. + cdef_buffer_base += column_step[plane] << dual_cdef; + src_buffer_base += column_step[plane] << dual_cdef; + column4x4 += kStep4x4 << dual_cdef; + y_index += 1 << dual_cdef; + } while (column4x4 < column4x4_start + block_width4x4); + + cdef_buffer_row_base[plane] += cdef_buffer_row_base_stride[plane]; + src_buffer_row_base[plane] += src_buffer_row_base_stride[plane]; + row4x4 += kStep4x4; + } while (row4x4 < row4x4_start + block_height4x4); } } @@ -336,8 +466,7 @@ void PostFilter::ApplyCdefForOneSuperBlockRow(int row4x4_start, int sb4x4, template <typename Pixel> void PostFilter::ApplyCdefForOneRowInWindow(const int row4x4, const int column4x4_start) { - uint16_t cdef_block[kRestorationProcessingUnitSizeWithBorders * - kRestorationProcessingUnitSizeWithBorders * 3]; + uint16_t cdef_block[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 3]; for (int column4x4_64x64 = 0; column4x4_64x64 < std::min(DivideBy4(window_buffer_width_), diff --git a/chromium/third_party/libgav1/src/src/post_filter/deblock.cc b/chromium/third_party/libgav1/src/src/post_filter/deblock.cc index db21d3db117..afe2895dbe3 100644 --- a/chromium/third_party/libgav1/src/src/post_filter/deblock.cc +++ b/chromium/third_party/libgav1/src/src/post_filter/deblock.cc @@ -17,6 +17,76 @@ #include "src/utils/blocking_counter.h" namespace libgav1 { +namespace { + +constexpr uint8_t HevThresh(int level) { return DivideBy16(level); } + +// GetLoopFilterSize* functions depend on this exact ordering of the +// LoopFilterSize enums. +static_assert(dsp::kLoopFilterSize4 == 0, ""); +static_assert(dsp::kLoopFilterSize6 == 1, ""); +static_assert(dsp::kLoopFilterSize8 == 2, ""); +static_assert(dsp::kLoopFilterSize14 == 3, ""); + +dsp::LoopFilterSize GetLoopFilterSizeY(int filter_length) { + // |filter_length| must be a power of 2. + assert((filter_length & (filter_length - 1)) == 0); + // This code is the branch free equivalent of: + // if (filter_length == 4) return kLoopFilterSize4; + // if (filter_length == 8) return kLoopFilterSize8; + // return kLoopFilterSize14; + return static_cast<dsp::LoopFilterSize>( + MultiplyBy2(static_cast<int>(filter_length > 4)) + + static_cast<int>(filter_length > 8)); +} + +constexpr dsp::LoopFilterSize GetLoopFilterSizeUV(int filter_length) { + // For U & V planes, size is kLoopFilterSize4 if |filter_length| is 4, + // otherwise size is kLoopFilterSize6. + return static_cast<dsp::LoopFilterSize>(filter_length != 4); +} + +// 7.14.5. +void ComputeDeblockFilterLevelsHelper( + const ObuFrameHeader& frame_header, int segment_id, int level_index, + const int8_t delta_lf[kFrameLfCount], + uint8_t deblock_filter_levels[kNumReferenceFrameTypes][2]) { + const int delta = delta_lf[frame_header.delta_lf.multi ? level_index : 0]; + uint8_t level = Clip3(frame_header.loop_filter.level[level_index] + delta, 0, + kMaxLoopFilterValue); + const auto feature = static_cast<SegmentFeature>( + kSegmentFeatureLoopFilterYVertical + level_index); + level = + Clip3(level + frame_header.segmentation.feature_data[segment_id][feature], + 0, kMaxLoopFilterValue); + if (!frame_header.loop_filter.delta_enabled) { + static_assert(sizeof(deblock_filter_levels[0][0]) == 1, ""); + memset(deblock_filter_levels, level, kNumReferenceFrameTypes * 2); + return; + } + assert(frame_header.loop_filter.delta_enabled); + const int shift = level >> 5; + deblock_filter_levels[kReferenceFrameIntra][0] = Clip3( + level + + LeftShift(frame_header.loop_filter.ref_deltas[kReferenceFrameIntra], + shift), + 0, kMaxLoopFilterValue); + // deblock_filter_levels[kReferenceFrameIntra][1] is never used. So it does + // not have to be populated. + for (int reference_frame = kReferenceFrameIntra + 1; + reference_frame < kNumReferenceFrameTypes; ++reference_frame) { + for (int mode_id = 0; mode_id < 2; ++mode_id) { + deblock_filter_levels[reference_frame][mode_id] = Clip3( + level + + LeftShift(frame_header.loop_filter.ref_deltas[reference_frame] + + frame_header.loop_filter.mode_deltas[mode_id], + shift), + 0, kMaxLoopFilterValue); + } + } +} + +} // namespace void PostFilter::ComputeDeblockFilterLevels( const int8_t delta_lf[kFrameLfCount], @@ -28,13 +98,13 @@ void PostFilter::ComputeDeblockFilterLevels( ++segment_id) { int level_index = 0; for (; level_index < 2; ++level_index) { - LoopFilterMask::ComputeDeblockFilterLevels( + ComputeDeblockFilterLevelsHelper( frame_header_, segment_id, level_index, delta_lf, deblock_filter_levels[segment_id][level_index]); } for (; level_index < kFrameLfCount; ++level_index) { if (frame_header_.loop_filter.level[level_index] != 0) { - LoopFilterMask::ComputeDeblockFilterLevels( + ComputeDeblockFilterLevelsHelper( frame_header_, segment_id, level_index, delta_lf, deblock_filter_levels[segment_id][level_index]); } @@ -42,62 +112,28 @@ void PostFilter::ComputeDeblockFilterLevels( } } -void PostFilter::InitDeblockFilterParams() { - const int8_t sharpness = frame_header_.loop_filter.sharpness; - assert(0 <= sharpness && sharpness < 8); - const int shift = DivideBy4(sharpness + 3); // ceil(sharpness / 4.0) - for (int level = 0; level <= kMaxLoopFilterValue; ++level) { - uint8_t limit = level >> shift; - if (sharpness > 0) { - limit = Clip3(limit, 1, 9 - sharpness); - } else { - limit = std::max(limit, static_cast<uint8_t>(1)); - } - inner_thresh_[level] = limit; - outer_thresh_[level] = 2 * (level + 2) + limit; - hev_thresh_[level] = level >> 4; - } -} - -void PostFilter::GetDeblockFilterParams(uint8_t level, int* outer_thresh, - int* inner_thresh, - int* hev_thresh) const { - *outer_thresh = outer_thresh_[level]; - *inner_thresh = inner_thresh_[level]; - *hev_thresh = hev_thresh_[level]; -} - -template <LoopFilterType type> -bool PostFilter::GetDeblockFilterEdgeInfo(const Plane plane, int row4x4, - int column4x4, - const int8_t subsampling_x, - const int8_t subsampling_y, - uint8_t* level, int* step, - int* filter_length) const { +bool PostFilter::GetHorizontalDeblockFilterEdgeInfo(const Plane plane, + int row4x4, int column4x4, + const int8_t subsampling_x, + const int8_t subsampling_y, + uint8_t* level, int* step, + int* filter_length) const { row4x4 = GetDeblockPosition(row4x4, subsampling_y); column4x4 = GetDeblockPosition(column4x4, subsampling_x); const BlockParameters* bp = block_parameters_.Find(row4x4, column4x4); const TransformSize transform_size = (plane == kPlaneY) ? inter_transform_sizes_[row4x4][column4x4] : bp->uv_transform_size; - *step = (type == kLoopFilterTypeHorizontal) ? kTransformHeight[transform_size] - : kTransformWidth[transform_size]; - if ((type == kLoopFilterTypeHorizontal && row4x4 == subsampling_y) || - (type == kLoopFilterTypeVertical && column4x4 == subsampling_x)) { - return false; - } + *step = kTransformHeight[transform_size]; + if (row4x4 == subsampling_y) return false; - const int filter_id = kDeblockFilterLevelIndex[plane][type]; + const int filter_id = + kDeblockFilterLevelIndex[plane][kLoopFilterTypeHorizontal]; const uint8_t level_this = bp->deblock_filter_level[filter_id]; - const int row4x4_prev = (type == kLoopFilterTypeHorizontal) - ? row4x4 - (1 << subsampling_y) - : row4x4; - const int column4x4_prev = (type == kLoopFilterTypeHorizontal) - ? column4x4 - : column4x4 - (1 << subsampling_x); - assert(row4x4_prev >= 0 && column4x4_prev >= 0); + const int row4x4_prev = row4x4 - (1 << subsampling_y); + assert(row4x4_prev >= 0); const BlockParameters* bp_prev = - block_parameters_.Find(row4x4_prev, column4x4_prev); + block_parameters_.Find(row4x4_prev, column4x4); const uint8_t level_prev = bp_prev->deblock_filter_level[filter_id]; *level = level_this; if (level_this == 0) { @@ -107,373 +143,91 @@ bool PostFilter::GetDeblockFilterEdgeInfo(const Plane plane, int row4x4, const BlockSize size = kPlaneResidualSize[bp->size][subsampling_x][subsampling_y]; - const int prediction_masks = (type == kLoopFilterTypeHorizontal) - ? kBlockHeightPixels[size] - 1 - : kBlockWidthPixels[size] - 1; - const int pixel_position = MultiplyBy4((type == kLoopFilterTypeHorizontal) - ? row4x4 >> subsampling_y - : column4x4 >> subsampling_x); + const int prediction_masks = kBlockHeightPixels[size] - 1; + const int pixel_position = MultiplyBy4(row4x4 >> subsampling_y); const bool is_border = (pixel_position & prediction_masks) == 0; const bool skip = bp->skip && bp->is_inter; const bool skip_prev = bp_prev->skip && bp_prev->is_inter; if (!skip || !skip_prev || is_border) { const TransformSize transform_size_prev = - (plane == kPlaneY) ? inter_transform_sizes_[row4x4_prev][column4x4_prev] + (plane == kPlaneY) ? inter_transform_sizes_[row4x4_prev][column4x4] : bp_prev->uv_transform_size; - const int step_prev = (type == kLoopFilterTypeHorizontal) - ? kTransformHeight[transform_size_prev] - : kTransformWidth[transform_size_prev]; + const int step_prev = kTransformHeight[transform_size_prev]; *filter_length = std::min(*step, step_prev); return true; } return false; } -void PostFilter::HorizontalDeblockFilter(Plane plane, int row4x4_start, - int column4x4_start, int unit_id) { - const int8_t subsampling_x = subsampling_x_[plane]; - const int8_t subsampling_y = subsampling_y_[plane]; - const int row_step = 1 << subsampling_y; - const int column_step = 1 << subsampling_x; - const size_t src_step = 4 * pixel_size_; - const ptrdiff_t row_stride = MultiplyBy4(frame_buffer_.stride(plane)); - const ptrdiff_t src_stride = frame_buffer_.stride(plane); - uint8_t* src = GetSourceBuffer(plane, row4x4_start, column4x4_start); - const uint64_t single_row_mask = 0xffff; - // 3 (11), 5 (0101). - const uint64_t two_block_mask = (subsampling_x > 0) ? 5 : 3; - const LoopFilterType type = kLoopFilterTypeHorizontal; - // Subsampled UV samples correspond to the right/bottom position of - // Y samples. - const int column = subsampling_x; - - // AV1 smallest transform size is 4x4, thus minimum horizontal edge size is - // 4x4. For SIMD implementation, sse2 could compute 8 pixels at the same time. - // __m128i = 8 x uint16_t, AVX2 could compute 16 pixels at the same time. - // __m256i = 16 x uint16_t, assuming pixel type is 16 bit. It means we could - // filter 2 horizontal edges using sse2 and 4 edges using AVX2. - // The bitmask enables us to call different SIMD implementations to filter - // 1 edge, or 2 edges or 4 edges. - // TODO(chengchen): Here, the implementation only consider 1 and 2 edges. - // Add support for 4 edges. More branches involved, for example, if input is - // 8 bit, __m128i = 16 x 8 bit, we could apply filtering for 4 edges using - // sse2, 8 edges using AVX2. If input is 16 bit, __m128 = 8 x 16 bit, then - // we apply filtering for 2 edges using sse2, and 4 edges using AVX2. - for (int row4x4 = 0; MultiplyBy4(row4x4_start + row4x4) < height_ && - row4x4 < kNum4x4InLoopFilterMaskUnit; - row4x4 += row_step) { - if (row4x4_start + row4x4 == 0) { - src += row_stride; - continue; - } - // Subsampled UV samples correspond to the right/bottom position of - // Y samples. - const int row = GetDeblockPosition(row4x4, subsampling_y); - const int index = GetIndex(row); - const int shift = GetShift(row, column); - const int level_offset = LoopFilterMask::GetLevelOffset(row, column); - // Mask of current row. mask4x4 represents the vertical filter length for - // the current horizontal edge is 4, and we needs to apply 3-tap filtering. - // Similarly, mask8x8 and mask16x16 represent filter lengths are 8 and 16. - uint64_t mask4x4 = - (masks_->GetTop(unit_id, plane, kLoopFilterTransformSizeId4x4, index) >> - shift) & - single_row_mask; - uint64_t mask8x8 = - (masks_->GetTop(unit_id, plane, kLoopFilterTransformSizeId8x8, index) >> - shift) & - single_row_mask; - uint64_t mask16x16 = - (masks_->GetTop(unit_id, plane, kLoopFilterTransformSizeId16x16, - index) >> - shift) & - single_row_mask; - // mask4x4, mask8x8, mask16x16 are mutually exclusive. - assert((mask4x4 & mask8x8) == 0 && (mask4x4 & mask16x16) == 0 && - (mask8x8 & mask16x16) == 0); - // Apply deblock filter for one row. - uint8_t* src_row = src; - int column_offset = 0; - for (uint64_t mask = mask4x4 | mask8x8 | mask16x16; mask != 0;) { - int edge_count = 1; - if ((mask & 1) != 0) { - // Filter parameters of current edge. - const uint8_t level = masks_->GetLevel(unit_id, plane, type, - level_offset + column_offset); - int outer_thresh_0; - int inner_thresh_0; - int hev_thresh_0; - GetDeblockFilterParams(level, &outer_thresh_0, &inner_thresh_0, - &hev_thresh_0); - // Filter parameters of next edge. Clip the index to avoid over - // reading at the edge of the block. The values will be unused in that - // case. - const int level_next_index = level_offset + column_offset + column_step; - const uint8_t level_next = - masks_->GetLevel(unit_id, plane, type, level_next_index & 0xff); - int outer_thresh_1; - int inner_thresh_1; - int hev_thresh_1; - GetDeblockFilterParams(level_next, &outer_thresh_1, &inner_thresh_1, - &hev_thresh_1); - - if ((mask16x16 & 1) != 0) { - const dsp::LoopFilterSize size = (plane == kPlaneY) - ? dsp::kLoopFilterSize14 - : dsp::kLoopFilterSize6; - const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type]; - if ((mask16x16 & two_block_mask) == two_block_mask) { - edge_count = 2; - // Apply filtering for two edges. - filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0, - hev_thresh_0); - filter_func(src_row + src_step, src_stride, outer_thresh_1, - inner_thresh_1, hev_thresh_1); - } else { - // Apply single edge filtering. - filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0, - hev_thresh_0); - } - } - - if ((mask8x8 & 1) != 0) { - const dsp::LoopFilterSize size = - plane == kPlaneY ? dsp::kLoopFilterSize8 : dsp::kLoopFilterSize6; - const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type]; - if ((mask8x8 & two_block_mask) == two_block_mask) { - edge_count = 2; - // Apply filtering for two edges. - filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0, - hev_thresh_0); - filter_func(src_row + src_step, src_stride, outer_thresh_1, - inner_thresh_1, hev_thresh_1); - } else { - // Apply single edge filtering. - filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0, - hev_thresh_0); - } - } +bool PostFilter::GetVerticalDeblockFilterEdgeInfo( + const Plane /*plane*/, int row4x4, int column4x4, + const int8_t /*subsampling_x*/, const int8_t /*subsampling_y*/, + BlockParameters* const* bp_ptr, uint8_t* level, int* step, + int* filter_length) const { + const BlockParameters* bp = *bp_ptr; + *step = kTransformWidth[inter_transform_sizes_[row4x4][column4x4]]; + if (column4x4 == 0) return false; - if ((mask4x4 & 1) != 0) { - const dsp::LoopFilterSize size = dsp::kLoopFilterSize4; - const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type]; - if ((mask4x4 & two_block_mask) == two_block_mask) { - edge_count = 2; - // Apply filtering for two edges. - filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0, - hev_thresh_0); - filter_func(src_row + src_step, src_stride, outer_thresh_1, - inner_thresh_1, hev_thresh_1); - } else { - // Apply single edge filtering. - filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0, - hev_thresh_0); - } - } - } - - const int step = edge_count * column_step; - mask4x4 >>= step; - mask8x8 >>= step; - mask16x16 >>= step; - mask >>= step; - column_offset += step; - src_row += MultiplyBy4(edge_count) * pixel_size_; - } - src += row_stride; + const int filter_id = 0; + const uint8_t level_this = bp->deblock_filter_level[filter_id]; + const int column4x4_prev = column4x4 - 1; + assert(column4x4_prev >= 0); + const BlockParameters* bp_prev = *(bp_ptr - 1); + const uint8_t level_prev = bp_prev->deblock_filter_level[filter_id]; + *level = level_this; + if (level_this == 0) { + if (level_prev == 0) return false; + *level = level_prev; } -} -void PostFilter::VerticalDeblockFilter(Plane plane, int row4x4_start, - int column4x4_start, int unit_id) { - const int8_t subsampling_x = subsampling_x_[plane]; - const int8_t subsampling_y = subsampling_y_[plane]; - const int row_step = 1 << subsampling_y; - const int two_row_step = row_step << 1; - const int column_step = 1 << subsampling_x; - const size_t src_step = (bitdepth_ == 8) ? 4 : 4 * sizeof(uint16_t); - const ptrdiff_t row_stride = MultiplyBy4(frame_buffer_.stride(plane)); - const ptrdiff_t two_row_stride = row_stride << 1; - const ptrdiff_t src_stride = frame_buffer_.stride(plane); - uint8_t* src = GetSourceBuffer(plane, row4x4_start, column4x4_start); - const uint64_t single_row_mask = 0xffff; - const LoopFilterType type = kLoopFilterTypeVertical; - // Subsampled UV samples correspond to the right/bottom position of - // Y samples. - const int column = subsampling_x; - - // AV1 smallest transform size is 4x4, thus minimum vertical edge size is 4x4. - // For SIMD implementation, sse2 could compute 8 pixels at the same time. - // __m128i = 8 x uint16_t, AVX2 could compute 16 pixels at the same time. - // __m256i = 16 x uint16_t, assuming pixel type is 16 bit. It means we could - // filter 2 vertical edges using sse2 and 4 edges using AVX2. - // The bitmask enables us to call different SIMD implementations to filter - // 1 edge, or 2 edges or 4 edges. - // TODO(chengchen): Here, the implementation only consider 1 and 2 edges. - // Add support for 4 edges. More branches involved, for example, if input is - // 8 bit, __m128i = 16 x 8 bit, we could apply filtering for 4 edges using - // sse2, 8 edges using AVX2. If input is 16 bit, __m128 = 8 x 16 bit, then - // we apply filtering for 2 edges using sse2, and 4 edges using AVX2. - for (int row4x4 = 0; MultiplyBy4(row4x4_start + row4x4) < height_ && - row4x4 < kNum4x4InLoopFilterMaskUnit; - row4x4 += two_row_step) { - // Subsampled UV samples correspond to the right/bottom position of - // Y samples. - const int row = GetDeblockPosition(row4x4, subsampling_y); - const int row_next = row + row_step; - const int index = GetIndex(row); - const int shift = GetShift(row, column); - const int level_offset = LoopFilterMask::GetLevelOffset(row, column); - const int index_next = GetIndex(row_next); - const int shift_next_row = GetShift(row_next, column); - const int level_offset_next_row = - LoopFilterMask::GetLevelOffset(row_next, column); - // TODO(chengchen): replace 0, 1, 2 to meaningful enum names. - // mask of current row. mask4x4 represents the horizontal filter length for - // the current vertical edge is 4, and we needs to apply 3-tap filtering. - // Similarly, mask8x8 and mask16x16 represent filter lengths are 8 and 16. - uint64_t mask4x4_0 = - (masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId4x4, - index) >> - shift) & - single_row_mask; - uint64_t mask8x8_0 = - (masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId8x8, - index) >> - shift) & - single_row_mask; - uint64_t mask16x16_0 = - (masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId16x16, - index) >> - shift) & - single_row_mask; - // mask4x4, mask8x8, mask16x16 are mutually exclusive. - assert((mask4x4_0 & mask8x8_0) == 0 && (mask4x4_0 & mask16x16_0) == 0 && - (mask8x8_0 & mask16x16_0) == 0); - // mask of the next row. With mask of current and the next row, we can call - // the corresponding SIMD function to apply filtering for two vertical - // edges together. - uint64_t mask4x4_1 = - (masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId4x4, - index_next) >> - shift_next_row) & - single_row_mask; - uint64_t mask8x8_1 = - (masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId8x8, - index_next) >> - shift_next_row) & - single_row_mask; - uint64_t mask16x16_1 = - (masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId16x16, - index_next) >> - shift_next_row) & - single_row_mask; - // mask4x4, mask8x8, mask16x16 are mutually exclusive. - assert((mask4x4_1 & mask8x8_1) == 0 && (mask4x4_1 & mask16x16_1) == 0 && - (mask8x8_1 & mask16x16_1) == 0); - // Apply deblock filter for two rows. - uint8_t* src_row = src; - int column_offset = 0; - for (uint64_t mask = mask4x4_0 | mask8x8_0 | mask16x16_0 | mask4x4_1 | - mask8x8_1 | mask16x16_1; - mask != 0;) { - if ((mask & 1) != 0) { - // Filter parameters of current row. - const uint8_t level = masks_->GetLevel(unit_id, plane, type, - level_offset + column_offset); - int outer_thresh_0; - int inner_thresh_0; - int hev_thresh_0; - GetDeblockFilterParams(level, &outer_thresh_0, &inner_thresh_0, - &hev_thresh_0); - // Filter parameters of next row. Clip the index to avoid over - // reading at the edge of the block. The values will be unused in that - // case. - const int level_next_index = level_offset_next_row + column_offset; - const uint8_t level_next = - masks_->GetLevel(unit_id, plane, type, level_next_index & 0xff); - int outer_thresh_1; - int inner_thresh_1; - int hev_thresh_1; - GetDeblockFilterParams(level_next, &outer_thresh_1, &inner_thresh_1, - &hev_thresh_1); - uint8_t* const src_row_next = src_row + row_stride; - - if (((mask16x16_0 | mask16x16_1) & 1) != 0) { - const dsp::LoopFilterSize size = (plane == kPlaneY) - ? dsp::kLoopFilterSize14 - : dsp::kLoopFilterSize6; - const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type]; - if ((mask16x16_0 & mask16x16_1 & 1) != 0) { - // Apply dual vertical edge filtering. - filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0, - hev_thresh_0); - filter_func(src_row_next, src_stride, outer_thresh_1, - inner_thresh_1, hev_thresh_1); - } else if ((mask16x16_0 & 1) != 0) { - filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0, - hev_thresh_0); - } else { - filter_func(src_row_next, src_stride, outer_thresh_1, - inner_thresh_1, hev_thresh_1); - } - } - - if (((mask8x8_0 | mask8x8_1) & 1) != 0) { - const dsp::LoopFilterSize size = (plane == kPlaneY) - ? dsp::kLoopFilterSize8 - : dsp::kLoopFilterSize6; - const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type]; - if ((mask8x8_0 & mask8x8_1 & 1) != 0) { - filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0, - hev_thresh_0); - filter_func(src_row_next, src_stride, outer_thresh_1, - inner_thresh_1, hev_thresh_1); - } else if ((mask8x8_0 & 1) != 0) { - filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0, - hev_thresh_0); - } else { - filter_func(src_row_next, src_stride, outer_thresh_1, - inner_thresh_1, hev_thresh_1); - } - } + const int prediction_masks = kBlockWidthPixels[bp->size] - 1; + const int pixel_position = MultiplyBy4(column4x4); + const bool is_border = (pixel_position & prediction_masks) == 0; + const bool skip = bp->skip && bp->is_inter; + const bool skip_prev = bp_prev->skip && bp_prev->is_inter; + if (skip && skip_prev && !is_border) return false; + const int step_prev = + kTransformWidth[inter_transform_sizes_[row4x4][column4x4_prev]]; + *filter_length = std::min(*step, step_prev); + return true; +} - if (((mask4x4_0 | mask4x4_1) & 1) != 0) { - const dsp::LoopFilterSize size = dsp::kLoopFilterSize4; - const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type]; - if ((mask4x4_0 & mask4x4_1 & 1) != 0) { - filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0, - hev_thresh_0); - filter_func(src_row_next, src_stride, outer_thresh_1, - inner_thresh_1, hev_thresh_1); - } else if ((mask4x4_0 & 1) != 0) { - filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0, - hev_thresh_0); - } else { - filter_func(src_row_next, src_stride, outer_thresh_1, - inner_thresh_1, hev_thresh_1); - } - } - } +bool PostFilter::GetVerticalDeblockFilterEdgeInfoUV( + const Plane plane, int row4x4, int column4x4, const int8_t subsampling_x, + const int8_t subsampling_y, BlockParameters* const* bp_ptr, uint8_t* level, + int* step, int* filter_length) const { + row4x4 = GetDeblockPosition(row4x4, subsampling_y); + column4x4 = GetDeblockPosition(column4x4, subsampling_x); + const BlockParameters* bp = *bp_ptr; + *step = kTransformWidth[bp->uv_transform_size]; + if (column4x4 == subsampling_x) return false; - mask4x4_0 >>= column_step; - mask8x8_0 >>= column_step; - mask16x16_0 >>= column_step; - mask4x4_1 >>= column_step; - mask8x8_1 >>= column_step; - mask16x16_1 >>= column_step; - mask >>= column_step; - column_offset += column_step; - src_row += src_step; - } - src += two_row_stride; + const int filter_id = + kDeblockFilterLevelIndex[plane][kLoopFilterTypeVertical]; + const uint8_t level_this = bp->deblock_filter_level[filter_id]; + const BlockParameters* bp_prev = *(bp_ptr - (1 << subsampling_x)); + const uint8_t level_prev = bp_prev->deblock_filter_level[filter_id]; + *level = level_this; + if (level_this == 0) { + if (level_prev == 0) return false; + *level = level_prev; } + + const BlockSize size = + kPlaneResidualSize[bp->size][subsampling_x][subsampling_y]; + const int prediction_masks = kBlockWidthPixels[size] - 1; + const int pixel_position = MultiplyBy4(column4x4 >> subsampling_x); + const bool is_border = (pixel_position & prediction_masks) == 0; + const bool skip = bp->skip && bp->is_inter; + const bool skip_prev = bp_prev->skip && bp_prev->is_inter; + if (skip && skip_prev && !is_border) return false; + const int step_prev = kTransformWidth[bp_prev->uv_transform_size]; + *filter_length = std::min(*step, step_prev); + return true; } -void PostFilter::HorizontalDeblockFilterNoMask(Plane plane, int row4x4_start, - int column4x4_start, - int unit_id) { - static_cast<void>(unit_id); +void PostFilter::HorizontalDeblockFilter(Plane plane, int row4x4_start, + int column4x4_start) { const int8_t subsampling_x = subsampling_x_[plane]; const int8_t subsampling_y = subsampling_y_[plane]; const int column_step = 1 << subsampling_x; @@ -486,27 +240,22 @@ void PostFilter::HorizontalDeblockFilterNoMask(Plane plane, int row4x4_start, int filter_length; for (int column4x4 = 0; MultiplyBy4(column4x4_start + column4x4) < width_ && - column4x4 < kNum4x4InLoopFilterMaskUnit; + column4x4 < kNum4x4InLoopFilterUnit; column4x4 += column_step, src += src_step) { uint8_t* src_row = src; for (int row4x4 = 0; MultiplyBy4(row4x4_start + row4x4) < height_ && - row4x4 < kNum4x4InLoopFilterMaskUnit; + row4x4 < kNum4x4InLoopFilterUnit; row4x4 += row_step) { - const bool need_filter = - GetDeblockFilterEdgeInfo<kLoopFilterTypeHorizontal>( - plane, row4x4_start + row4x4, column4x4_start + column4x4, - subsampling_x, subsampling_y, &level, &row_step, &filter_length); + const bool need_filter = GetHorizontalDeblockFilterEdgeInfo( + plane, row4x4_start + row4x4, column4x4_start + column4x4, + subsampling_x, subsampling_y, &level, &row_step, &filter_length); if (need_filter) { - int outer_thresh; - int inner_thresh; - int hev_thresh; - GetDeblockFilterParams(level, &outer_thresh, &inner_thresh, - &hev_thresh); const dsp::LoopFilterSize size = - GetLoopFilterSize(plane, filter_length); + (plane == kPlaneY) ? GetLoopFilterSizeY(filter_length) + : GetLoopFilterSizeUV(filter_length); const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type]; - filter_func(src_row, src_stride, outer_thresh, inner_thresh, - hev_thresh); + filter_func(src_row, src_stride, outer_thresh_[level], + inner_thresh_[level], HevThresh(level)); } // TODO(chengchen): use shifts instead of multiplication. src_row += row_step * src_stride; @@ -515,9 +264,8 @@ void PostFilter::HorizontalDeblockFilterNoMask(Plane plane, int row4x4_start, } } -void PostFilter::VerticalDeblockFilterNoMask(Plane plane, int row4x4_start, - int column4x4_start, int unit_id) { - static_cast<void>(unit_id); +void PostFilter::VerticalDeblockFilter(Plane plane, int row4x4_start, + int column4x4_start) { const int8_t subsampling_x = subsampling_x_[plane]; const int8_t subsampling_y = subsampling_y_[plane]; const int row_step = 1 << subsampling_y; @@ -529,29 +277,30 @@ void PostFilter::VerticalDeblockFilterNoMask(Plane plane, int row4x4_start, uint8_t level; int filter_length; + BlockParameters* const* bp_row_base = block_parameters_.Address( + GetDeblockPosition(row4x4_start, subsampling_y), + GetDeblockPosition(column4x4_start, subsampling_x)); + const auto edge_info = deblock_vertical_edge_info_[plane]; + const int bp_stride = block_parameters_.columns4x4() * row_step; for (int row4x4 = 0; MultiplyBy4(row4x4_start + row4x4) < height_ && - row4x4 < kNum4x4InLoopFilterMaskUnit; - row4x4 += row_step, src += row_stride) { + row4x4 < kNum4x4InLoopFilterUnit; + row4x4 += row_step, src += row_stride, bp_row_base += bp_stride) { uint8_t* src_row = src; + BlockParameters* const* bp = bp_row_base; for (int column4x4 = 0; MultiplyBy4(column4x4_start + column4x4) < width_ && - column4x4 < kNum4x4InLoopFilterMaskUnit; - column4x4 += column_step) { - const bool need_filter = - GetDeblockFilterEdgeInfo<kLoopFilterTypeVertical>( - plane, row4x4_start + row4x4, column4x4_start + column4x4, - subsampling_x, subsampling_y, &level, &column_step, - &filter_length); + column4x4 < kNum4x4InLoopFilterUnit; + column4x4 += column_step, bp += column_step) { + const bool need_filter = (this->*edge_info)( + plane, row4x4_start + row4x4, column4x4_start + column4x4, + subsampling_x, subsampling_y, bp, &level, &column_step, + &filter_length); if (need_filter) { - int outer_thresh; - int inner_thresh; - int hev_thresh; - GetDeblockFilterParams(level, &outer_thresh, &inner_thresh, - &hev_thresh); const dsp::LoopFilterSize size = - GetLoopFilterSize(plane, filter_length); + (plane == kPlaneY) ? GetLoopFilterSizeY(filter_length) + : GetLoopFilterSizeUV(filter_length); const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type]; - filter_func(src_row, src_stride, outer_thresh, inner_thresh, - hev_thresh); + filter_func(src_row, src_stride, outer_thresh_[level], + inner_thresh_[level], HevThresh(level)); } src_row += column_step * pixel_size_; column_step = DivideBy4(column_step << subsampling_x); @@ -573,21 +322,19 @@ void PostFilter::ApplyDeblockFilterForOneSuperBlockRow(int row4x4_start, if (row4x4 >= frame_header_.rows4x4) break; int column4x4; for (column4x4 = 0; column4x4 < frame_header_.columns4x4; - column4x4 += kNum4x4InLoopFilterMaskUnit) { + column4x4 += kNum4x4InLoopFilterUnit) { // First apply vertical filtering - VerticalDeblockFilterNoMask(static_cast<Plane>(plane), row4x4, - column4x4, 0); + VerticalDeblockFilter(static_cast<Plane>(plane), row4x4, column4x4); // Delay one superblock to apply horizontal filtering. if (column4x4 != 0) { - HorizontalDeblockFilterNoMask(static_cast<Plane>(plane), row4x4, - column4x4 - kNum4x4InLoopFilterMaskUnit, - 0); + HorizontalDeblockFilter(static_cast<Plane>(plane), row4x4, + column4x4 - kNum4x4InLoopFilterUnit); } } // Horizontal filtering for the last 64x64 block. - HorizontalDeblockFilterNoMask(static_cast<Plane>(plane), row4x4, - column4x4 - kNum4x4InLoopFilterMaskUnit, 0); + HorizontalDeblockFilter(static_cast<Plane>(plane), row4x4, + column4x4 - kNum4x4InLoopFilterUnit); } } } @@ -602,12 +349,11 @@ void PostFilter::DeblockFilterWorker(int jobs_per_plane, const Plane* planes, total_jobs) { const Plane plane = planes[job_index / jobs_per_plane]; const int row_unit = job_index % jobs_per_plane; - const int row4x4 = row_unit * kNum4x4InLoopFilterMaskUnit; + const int row4x4 = row_unit * kNum4x4InLoopFilterUnit; for (int column4x4 = 0, column_unit = 0; column4x4 < frame_header_.columns4x4; - column4x4 += kNum4x4InLoopFilterMaskUnit, ++column_unit) { - const int unit_id = GetDeblockUnitId(row_unit, column_unit); - (this->*deblock_filter)(plane, row4x4, column4x4, unit_id); + column4x4 += kNum4x4InLoopFilterUnit, ++column_unit) { + (this->*deblock_filter)(plane, row4x4, column4x4); } } } @@ -635,8 +381,7 @@ void PostFilter::ApplyDeblockFilterThreaded() { // The only synchronization involved is to know when the each directional // filter is complete for the entire frame. for (auto& type : {kLoopFilterTypeVertical, kLoopFilterTypeHorizontal}) { - const DeblockFilter deblock_filter = - deblock_filter_type_table_[kDeblockFilterBitMask][type]; + const DeblockFilter deblock_filter = deblock_filter_func_[type]; std::atomic<int> job_counter(0); BlockingCounter pending_workers(num_workers); for (int i = 0; i < num_workers; ++i) { @@ -656,4 +401,31 @@ void PostFilter::ApplyDeblockFilterThreaded() { } } +void PostFilter::ApplyDeblockFilter(LoopFilterType loop_filter_type, + int row4x4_start, int column4x4_start, + int column4x4_end, int sb4x4) { + assert(row4x4_start >= 0); + assert(DoDeblock()); + + column4x4_end = std::min(column4x4_end, frame_header_.columns4x4); + if (column4x4_start >= column4x4_end) return; + + const DeblockFilter deblock_filter = deblock_filter_func_[loop_filter_type]; + const int sb_height4x4 = + std::min(sb4x4, frame_header_.rows4x4 - row4x4_start); + for (int plane = kPlaneY; plane < planes_; ++plane) { + if (plane != kPlaneY && frame_header_.loop_filter.level[plane + 1] == 0) { + continue; + } + + for (int y = 0; y < sb_height4x4; y += kNum4x4InLoopFilterUnit) { + const int row4x4 = row4x4_start + y; + for (int column4x4 = column4x4_start; column4x4 < column4x4_end; + column4x4 += kNum4x4InLoopFilterUnit) { + (this->*deblock_filter)(static_cast<Plane>(plane), row4x4, column4x4); + } + } + } +} + } // namespace libgav1 diff --git a/chromium/third_party/libgav1/src/src/post_filter/deblock_thresholds.inc b/chromium/third_party/libgav1/src/src/post_filter/deblock_thresholds.inc new file mode 100644 index 00000000000..ca12aaaeb7e --- /dev/null +++ b/chromium/third_party/libgav1/src/src/post_filter/deblock_thresholds.inc @@ -0,0 +1,85 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Thresholds for the deblocking filter. Precomputed values of part of Section +// 7.14.4 for all possible values of sharpness. + +constexpr uint8_t kInnerThresh[8][kMaxLoopFilterValue + 1] = { + {1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}, + {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}, + {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}, + {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}, + {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, + {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}, + {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, + {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}}; + +constexpr uint8_t kOuterThresh[8][kMaxLoopFilterValue + 1] = { + {5, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, + 43, 46, 49, 52, 55, 58, 61, 64, 67, 70, 73, 76, 79, + 82, 85, 88, 91, 94, 97, 100, 103, 106, 109, 112, 115, 118, + 121, 124, 127, 130, 133, 136, 139, 142, 145, 148, 151, 154, 157, + 160, 163, 166, 169, 172, 175, 178, 181, 184, 187, 190, 193}, + {5, 7, 9, 11, 14, 16, 19, 21, 24, 26, 29, 31, 34, + 36, 39, 41, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, + 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, + 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138}, + {5, 7, 9, 11, 14, 16, 19, 21, 24, 26, 29, 31, 34, + 36, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, + 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, + 89, 91, 93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113, + 115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137}, + {5, 7, 9, 11, 14, 16, 19, 21, 24, 26, 29, 31, 34, + 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, + 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, + 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, + 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136}, + {5, 7, 9, 11, 14, 16, 19, 21, 24, 26, 29, 31, 33, + 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, + 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, + 87, 89, 91, 93, 95, 97, 99, 101, 103, 105, 107, 109, 111, + 113, 115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135}, + {5, 7, 9, 11, 13, 15, 17, 19, 22, 24, 26, 28, 31, + 33, 35, 37, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, + 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, + 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, + 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134}, + {5, 7, 9, 11, 13, 15, 17, 19, 22, 24, 26, 28, 31, + 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, + 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, + 85, 87, 89, 91, 93, 95, 97, 99, 101, 103, 105, 107, 109, + 111, 113, 115, 117, 119, 121, 123, 125, 127, 129, 131, 133}, + {5, 7, 9, 11, 13, 15, 17, 19, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, + 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, + 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, + 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132}}; diff --git a/chromium/third_party/libgav1/src/src/post_filter/loop_restoration.cc b/chromium/third_party/libgav1/src/src/post_filter/loop_restoration.cc index a36788057ba..b36ad80cf05 100644 --- a/chromium/third_party/libgav1/src/src/post_filter/loop_restoration.cc +++ b/chromium/third_party/libgav1/src/src/post_filter/loop_restoration.cc @@ -21,16 +21,14 @@ void PostFilter::ApplyLoopRestorationForOneUnit( uint8_t* const cdef_buffer, const ptrdiff_t cdef_buffer_stride, const Plane plane, const int plane_height, const int x, const int y, const int row, const int column, const int unit_row, - const int current_process_unit_height, const int plane_process_unit_width, - const int plane_unit_size, const int num_horizontal_units, - const int plane_width, Array2DView<Pixel>* const loop_restored_window) { + const int current_process_unit_height, const int plane_unit_size, + const int num_horizontal_units, const int plane_width, + Array2DView<Pixel>* const loop_restored_window) { const int unit_x = x + column; const int unit_y = y + row; const int current_process_unit_width = - (unit_x + plane_process_unit_width <= plane_width) - ? plane_process_unit_width - : plane_width - unit_x; - uint8_t* cdef_unit_buffer = + std::min(plane_unit_size, plane_width - unit_x); + const uint8_t* cdef_unit_buffer = cdef_buffer + unit_y * cdef_buffer_stride + unit_x * pixel_size_; const int unit_column = std::min(unit_x / plane_unit_size, num_horizontal_units - 1); @@ -49,54 +47,47 @@ void PostFilter::ApplyLoopRestorationForOneUnit( return; } + const ptrdiff_t block_buffer_stride = + kRestorationUnitWidthWithBorders * sizeof(Pixel); // The SIMD implementation of wiener filter (currently WienerFilter_SSE4_1()) // over-reads 6 bytes, so add 6 extra bytes at the end of block_buffer for 8 // bit. - alignas(alignof(uint16_t)) - uint8_t block_buffer[kRestorationProcessingUnitSizeWithBorders * - kRestorationProcessingUnitSizeWithBorders * - sizeof(Pixel) + - ((sizeof(Pixel) == 1) ? 6 : 0)]; - const ptrdiff_t block_buffer_stride = - kRestorationProcessingUnitSizeWithBorders * pixel_size_; - IntermediateBuffers intermediate_buffers; - - RestorationBuffer restoration_buffer = { - {intermediate_buffers.box_filter.output[0], - intermediate_buffers.box_filter.output[1]}, - plane_process_unit_width, - {intermediate_buffers.box_filter.intermediate_a, - intermediate_buffers.box_filter.intermediate_b}, - kRestorationProcessingUnitSizeWithBorders + kRestorationPadding, - intermediate_buffers.wiener, - kMaxSuperBlockSizeInPixels}; - const int deblock_buffer_units = 64 >> subsampling_y_[plane]; - uint8_t* const deblock_buffer = deblock_buffer_.data(plane); - const int deblock_buffer_stride = deblock_buffer_.stride(plane); - const int deblock_unit_y = - std::max(MultiplyBy4(Ceil(unit_y, deblock_buffer_units)) - 4, 0); - uint8_t* deblock_unit_buffer = - (deblock_buffer != nullptr) - ? deblock_buffer + deblock_unit_y * deblock_buffer_stride + - unit_x * pixel_size_ - : nullptr; + alignas(alignof(uint16_t)) uint8_t + block_buffer[kRestorationUnitHeightWithBorders * block_buffer_stride + + ((sizeof(Pixel) == 1) ? 6 : 0)]; + RestorationBuffer restoration_buffer; + const uint8_t* source; + ptrdiff_t source_stride; + if (DoCdef()) { + const int deblock_buffer_units = 64 >> subsampling_y_[plane]; + const uint8_t* const deblock_buffer = deblock_buffer_.data(plane); + assert(deblock_buffer != nullptr); + const int deblock_buffer_stride = deblock_buffer_.stride(plane); + const int deblock_unit_y = + std::max(MultiplyBy4(Ceil(unit_y, deblock_buffer_units)) - 4, 0); + const uint8_t* const deblock_unit_buffer = + deblock_buffer + deblock_unit_y * deblock_buffer_stride + + unit_x * pixel_size_; + PrepareLoopRestorationBlock<Pixel>( + cdef_unit_buffer, cdef_buffer_stride, deblock_unit_buffer, + deblock_buffer_stride, block_buffer, block_buffer_stride, + current_process_unit_width, current_process_unit_height, unit_y == 0, + unit_y + current_process_unit_height >= plane_height); + source = block_buffer + kRestorationBorder * block_buffer_stride + + kRestorationBorder * pixel_size_; + source_stride = kRestorationUnitWidthWithBorders; + } else { + source = cdef_unit_buffer; + source_stride = cdef_buffer_stride / sizeof(Pixel); + } assert(type == kLoopRestorationTypeSgrProj || type == kLoopRestorationTypeWiener); const dsp::LoopRestorationFunc restoration_func = dsp_.loop_restorations[type - 2]; - PrepareLoopRestorationBlock<Pixel>( - DoCdef(), cdef_unit_buffer, cdef_buffer_stride, deblock_unit_buffer, - deblock_buffer_stride, block_buffer, block_buffer_stride, - current_process_unit_width, current_process_unit_height, unit_y == 0, - unit_y + current_process_unit_height >= plane_height); - restoration_func(reinterpret_cast<const uint8_t*>( - block_buffer + kRestorationBorder * block_buffer_stride + - kRestorationBorder * pixel_size_), - &(*loop_restored_window)[row][column], + restoration_func(source, &(*loop_restored_window)[row][column], restoration_info_->loop_restoration_info( static_cast<Plane>(plane), unit_id), - block_buffer_stride, - loop_restored_window->columns() * pixel_size_, + source_stride, loop_restored_window->columns(), current_process_unit_width, current_process_unit_height, &restoration_buffer); } @@ -104,9 +95,8 @@ void PostFilter::ApplyLoopRestorationForOneUnit( template <typename Pixel> void PostFilter::ApplyLoopRestorationForSuperBlock( const Plane plane, const int x, const int y, const int unit_row, - const int current_process_unit_height, const int process_unit_width) { + const int current_process_unit_height, const int plane_unit_size) { const int stride = frame_buffer_.stride(plane); - const int plane_unit_size = loop_restoration_.unit_size[plane]; const int num_horizontal_units = restoration_info_->num_horizontal_units(static_cast<Plane>(plane)); const int plane_width = @@ -119,23 +109,14 @@ void PostFilter::ApplyLoopRestorationForSuperBlock( x * pixel_size_)); ApplyLoopRestorationForOneUnit<Pixel>( superres_buffer_[plane], stride, plane, plane_height, x, y, 0, 0, - unit_row, current_process_unit_height, process_unit_width, - plane_unit_size, num_horizontal_units, plane_width, - &loop_restored_window); + unit_row, current_process_unit_height, plane_unit_size, + num_horizontal_units, plane_width, &loop_restored_window); } void PostFilter::ApplyLoopRestorationForOneSuperBlockRow(int row4x4_start, int sb4x4) { assert(row4x4_start >= 0); assert(DoRestoration()); - const int plane_process_unit_width[kMaxPlanes] = { - kRestorationProcessingUnitSize, - kRestorationProcessingUnitSize >> subsampling_x_[kPlaneU], - kRestorationProcessingUnitSize >> subsampling_x_[kPlaneV]}; - const int plane_process_unit_height[kMaxPlanes] = { - kRestorationProcessingUnitSize, - kRestorationProcessingUnitSize >> subsampling_y_[kPlaneU], - kRestorationProcessingUnitSize >> subsampling_y_[kPlaneV]}; for (int plane = 0; plane < planes_; ++plane) { if (frame_header_.loop_restoration.type[plane] == kLoopRestorationTypeNone) { @@ -149,36 +130,36 @@ void PostFilter::ApplyLoopRestorationForOneSuperBlockRow(int row4x4_start, subsampling_x_[plane]); const int num_vertical_units = restoration_info_->num_vertical_units(static_cast<Plane>(plane)); - const int process_unit_width = plane_process_unit_width[plane]; + const int plane_unit_size = frame_header_.loop_restoration.unit_size[plane]; + const int plane_process_unit_height = + kRestorationUnitHeight >> subsampling_y_[plane]; + int y = (row4x4_start == 0) + ? 0 + : (MultiplyBy4(row4x4_start) >> subsampling_y_[plane]) - + unit_height_offset; + int expected_height = plane_process_unit_height - + ((row4x4_start == 0) ? unit_height_offset : 0); for (int sb_y = 0; sb_y < sb4x4; sb_y += 16) { - const int row4x4 = row4x4_start + sb_y; - const int y = (MultiplyBy4(row4x4) - (row4x4 == 0 ? 0 : 8)) >> - subsampling_y_[plane]; if (y >= plane_height) break; - const int plane_unit_size = - frame_header_.loop_restoration.unit_size[plane]; const int unit_row = std::min((y + unit_height_offset) / plane_unit_size, num_vertical_units - 1); - const int expected_height = plane_process_unit_height[plane] + - ((y == 0) ? -unit_height_offset : 0); const int current_process_unit_height = - (y + expected_height <= plane_height) ? expected_height - : plane_height - y; - for (int column4x4 = 0;; column4x4 += 16) { - const int x = MultiplyBy4(column4x4) >> subsampling_x_[plane]; - if (x >= plane_width) break; + std::min(expected_height, plane_height - y); + for (int x = 0; x < plane_width; x += plane_unit_size) { #if LIBGAV1_MAX_BITDEPTH >= 10 if (bitdepth_ >= 10) { ApplyLoopRestorationForSuperBlock<uint16_t>( static_cast<Plane>(plane), x, y, unit_row, - current_process_unit_height, process_unit_width); + current_process_unit_height, plane_unit_size); continue; } #endif ApplyLoopRestorationForSuperBlock<uint8_t>( static_cast<Plane>(plane), x, y, unit_row, - current_process_unit_height, process_unit_width); + current_process_unit_height, plane_unit_size); } + expected_height = plane_process_unit_height; + y += current_process_unit_height; } } } @@ -188,18 +169,16 @@ void PostFilter::ApplyLoopRestorationForOneRowInWindow( uint8_t* const cdef_buffer, const ptrdiff_t cdef_buffer_stride, const Plane plane, const int plane_height, const int plane_width, const int x, const int y, const int row, const int unit_row, - const int current_process_unit_height, const int process_unit_width, - const int window_width, const int plane_unit_size, - const int num_horizontal_units) { + const int current_process_unit_height, const int plane_unit_size, + const int window_width, const int num_horizontal_units) { Array2DView<Pixel> loop_restored_window( window_buffer_height_, window_buffer_width_, reinterpret_cast<Pixel*>(threaded_window_buffer_)); - for (int column = 0; column < window_width; column += process_unit_width) { + for (int column = 0; column < window_width; column += plane_unit_size) { ApplyLoopRestorationForOneUnit<Pixel>( cdef_buffer, cdef_buffer_stride, plane, plane_height, x, y, row, column, - unit_row, current_process_unit_height, process_unit_width, - plane_unit_size, num_horizontal_units, plane_width, - &loop_restored_window); + unit_row, current_process_unit_height, plane_unit_size, + num_horizontal_units, plane_width, &loop_restored_window); } } @@ -210,20 +189,14 @@ void PostFilter::ApplyLoopRestorationForOneRowInWindow( // completes filtering until all jobs are finished. This approach requires an // extra buffer (|threaded_window_buffer_|) to hold the filtering output, whose // size is the size of the window. It also needs block buffers (i.e., -// |block_buffer| and |intermediate_buffers| in -// ApplyLoopRestorationForOneUnit()) to store intermediate results in loop -// restoration for each thread. After all units inside the window are filtered, -// the output is written to the frame buffer. +// |block_buffer| in ApplyLoopRestorationForOneUnit()) to store intermediate +// results in loop restoration for each thread. After all units inside the +// window are filtered, the output is written to the frame buffer. template <typename Pixel> void PostFilter::ApplyLoopRestorationThreaded() { - const int plane_process_unit_width[kMaxPlanes] = { - kRestorationProcessingUnitSize, - kRestorationProcessingUnitSize >> subsampling_x_[kPlaneU], - kRestorationProcessingUnitSize >> subsampling_x_[kPlaneV]}; const int plane_process_unit_height[kMaxPlanes] = { - kRestorationProcessingUnitSize, - kRestorationProcessingUnitSize >> subsampling_y_[kPlaneU], - kRestorationProcessingUnitSize >> subsampling_y_[kPlaneV]}; + kRestorationUnitHeight, kRestorationUnitHeight >> subsampling_y_[kPlaneU], + kRestorationUnitHeight >> subsampling_y_[kPlaneV]}; for (int plane = kPlaneY; plane < planes_; ++plane) { if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) { @@ -270,11 +243,11 @@ void PostFilter::ApplyLoopRestorationThreaded() { plane_process_unit_height[plane] + 1; } + const int jobs_for_threadpool = + vertical_units_per_window * num_workers / (num_workers + 1); for (int x = 0; x < plane_width; x += window_buffer_width_) { const int actual_window_width = std::min(window_buffer_width_, plane_width - x); - const int jobs_for_threadpool = - vertical_units_per_window * num_workers / (num_workers + 1); assert(jobs_for_threadpool < vertical_units_per_window); BlockingCounter pending_jobs(jobs_for_threadpool); int job_count = 0; @@ -282,37 +255,32 @@ void PostFilter::ApplyLoopRestorationThreaded() { for (int row = 0; row < actual_window_height; row += current_process_unit_height) { const int unit_y = y + row; - const int expected_height = plane_process_unit_height[plane] + - ((unit_y == 0) ? -unit_height_offset : 0); + const int expected_height = plane_process_unit_height[plane] - + ((unit_y == 0) ? unit_height_offset : 0); current_process_unit_height = - (unit_y + expected_height <= plane_height) - ? expected_height - : plane_height - unit_y; + std::min(expected_height, plane_height - unit_y); const int unit_row = std::min((unit_y + unit_height_offset) / plane_unit_size, num_vertical_units - 1); - const int process_unit_width = plane_process_unit_width[plane]; if (job_count < jobs_for_threadpool) { thread_pool_->Schedule( - [this, src_buffer, src_stride, process_unit_width, + [this, src_buffer, src_stride, plane_unit_size, current_process_unit_height, actual_window_width, - plane_unit_size, num_horizontal_units, x, y, row, unit_row, - plane_height, plane_width, plane, &pending_jobs]() { + num_horizontal_units, x, y, row, unit_row, plane_height, + plane_width, plane, &pending_jobs]() { ApplyLoopRestorationForOneRowInWindow<Pixel>( src_buffer, src_stride, static_cast<Plane>(plane), plane_height, plane_width, x, y, row, unit_row, - current_process_unit_height, process_unit_width, - actual_window_width, plane_unit_size, - num_horizontal_units); + current_process_unit_height, plane_unit_size, + actual_window_width, num_horizontal_units); pending_jobs.Decrement(); }); } else { ApplyLoopRestorationForOneRowInWindow<Pixel>( src_buffer, src_stride, static_cast<Plane>(plane), plane_height, plane_width, x, y, row, unit_row, current_process_unit_height, - process_unit_width, actual_window_width, plane_unit_size, - num_horizontal_units); + plane_unit_size, actual_window_width, num_horizontal_units); } ++job_count; } diff --git a/chromium/third_party/libgav1/src/src/post_filter/post_filter.cc b/chromium/third_party/libgav1/src/src/post_filter/post_filter.cc index 1b65e9fbcf8..6174aabdee6 100644 --- a/chromium/third_party/libgav1/src/src/post_filter/post_filter.cc +++ b/chromium/third_party/libgav1/src/src/post_filter/post_filter.cc @@ -31,6 +31,9 @@ namespace libgav1 { namespace { +// Import all the constants in the anonymous namespace. +#include "src/post_filter/deblock_thresholds.inc" + // Row indices of deblocked pixels needed by loop restoration. This is used to // populate the |deblock_buffer_| when cdef is on. The first dimension is // subsampling_y. @@ -122,16 +125,11 @@ void ExtendFrame(uint8_t* const frame_start, const int width, const int height, } // namespace -PostFilter::PostFilter( - const ObuFrameHeader& frame_header, - const ObuSequenceHeader& sequence_header, LoopFilterMask* const masks, - const Array2D<int16_t>& cdef_index, - const Array2D<TransformSize>& inter_transform_sizes, - LoopRestorationInfo* const restoration_info, - BlockParametersHolder* block_parameters, YuvBuffer* const frame_buffer, - YuvBuffer* const deblock_buffer, const dsp::Dsp* dsp, - ThreadPool* const thread_pool, uint8_t* const threaded_window_buffer, - uint8_t* const superres_line_buffer, int do_post_filter_mask) +PostFilter::PostFilter(const ObuFrameHeader& frame_header, + const ObuSequenceHeader& sequence_header, + FrameScratchBuffer* const frame_scratch_buffer, + YuvBuffer* const frame_buffer, const dsp::Dsp* dsp, + int do_post_filter_mask) : frame_header_(frame_header), loop_restoration_(frame_header.loop_restoration), dsp_(*dsp), @@ -149,24 +147,24 @@ PostFilter::PostFilter( : kMaxPlanes), pixel_size_(static_cast<int>((bitdepth_ == 8) ? sizeof(uint8_t) : sizeof(uint16_t))), - masks_(masks), - cdef_index_(cdef_index), - inter_transform_sizes_(inter_transform_sizes), - threaded_window_buffer_(threaded_window_buffer), - restoration_info_(restoration_info), - window_buffer_width_(GetWindowBufferWidth(thread_pool, frame_header)), - window_buffer_height_(GetWindowBufferHeight(thread_pool, frame_header)), - superres_line_buffer_(superres_line_buffer), - block_parameters_(*block_parameters), + inner_thresh_(kInnerThresh[frame_header.loop_filter.sharpness]), + outer_thresh_(kOuterThresh[frame_header.loop_filter.sharpness]), + cdef_index_(frame_scratch_buffer->cdef_index), + inter_transform_sizes_(frame_scratch_buffer->inter_transform_sizes), + threaded_window_buffer_( + frame_scratch_buffer->threaded_window_buffer.get()), + restoration_info_(&frame_scratch_buffer->loop_restoration_info), + superres_line_buffer_(frame_scratch_buffer->superres_line_buffer.get()), + block_parameters_(frame_scratch_buffer->block_parameters_holder), frame_buffer_(*frame_buffer), - deblock_buffer_(*deblock_buffer), + deblock_buffer_(frame_scratch_buffer->deblock_buffer), do_post_filter_mask_(do_post_filter_mask), - thread_pool_(thread_pool) { + thread_pool_( + frame_scratch_buffer->threading_strategy.post_filter_thread_pool()), + window_buffer_width_(GetWindowBufferWidth(thread_pool_, frame_header)), + window_buffer_height_(GetWindowBufferHeight(thread_pool_, frame_header)) { const int8_t zero_delta_lf[kFrameLfCount] = {}; ComputeDeblockFilterLevels(zero_delta_lf, deblock_filter_levels_); - if (DoDeblock()) { - InitDeblockFilterParams(); - } if (DoSuperRes()) { for (int plane = 0; plane < planes_; ++plane) { const int downscaled_width = @@ -196,7 +194,7 @@ PostFilter::PostFilter( // In single threaded mode, we apply SuperRes without making a copy of the // input row by writing the output to one row to the top (we refer to this // process as "in place superres" in our code). - const bool in_place_superres = DoSuperRes() && thread_pool == nullptr; + const bool in_place_superres = DoSuperRes() && thread_pool_ == nullptr; if (DoCdef() || DoRestoration() || in_place_superres) { for (int plane = 0; plane < planes_; ++plane) { int horizontal_shift = 0; @@ -372,8 +370,8 @@ void PostFilter::ApplyFilteringThreaded() { if (DoDeblock()) ApplyDeblockFilterThreaded(); if (DoCdef() && DoRestoration()) { for (int row4x4 = 0; row4x4 < frame_header_.rows4x4; - row4x4 += kNum4x4InLoopFilterMaskUnit) { - SetupDeblockBuffer(row4x4, kNum4x4InLoopFilterMaskUnit); + row4x4 += kNum4x4InLoopFilterUnit) { + SetupDeblockBuffer(row4x4, kNum4x4InLoopFilterUnit); } } if (DoCdef()) ApplyCdef(); @@ -383,9 +381,10 @@ void PostFilter::ApplyFilteringThreaded() { } int PostFilter::ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4, - bool is_last_row) { + bool is_last_row, + bool do_deblock) { if (row4x4 < 0) return -1; - if (DoDeblock()) { + if (DoDeblock() && do_deblock) { ApplyDeblockFilterForOneSuperBlockRow(row4x4, sb4x4); } if (DoRestoration() && DoCdef()) { diff --git a/chromium/third_party/libgav1/src/src/post_filter/super_res.cc b/chromium/third_party/libgav1/src/src/post_filter/super_res.cc index 2dc1dcd61cf..8f17a37b5cb 100644 --- a/chromium/third_party/libgav1/src/src/post_filter/super_res.cc +++ b/chromium/third_party/libgav1/src/src/post_filter/super_res.cc @@ -35,10 +35,10 @@ void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& buffers, const std::array<int, kMaxPlanes>& strides, const std::array<int, kMaxPlanes>& rows, size_t line_buffer_offset) { - uint8_t* const line_buffer_start = - in_place ? nullptr - : superres_line_buffer_ + line_buffer_offset + - kSuperResHorizontalBorder * pixel_size_; + // Only used when |in_place| == false. + uint8_t* const line_buffer_start = superres_line_buffer_ + + line_buffer_offset + + kSuperResHorizontalBorder * pixel_size_; for (int plane = kPlaneY; plane < planes_; ++plane) { const int8_t subsampling_x = subsampling_x_[plane]; const int plane_width = diff --git a/chromium/third_party/libgav1/src/src/threading_strategy.cc b/chromium/third_party/libgav1/src/src/threading_strategy.cc index 75e2ed60270..5c0b940c835 100644 --- a/chromium/third_party/libgav1/src/src/threading_strategy.cc +++ b/chromium/third_party/libgav1/src/src/threading_strategy.cc @@ -16,15 +16,52 @@ #include <algorithm> #include <cassert> +#include <memory> +#include "src/frame_scratch_buffer.h" #include "src/utils/constants.h" #include "src/utils/logging.h" +#include "src/utils/vector.h" namespace libgav1 { +namespace { + +// Computes the number of frame threads to be used based on the following +// heuristic: +// * If |thread_count| == 1, return 0. +// * If |thread_count| <= |tile_count| * 4, return 0. +// * Otherwise, return the largest value of i which satisfies the following +// condition: i + i * tile_columns <= thread_count. This ensures that there +// are at least |tile_columns| worker threads for each frame thread. +// * This function will never return 1 or a value > |thread_count|. +// +// This heuristic is based empirical performance data. The in-frame threading +// model (combination of tile multithreading, superblock row multithreading and +// post filter multithreading) performs better than the frame parallel model +// until we reach the threshold of |thread_count| > |tile_count| * 4. +// +// It is a function of |tile_count| since tile threading and superblock row +// multithreading will scale only as a factor of |tile_count|. The threshold 4 +// is arrived at based on empirical data. The general idea is that superblock +// row multithreading plateaus at 4 * |tile_count| because in most practical +// cases there aren't more than that many superblock rows and columns available +// to work on in parallel. +int ComputeFrameThreadCount(int thread_count, int tile_count, + int tile_columns) { + assert(thread_count > 0); + if (thread_count == 1) return 0; + return (thread_count <= tile_count * 4) + ? 0 + : std::max(2, thread_count / (1 + tile_columns)); +} + +} // namespace bool ThreadingStrategy::Reset(const ObuFrameHeader& frame_header, int thread_count) { assert(thread_count > 0); + frame_parallel_ = false; + if (thread_count == 1) { thread_pool_.reset(nullptr); tile_thread_count_ = 0; @@ -103,14 +140,74 @@ bool ThreadingStrategy::Reset(const ObuFrameHeader& frame_header, return true; } +bool ThreadingStrategy::Reset(int thread_count) { + assert(thread_count > 0); + frame_parallel_ = true; + + // In frame parallel mode, we simply access the underlying |thread_pool_| + // directly. So ensure all the other threadpool getter functions return + // nullptr. Also, superblock row multithreading is always disabled in frame + // parallel mode. + tile_thread_count_ = 0; + max_tile_index_for_row_threads_ = 0; + + if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) { + thread_pool_ = ThreadPool::Create("libgav1-fp", thread_count); + if (thread_pool_ == nullptr) { + LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.", + thread_count); + return false; + } + } + return true; +} + bool InitializeThreadPoolsForFrameParallel( - int thread_count, std::unique_ptr<ThreadPool>* const frame_thread_pool) { - *frame_thread_pool = ThreadPool::Create(thread_count); + int thread_count, int tile_count, int tile_columns, + std::unique_ptr<ThreadPool>* const frame_thread_pool, + FrameScratchBufferPool* const frame_scratch_buffer_pool) { + assert(*frame_thread_pool == nullptr); + thread_count = std::min(thread_count, static_cast<int>(kMaxThreads)); + const int frame_threads = + ComputeFrameThreadCount(thread_count, tile_count, tile_columns); + if (frame_threads == 0) return true; + *frame_thread_pool = ThreadPool::Create(frame_threads); if (*frame_thread_pool == nullptr) { LIBGAV1_DLOG(ERROR, "Failed to create frame thread pool with %d threads.", - thread_count); + frame_threads); return false; } + int remaining_threads = thread_count - frame_threads; + if (remaining_threads == 0) return true; + int threads_per_frame = remaining_threads / frame_threads; + const int extra_threads = remaining_threads % frame_threads; + Vector<std::unique_ptr<FrameScratchBuffer>> frame_scratch_buffers; + if (!frame_scratch_buffers.reserve(frame_threads)) return false; + // Create the tile thread pools. + for (int i = 0; i < frame_threads && remaining_threads > 0; ++i) { + std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer = + frame_scratch_buffer_pool->Get(); + if (frame_scratch_buffer == nullptr) { + return false; + } + // If the number of tile threads cannot be divided equally amongst all the + // frame threads, assign one extra thread to the first |extra_threads| frame + // threads. + const int current_frame_thread_count = + threads_per_frame + static_cast<int>(i < extra_threads); + if (!frame_scratch_buffer->threading_strategy.Reset( + current_frame_thread_count)) { + return false; + } + remaining_threads -= current_frame_thread_count; + frame_scratch_buffers.push_back_unchecked(std::move(frame_scratch_buffer)); + } + // We release the frame scratch buffers in reverse order so that the extra + // threads are allocated to buffers in the top of the stack. + for (int i = static_cast<int>(frame_scratch_buffers.size()) - 1; i >= 0; + --i) { + frame_scratch_buffer_pool->Release(std::move(frame_scratch_buffers[i])); + } return true; } diff --git a/chromium/third_party/libgav1/src/src/threading_strategy.h b/chromium/third_party/libgav1/src/src/threading_strategy.h index 5822bb31f36..84b35896d26 100644 --- a/chromium/third_party/libgav1/src/src/threading_strategy.h +++ b/chromium/third_party/libgav1/src/src/threading_strategy.h @@ -25,6 +25,8 @@ namespace libgav1 { +class FrameScratchBufferPool; + // This class allocates and manages the worker threads among thread pools used // for multi-threaded decoding. class ThreadingStrategy { @@ -36,18 +38,28 @@ class ThreadingStrategy { ThreadingStrategy& operator=(const ThreadingStrategy&) = delete; // Creates or re-allocates the thread pools based on the |frame_header| and - // |thread_count|. This function is idempotent if the |frame_header| and - // |thread_count| doesn't change between calls (it will only create new - // threads on the first call and do nothing on the subsequent calls). This - // function also starts the worker threads whenever it creates new thread - // pools. + // |thread_count|. This function is used only in non frame-parallel mode. This + // function is idempotent if the |frame_header| and |thread_count| don't + // change between calls (it will only create new threads on the first call and + // do nothing on the subsequent calls). This function also starts the worker + // threads whenever it creates new thread pools. // The following strategy is used to allocate threads: // * One thread is allocated for decoding each Tile. // * Any remaining threads are allocated for superblock row multi-threading // within each of the tile in a round robin fashion. + // Note: During the lifetime of a ThreadingStrategy object, only one of the + // Reset() variants will be used. LIBGAV1_MUST_USE_RESULT bool Reset(const ObuFrameHeader& frame_header, int thread_count); + // Creates or re-allocates a thread pool with |thread_count| threads. This + // function is used only in frame parallel mode. This function is idempotent + // if the |thread_count| doesn't change between calls (it will only create new + // threads on the first call and do nothing on the subsequent calls). + // Note: During the lifetime of a ThreadingStrategy object, only one of the + // Reset() variants will be used. + LIBGAV1_MUST_USE_RESULT bool Reset(int thread_count); + // Returns a pointer to the ThreadPool that is to be used for Tile // multi-threading. ThreadPool* tile_thread_pool() const { @@ -56,8 +68,14 @@ class ThreadingStrategy { int tile_thread_count() const { return tile_thread_count_; } + // Returns a pointer to the underlying ThreadPool. + // Note: Valid only when |frame_parallel_| is true. This is used for + // facilitating in-frame multi-threading in that case. + ThreadPool* thread_pool() const { return thread_pool_.get(); } + // Returns a pointer to the ThreadPool that is to be used within the Tile at // index |tile_index| for superblock row multi-threading. + // Note: Valid only when |frame_parallel_| is false. ThreadPool* row_thread_pool(int tile_index) const { return tile_index < max_tile_index_for_row_threads_ ? thread_pool_.get() : nullptr; @@ -65,20 +83,48 @@ class ThreadingStrategy { // Returns a pointer to the ThreadPool that is to be used for post filter // multi-threading. - ThreadPool* post_filter_thread_pool() const { return thread_pool_.get(); } + // Note: Valid only when |frame_parallel_| is false. + ThreadPool* post_filter_thread_pool() const { + return frame_parallel_ ? nullptr : thread_pool_.get(); + } // Returns a pointer to the ThreadPool that is to be used for film grain // synthesis and blending. + // Note: Valid only when |frame_parallel_| is false. ThreadPool* film_grain_thread_pool() const { return thread_pool_.get(); } private: std::unique_ptr<ThreadPool> thread_pool_; - int tile_thread_count_; - int max_tile_index_for_row_threads_; + int tile_thread_count_ = 0; + int max_tile_index_for_row_threads_ = 0; + bool frame_parallel_ = false; }; +// Initializes the |frame_thread_pool| and the necessary worker threadpools (the +// threading_strategy objects in each of the frame scratch buffer in +// |frame_scratch_buffer_pool|) as follows: +// * frame_threads = ComputeFrameThreadCount(); +// * For more details on how frame_threads is computed, see the function +// comment in ComputeFrameThreadCount(). +// * |frame_thread_pool| is created with |frame_threads| threads. +// * divide the remaining number of threads into each frame thread and +// initialize a frame_scratch_buffer.threading_strategy for each frame +// thread. +// When this function is called, |frame_scratch_buffer_pool| must be empty. If +// this function returns true, it means the initialization was successful and +// one of the following is true: +// * |frame_thread_pool| has been successfully initialized and +// |frame_scratch_buffer_pool| has been successfully populated with +// |frame_threads| buffers to be used by each frame thread. The total +// number of threads that this function creates will always be equal to +// |thread_count|. +// * |frame_thread_pool| is nullptr. |frame_scratch_buffer_pool| is not +// modified. This means that frame threading will not be used and the +// decoder will continue to operate normally in non frame parallel mode. LIBGAV1_MUST_USE_RESULT bool InitializeThreadPoolsForFrameParallel( - int thread_count, std::unique_ptr<ThreadPool>* frame_thread_pool); + int thread_count, int tile_count, int tile_columns, + std::unique_ptr<ThreadPool>* frame_thread_pool, + FrameScratchBufferPool* frame_scratch_buffer_pool); } // namespace libgav1 diff --git a/chromium/third_party/libgav1/src/src/tile.h b/chromium/third_party/libgav1/src/src/tile.h index d8f48b4df27..7fb7e2296c0 100644 --- a/chromium/third_party/libgav1/src/src/tile.h +++ b/chromium/third_party/libgav1/src/src/tile.h @@ -33,7 +33,6 @@ #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/frame_scratch_buffer.h" -#include "src/loop_filter_mask.h" #include "src/loop_restoration_info.h" #include "src/obu_parser.h" #include "src/post_filter.h" @@ -77,16 +76,14 @@ class Tile : public Allocable { const WedgeMaskArray& wedge_masks, SymbolDecoderContext* const saved_symbol_decoder_context, const SegmentationMap* prev_segment_ids, PostFilter* const post_filter, - BlockParametersHolder* const block_parameters_holder, const dsp::Dsp* const dsp, ThreadPool* const thread_pool, BlockingCounterWithStatus* const pending_tiles, bool frame_parallel, bool use_intra_prediction_buffer) { std::unique_ptr<Tile> tile(new (std::nothrow) Tile( tile_number, data, size, sequence_header, frame_header, current_frame, state, frame_scratch_buffer, wedge_masks, saved_symbol_decoder_context, - prev_segment_ids, post_filter, block_parameters_holder, dsp, - thread_pool, pending_tiles, frame_parallel, - use_intra_prediction_buffer)); + prev_segment_ids, post_filter, dsp, thread_pool, pending_tiles, + frame_parallel, use_intra_prediction_buffer)); return (tile != nullptr && tile->Init()) ? std::move(tile) : nullptr; } @@ -100,9 +97,17 @@ class Tile : public Allocable { // Parses the entire tile. bool Parse(); + // Decodes the entire tile. |superblock_row_progress| and + // |superblock_row_progress_condvar| are arrays of size equal to the number of + // superblock rows in the frame. Increments |superblock_row_progress[i]| after + // each superblock row at index |i| is decoded. If the count reaches the + // number of tile columns, then it notifies + // |superblock_row_progress_condvar[i]|. + bool Decode(std::mutex* mutex, int* superblock_row_progress, + std::condition_variable* superblock_row_progress_condvar); // Parses and decodes the entire tile. Depending on the configuration of this // Tile, this function may do multithreaded decoding. - bool ParseAndDecode(bool is_main_thread); // 5.11.2. + bool ParseAndDecode(); // 5.11.2. // Processes all the columns of the superblock row at |row4x4| that are within // this Tile. If |save_symbol_decoder_context| is true, then // SaveSymbolDecoderContext() is invoked for the last superblock row. @@ -118,10 +123,14 @@ class Tile : public Allocable { return reference_frame_sign_bias_; } + bool IsRow4x4Inside(int row4x4) const { + return row4x4 >= row4x4_start_ && row4x4 < row4x4_end_; + } + // 5.11.51. bool IsInside(int row4x4, int column4x4) const { - return row4x4 >= row4x4_start_ && row4x4 < row4x4_end_ && - column4x4 >= column4x4_start_ && column4x4 < column4x4_end_; + return IsRow4x4Inside(row4x4) && column4x4 >= column4x4_start_ && + column4x4 < column4x4_end_; } bool IsLeftInside(int column4x4) const { @@ -168,9 +177,13 @@ class Tile : public Allocable { const BlockParameters& Parameters(int row, int column) const { return *block_parameters_holder_.Find(row, column); } + int number() const { return number_; } int superblock_rows() const { return superblock_rows_; } int superblock_columns() const { return superblock_columns_; } + int row4x4_start() const { return row4x4_start_; } + int column4x4_start() const { return column4x4_start_; } + int column4x4_end() const { return column4x4_end_; } private: Tile(int tile_number, const uint8_t* data, size_t size, @@ -180,9 +193,9 @@ class Tile : public Allocable { const WedgeMaskArray& wedge_masks, SymbolDecoderContext* saved_symbol_decoder_context, const SegmentationMap* prev_segment_ids, PostFilter* post_filter, - BlockParametersHolder* block_parameters_holder, const dsp::Dsp* dsp, - ThreadPool* thread_pool, BlockingCounterWithStatus* pending_tiles, - bool frame_parallel, bool use_intra_prediction_buffer); + const dsp::Dsp* dsp, ThreadPool* thread_pool, + BlockingCounterWithStatus* pending_tiles, bool frame_parallel, + bool use_intra_prediction_buffer); // Stores the transform tree state when reading variable size transform trees // and when applying the transform tree. When applying the transform tree, @@ -201,16 +214,20 @@ class Tile : public Allocable { int depth; }; + // Enum to track the processing state of a superblock. + enum SuperBlockState : uint8_t { + kSuperBlockStateNone, // Not yet parsed or decoded. + kSuperBlockStateParsed, // Parsed but not yet decoded. + kSuperBlockStateScheduled, // Scheduled for decoding. + kSuperBlockStateDecoded // Parsed and decoded. + }; + // Parameters used to facilitate multi-threading within the Tile. struct ThreadingParameters { std::mutex mutex; - // Array2DView of size |superblock_rows_| by |superblock_columns_| - // containing the processing state of each superblock. The code in this - // class uses relative indexing of superblocks with respect to this Tile. - // The memory for this comes from the caller (the |super_block_state| - // parameter in the constructor). The memory is for the whole frame whereas - // the |sb_state| array in this struct points to the beginning of this Tile. - Array2DView<SuperBlockState> sb_state LIBGAV1_GUARDED_BY(mutex); + // 2d array of size |superblock_rows_| by |superblock_columns_| containing + // the processing state of each superblock. + Array2D<SuperBlockState> sb_state LIBGAV1_GUARDED_BY(mutex); // Variable used to indicate either parse or decode failure. bool abort LIBGAV1_GUARDED_BY(mutex) = false; int pending_jobs LIBGAV1_GUARDED_BY(mutex) = 0; @@ -297,14 +314,6 @@ class Tile : public Allocable { void ResetLoopRestorationParams(); void ReadLoopRestorationCoefficients(int row4x4, int column4x4, BlockSize block_size); // 5.11.57. - // Build bit masks for vertical edges followed by horizontal edges. - // Traverse through each transform edge in the current coding block, and - // determine if a 4x4 edge needs filtering. If filtering is needed, determine - // filter length. Set corresponding bit mask to 1. - void BuildBitMask(const Block& block); - void BuildBitMaskHelper(const Block& block, int row4x4, int column4x4, - BlockSize block_size, bool is_vertical_block_border, - bool is_horizontal_block_border); // Helper functions for DecodeBlock. bool ReadSegmentId(const Block& block); // 5.11.9. @@ -582,8 +591,8 @@ class Tile : public Allocable { } const int number_; - int row_; - int column_; + const int row_; + const int column_; const uint8_t* const data_; size_t size_; int row4x4_start_; @@ -729,14 +738,17 @@ class Tile : public Allocable { int8_t delta_lf_[kFrameLfCount]; // True if all the values in |delta_lf_| are zero. False otherwise. bool delta_lf_all_zero_; - bool build_bit_mask_when_parsing_; const bool frame_parallel_; const bool use_intra_prediction_buffer_; // Buffer used to store the unfiltered pixels that are necessary for decoding // the next superblock row (for the intra prediction process). Used only if - // |use_intra_prediction_buffer_| is true. - std::array<AlignedDynamicBuffer<uint8_t, kMaxAlignment>, kMaxPlanes> - intra_prediction_buffer_; + // |use_intra_prediction_buffer_| is true. The |frame_scratch_buffer| contains + // one row buffer for each tile row. This tile will have to use the buffer + // corresponding to this tile's row. + IntraPredictionBuffer* const intra_prediction_buffer_; + // Stores the progress of the reference frames. This will be used to avoid + // unnecessary calls into RefCountedBuffer::WaitUntil(). + std::array<int, kNumReferenceFrameTypes> reference_frame_progress_cache_; }; struct Tile::Block { diff --git a/chromium/third_party/libgav1/src/src/tile/bitstream/mode_info.cc b/chromium/third_party/libgav1/src/src/tile/bitstream/mode_info.cc index c13fbe3b907..1bae5a3c1b6 100644 --- a/chromium/third_party/libgav1/src/src/tile/bitstream/mode_info.cc +++ b/chromium/third_party/libgav1/src/src/tile/bitstream/mode_info.cc @@ -1100,12 +1100,11 @@ uint16_t* Tile::GetIsExplicitCompoundTypeCdf(const Block& block) { uint16_t* Tile::GetIsCompoundTypeAverageCdf(const Block& block) { const BlockParameters& bp = *block.bp; - const int forward = std::abs(GetRelativeDistance( - current_frame_.order_hint(bp.reference_frame[0]), - frame_header_.order_hint, sequence_header_.order_hint_shift_bits)); - const int backward = std::abs(GetRelativeDistance( - current_frame_.order_hint(bp.reference_frame[1]), - frame_header_.order_hint, sequence_header_.order_hint_shift_bits)); + const ReferenceInfo& reference_info = *current_frame_.reference_info(); + const int forward = + std::abs(reference_info.relative_distance_from[bp.reference_frame[0]]); + const int backward = + std::abs(reference_info.relative_distance_from[bp.reference_frame[1]]); int context = (forward == backward) ? 3 : 0; if (block.top_available[kPlaneY]) { if (!block.IsTopSingle()) { diff --git a/chromium/third_party/libgav1/src/src/tile/prediction.cc b/chromium/third_party/libgav1/src/src/tile/prediction.cc index 672b5a2b3a7..785c1dac404 100644 --- a/chromium/third_party/libgav1/src/src/tile/prediction.cc +++ b/chromium/third_party/libgav1/src/src/tile/prediction.cc @@ -277,7 +277,6 @@ void Tile::IntraPrediction(const Block& block, Plane plane, int x, int y, (mode == kPredictionModeDc && has_left); const Pixel* top_row_src = buffer[y - 1]; - int top_row_offset = 0; // Determine if we need to retrieve the top row from // |intra_prediction_buffer_|. @@ -295,13 +294,8 @@ void Tile::IntraPrediction(const Block& block, Plane plane, int x, int y, // then we will have to retrieve the top row from the // |intra_prediction_buffer_|. if (current_superblock_index != top_row_superblock_index) { - top_row_src = - reinterpret_cast<const Pixel*>(intra_prediction_buffer_[plane].get()); - // The |intra_prediction_buffer_| only stores the top row for this Tile. - // The |x| value in this function is absolute to the frame. So in order to - // make it relative to this Tile, all acccesses into top_row_src must be - // offset by negative |top_row_offset|. - top_row_offset = MultiplyBy4(column4x4_start_) >> subsampling_x_[plane]; + top_row_src = reinterpret_cast<const Pixel*>( + (*intra_prediction_buffer_)[plane].get()); } } @@ -309,8 +303,7 @@ void Tile::IntraPrediction(const Block& block, Plane plane, int x, int y, // Compute top_row. if (has_top || has_left) { const int left_index = has_left ? x - 1 : x; - top_row[-1] = has_top ? top_row_src[left_index - top_row_offset] - : buffer[y][left_index]; + top_row[-1] = has_top ? top_row_src[left_index] : buffer[y][left_index]; } else { top_row[-1] = 1 << (bitdepth - 1); } @@ -320,14 +313,12 @@ void Tile::IntraPrediction(const Block& block, Plane plane, int x, int y, Memset(top_row, (1 << (bitdepth - 1)) - 1, top_size); } else { const int top_limit = std::min(max_x - x + 1, top_right_size); - memcpy(top_row, &top_row_src[x - top_row_offset], - top_limit * sizeof(Pixel)); + memcpy(top_row, &top_row_src[x], top_limit * sizeof(Pixel)); // Even though it is safe to call Memset with a size of 0, accessing // top_row_src[top_limit - x + 1] is not allowed when this condition is // false. if (top_size - top_limit > 0) { - Memset(top_row + top_limit, - top_row_src[top_limit + x - 1 - top_row_offset], + Memset(top_row + top_limit, top_row_src[top_limit + x - 1], top_size - top_limit); } } @@ -336,13 +327,13 @@ void Tile::IntraPrediction(const Block& block, Plane plane, int x, int y, // Compute left_column. if (has_top || has_left) { const int left_index = has_left ? x - 1 : x; - left_column[-1] = has_top ? top_row_src[left_index - top_row_offset] - : buffer[y][left_index]; + left_column[-1] = + has_top ? top_row_src[left_index] : buffer[y][left_index]; } else { left_column[-1] = 1 << (bitdepth - 1); } if (!has_left && has_top) { - Memset(left_column, top_row_src[x - top_row_offset], left_size); + Memset(left_column, top_row_src[x], left_size); } else if (!has_left && !has_top) { Memset(left_column, (1 << (bitdepth - 1)) + 1, left_size); } else { @@ -942,14 +933,13 @@ void Tile::DistanceWeightedPrediction(void* prediction_0, void* prediction_1, for (int reference = 0; reference < 2; ++reference) { const BlockParameters& bp = *block_parameters_holder_.Find(candidate_row, candidate_column); - const unsigned int reference_hint = - current_frame_.order_hint(bp.reference_frame[reference]); // Note: distance[0] and distance[1] correspond to relative distance // between current frame and reference frame [1] and [0], respectively. - distance[1 - reference] = Clip3( - std::abs(GetRelativeDistance(reference_hint, frame_header_.order_hint, - sequence_header_.order_hint_shift_bits)), - 0, kMaxFrameDistance); + distance[1 - reference] = std::min( + std::abs(static_cast<int>( + current_frame_.reference_info() + ->relative_distance_from[bp.reference_frame[reference]])), + static_cast<int>(kMaxFrameDistance)); } GetDistanceWeights(distance, weight); @@ -1136,7 +1126,11 @@ bool Tile::BlockInterPrediction( // reference_y_max by 2 since we only track the progress of Y planes. reference_y_max = LeftShift(reference_y_max, subsampling_y); } - if (!reference_frames_[reference_frame_index]->WaitUntil(reference_y_max)) { + if (reference_frame_progress_cache_[reference_frame_index] < + reference_y_max && + !reference_frames_[reference_frame_index]->WaitUntil( + reference_y_max, + &reference_frame_progress_cache_[reference_frame_index])) { return false; } } @@ -1275,7 +1269,11 @@ bool Tile::BlockWarpProcess(const Block& block, const Plane plane, // For U and V planes with subsampling, we need to multiply reference_y_max // by 2 since we only track the progress of Y planes. reference_y_max = LeftShift(reference_y_max, subsampling_y_[plane]); - if (!reference_frames_[reference_frame_index]->WaitUntil(reference_y_max)) { + if (reference_frame_progress_cache_[reference_frame_index] < + reference_y_max && + !reference_frames_[reference_frame_index]->WaitUntil( + reference_y_max, + &reference_frame_progress_cache_[reference_frame_index])) { return false; } } diff --git a/chromium/third_party/libgav1/src/src/tile/tile.cc b/chromium/third_party/libgav1/src/src/tile/tile.cc index 50daf1add34..ed00e282018 100644 --- a/chromium/third_party/libgav1/src/src/tile/tile.cc +++ b/chromium/third_party/libgav1/src/src/tile/tile.cc @@ -17,6 +17,7 @@ #include <algorithm> #include <array> #include <cassert> +#include <climits> #include <cstdlib> #include <cstring> #include <memory> @@ -100,6 +101,14 @@ constexpr PredictionMode kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal, kPredictionModeD157, kPredictionModeDc}; +// Mask used to determine the index for mode_deltas lookup. +constexpr BitMaskSet kPredictionModeDeltasMask( + kPredictionModeNearestMv, kPredictionModeNearMv, kPredictionModeNewMv, + kPredictionModeNearestNearestMv, kPredictionModeNearNearMv, + kPredictionModeNearestNewMv, kPredictionModeNewNearestMv, + kPredictionModeNearNewMv, kPredictionModeNewNearMv, + kPredictionModeNewNewMv); + // This is computed as: // min(transform_width_log2, 5) + min(transform_height_log2, 5) - 4. constexpr uint8_t kEobMultiSizeLookup[kNumTransformSizes] = { @@ -383,12 +392,13 @@ Tile::Tile(int tile_number, const uint8_t* const data, size_t size, const WedgeMaskArray& wedge_masks, SymbolDecoderContext* const saved_symbol_decoder_context, const SegmentationMap* prev_segment_ids, - PostFilter* const post_filter, - BlockParametersHolder* const block_parameters_holder, - const dsp::Dsp* const dsp, ThreadPool* const thread_pool, + PostFilter* const post_filter, const dsp::Dsp* const dsp, + ThreadPool* const thread_pool, BlockingCounterWithStatus* const pending_tiles, bool frame_parallel, bool use_intra_prediction_buffer) : number_(tile_number), + row_(number_ / frame_header.tile_info.tile_columns), + column_(number_ % frame_header.tile_info.tile_columns), data_(data), size_(size), read_deltas_(false), @@ -410,7 +420,7 @@ Tile::Tile(int tile_number, const uint8_t* const data, size_t size, prev_segment_ids_(prev_segment_ids), dsp_(*dsp), post_filter_(*post_filter), - block_parameters_holder_(*block_parameters_holder), + block_parameters_holder_(frame_scratch_buffer->block_parameters_holder), quantizer_(sequence_header_.color_config.bitdepth, &frame_header_.quantizer), residual_size_((sequence_header_.color_config.bitdepth == 8) @@ -428,11 +438,12 @@ Tile::Tile(int tile_number, const uint8_t* const data, size_t size, tile_scratch_buffer_pool_( &frame_scratch_buffer->tile_scratch_buffer_pool), pending_tiles_(pending_tiles), - build_bit_mask_when_parsing_(false), frame_parallel_(frame_parallel), - use_intra_prediction_buffer_(use_intra_prediction_buffer) { - row_ = number_ / frame_header.tile_info.tile_columns; - column_ = number_ % frame_header.tile_info.tile_columns; + use_intra_prediction_buffer_(use_intra_prediction_buffer), + intra_prediction_buffer_( + use_intra_prediction_buffer_ + ? &frame_scratch_buffer->intra_prediction_buffers.get()[row_] + : nullptr) { row4x4_start_ = frame_header.tile_info.tile_row_start[row_]; row4x4_end_ = frame_header.tile_info.tile_row_start[row_ + 1]; column4x4_start_ = frame_header.tile_info.tile_column_start[column_]; @@ -454,6 +465,9 @@ Tile::Tile(int tile_number, const uint8_t* const data, size_t size, split_parse_and_decode_ = (thread_pool_ != nullptr && superblock_columns_ > intra_block_copy_lag_) || frame_parallel; + if (frame_parallel_) { + reference_frame_progress_cache_.fill(INT_MIN); + } memset(delta_lf_, 0, sizeof(delta_lf_)); delta_lf_all_zero_ = true; const YuvBuffer& buffer = post_filter_.frame_buffer(); @@ -491,21 +505,6 @@ Tile::Tile(int tile_number, const uint8_t* const data, size_t size, std::min(frame_header_.columns4x4, DivideBy4(plane_width + 3) << subsampling_x_[plane]); } - auto& superblock_state = frame_scratch_buffer->superblock_state; - if (split_parse_and_decode_ && superblock_state.rows() > 0) { - // The |superblock_state| array is for the entire frame. Set - // |threading_.sb_state| to point to the beginning of this Tile. - std::lock_guard<std::mutex> lock(threading_.mutex); - const int superblock_width_log2 = - FloorLog2(kBlockWidthPixels[SuperBlockSize()]); - const int superblock_row_start_index = - MultiplyBy4(row4x4_start_) >> superblock_width_log2; - const int superblock_column_start_index = - MultiplyBy4(column4x4_start_) >> superblock_width_log2; - threading_.sb_state.Reset(superblock_rows_, superblock_state.columns(), - &superblock_state[superblock_row_start_index] - [superblock_column_start_index]); - } } bool Tile::Init() { @@ -545,28 +544,11 @@ bool Tile::Init() { return false; } } - if (use_intra_prediction_buffer_) { - for (int plane = 0; plane < PlaneCount(); ++plane) { - const size_t intra_prediction_buffer_size = - (MultiplyBy4(column4x4_end_ - column4x4_start_) >> - subsampling_x_[plane]) * - (sequence_header_.color_config.bitdepth == 8 ? sizeof(uint8_t) - : sizeof(uint16_t)); - if (!intra_prediction_buffer_[plane].Resize( - intra_prediction_buffer_size)) { - LIBGAV1_DLOG( - ERROR, "Failed to allocate intra prediction buffer for plane %d.\n", - plane); - return false; - } - } - } if (frame_header_.use_ref_frame_mvs) { assert(sequence_header_.enable_order_hint); SetupMotionField(frame_header_, current_frame_, reference_frames_, - sequence_header_.order_hint_shift_bits, row4x4_start_, - row4x4_end_, column4x4_start_, column4x4_end_, - &motion_field_); + row4x4_start_, row4x4_end_, column4x4_start_, + column4x4_end_, &motion_field_); } ResetLoopRestorationParams(); return true; @@ -612,11 +594,10 @@ void Tile::SaveSymbolDecoderContext() { } } -bool Tile::ParseAndDecode(bool is_main_thread) { +bool Tile::ParseAndDecode() { // If this is the main thread, we build the loop filter bit masks when parsing // so that it happens in the current thread. This ensures that the main thread // does as much work as possible. - build_bit_mask_when_parsing_ = is_main_thread; if (split_parse_and_decode_) { if (!ThreadedParseAndDecode()) return false; SaveSymbolDecoderContext(); @@ -663,9 +644,72 @@ bool Tile::Parse() { return true; } +bool Tile::Decode( + std::mutex* const mutex, int* const superblock_row_progress, + std::condition_variable* const superblock_row_progress_condvar) { + const int block_width4x4 = sequence_header_.use_128x128_superblock ? 32 : 16; + const int block_width4x4_log2 = + sequence_header_.use_128x128_superblock ? 5 : 4; + std::unique_ptr<TileScratchBuffer> scratch_buffer = + tile_scratch_buffer_pool_->Get(); + if (scratch_buffer == nullptr) { + LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer."); + return false; + } + for (int row4x4 = row4x4_start_, index = row4x4_start_ >> block_width4x4_log2; + row4x4 < row4x4_end_; row4x4 += block_width4x4, ++index) { + if (!ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>( + row4x4, scratch_buffer.get())) { + return false; + } + if (post_filter_.DoDeblock()) { + // Apply vertical deblock filtering for all the columns in this tile + // except for the first 64 columns. + post_filter_.ApplyDeblockFilter( + kLoopFilterTypeVertical, row4x4, + column4x4_start_ + kNum4x4InLoopFilterUnit, column4x4_end_, + block_width4x4); + // If this is the first superblock row of the tile, then we cannot apply + // horizontal deblocking here since we don't know if the top row is + // available. So it will be done by the calling thread in that case. + if (row4x4 != row4x4_start_) { + // Apply horizontal deblock filtering for all the columns in this tile + // except for the first and the last 64 columns. + // Note about the last tile of each row: For the last tile, + // column4x4_end may not be a multiple of 16. In that case it is still + // okay to simply subtract 16 since ApplyDeblockFilter() will only do + // the filters in increments of 64 columns (or 32 columns for chroma + // with subsampling). + post_filter_.ApplyDeblockFilter( + kLoopFilterTypeHorizontal, row4x4, + column4x4_start_ + kNum4x4InLoopFilterUnit, + column4x4_end_ - kNum4x4InLoopFilterUnit, block_width4x4); + } + } + bool notify; + { + std::unique_lock<std::mutex> lock(*mutex); + notify = ++superblock_row_progress[index] == + frame_header_.tile_info.tile_columns; + } + if (notify) { + // We are done decoding this superblock row. Notify the post filtering + // thread. + superblock_row_progress_condvar[index].notify_one(); + } + } + tile_scratch_buffer_pool_->Release(std::move(scratch_buffer)); + return true; +} + bool Tile::ThreadedParseAndDecode() { { std::lock_guard<std::mutex> lock(threading_.mutex); + if (!threading_.sb_state.Reset(superblock_rows_, superblock_columns_)) { + pending_tiles_->Decrement(false); + LIBGAV1_DLOG(ERROR, "threading.sb_state.Reset() failed."); + return false; + } // Account for the parsing job. ++threading_.pending_jobs; } @@ -826,14 +870,16 @@ void Tile::PopulateIntraPredictionBuffer(int row4x4) { if (!use_intra_prediction_buffer_ || row4x4 + block_width4x4 >= row4x4_end_) { return; } + const size_t pixel_size = + (sequence_header_.color_config.bitdepth == 8 ? sizeof(uint8_t) + : sizeof(uint16_t)); for (int plane = 0; plane < PlaneCount(); ++plane) { const int row_to_copy = (MultiplyBy4(row4x4 + block_width4x4) >> subsampling_y_[plane]) - 1; const size_t pixels_to_copy = (MultiplyBy4(column4x4_end_ - column4x4_start_) >> subsampling_x_[plane]) * - (sequence_header_.color_config.bitdepth == 8 ? sizeof(uint8_t) - : sizeof(uint16_t)); + pixel_size; const size_t column_start = MultiplyBy4(column4x4_start_) >> subsampling_x_[plane]; void* start; @@ -848,7 +894,8 @@ void Tile::PopulateIntraPredictionBuffer(int row4x4) { { start = &buffer_[plane][row_to_copy][column_start]; } - memcpy(intra_prediction_buffer_[plane].get(), start, pixels_to_copy); + memcpy((*intra_prediction_buffer_)[plane].get() + column_start * pixel_size, + start, pixels_to_copy); } } @@ -2067,15 +2114,16 @@ bool Tile::ComputePrediction(const Block& block) { void Tile::PopulateDeblockFilterLevel(const Block& block) { if (!post_filter_.DoDeblock()) return; BlockParameters& bp = *block.bp; + const int mode_id = + static_cast<int>(kPredictionModeDeltasMask.Contains(bp.y_mode)); for (int i = 0; i < kFrameLfCount; ++i) { if (delta_lf_all_zero_) { bp.deblock_filter_level[i] = post_filter_.GetZeroDeltaDeblockFilterLevel( - bp.segment_id, i, bp.reference_frame[0], - LoopFilterMask::GetModeId(bp.y_mode)); + bp.segment_id, i, bp.reference_frame[0], mode_id); } else { bp.deblock_filter_level[i] = deblock_filter_levels_[bp.segment_id][i][bp.reference_frame[0]] - [LoopFilterMask::GetModeId(bp.y_mode)]; + [mode_id]; } } } @@ -2138,10 +2186,6 @@ bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size, current_frame_.segmentation_map()->FillBlock(row4x4, column4x4, x_limit, y_limit, bp.segment_id); } - if (kDeblockFilterBitMask && - (build_bit_mask_when_parsing_ || !split_parse_and_decode_)) { - BuildBitMask(block); - } StoreMotionFieldMvsIntoCurrentFrame(block); if (!split_parse_and_decode_) { prediction_parameters_ = std::move(bp.prediction_parameters); @@ -2164,9 +2208,6 @@ bool Tile::DecodeBlock(ParameterTree* const tree, !Residual(block, kProcessingModeDecodeOnly)) { return false; } - if (kDeblockFilterBitMask && !build_bit_mask_when_parsing_) { - BuildBitMask(block); - } block.bp->prediction_parameters.reset(nullptr); return true; } @@ -2451,176 +2492,11 @@ void Tile::ReadLoopRestorationCoefficients(int row4x4, int column4x4, } } -void Tile::BuildBitMask(const Block& block) { - if (!post_filter_.DoDeblock()) return; - if (block.size <= kBlock64x64) { - BuildBitMaskHelper(block, block.row4x4, block.column4x4, block.size, true, - true); - } else { - const int block_width4x4 = kNum4x4BlocksWide[block.size]; - const int block_height4x4 = kNum4x4BlocksHigh[block.size]; - for (int y = 0; y < block_height4x4; y += 16) { - for (int x = 0; x < block_width4x4; x += 16) { - BuildBitMaskHelper(block, block.row4x4 + y, block.column4x4 + x, - kBlock64x64, x == 0, y == 0); - } - } - } -} - -void Tile::BuildBitMaskHelper(const Block& block, int row4x4, int column4x4, - BlockSize block_size, - const bool is_vertical_block_border, - const bool is_horizontal_block_border) { - const int block_width4x4 = kNum4x4BlocksWide[block_size]; - const int block_height4x4 = kNum4x4BlocksHigh[block_size]; - BlockParameters& bp = *block.bp; - const bool skip = bp.skip && bp.is_inter; - LoopFilterMask* const masks = post_filter_.masks(); - const int unit_id = DivideBy16(row4x4) * masks->num_64x64_blocks_per_row() + - DivideBy16(column4x4); - - for (int plane = kPlaneY; plane < PlaneCount(); ++plane) { - // For U and V planes, do not build bit masks if level == 0. - if (plane > kPlaneY && frame_header_.loop_filter.level[plane + 1] == 0) { - continue; - } - // Build bit mask for vertical edges. - const int subsampling_x = subsampling_x_[plane]; - const int subsampling_y = subsampling_y_[plane]; - const int column_limit = - std::min(column4x4 + block_width4x4, deblock_column_limit_[plane]); - const int row_limit = - std::min(row4x4 + block_height4x4, deblock_row_limit_[plane]); - const int row_start = GetDeblockPosition(row4x4, subsampling_y); - const int column_start = GetDeblockPosition(column4x4, subsampling_x); - if (row_start >= row_limit || column_start >= column_limit) { - continue; - } - const int vertical_step = 1 << subsampling_y; - const int horizontal_step = 1 << subsampling_x; - const BlockParameters& bp = - *block_parameters_holder_.Find(row_start, column_start); - const int horizontal_level_index = - kDeblockFilterLevelIndex[plane][kLoopFilterTypeHorizontal]; - const int vertical_level_index = - kDeblockFilterLevelIndex[plane][kLoopFilterTypeVertical]; - const uint8_t vertical_level = - bp.deblock_filter_level[vertical_level_index]; - - for (int row = row_start; row < row_limit; row += vertical_step) { - for (int column = column_start; column < column_limit;) { - const TransformSize tx_size = (plane == kPlaneY) - ? inter_transform_sizes_[row][column] - : bp.uv_transform_size; - // (1). Don't filter frame boundary. - // (2). For tile boundary, we don't know whether the previous tile is - // available or not, thus we handle it after all tiles are decoded. - const bool is_vertical_border = - (column == column_start) && is_vertical_block_border; - if (column == GetDeblockPosition(column4x4_start_, subsampling_x) || - (skip && !is_vertical_border)) { - column += kNum4x4BlocksWide[tx_size] << subsampling_x; - continue; - } - - // bp_left is the parameter of the left prediction block which - // is guaranteed to be inside the tile. - const BlockParameters& bp_left = - *block_parameters_holder_.Find(row, column - horizontal_step); - const uint8_t left_level = - is_vertical_border - ? bp_left.deblock_filter_level[vertical_level_index] - : vertical_level; - // We don't have to check if the left block is skipped or not, - // because if the current transform block is on the edge of the coding - // block, is_vertical_border is true; if it's not on the edge, - // left skip is equal to skip. - if (vertical_level != 0 || left_level != 0) { - const TransformSize left_tx_size = - (plane == kPlaneY) - ? inter_transform_sizes_[row][column - horizontal_step] - : bp_left.uv_transform_size; - const LoopFilterTransformSizeId transform_size_id = - GetTransformSizeIdWidth(tx_size, left_tx_size); - const int r = row & (kNum4x4InLoopFilterMaskUnit - 1); - const int c = column & (kNum4x4InLoopFilterMaskUnit - 1); - const int shift = LoopFilterMask::GetShift(r, c); - const int index = LoopFilterMask::GetIndex(r); - const auto mask = static_cast<uint64_t>(1) << shift; - masks->SetLeft(mask, unit_id, plane, transform_size_id, index); - const uint8_t current_level = - (vertical_level == 0) ? left_level : vertical_level; - masks->SetLevel(current_level, unit_id, plane, - kLoopFilterTypeVertical, - LoopFilterMask::GetLevelOffset(r, c)); - } - column += kNum4x4BlocksWide[tx_size] << subsampling_x; - } - } - - // Build bit mask for horizontal edges. - const uint8_t horizontal_level = - bp.deblock_filter_level[horizontal_level_index]; - for (int column = column_start; column < column_limit; - column += horizontal_step) { - for (int row = row_start; row < row_limit;) { - const TransformSize tx_size = (plane == kPlaneY) - ? inter_transform_sizes_[row][column] - : bp.uv_transform_size; - - // (1). Don't filter frame boundary. - // (2). For tile boundary, we don't know whether the previous tile is - // available or not, thus we handle it after all tiles are decoded. - const bool is_horizontal_border = - (row == row_start) && is_horizontal_block_border; - if (row == GetDeblockPosition(row4x4_start_, subsampling_y) || - (skip && !is_horizontal_border)) { - row += kNum4x4BlocksHigh[tx_size] << subsampling_y; - continue; - } - - // bp_top is the parameter of the top prediction block which is - // guaranteed to be inside the tile. - const BlockParameters& bp_top = - *block_parameters_holder_.Find(row - vertical_step, column); - const uint8_t top_level = - is_horizontal_border - ? bp_top.deblock_filter_level[horizontal_level_index] - : horizontal_level; - // We don't have to check it the top block is skipped or not, - // because if the current transform block is on the edge of the coding - // block, is_horizontal_border is true; if it's not on the edge, - // top skip is equal to skip. - if (horizontal_level != 0 || top_level != 0) { - const TransformSize top_tx_size = - (plane == kPlaneY) - ? inter_transform_sizes_[row - vertical_step][column] - : bp_top.uv_transform_size; - const LoopFilterTransformSizeId transform_size_id = - static_cast<LoopFilterTransformSizeId>( - std::min({kTransformHeightLog2[tx_size] - 2, - kTransformHeightLog2[top_tx_size] - 2, 2})); - const int r = row & (kNum4x4InLoopFilterMaskUnit - 1); - const int c = column & (kNum4x4InLoopFilterMaskUnit - 1); - const int shift = LoopFilterMask::GetShift(r, c); - const int index = LoopFilterMask::GetIndex(r); - const auto mask = static_cast<uint64_t>(1) << shift; - masks->SetTop(mask, unit_id, plane, transform_size_id, index); - const uint8_t current_level = - (horizontal_level == 0) ? top_level : horizontal_level; - masks->SetLevel(current_level, unit_id, plane, - kLoopFilterTypeHorizontal, - LoopFilterMask::GetLevelOffset(r, c)); - } - row += kNum4x4BlocksHigh[tx_size] << subsampling_y; - } - } - } -} - void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) { - if (frame_header_.refresh_frame_flags == 0) return; + if (frame_header_.refresh_frame_flags == 0 || + IsIntraFrame(frame_header_.frame_type)) { + return; + } // Iterate over odd rows/columns beginning at the first odd row/column for the // block. It is done this way because motion field mvs are only needed at a // 8x8 granularity. @@ -2636,6 +2512,7 @@ void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) { // The largest reference MV component that can be saved. constexpr int kRefMvsLimit = (1 << 12) - 1; const BlockParameters& bp = *block.bp; + ReferenceInfo* reference_info = current_frame_.reference_info(); for (int i = 1; i >= 0; --i) { const ReferenceFrameType reference_frame_to_store = bp.reference_frame[i]; // Must make a local copy so that StoreMotionFieldMvs() knows there is no @@ -2649,12 +2526,7 @@ void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) { // The next line is equivalent to: // mv_row <= kRefMvsLimit && mv_column <= kRefMvsLimit (mv_row | mv_column) <= kRefMvsLimit && - GetRelativeDistance( - reference_order_hint_ - [frame_header_.reference_frame_index[reference_frame_to_store - - kReferenceFrameLast]], - frame_header_.order_hint, - sequence_header_.order_hint_shift_bits) < 0) { + reference_info->relative_distance_from[reference_frame_to_store] < 0) { const int row_start8x8 = DivideBy2(row_start4x4); const int row_limit8x8 = DivideBy2(row_limit4x4); const int column_start8x8 = DivideBy2(column_start4x4); @@ -2663,10 +2535,10 @@ void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) { const int columns = column_limit8x8 - column_start8x8; const ptrdiff_t stride = DivideBy2(current_frame_.columns4x4()); ReferenceFrameType* const reference_frame_row_start = - current_frame_.motion_field_reference_frame(row_start8x8, - column_start8x8); + &reference_info + ->motion_field_reference_frame[row_start8x8][column_start8x8]; MotionVector* const mv = - current_frame_.motion_field_mv(row_start8x8, column_start8x8); + &reference_info->motion_field_mv[row_start8x8][column_start8x8]; // Specialize columns cases 1, 2, 4, 8 and 16. This makes memset() inlined // and simplifies std::fill() for these cases. diff --git a/chromium/third_party/libgav1/src/src/utils/array_2d.h b/chromium/third_party/libgav1/src/src/utils/array_2d.h index 941d4b16f87..2df624187d0 100644 --- a/chromium/third_party/libgav1/src/src/utils/array_2d.h +++ b/chromium/third_party/libgav1/src/src/utils/array_2d.h @@ -113,6 +113,7 @@ class Array2D { int columns() const { return data_view_.columns(); } size_t size() const { return size_; } T* data() { return data_.get(); } + const T* data() const { return data_.get(); } T* operator[](int row) { return data_view_[row]; } diff --git a/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.cc b/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.cc index b52e91d6c97..79bb2b8f7e1 100644 --- a/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.cc +++ b/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.cc @@ -35,13 +35,11 @@ int RowsOrColumns4x4ToSuperBlocks(int value4x4, bool use_128x128_superblock) { } // namespace -BlockParametersHolder::BlockParametersHolder(int rows4x4, int columns4x4, - bool use_128x128_superblock) - : rows4x4_(rows4x4), - columns4x4_(columns4x4), - use_128x128_superblock_(use_128x128_superblock) {} - -bool BlockParametersHolder::Init() { +bool BlockParametersHolder::Reset(int rows4x4, int columns4x4, + bool use_128x128_superblock) { + rows4x4_ = rows4x4; + columns4x4_ = columns4x4; + use_128x128_superblock_ = use_128x128_superblock; if (!block_parameters_cache_.Reset(rows4x4_, columns4x4_)) { LIBGAV1_DLOG(ERROR, "block_parameters_cache_.Reset() failed."); return false; diff --git a/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.h b/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.h index 909de5eefa3..35543c30a4e 100644 --- a/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.h +++ b/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.h @@ -31,17 +31,16 @@ namespace libgav1 { // corresponding to a superblock. class BlockParametersHolder { public: - // If |use_128x128_superblock| is true, 128x128 superblocks will be used, - // otherwise 64x64 superblocks will be used. - BlockParametersHolder(int rows4x4, int columns4x4, - bool use_128x128_superblock); + BlockParametersHolder() = default; // Not copyable or movable. BlockParametersHolder(const BlockParametersHolder&) = delete; BlockParametersHolder& operator=(const BlockParametersHolder&) = delete; - // Must be called first. - LIBGAV1_MUST_USE_RESULT bool Init(); + // If |use_128x128_superblock| is true, 128x128 superblocks will be used, + // otherwise 64x64 superblocks will be used. + LIBGAV1_MUST_USE_RESULT bool Reset(int rows4x4, int columns4x4, + bool use_128x128_superblock); // Finds the BlockParameters corresponding to |row4x4| and |column4x4|. This // is done as a simple look up of the |block_parameters_cache_| matrix. @@ -54,6 +53,10 @@ class BlockParametersHolder { return block_parameters_cache_.data() + row4x4 * columns4x4_ + column4x4; } + BlockParameters* const* Address(int row4x4, int column4x4) const { + return block_parameters_cache_.data() + row4x4 * columns4x4_ + column4x4; + } + int columns4x4() const { return columns4x4_; } // Returns the ParameterTree corresponding to superblock starting at (|row|, @@ -66,9 +69,9 @@ class BlockParametersHolder { BlockParameters* bp); private: - const int rows4x4_; - const int columns4x4_; - const bool use_128x128_superblock_; + int rows4x4_ = 0; + int columns4x4_ = 0; + bool use_128x128_superblock_ = false; Array2D<std::unique_ptr<ParameterTree>> trees_; // This is a 2d array of size |rows4x4_| * |columns4x4_|. This is filled in by diff --git a/chromium/third_party/libgav1/src/src/utils/common.h b/chromium/third_party/libgav1/src/src/utils/common.h index 56f413a2849..d6e019933e2 100644 --- a/chromium/third_party/libgav1/src/src/utils/common.h +++ b/chromium/third_party/libgav1/src/src/utils/common.h @@ -400,19 +400,17 @@ constexpr int ApplySign(int value, int sign) { return (value ^ sign) - sign; } // 7.9.3. (without the clamp for numerator and denominator). inline void GetMvProjection(const MotionVector& mv, int numerator, - int denominator, MotionVector* projection_mv) { - // Allow numerator and denominator to be 0 so that this function can be called - // unconditionally. When either numerator or denominator is 0, |projection_mv| - // will be 0, and this is what we want. + int division_multiplier, + MotionVector* projection_mv) { + // Allow numerator and to be 0 so that this function can be called + // unconditionally. When numerator is 0, |projection_mv| will be 0, and this + // is what we want. assert(std::abs(numerator) <= kMaxFrameDistance); - assert(denominator >= 0); - assert(denominator <= kMaxFrameDistance); for (int i = 0; i < 2; ++i) { - projection_mv->mv[i] = Clip3( - RightShiftWithRoundingSigned( - mv.mv[i] * numerator * kProjectionMvDivisionLookup[denominator], - 14), - -kProjectionMvClamp, kProjectionMvClamp); + projection_mv->mv[i] = + Clip3(RightShiftWithRoundingSigned( + mv.mv[i] * numerator * division_multiplier, 14), + -kProjectionMvClamp, kProjectionMvClamp); } } diff --git a/chromium/third_party/libgav1/src/src/utils/constants.h b/chromium/third_party/libgav1/src/src/utils/constants.h index 868bfdc8c82..f070767ecb6 100644 --- a/chromium/third_party/libgav1/src/src/utils/constants.h +++ b/chromium/third_party/libgav1/src/src/utils/constants.h @@ -27,11 +27,14 @@ namespace libgav1 { // Returns the number of elements between begin (inclusive) and end (inclusive). constexpr int EnumRangeLength(int begin, int end) { return end - begin + 1; } -#if defined(ENABLE_DEBLOCK_BIT_MASK) -constexpr bool kDeblockFilterBitMask = true; +enum { +// Maximum number of threads that the library will ever create. +#if defined(LIBGAV1_MAX_THREADS) && LIBGAV1_MAX_THREADS > 0 + kMaxThreads = LIBGAV1_MAX_THREADS #else -constexpr bool kDeblockFilterBitMask = false; -#endif // defined(ENABLE_DEBLOCK_BIT_MASK) + kMaxThreads = 128 +#endif +}; // anonymous enum enum { kInvalidMvValue = -32768, @@ -44,7 +47,6 @@ enum { kFrameLfCount = 4, kMaxLoopFilterValue = 63, kNum4x4In64x64 = 256, - kNumLoopFilterMasks = 4, kMaxAngleDelta = 3, kDirectionalIntraModes = 8, kMaxSuperBlockSizeLog2 = 7, @@ -97,24 +99,19 @@ enum { kMaxSuperBlockSizeInPixels = 128, kMaxScaledSuperBlockSizeInPixels = 128 * 2, kMaxSuperBlockSizeSquareInPixels = 128 * 128, - kNum4x4InLoopFilterMaskUnit = 16, + kNum4x4InLoopFilterUnit = 16, kProjectionMvClamp = (1 << 14) - 1, // == 16383 kProjectionMvMaxHorizontalOffset = 8, + kCdefUnitSize = 64, + kCdefUnitSizeWithBorders = kCdefUnitSize + 2 * kRestorationBorder, kRestorationUnitOffset = 8, - // 2 pixel padding for 5x5 box sum on each side. - kRestorationPadding = 4, // Loop restoration's processing unit size is fixed as 64x64. - kRestorationProcessingUnitSize = 64, - kRestorationProcessingUnitSizeWithBorders = - kRestorationProcessingUnitSize + 2 * kRestorationBorder, - // The max size of a box filter process output buffer. - kMaxBoxFilterProcessOutputPixels = kRestorationProcessingUnitSize * - kRestorationProcessingUnitSize, // == 4096 - // The max size of a box filter process intermediate buffer. - kBoxFilterProcessIntermediatePixels = - (kRestorationProcessingUnitSizeWithBorders + kRestorationPadding) * - (kRestorationProcessingUnitSizeWithBorders + - kRestorationPadding), // == 5476 + kRestorationUnitHeight = 64, + kRestorationUnitWidth = 256, + kRestorationUnitHeightWithBorders = + kRestorationUnitHeight + 2 * kRestorationBorder, + kRestorationUnitWidthWithBorders = + kRestorationUnitWidth + 2 * kRestorationBorder, kSuperResFilterBits = 6, kSuperResFilterShifts = 1 << kSuperResFilterBits, kSuperResFilterTaps = 8, @@ -148,8 +145,6 @@ enum { kMaxFrameDistance = 31, kReferenceFrameScalePrecision = 14, kNumWienerCoefficients = 3, - // Maximum number of threads that the library will ever use at any given time. - kMaxThreads = 32, kLoopFilterMaxModeDeltas = 2, kMaxCdefStrengths = 8, kCdefLargeValue = 0x4000, // Used to indicate where CDEF is not available. @@ -512,14 +507,6 @@ enum ObuType : int8_t { kObuPadding = 15, }; -// Enum to track the processing state of a superblock. -enum SuperBlockState : uint8_t { - kSuperBlockStateNone, // Not yet parsed or decoded. - kSuperBlockStateParsed, // Parsed but not yet decoded. - kSuperBlockStateScheduled, // Scheduled for decoding. - kSuperBlockStateDecoded // Parsed and decoded. -}; - //------------------------------------------------------------------------------ // ToString() // diff --git a/chromium/third_party/libgav1/src/src/utils/libgav1_utils.cmake b/chromium/third_party/libgav1/src/src/utils/libgav1_utils.cmake index 50bf941306f..8b6ec4bee32 100644 --- a/chromium/third_party/libgav1/src/src/utils/libgav1_utils.cmake +++ b/chromium/third_party/libgav1/src/src/utils/libgav1_utils.cmake @@ -44,6 +44,7 @@ list(APPEND libgav1_utils_sources "${libgav1_source}/utils/queue.h" "${libgav1_source}/utils/raw_bit_reader.cc" "${libgav1_source}/utils/raw_bit_reader.h" + "${libgav1_source}/utils/reference_info.h" "${libgav1_source}/utils/segmentation.cc" "${libgav1_source}/utils/segmentation.h" "${libgav1_source}/utils/segmentation_map.cc" diff --git a/chromium/third_party/libgav1/src/src/utils/reference_info.h b/chromium/third_party/libgav1/src/src/utils/reference_info.h new file mode 100644 index 00000000000..a6607912ab8 --- /dev/null +++ b/chromium/third_party/libgav1/src/src/utils/reference_info.h @@ -0,0 +1,92 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_ +#define LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_ + +#include <array> +#include <cstdint> + +#include "src/utils/array_2d.h" +#include "src/utils/constants.h" +#include "src/utils/types.h" + +namespace libgav1 { + +// This struct collects some members related to reference frames in one place to +// make it easier to pass them as parameters to some dsp functions. +struct ReferenceInfo { + // Initialize |motion_field_reference_frame| so that + // Tile::StoreMotionFieldMvsIntoCurrentFrame() can skip some updates when + // the updates are the same as the initialized value. + // Set to kReferenceFrameIntra instead of kReferenceFrameNone to simplify + // branch conditions in motion field projection. + // The following memory initialization of contiguous memory is very fast. It + // is not recommended to make the initialization multi-threaded, unless the + // memory which needs to be initialized in each thread is still contiguous. + LIBGAV1_MUST_USE_RESULT bool Reset(int rows, int columns) { + return motion_field_reference_frame.Reset(rows, columns, + /*zero_initialize=*/true) && + motion_field_mv.Reset( + rows, columns, +#if LIBGAV1_MSAN + // It is set in Tile::StoreMotionFieldMvsIntoCurrentFrame() only + // for qualified blocks. In MotionFieldProjectionKernel() dsp + // optimizations, it is read no matter it was set or not. + /*zero_initialize=*/true +#else + /*zero_initialize=*/false +#endif + ); + } + + // All members are used by inter frames only. + // For intra frames, they are not initialized. + + std::array<uint8_t, kNumReferenceFrameTypes> order_hint; + + // An example when |relative_distance_from| does not equal + // -|relative_distance_to|: + // |relative_distance_from| = GetRelativeDistance(7, 71, 25) = -64 + // -|relative_distance_to| = -GetRelativeDistance(71, 7, 25) = 64 + // This is why we need both |relative_distance_from| and + // |relative_distance_to|. + // |relative_distance_from|: Relative distances from reference frames to this + // frame. + std::array<int8_t, kNumReferenceFrameTypes> relative_distance_from; + // |relative_distance_to|: Relative distances to reference frames. + std::array<int8_t, kNumReferenceFrameTypes> relative_distance_to; + + // Skip motion field projection of specific types of frames if their + // |relative_distance_to| is negative or too large. + std::array<bool, kNumReferenceFrameTypes> skip_references; + // Lookup table to get motion field projection division multiplier of specific + // types of frames. Derived from kProjectionMvDivisionLookup. + std::array<int16_t, kNumReferenceFrameTypes> projection_divisions; + + // The current frame's |motion_field_reference_frame| and |motion_field_mv_| + // are guaranteed to be allocated only when refresh_frame_flags is not 0. + // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds + // to MfRefFrames[i * 2 + 1][j * 2 + 1] in the spec. + Array2D<ReferenceFrameType> motion_field_reference_frame; + // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds + // to MfMvs[i * 2 + 1][j * 2 + 1] in the spec. + Array2D<MotionVector> motion_field_mv; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_ diff --git a/chromium/third_party/libgav1/src/src/utils/types.h b/chromium/third_party/libgav1/src/src/utils/types.h index 8a95bdb20f9..89b35ad7b21 100644 --- a/chromium/third_party/libgav1/src/src/utils/types.h +++ b/chromium/third_party/libgav1/src/src/utils/types.h @@ -283,8 +283,10 @@ struct Delta { }; struct Cdef { - uint8_t damping; + uint8_t damping; // damping value from the spec + (bitdepth - 8). uint8_t bits; + // All the strength values are the values from the spec and left shifted by + // (bitdepth - 8). uint8_t y_primary_strength[kMaxCdefStrengths]; uint8_t y_secondary_strength[kMaxCdefStrengths]; uint8_t uv_primary_strength[kMaxCdefStrengths]; |