summaryrefslogtreecommitdiff
path: root/chromium/third_party/libgav1
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@qt.io>2020-10-12 14:27:29 +0200
committerAllan Sandfeld Jensen <allan.jensen@qt.io>2020-10-13 09:35:20 +0000
commitc30a6232df03e1efbd9f3b226777b07e087a1122 (patch)
treee992f45784689f373bcc38d1b79a239ebe17ee23 /chromium/third_party/libgav1
parent7b5b123ac58f58ffde0f4f6e488bcd09aa4decd3 (diff)
downloadqtwebengine-chromium-85-based.tar.gz
BASELINE: Update Chromium to 85.0.4183.14085-based
Change-Id: Iaa42f4680837c57725b1344f108c0196741f6057 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
Diffstat (limited to 'chromium/third_party/libgav1')
-rw-r--r--chromium/third_party/libgav1/BUILD.gn4
-rw-r--r--chromium/third_party/libgav1/README.chromium4
-rw-r--r--chromium/third_party/libgav1/libgav1_srcs.gni7
-rw-r--r--chromium/third_party/libgav1/src/README.md3
-rw-r--r--chromium/third_party/libgav1/src/cmake/libgav1_build_definitions.cmake8
-rw-r--r--chromium/third_party/libgav1/src/cmake/libgav1_flags.cmake11
-rw-r--r--chromium/third_party/libgav1/src/examples/file_reader.cc8
-rw-r--r--chromium/third_party/libgav1/src/examples/file_reader.h11
-rw-r--r--chromium/third_party/libgav1/src/examples/file_reader_factory.cc2
-rw-r--r--chromium/third_party/libgav1/src/examples/file_reader_factory.h6
-rw-r--r--chromium/third_party/libgav1/src/examples/file_reader_interface.h6
-rw-r--r--chromium/third_party/libgav1/src/examples/file_writer.cc26
-rw-r--r--chromium/third_party/libgav1/src/examples/file_writer.h8
-rw-r--r--chromium/third_party/libgav1/src/examples/logging.h1
-rw-r--r--chromium/third_party/libgav1/src/src/buffer_pool.cc28
-rw-r--r--chromium/third_party/libgav1/src/src/buffer_pool.h81
-rw-r--r--chromium/third_party/libgav1/src/src/decoder_impl.cc686
-rw-r--r--chromium/third_party/libgav1/src/src/decoder_impl.h67
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/arm/cdef_neon.cc295
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/arm/convolve_neon.cc41
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/arm/loop_restoration_neon.cc1605
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/arm/motion_field_projection_neon.cc219
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/arm/motion_vector_search_neon.cc21
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/arm/warp_neon.cc2
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/arm/weight_mask_neon.h1
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/cdef.cc36
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/cdef.inc29
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/common.h14
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/constants.cc21
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/constants.h2
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/dsp.cc2
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/dsp.h65
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/libgav1_dsp.cmake7
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/loop_filter.cc8
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/loop_restoration.cc1025
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/loop_restoration.h13
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/motion_field_projection.cc46
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/motion_field_projection.h8
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/motion_vector_search.cc39
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/motion_vector_search.h9
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/x86/cdef_sse4.cc360
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/x86/common_sse4.h48
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.cc1900
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.h24
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/x86/loop_filter_sse4.cc2
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/x86/loop_restoration_sse4.cc1888
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.cc397
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.h37
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.cc262
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.h37
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/x86/transpose_sse4.h10
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/x86/warp_sse4.cc558
-rw-r--r--chromium/third_party/libgav1/src/src/dsp/x86/weight_mask_sse4.h1
-rw-r--r--chromium/third_party/libgav1/src/src/frame_scratch_buffer.h31
-rw-r--r--chromium/third_party/libgav1/src/src/gav1/decoder.h19
-rw-r--r--chromium/third_party/libgav1/src/src/gav1/decoder_settings.h28
-rw-r--r--chromium/third_party/libgav1/src/src/libgav1_decoder.cmake4
-rw-r--r--chromium/third_party/libgav1/src/src/loop_filter_mask.cc208
-rw-r--r--chromium/third_party/libgav1/src/src/loop_filter_mask.h189
-rw-r--r--chromium/third_party/libgav1/src/src/motion_vector.cc111
-rw-r--r--chromium/third_party/libgav1/src/src/motion_vector.h4
-rw-r--r--chromium/third_party/libgav1/src/src/obu_parser.cc67
-rw-r--r--chromium/third_party/libgav1/src/src/post_filter.h306
-rw-r--r--chromium/third_party/libgav1/src/src/post_filter/cdef.cc423
-rw-r--r--chromium/third_party/libgav1/src/src/post_filter/deblock.cc660
-rw-r--r--chromium/third_party/libgav1/src/src/post_filter/deblock_thresholds.inc85
-rw-r--r--chromium/third_party/libgav1/src/src/post_filter/loop_restoration.cc184
-rw-r--r--chromium/third_party/libgav1/src/src/post_filter/post_filter.cc57
-rw-r--r--chromium/third_party/libgav1/src/src/post_filter/super_res.cc8
-rw-r--r--chromium/third_party/libgav1/src/src/threading_strategy.cc103
-rw-r--r--chromium/third_party/libgav1/src/src/threading_strategy.h64
-rw-r--r--chromium/third_party/libgav1/src/src/tile.h76
-rw-r--r--chromium/third_party/libgav1/src/src/tile/bitstream/mode_info.cc11
-rw-r--r--chromium/third_party/libgav1/src/src/tile/prediction.cc48
-rw-r--r--chromium/third_party/libgav1/src/src/tile/tile.cc342
-rw-r--r--chromium/third_party/libgav1/src/src/utils/array_2d.h1
-rw-r--r--chromium/third_party/libgav1/src/src/utils/block_parameters_holder.cc12
-rw-r--r--chromium/third_party/libgav1/src/src/utils/block_parameters_holder.h21
-rw-r--r--chromium/third_party/libgav1/src/src/utils/common.h20
-rw-r--r--chromium/third_party/libgav1/src/src/utils/constants.h45
-rw-r--r--chromium/third_party/libgav1/src/src/utils/libgav1_utils.cmake1
-rw-r--r--chromium/third_party/libgav1/src/src/utils/reference_info.h92
-rw-r--r--chromium/third_party/libgav1/src/src/utils/types.h4
83 files changed, 8272 insertions, 4951 deletions
diff --git a/chromium/third_party/libgav1/BUILD.gn b/chromium/third_party/libgav1/BUILD.gn
index 9a31f423f6e..3a28871b8d2 100644
--- a/chromium/third_party/libgav1/BUILD.gn
+++ b/chromium/third_party/libgav1/BUILD.gn
@@ -16,6 +16,10 @@ config("public_libgav1_config") {
"LIBGAV1_THREADPOOL_USE_STD_MUTEX", # to avoid abseil dependency.
"LIBGAV1_ENABLE_LOGGING=0", # to avoid debug log of libgav1 in chromium
# debug build.
+
+ # Don't let libgav1 export any symbols. Otherwise the verify_order step on
+ # macOS can fail since these exports end up in the final Chromium binary.
+ "LIBGAV1_PUBLIC=",
]
}
diff --git a/chromium/third_party/libgav1/README.chromium b/chromium/third_party/libgav1/README.chromium
index fc62bb71907..27a8fe8222f 100644
--- a/chromium/third_party/libgav1/README.chromium
+++ b/chromium/third_party/libgav1/README.chromium
@@ -2,9 +2,9 @@ Name: libgav1
Short Name: libgav1
URL: https://chromium.googlesource.com/codecs/libgav1/
Version: 0
-Date: Thursday March 26 2020
+Date: Saturday May 23 2020
Branch: master
-Commit: 638ef84819f8b3cd614dcf63378fe4814aa4cb2a
+Commit: bf190c43e5c7cc81751867c917a81bc2920be079
License: Apache 2.0
License File: libgav1/LICENSE
Security Critical: yes
diff --git a/chromium/third_party/libgav1/libgav1_srcs.gni b/chromium/third_party/libgav1/libgav1_srcs.gni
index e460d030f1b..9dc54f97124 100644
--- a/chromium/third_party/libgav1/libgav1_srcs.gni
+++ b/chromium/third_party/libgav1/libgav1_srcs.gni
@@ -15,8 +15,6 @@ gav1_common_sources = [
"//third_party/libgav1/src/src/frame_scratch_buffer.h",
"//third_party/libgav1/src/src/internal_frame_buffer_list.cc",
"//third_party/libgav1/src/src/internal_frame_buffer_list.h",
- "//third_party/libgav1/src/src/loop_filter_mask.cc",
- "//third_party/libgav1/src/src/loop_filter_mask.h",
"//third_party/libgav1/src/src/loop_restoration_info.cc",
"//third_party/libgav1/src/src/loop_restoration_info.h",
"//third_party/libgav1/src/src/motion_vector.cc",
@@ -146,6 +144,10 @@ gav1_dsp_sources = [
"//third_party/libgav1/src/src/dsp/x86/loop_restoration_sse4.h",
"//third_party/libgav1/src/src/dsp/x86/mask_blend_sse4.cc",
"//third_party/libgav1/src/src/dsp/x86/mask_blend_sse4.h",
+ "//third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.cc",
+ "//third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.h",
+ "//third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.cc",
+ "//third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.h",
"//third_party/libgav1/src/src/dsp/x86/obmc_sse4.cc",
"//third_party/libgav1/src/src/dsp/x86/obmc_sse4.h",
"//third_party/libgav1/src/src/dsp/x86/super_res_sse4.cc",
@@ -215,6 +217,7 @@ gav1_utils_sources = [
"//third_party/libgav1/src/src/utils/queue.h",
"//third_party/libgav1/src/src/utils/raw_bit_reader.cc",
"//third_party/libgav1/src/src/utils/raw_bit_reader.h",
+ "//third_party/libgav1/src/src/utils/reference_info.h",
"//third_party/libgav1/src/src/utils/segmentation.cc",
"//third_party/libgav1/src/src/utils/segmentation.h",
"//third_party/libgav1/src/src/utils/segmentation_map.cc",
diff --git a/chromium/third_party/libgav1/src/README.md b/chromium/third_party/libgav1/src/README.md
index ead3fc3b8ee..a5799d1395b 100644
--- a/chromium/third_party/libgav1/src/README.md
+++ b/chromium/third_party/libgav1/src/README.md
@@ -56,6 +56,9 @@ Configuration options:
absl::Mutex in ThreadPool. Defining this to 1 will remove any Abseil
dependency from the core library. Automatically defined in
`src/utils/threadpool.h` if unset.
+* `LIBGAV1_MAX_THREADS`: sets the number of threads that the library is
+ allowed to create. Has to be an integer > 0. Otherwise this is ignored.
+ The default value is 128.
For additional options see:
diff --git a/chromium/third_party/libgav1/src/cmake/libgav1_build_definitions.cmake b/chromium/third_party/libgav1/src/cmake/libgav1_build_definitions.cmake
index cd5ff9e1230..31017a9de14 100644
--- a/chromium/third_party/libgav1/src/cmake/libgav1_build_definitions.cmake
+++ b/chromium/third_party/libgav1/src/cmake/libgav1_build_definitions.cmake
@@ -63,6 +63,14 @@ macro(libgav1_set_build_definitions)
list(APPEND libgav1_clang_cxx_flags "-Wmissing-prototypes"
"-Wshorten-64-to-32")
+ if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+ if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6")
+ # Quiet warnings in copy-list-initialization where {} elision has always
+ # been allowed.
+ list(APPEND libgav1_clang_cxx_flags "-Wno-missing-braces")
+ endif()
+ endif()
+
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "7")
# Quiet warnings due to potential snprintf() truncation in threadpool.cc.
diff --git a/chromium/third_party/libgav1/src/cmake/libgav1_flags.cmake b/chromium/third_party/libgav1/src/cmake/libgav1_flags.cmake
index 295b078756a..0b8df60f3df 100644
--- a/chromium/third_party/libgav1/src/cmake/libgav1_flags.cmake
+++ b/chromium/third_party/libgav1/src/cmake/libgav1_flags.cmake
@@ -212,14 +212,17 @@ endmacro()
macro(libgav1_set_cxx_flags)
unset(cxx_flag_lists)
- if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
- list(APPEND cxx_flag_lists libgav1_clang_cxx_flags)
- endif()
-
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
list(APPEND cxx_flag_lists libgav1_base_cxx_flags)
endif()
+ # Append clang flags after the base set to allow -Wno* overrides to take
+ # effect. Some of the base flags may enable a large set of warnings, e.g.,
+ # -Wall.
+ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+ list(APPEND cxx_flag_lists libgav1_clang_cxx_flags)
+ endif()
+
if(MSVC)
list(APPEND cxx_flag_lists libgav1_msvc_cxx_flags)
endif()
diff --git a/chromium/third_party/libgav1/src/examples/file_reader.cc b/chromium/third_party/libgav1/src/examples/file_reader.cc
index f174e2d67b6..b0967227ef8 100644
--- a/chromium/third_party/libgav1/src/examples/file_reader.cc
+++ b/chromium/third_party/libgav1/src/examples/file_reader.cc
@@ -26,7 +26,6 @@
#include <io.h>
#endif
-#include "absl/memory/memory.h"
#include "examples/file_reader_constants.h"
#include "examples/file_reader_factory.h"
#include "examples/file_reader_interface.h"
@@ -53,10 +52,9 @@ FileReader::~FileReader() {
}
std::unique_ptr<FileReaderInterface> FileReader::Open(
- absl::string_view file_name, const bool error_tolerant) {
+ const std::string& file_name, const bool error_tolerant) {
if (file_name.empty()) return nullptr;
- const std::string fopen_file_name = std::string(file_name);
FILE* raw_file_ptr;
bool owns_file = true;
@@ -64,14 +62,14 @@ std::unique_ptr<FileReaderInterface> FileReader::Open(
raw_file_ptr = SetBinaryMode(stdin);
owns_file = false; // stdin is owned by the Standard C Library.
} else {
- raw_file_ptr = fopen(fopen_file_name.c_str(), "rb");
+ raw_file_ptr = fopen(file_name.c_str(), "rb");
}
if (raw_file_ptr == nullptr) {
return nullptr;
}
- auto file = absl::WrapUnique(
+ std::unique_ptr<FileReader> file(
new (std::nothrow) FileReader(raw_file_ptr, owns_file, error_tolerant));
if (file == nullptr) {
LIBGAV1_EXAMPLES_LOG_ERROR("Out of memory");
diff --git a/chromium/third_party/libgav1/src/examples/file_reader.h b/chromium/third_party/libgav1/src/examples/file_reader.h
index ad5911e32fe..c342a20df1e 100644
--- a/chromium/third_party/libgav1/src/examples/file_reader.h
+++ b/chromium/third_party/libgav1/src/examples/file_reader.h
@@ -21,10 +21,9 @@
#include <cstdint>
#include <cstdio>
#include <memory>
+#include <string>
#include <vector>
-#include "absl/base/attributes.h"
-#include "absl/strings/string_view.h"
#include "examples/file_reader_interface.h"
namespace libgav1 {
@@ -42,7 +41,7 @@ class FileReader : public FileReaderInterface {
// ReadTemporalUnit() may return truncated data.
// Returns nullptr when the file does not exist, cannot be read, or is not an
// IVF file.
- static std::unique_ptr<FileReaderInterface> Open(absl::string_view file_name,
+ static std::unique_ptr<FileReaderInterface> Open(const std::string& file_name,
bool error_tolerant = false);
FileReader() = delete;
@@ -62,10 +61,10 @@ class FileReader : public FileReaderInterface {
// The |timestamp| pointer is optional: callers not interested in timestamps
// can pass nullptr. When |timestamp| is not a nullptr, this function returns
// the presentation timestamp from the IVF frame header.
- ABSL_MUST_USE_RESULT bool ReadTemporalUnit(std::vector<uint8_t>* tu_data,
- int64_t* timestamp) override;
+ /*LIBGAV1_MUST_USE_RESULT*/ bool ReadTemporalUnit(
+ std::vector<uint8_t>* tu_data, int64_t* timestamp) override;
- ABSL_MUST_USE_RESULT bool IsEndOfFile() const override {
+ /*LIBGAV1_MUST_USE_RESULT*/ bool IsEndOfFile() const override {
return feof(file_) != 0;
}
diff --git a/chromium/third_party/libgav1/src/examples/file_reader_factory.cc b/chromium/third_party/libgav1/src/examples/file_reader_factory.cc
index 860d916423d..d5260eba893 100644
--- a/chromium/third_party/libgav1/src/examples/file_reader_factory.cc
+++ b/chromium/third_party/libgav1/src/examples/file_reader_factory.cc
@@ -38,7 +38,7 @@ bool FileReaderFactory::RegisterReader(OpenFunction open_function) {
}
std::unique_ptr<FileReaderInterface> FileReaderFactory::OpenReader(
- absl::string_view file_name, const bool error_tolerant /*= false*/) {
+ const std::string& file_name, const bool error_tolerant /*= false*/) {
for (auto* open_function : *GetFileReaderOpenFunctions()) {
auto reader = open_function(file_name, error_tolerant);
if (reader == nullptr) continue;
diff --git a/chromium/third_party/libgav1/src/examples/file_reader_factory.h b/chromium/third_party/libgav1/src/examples/file_reader_factory.h
index ddf8744d19b..0f534845e75 100644
--- a/chromium/third_party/libgav1/src/examples/file_reader_factory.h
+++ b/chromium/third_party/libgav1/src/examples/file_reader_factory.h
@@ -18,8 +18,8 @@
#define LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_
#include <memory>
+#include <string>
-#include "absl/strings/string_view.h"
#include "examples/file_reader_interface.h"
namespace libgav1 {
@@ -27,7 +27,7 @@ namespace libgav1 {
class FileReaderFactory {
public:
using OpenFunction = std::unique_ptr<FileReaderInterface> (*)(
- absl::string_view file_name, bool error_tolerant);
+ const std::string& file_name, bool error_tolerant);
FileReaderFactory() = delete;
FileReaderFactory(const FileReaderFactory&) = delete;
@@ -43,7 +43,7 @@ class FileReaderFactory {
// returned. If |error_tolerant| is true and the reader supports it, some
// format and read errors may be ignored and partial data returned.
static std::unique_ptr<FileReaderInterface> OpenReader(
- absl::string_view file_name, bool error_tolerant = false);
+ const std::string& file_name, bool error_tolerant = false);
};
} // namespace libgav1
diff --git a/chromium/third_party/libgav1/src/examples/file_reader_interface.h b/chromium/third_party/libgav1/src/examples/file_reader_interface.h
index d768017e2ba..d8f703091e2 100644
--- a/chromium/third_party/libgav1/src/examples/file_reader_interface.h
+++ b/chromium/third_party/libgav1/src/examples/file_reader_interface.h
@@ -21,8 +21,6 @@
#include <cstdint>
#include <vector>
-#include "absl/base/attributes.h"
-
namespace libgav1 {
class FileReaderInterface {
@@ -47,10 +45,10 @@ class FileReaderInterface {
// The |timestamp| pointer is optional: callers not interested in timestamps
// can pass nullptr. When |timestamp| is not a nullptr, this function returns
// the presentation timestamp of the temporal unit.
- ABSL_MUST_USE_RESULT virtual bool ReadTemporalUnit(
+ /*LIBGAV1_MUST_USE_RESULT*/ virtual bool ReadTemporalUnit(
std::vector<uint8_t>* tu_data, int64_t* timestamp) = 0;
- ABSL_MUST_USE_RESULT virtual bool IsEndOfFile() const = 0;
+ /*LIBGAV1_MUST_USE_RESULT*/ virtual bool IsEndOfFile() const = 0;
// The values returned by these accessors are strictly informative. No
// validation is performed when they are read from file.
diff --git a/chromium/third_party/libgav1/src/examples/file_writer.cc b/chromium/third_party/libgav1/src/examples/file_writer.cc
index bf13d4a1199..54afe145cde 100644
--- a/chromium/third_party/libgav1/src/examples/file_writer.cc
+++ b/chromium/third_party/libgav1/src/examples/file_writer.cc
@@ -25,8 +25,6 @@
#include <io.h>
#endif
-#include "absl/memory/memory.h"
-#include "absl/strings/str_format.h"
#include "examples/logging.h"
namespace libgav1 {
@@ -72,9 +70,8 @@ std::string GetY4mColorSpaceString(
if (y4m_parameters.bitdepth > 8) {
const bool monochrome =
y4m_parameters.image_format == kImageFormatMonochrome400;
- color_space_string =
- absl::StrFormat("%s%s%d", color_space_string, monochrome ? "" : "p",
- y4m_parameters.bitdepth);
+ if (!monochrome) color_space_string += "p";
+ color_space_string += std::to_string(y4m_parameters.bitdepth);
}
return color_space_string;
@@ -85,7 +82,7 @@ std::string GetY4mColorSpaceString(
FileWriter::~FileWriter() { fclose(file_); }
std::unique_ptr<FileWriter> FileWriter::Open(
- absl::string_view file_name, FileType file_type,
+ const std::string& file_name, FileType file_type,
const Y4mParameters* const y4m_parameters) {
if (file_name.empty() ||
(file_type == kFileTypeY4m && y4m_parameters == nullptr) ||
@@ -94,13 +91,12 @@ std::unique_ptr<FileWriter> FileWriter::Open(
return nullptr;
}
- const std::string fopen_file_name = std::string(file_name);
FILE* raw_file_ptr;
if (file_name == "-") {
raw_file_ptr = SetBinaryMode(stdout);
} else {
- raw_file_ptr = fopen(fopen_file_name.c_str(), "wb");
+ raw_file_ptr = fopen(file_name.c_str(), "wb");
}
if (raw_file_ptr == nullptr) {
@@ -108,7 +104,7 @@ std::unique_ptr<FileWriter> FileWriter::Open(
return nullptr;
}
- auto file = absl::WrapUnique(new (std::nothrow) FileWriter(raw_file_ptr));
+ std::unique_ptr<FileWriter> file(new (std::nothrow) FileWriter(raw_file_ptr));
if (file == nullptr) {
LIBGAV1_EXAMPLES_LOG_ERROR("Out of memory");
fclose(raw_file_ptr);
@@ -173,11 +169,13 @@ bool FileWriter::WriteFrame(const DecoderBuffer& frame_buffer) {
//
// More info here: https://wiki.multimedia.cx/index.php/YUV4MPEG2
bool FileWriter::WriteY4mFileHeader(const Y4mParameters& y4m_parameters) {
- std::string y4m_header = absl::StrFormat(
- "YUV4MPEG2 W%zu H%zu F%zu:%zu Ip C%s\n", y4m_parameters.width,
- y4m_parameters.height, y4m_parameters.frame_rate_numerator,
- y4m_parameters.frame_rate_denominator,
- GetY4mColorSpaceString(y4m_parameters));
+ std::string y4m_header = "YUV4MPEG2";
+ y4m_header += " W" + std::to_string(y4m_parameters.width);
+ y4m_header += " H" + std::to_string(y4m_parameters.height);
+ y4m_header += " F" + std::to_string(y4m_parameters.frame_rate_numerator) +
+ ":" + std::to_string(y4m_parameters.frame_rate_denominator);
+ y4m_header += " Ip C" + GetY4mColorSpaceString(y4m_parameters);
+ y4m_header += "\n";
return fwrite(y4m_header.c_str(), 1, y4m_header.length(), file_) ==
y4m_header.length();
}
diff --git a/chromium/third_party/libgav1/src/examples/file_writer.h b/chromium/third_party/libgav1/src/examples/file_writer.h
index a7b1937dd37..00f6cc38097 100644
--- a/chromium/third_party/libgav1/src/examples/file_writer.h
+++ b/chromium/third_party/libgav1/src/examples/file_writer.h
@@ -21,9 +21,8 @@
#include <cstdint>
#include <cstdio>
#include <memory>
+#include <string>
-#include "absl/base/attributes.h"
-#include "absl/strings/string_view.h"
#include "gav1/decoder_buffer.h"
namespace libgav1 {
@@ -70,7 +69,7 @@ class FileWriter {
// Returns a FileWriter instance after the file is opened successfully for
// kFileTypeRaw files, and after the Y4M file header bytes are written for
// kFileTypeY4m files. Returns nullptr upon failure.
- static std::unique_ptr<FileWriter> Open(absl::string_view file_name,
+ static std::unique_ptr<FileWriter> Open(const std::string& file_name,
FileType type,
const Y4mParameters* y4m_parameters);
@@ -86,7 +85,8 @@ class FileWriter {
// Writes the frame data in |frame_buffer| to |file_|. Returns true after
// successful write of |frame_buffer| data.
- ABSL_MUST_USE_RESULT bool WriteFrame(const DecoderBuffer& frame_buffer);
+ /*LIBGAV1_MUST_USE_RESULT*/ bool WriteFrame(
+ const DecoderBuffer& frame_buffer);
private:
explicit FileWriter(FILE* file) : file_(file) {}
diff --git a/chromium/third_party/libgav1/src/examples/logging.h b/chromium/third_party/libgav1/src/examples/logging.h
index ba784ef5c15..536ed1dbaf2 100644
--- a/chromium/third_party/libgav1/src/examples/logging.h
+++ b/chromium/third_party/libgav1/src/examples/logging.h
@@ -18,6 +18,7 @@
#define LIBGAV1_EXAMPLES_LOGGING_H_
#include <cstddef>
+#include <cstdio>
namespace libgav1 {
namespace examples {
diff --git a/chromium/third_party/libgav1/src/src/buffer_pool.cc b/chromium/third_party/libgav1/src/src/buffer_pool.cc
index 282da8c948b..c1a5606cd11 100644
--- a/chromium/third_party/libgav1/src/src/buffer_pool.cc
+++ b/chromium/third_party/libgav1/src/src/buffer_pool.cc
@@ -69,27 +69,13 @@ bool RefCountedBuffer::SetFrameDimensions(const ObuFrameHeader& frame_header) {
render_height_ = frame_header.render_height;
rows4x4_ = frame_header.rows4x4;
columns4x4_ = frame_header.columns4x4;
- const int rows4x4_half = DivideBy2(rows4x4_);
- const int columns4x4_half = DivideBy2(columns4x4_);
- if (!motion_field_reference_frame_.Reset(rows4x4_half, columns4x4_half,
- /*zero_initialize=*/false) ||
- !motion_field_mv_.Reset(rows4x4_half, columns4x4_half,
- /*zero_initialize=*/false)) {
- return false;
- }
- if (frame_header.refresh_frame_flags != 0) {
- // Initialize so that Tile::StoreMotionFieldMvsIntoCurrentFrame() can skip
- // some updates when the updates are the same as the initialized value.
- // Set to kReferenceFrameIntra instead of kReferenceFrameNone to simplify
- // branch conditions in motion field projection.
- // The following memory initialization of contiguous memory is very fast. It
- // is not recommended to make the initialization multi-threaded, unless the
- // memory which needs to be initialized in each thread is still contiguous.
- static_assert(sizeof(motion_field_reference_frame_[0][0]) == sizeof(int8_t),
- "");
- memset(motion_field_reference_frame_.data(), kReferenceFrameIntra,
- sizeof(motion_field_reference_frame_[0][0]) *
- motion_field_reference_frame_.size());
+ if (frame_header.refresh_frame_flags != 0 &&
+ !IsIntraFrame(frame_header.frame_type)) {
+ const int rows4x4_half = DivideBy2(rows4x4_);
+ const int columns4x4_half = DivideBy2(columns4x4_);
+ if (!reference_info_.Reset(rows4x4_half, columns4x4_half)) {
+ return false;
+ }
}
return segmentation_map_.Allocate(rows4x4_, columns4x4_);
}
diff --git a/chromium/third_party/libgav1/src/src/buffer_pool.h b/chromium/third_party/libgav1/src/src/buffer_pool.h
index 07adc838f12..13008c10cd2 100644
--- a/chromium/third_party/libgav1/src/src/buffer_pool.h
+++ b/chromium/third_party/libgav1/src/src/buffer_pool.h
@@ -19,6 +19,7 @@
#include <array>
#include <cassert>
+#include <climits>
#include <condition_variable> // NOLINT (unapproved c++11 header)
#include <cstdint>
#include <cstring>
@@ -29,9 +30,9 @@
#include "src/gav1/frame_buffer.h"
#include "src/internal_frame_buffer_list.h"
#include "src/symbol_decoder_context.h"
-#include "src/utils/array_2d.h"
#include "src/utils/compiler_attributes.h"
#include "src/utils/constants.h"
+#include "src/utils/reference_info.h"
#include "src/utils/segmentation.h"
#include "src/utils/segmentation_map.h"
#include "src/utils/types.h"
@@ -108,21 +109,11 @@ class RefCountedBuffer {
bool showable_frame() const { return showable_frame_; }
void set_showable_frame(bool value) { showable_frame_ = value; }
- // This array has kNumReferenceFrameTypes elements.
- const uint8_t* order_hint_array() const { return order_hint_.data(); }
- uint8_t order_hint(ReferenceFrameType reference_frame) const {
- return order_hint_[reference_frame];
- }
- void set_order_hint(ReferenceFrameType reference_frame, uint8_t order_hint) {
- order_hint_[reference_frame] = order_hint;
- }
- void ClearOrderHints() { order_hint_.fill(0); }
-
// Sets upscaled_width_, frame_width_, frame_height_, render_width_,
// render_height_, rows4x4_ and columns4x4_ from the corresponding fields
- // in frame_header. Allocates motion_field_reference_frame_,
- // motion_field_mv_, and segmentation_map_. Returns true on success, false
- // on failure.
+ // in frame_header. Allocates reference_info_.motion_field_reference_frame,
+ // reference_info_.motion_field_mv_, and segmentation_map_. Returns true on
+ // success, false on failure.
bool SetFrameDimensions(const ObuFrameHeader& frame_header);
int32_t upscaled_width() const { return upscaled_width_; }
@@ -135,27 +126,6 @@ class RefCountedBuffer {
int32_t rows4x4() const { return rows4x4_; }
int32_t columns4x4() const { return columns4x4_; }
- // Entry at |row|, |column| corresponds to
- // MfRefFrames[row * 2 + 1][column * 2 + 1] in the spec.
- ReferenceFrameType* motion_field_reference_frame(int row, int column) {
- return &motion_field_reference_frame_[row][column];
- }
-
- const ReferenceFrameType* motion_field_reference_frame(int row,
- int column) const {
- return &motion_field_reference_frame_[row][column];
- }
-
- // Entry at |row|, |column| corresponds to
- // MfMvs[row * 2 + 1][column * 2 + 1] in the spec.
- MotionVector* motion_field_mv(int row, int column) {
- return &motion_field_mv_[row][column];
- }
-
- const MotionVector* motion_field_mv(int row, int column) const {
- return &motion_field_mv_[row][column];
- }
-
SegmentationMap* segmentation_map() { return &segmentation_map_; }
const SegmentationMap* segmentation_map() const { return &segmentation_map_; }
@@ -205,6 +175,9 @@ class RefCountedBuffer {
film_grain_params_ = params;
}
+ const ReferenceInfo* reference_info() const { return &reference_info_; }
+ ReferenceInfo* reference_info() { return &reference_info_; }
+
// This will wake up the WaitUntil*() functions and make them return false.
void Abort() {
{
@@ -217,8 +190,10 @@ class RefCountedBuffer {
}
void SetFrameState(FrameState frame_state) {
- std::lock_guard<std::mutex> lock(mutex_);
- frame_state_ = frame_state;
+ {
+ std::lock_guard<std::mutex> lock(mutex_);
+ frame_state_ = frame_state;
+ }
if (frame_state == kFrameStateParsed) {
parsed_condvar_.notify_all();
} else if (frame_state == kFrameStateDecoded) {
@@ -230,9 +205,11 @@ class RefCountedBuffer {
// Sets the progress of this frame to |progress_row| and notifies any threads
// that may be waiting on rows <= |progress_row|.
void SetProgress(int progress_row) {
- std::lock_guard<std::mutex> lock(mutex_);
- if (progress_row_ >= progress_row) return;
- progress_row_ = progress_row;
+ {
+ std::lock_guard<std::mutex> lock(mutex_);
+ if (progress_row_ >= progress_row) return;
+ progress_row_ = progress_row;
+ }
progress_row_condvar_.notify_all();
}
@@ -257,8 +234,14 @@ class RefCountedBuffer {
}
// Waits until the |progress_row| has been decoded (as indicated either by
- // |progress_row_| or |frame_state_|).
- bool WaitUntil(int progress_row) {
+ // |progress_row_| or |frame_state_|). |progress_row_cache| must not be
+ // nullptr and will be populated with the value of |progress_row_| after the
+ // wait.
+ //
+ // Typical usage of |progress_row_cache| is as follows:
+ // * Initialize |*progress_row_cache| to INT_MIN.
+ // * Call WaitUntil only if |*progress_row_cache| < |progress_row|.
+ bool WaitUntil(int progress_row, int* progress_row_cache) {
// If |progress_row| is negative, it means that the wait is on the top
// border to be available. The top border will be available when row 0 has
// been decoded. So we can simply wait on row 0 instead.
@@ -268,6 +251,11 @@ class RefCountedBuffer {
!abort_) {
progress_row_condvar_.wait(lock);
}
+ // Once |frame_state_| reaches kFrameStateDecoded, |progress_row_| may no
+ // longer be updated. So we set |*progress_row_cache| to INT_MAX in that
+ // case.
+ *progress_row_cache =
+ (frame_state_ != kFrameStateDecoded) ? progress_row_ : INT_MAX;
return !abort_;
}
@@ -311,8 +299,6 @@ class RefCountedBuffer {
ChromaSamplePosition chroma_sample_position_ = kChromaSamplePositionUnknown;
bool showable_frame_ = false;
- std::array<uint8_t, kNumReferenceFrameTypes> order_hint_ = {};
-
int32_t upscaled_width_ = 0;
int32_t frame_width_ = 0;
int32_t frame_height_ = 0;
@@ -321,12 +307,6 @@ class RefCountedBuffer {
int32_t columns4x4_ = 0;
int32_t rows4x4_ = 0;
- // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds
- // to MfRefFrames[i * 2 + 1][j * 2 + 1] in the spec.
- Array2D<ReferenceFrameType> motion_field_reference_frame_;
- // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds
- // to MfMvs[i * 2 + 1][j * 2 + 1] in the spec.
- Array2D<MotionVector> motion_field_mv_;
// segmentation_map_ contains a rows4x4_ by columns4x4_ 2D array.
SegmentationMap segmentation_map_;
@@ -344,6 +324,7 @@ class RefCountedBuffer {
// on feature_enabled only, we also save their values as an optimization.
Segmentation segmentation_ = {};
FilmGrainParams film_grain_params_ = {};
+ ReferenceInfo reference_info_;
};
// RefCountedBufferPtr contains a reference to a RefCountedBuffer.
diff --git a/chromium/third_party/libgav1/src/src/decoder_impl.cc b/chromium/third_party/libgav1/src/src/decoder_impl.cc
index 841e4efed4b..508bbde4822 100644
--- a/chromium/third_party/libgav1/src/src/decoder_impl.cc
+++ b/chromium/third_party/libgav1/src/src/decoder_impl.cc
@@ -27,7 +27,6 @@
#include "src/film_grain.h"
#include "src/frame_buffer_utils.h"
#include "src/frame_scratch_buffer.h"
-#include "src/loop_filter_mask.h"
#include "src/loop_restoration_info.h"
#include "src/obu_parser.h"
#include "src/post_filter.h"
@@ -36,6 +35,7 @@
#include "src/threading_strategy.h"
#include "src/utils/blocking_counter.h"
#include "src/utils/common.h"
+#include "src/utils/constants.h"
#include "src/utils/logging.h"
#include "src/utils/parameter_tree.h"
#include "src/utils/raw_bit_reader.h"
@@ -61,6 +61,41 @@ int GetBottomBorderPixels(const bool do_cdef, const bool do_restoration,
return border;
}
+// Sets |frame_scratch_buffer->tile_decoding_failed| to true (while holding on
+// to |frame_scratch_buffer->superblock_row_mutex|) and notifies the first
+// |count| condition variables in
+// |frame_scratch_buffer->superblock_row_progress_condvar|.
+void SetFailureAndNotifyAll(FrameScratchBuffer* const frame_scratch_buffer,
+ int count) {
+ {
+ std::lock_guard<std::mutex> lock(
+ frame_scratch_buffer->superblock_row_mutex);
+ frame_scratch_buffer->tile_decoding_failed = true;
+ }
+ std::condition_variable* const condvars =
+ frame_scratch_buffer->superblock_row_progress_condvar.get();
+ for (int i = 0; i < count; ++i) {
+ condvars[i].notify_one();
+ }
+}
+
+// Helper class that releases the frame scratch buffer in the destructor.
+class FrameScratchBufferReleaser {
+ public:
+ FrameScratchBufferReleaser(
+ FrameScratchBufferPool* frame_scratch_buffer_pool,
+ std::unique_ptr<FrameScratchBuffer>* frame_scratch_buffer)
+ : frame_scratch_buffer_pool_(frame_scratch_buffer_pool),
+ frame_scratch_buffer_(frame_scratch_buffer) {}
+ ~FrameScratchBufferReleaser() {
+ frame_scratch_buffer_pool_->Release(std::move(*frame_scratch_buffer_));
+ }
+
+ private:
+ FrameScratchBufferPool* const frame_scratch_buffer_pool_;
+ std::unique_ptr<FrameScratchBuffer>* const frame_scratch_buffer_;
+};
+
} // namespace
// static
@@ -107,22 +142,40 @@ DecoderImpl::~DecoderImpl() {
}
StatusCode DecoderImpl::Init() {
+ if (!GenerateWedgeMask(&wedge_masks_)) {
+ LIBGAV1_DLOG(ERROR, "GenerateWedgeMask() failed.");
+ return kStatusOutOfMemory;
+ }
+ return kStatusOk;
+}
+
+StatusCode DecoderImpl::InitializeFrameThreadPoolAndTemporalUnitQueue(
+ const uint8_t* data, size_t size) {
if (settings_.frame_parallel) {
-#if defined(ENABLE_FRAME_PARALLEL)
- if (settings_.threads > 1) {
- if (!InitializeThreadPoolsForFrameParallel(settings_.threads,
- &frame_thread_pool_)) {
- return kStatusOutOfMemory;
- }
- // TODO(b/142583029): Frame parallel decoding with in-frame
- // multi-threading is not yet implemented. Until then, we force
- // settings_.threads to 1 when frame parallel decoding is enabled.
- settings_.threads = 1;
+ DecoderState state;
+ std::unique_ptr<ObuParser> obu(
+ new (std::nothrow) ObuParser(data, size, &buffer_pool_, &state));
+ if (obu == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
+ return kStatusOutOfMemory;
+ }
+ RefCountedBufferPtr current_frame;
+ const StatusCode status = obu->ParseOneFrame(&current_frame);
+ if (status != kStatusOk) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
+ return status;
+ }
+ current_frame = nullptr;
+ // We assume that the first frame that was parsed will contain the frame
+ // header. This assumption is usually true in practice. So we will simply
+ // not use frame parallel mode if this is not the case.
+ if (settings_.threads > 1 &&
+ !InitializeThreadPoolsForFrameParallel(
+ settings_.threads, obu->frame_header().tile_info.tile_count,
+ obu->frame_header().tile_info.tile_columns, &frame_thread_pool_,
+ &frame_scratch_buffer_pool_)) {
+ return kStatusOutOfMemory;
}
-#else
- LIBGAV1_DLOG(
- ERROR, "Frame parallel decoding is not implemented, ignoring setting.");
-#endif // defined(ENABLE_FRAME_PARALLEL)
}
const int max_allowed_frames = GetMaxAllowedFrames();
assert(max_allowed_frames > 0);
@@ -130,10 +183,6 @@ StatusCode DecoderImpl::Init() {
LIBGAV1_DLOG(ERROR, "temporal_units_.Init() failed.");
return kStatusOutOfMemory;
}
- if (!GenerateWedgeMask(&wedge_masks_)) {
- LIBGAV1_DLOG(ERROR, "GenerateWedgeMask() failed.");
- return kStatusOutOfMemory;
- }
return kStatusOk;
}
@@ -141,7 +190,19 @@ StatusCode DecoderImpl::EnqueueFrame(const uint8_t* data, size_t size,
int64_t user_private_data,
void* buffer_private_data) {
if (data == nullptr || size == 0) return kStatusInvalidArgument;
- if (abort_) return kStatusUnknownError;
+ if (HasFailure()) return kStatusUnknownError;
+ if (!seen_first_frame_) {
+ seen_first_frame_ = true;
+ const StatusCode status =
+ InitializeFrameThreadPoolAndTemporalUnitQueue(data, size);
+ if (status != kStatusOk) {
+ if (settings_.release_input_buffer != nullptr) {
+ settings_.release_input_buffer(settings_.callback_private_data,
+ buffer_private_data);
+ }
+ return SignalFailure(status);
+ }
+ }
if (temporal_units_.Full()) {
return kStatusTryAgain;
}
@@ -153,11 +214,13 @@ StatusCode DecoderImpl::EnqueueFrame(const uint8_t* data, size_t size,
StatusCode DecoderImpl::SignalFailure(StatusCode status) {
if (status == kStatusOk || status == kStatusTryAgain) return status;
- abort_ = true;
- failure_status_ = status;
// Make sure all waiting threads exit.
buffer_pool_.Abort();
frame_thread_pool_ = nullptr;
+ {
+ std::lock_guard<std::mutex> lock(mutex_);
+ failure_status_ = status;
+ }
while (!temporal_units_.Empty()) {
if (settings_.release_input_buffer != nullptr) {
settings_.release_input_buffer(
@@ -197,17 +260,22 @@ StatusCode DecoderImpl::DequeueFrame(const DecoderBuffer** out_ptr) {
temporal_units_.Pop();
return status;
}
- if (settings_.blocking_dequeue) {
+ {
std::unique_lock<std::mutex> lock(mutex_);
- while (!temporal_unit.decoded && !abort_) {
- decoded_condvar_.wait(lock);
+ if (settings_.blocking_dequeue) {
+ while (!temporal_unit.decoded && failure_status_ == kStatusOk) {
+ decoded_condvar_.wait(lock);
+ }
+ } else {
+ if (!temporal_unit.decoded && failure_status_ == kStatusOk) {
+ return kStatusTryAgain;
+ }
+ }
+ if (failure_status_ != kStatusOk) {
+ const StatusCode failure_status = failure_status_;
+ lock.unlock();
+ return SignalFailure(failure_status);
}
- } else {
- std::lock_guard<std::mutex> lock(mutex_);
- if (!temporal_unit.decoded && !abort_) return kStatusTryAgain;
- }
- if (abort_) {
- return SignalFailure(failure_status_);
}
if (settings_.release_input_buffer != nullptr) {
settings_.release_input_buffer(settings_.callback_private_data,
@@ -290,33 +358,32 @@ StatusCode DecoderImpl::ParseAndSchedule() {
std::lock_guard<std::mutex> lock(mutex_);
temporal_unit.has_displayable_frame = false;
temporal_unit.decoded = true;
- decoded_condvar_.notify_one();
return kStatusOk;
}
for (auto& frame : temporal_unit.frames) {
EncodedFrame* const encoded_frame = &frame;
frame_thread_pool_->Schedule([this, encoded_frame]() {
- if (abort_) return;
+ if (HasFailure()) return;
const StatusCode status = DecodeFrame(encoded_frame);
- if (abort_) return;
encoded_frame->state = {};
encoded_frame->frame = nullptr;
TemporalUnit& temporal_unit = encoded_frame->temporal_unit;
std::lock_guard<std::mutex> lock(mutex_);
+ if (failure_status_ != kStatusOk) return;
// temporal_unit's status defaults to kStatusOk. So we need to set it only
- // on error. If |abort_| is true at this point, it means that there has
- // already been a failure. So we don't care about this subsequent failure.
- // We will simply return the error code of the first failure.
+ // on error. If |failure_status_| is not kStatusOk at this point, it means
+ // that there has already been a failure. So we don't care about this
+ // subsequent failure. We will simply return the error code of the first
+ // failure.
if (status != kStatusOk) {
temporal_unit.status = status;
- if (!abort_) {
- abort_ = true;
+ if (failure_status_ == kStatusOk) {
failure_status_ = status;
}
}
temporal_unit.decoded =
++temporal_unit.decoded_count == temporal_unit.frames.size();
- if (temporal_unit.decoded || abort_) {
+ if (temporal_unit.decoded || failure_status_ != kStatusOk) {
decoded_condvar_.notify_one();
}
});
@@ -330,6 +397,17 @@ StatusCode DecoderImpl::DecodeFrame(EncodedFrame* const encoded_frame) {
const Vector<ObuTileGroup>& tile_groups = encoded_frame->tile_groups;
RefCountedBufferPtr current_frame = std::move(encoded_frame->frame);
+ std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+ frame_scratch_buffer_pool_.Get();
+ if (frame_scratch_buffer == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
+ return kStatusOutOfMemory;
+ }
+ // |frame_scratch_buffer| will be released when this local variable goes out
+ // of scope (i.e.) on any return path in this function.
+ FrameScratchBufferReleaser frame_scratch_buffer_releaser(
+ &frame_scratch_buffer_pool_, &frame_scratch_buffer);
+
StatusCode status;
if (!frame_header.show_existing_frame) {
if (tile_groups.empty()) {
@@ -339,16 +417,9 @@ StatusCode DecoderImpl::DecodeFrame(EncodedFrame* const encoded_frame) {
// not have a reason to handle those cases, so we simply continue.
return kStatusOk;
}
- std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
- frame_scratch_buffer_pool_.Get();
- if (frame_scratch_buffer == nullptr) {
- LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
- return kStatusOutOfMemory;
- }
status = DecodeTiles(sequence_header, frame_header, tile_groups,
encoded_frame->state, frame_scratch_buffer.get(),
current_frame.get());
- frame_scratch_buffer_pool_.Release(std::move(frame_scratch_buffer));
if (status != kStatusOk) {
return status;
}
@@ -362,8 +433,9 @@ StatusCode DecoderImpl::DecodeFrame(EncodedFrame* const encoded_frame) {
return kStatusOk;
}
RefCountedBufferPtr film_grain_frame;
- status = ApplyFilmGrain(sequence_header, frame_header, current_frame,
- &film_grain_frame, /*thread_pool=*/nullptr);
+ status = ApplyFilmGrain(
+ sequence_header, frame_header, current_frame, &film_grain_frame,
+ frame_scratch_buffer->threading_strategy.thread_pool());
if (status != kStatusOk) {
return status;
}
@@ -402,6 +474,17 @@ StatusCode DecoderImpl::DecodeTemporalUnit(const TemporalUnit& temporal_unit,
RefCountedBufferPtr current_frame;
RefCountedBufferPtr displayable_frame;
StatusCode status;
+ std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+ frame_scratch_buffer_pool_.Get();
+ if (frame_scratch_buffer == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
+ return kStatusOutOfMemory;
+ }
+ // |frame_scratch_buffer| will be released when this local variable goes out
+ // of scope (i.e.) on any return path in this function.
+ FrameScratchBufferReleaser frame_scratch_buffer_releaser(
+ &frame_scratch_buffer_pool_, &frame_scratch_buffer);
+
while (obu->HasData()) {
status = obu->ParseOneFrame(&current_frame);
if (status != kStatusOk) {
@@ -433,16 +516,9 @@ StatusCode DecoderImpl::DecodeTemporalUnit(const TemporalUnit& temporal_unit,
// not have a reason to handle those cases, so we simply continue.
continue;
}
- std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
- frame_scratch_buffer_pool_.Get();
- if (frame_scratch_buffer == nullptr) {
- LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
- return kStatusOutOfMemory;
- }
status = DecodeTiles(obu->sequence_header(), obu->frame_header(),
obu->tile_groups(), state_,
frame_scratch_buffer.get(), current_frame.get());
- frame_scratch_buffer_pool_.Release(std::move(frame_scratch_buffer));
if (status != kStatusOk) {
return status;
}
@@ -463,17 +539,10 @@ StatusCode DecoderImpl::DecodeTemporalUnit(const TemporalUnit& temporal_unit,
}
displayable_frame = std::move(current_frame);
RefCountedBufferPtr film_grain_frame;
- std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
- frame_scratch_buffer_pool_.Get();
- if (frame_scratch_buffer == nullptr) {
- LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
- return kStatusOutOfMemory;
- }
status = ApplyFilmGrain(
obu->sequence_header(), obu->frame_header(), displayable_frame,
&film_grain_frame,
frame_scratch_buffer->threading_strategy.film_grain_thread_pool());
- frame_scratch_buffer_pool_.Release(std::move(frame_scratch_buffer));
if (status != kStatusOk) return status;
displayable_frame = std::move(film_grain_frame);
}
@@ -572,25 +641,6 @@ StatusCode DecoderImpl::DecodeTiles(
RefCountedBuffer* const current_frame) {
frame_scratch_buffer->tile_scratch_buffer_pool.Reset(
sequence_header.color_config.bitdepth);
- if (IsFrameParallel()) {
- // We can parse the current frame if all the reference frames have been
- // parsed.
- for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
- if (!state.reference_valid[i] || state.reference_frame[i] == nullptr) {
- continue;
- }
- if (!state.reference_frame[i]->WaitUntilParsed()) {
- return kStatusUnknownError;
- }
- }
- }
- if (PostFilter::DoDeblock(frame_header, settings_.post_filter_mask)) {
- if (kDeblockFilterBitMask && !frame_scratch_buffer->loop_filter_mask.Reset(
- frame_header.width, frame_header.height)) {
- LIBGAV1_DLOG(ERROR, "Failed to allocate memory for loop filter masks.");
- return kStatusOutOfMemory;
- }
- }
if (!frame_scratch_buffer->loop_restoration_info.Reset(
&frame_header.loop_restoration, frame_header.upscaled_width,
frame_header.height, sequence_header.color_config.subsampling_x,
@@ -671,11 +721,10 @@ StatusCode DecoderImpl::DecodeTiles(
// The addition of kMaxBlockHeight4x4 and kMaxBlockWidth4x4 is necessary so
// that the block parameters cache can be filled in for the last row/column
// without having to check for boundary conditions.
- BlockParametersHolder block_parameters_holder(
- frame_header.rows4x4 + kMaxBlockHeight4x4,
- frame_header.columns4x4 + kMaxBlockWidth4x4,
- sequence_header.use_128x128_superblock);
- if (!block_parameters_holder.Init()) {
+ if (!frame_scratch_buffer->block_parameters_holder.Reset(
+ frame_header.rows4x4 + kMaxBlockHeight4x4,
+ frame_header.columns4x4 + kMaxBlockWidth4x4,
+ sequence_header.use_128x128_superblock)) {
return kStatusOutOfMemory;
}
const dsp::Dsp* const dsp =
@@ -685,24 +734,6 @@ StatusCode DecoderImpl::DecodeTiles(
sequence_header.color_config.bitdepth);
return kStatusInternalError;
}
- // If prev_segment_ids is a null pointer, it is treated as if it pointed to
- // a segmentation map containing all 0s.
- const SegmentationMap* prev_segment_ids = nullptr;
- if (frame_header.primary_reference_frame == kPrimaryReferenceNone) {
- frame_scratch_buffer->symbol_decoder_context.Initialize(
- frame_header.quantizer.base_index);
- } else {
- const int index =
- frame_header
- .reference_frame_index[frame_header.primary_reference_frame];
- const RefCountedBuffer* prev_frame = state.reference_frame[index].get();
- frame_scratch_buffer->symbol_decoder_context = prev_frame->FrameContext();
- if (frame_header.segmentation.enabled &&
- prev_frame->columns4x4() == frame_header.columns4x4 &&
- prev_frame->rows4x4() == frame_header.rows4x4) {
- prev_segment_ids = prev_frame->segmentation_map();
- }
- }
const uint8_t tile_size_bytes = frame_header.tile_info.tile_size_bytes;
const int tile_count = tile_groups.back().end + 1;
@@ -714,26 +745,12 @@ StatusCode DecoderImpl::DecodeTiles(
}
ThreadingStrategy& threading_strategy =
frame_scratch_buffer->threading_strategy;
- if (!threading_strategy.Reset(frame_header, settings_.threads)) {
+ if (!IsFrameParallel() &&
+ !threading_strategy.Reset(frame_header, settings_.threads)) {
return kStatusOutOfMemory;
}
if (threading_strategy.row_thread_pool(0) != nullptr || IsFrameParallel()) {
- const int block_width4x4_minus_one =
- sequence_header.use_128x128_superblock ? 31 : 15;
- const int block_width4x4_log2 =
- sequence_header.use_128x128_superblock ? 5 : 4;
- const int superblock_rows =
- (frame_header.rows4x4 + block_width4x4_minus_one) >>
- block_width4x4_log2;
- const int superblock_columns =
- (frame_header.columns4x4 + block_width4x4_minus_one) >>
- block_width4x4_log2;
- if (!frame_scratch_buffer->superblock_state.Reset(superblock_rows,
- superblock_columns)) {
- LIBGAV1_DLOG(ERROR, "Failed to allocate super_block_state.\n");
- return kStatusOutOfMemory;
- }
if (frame_scratch_buffer->residual_buffer_pool == nullptr) {
frame_scratch_buffer->residual_buffer_pool.reset(
new (std::nothrow) ResidualBufferPool(
@@ -818,25 +835,80 @@ StatusCode DecoderImpl::DecodeTiles(
}
}
- PostFilter post_filter(
- frame_header, sequence_header, &frame_scratch_buffer->loop_filter_mask,
- frame_scratch_buffer->cdef_index,
- frame_scratch_buffer->inter_transform_sizes,
- &frame_scratch_buffer->loop_restoration_info, &block_parameters_holder,
- current_frame->buffer(), &frame_scratch_buffer->deblock_buffer, dsp,
- threading_strategy.post_filter_thread_pool(),
- frame_scratch_buffer->threaded_window_buffer.get(),
- frame_scratch_buffer->superres_line_buffer.get(),
- settings_.post_filter_mask);
+ PostFilter post_filter(frame_header, sequence_header, frame_scratch_buffer,
+ current_frame->buffer(), dsp,
+ settings_.post_filter_mask);
+
+ if (IsFrameParallel()) {
+ // We can parse the current frame if all the reference frames have been
+ // parsed.
+ for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+ if (!state.reference_valid[i] || state.reference_frame[i] == nullptr) {
+ continue;
+ }
+ if (!state.reference_frame[i]->WaitUntilParsed()) {
+ return kStatusUnknownError;
+ }
+ }
+ }
+
+ // If prev_segment_ids is a null pointer, it is treated as if it pointed to
+ // a segmentation map containing all 0s.
+ const SegmentationMap* prev_segment_ids = nullptr;
+ if (frame_header.primary_reference_frame == kPrimaryReferenceNone) {
+ frame_scratch_buffer->symbol_decoder_context.Initialize(
+ frame_header.quantizer.base_index);
+ } else {
+ const int index =
+ frame_header
+ .reference_frame_index[frame_header.primary_reference_frame];
+ const RefCountedBuffer* prev_frame = state.reference_frame[index].get();
+ frame_scratch_buffer->symbol_decoder_context = prev_frame->FrameContext();
+ if (frame_header.segmentation.enabled &&
+ prev_frame->columns4x4() == frame_header.columns4x4 &&
+ prev_frame->rows4x4() == frame_header.rows4x4) {
+ prev_segment_ids = prev_frame->segmentation_map();
+ }
+ }
+
// The Tile class must make use of a separate buffer to store the unfiltered
// pixels for the intra prediction of the next superblock row. This is done
// only when one of the following conditions are true:
- // * frame_parallel is true.
+ // * IsFrameParallel() is true.
// * settings_.threads == 1.
// In the non-frame-parallel multi-threaded case, we do not run the post
// filters in the decode loop. So this buffer need not be used.
const bool use_intra_prediction_buffer =
IsFrameParallel() || settings_.threads == 1;
+ if (use_intra_prediction_buffer) {
+ if (!frame_scratch_buffer->intra_prediction_buffers.Resize(
+ frame_header.tile_info.tile_rows)) {
+ LIBGAV1_DLOG(ERROR, "Failed to Resize intra_prediction_buffers.");
+ return kStatusOutOfMemory;
+ }
+ IntraPredictionBuffer* const intra_prediction_buffers =
+ frame_scratch_buffer->intra_prediction_buffers.get();
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const int subsampling =
+ (plane == kPlaneY) ? 0 : sequence_header.color_config.subsampling_x;
+ const size_t intra_prediction_buffer_size =
+ ((MultiplyBy4(frame_header.columns4x4) >> subsampling) *
+ (sequence_header.color_config.bitdepth == 8 ? sizeof(uint8_t)
+ : sizeof(uint16_t)));
+ for (int tile_row = 0; tile_row < frame_header.tile_info.tile_rows;
+ ++tile_row) {
+ if (!intra_prediction_buffers[tile_row][plane].Resize(
+ intra_prediction_buffer_size)) {
+ LIBGAV1_DLOG(ERROR,
+ "Failed to allocate intra prediction buffer for tile "
+ "row %d plane %d.\n",
+ tile_row, plane);
+ return kStatusOutOfMemory;
+ }
+ }
+ }
+ }
+
SymbolDecoderContext saved_symbol_decoder_context;
int tile_index = 0;
BlockingCounterWithStatus pending_tiles(tile_count);
@@ -870,7 +942,7 @@ StatusCode DecoderImpl::DecodeTiles(
tile_number, tile_group.data + byte_offset, tile_size,
sequence_header, frame_header, current_frame, state,
frame_scratch_buffer, wedge_masks_, &saved_symbol_decoder_context,
- prev_segment_ids, &post_filter, &block_parameters_holder, dsp,
+ prev_segment_ids, &post_filter, dsp,
threading_strategy.row_thread_pool(tile_index++), &pending_tiles,
IsFrameParallel(), use_intra_prediction_buffer);
if (tile == nullptr) {
@@ -885,7 +957,12 @@ StatusCode DecoderImpl::DecodeTiles(
}
assert(tiles.size() == static_cast<size_t>(tile_count));
if (IsFrameParallel()) {
- return DecodeTilesFrameParallel(
+ if (frame_scratch_buffer->threading_strategy.thread_pool() == nullptr) {
+ return DecodeTilesFrameParallel(
+ sequence_header, frame_header, tiles, saved_symbol_decoder_context,
+ prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame);
+ }
+ return DecodeTilesThreadedFrameParallel(
sequence_header, frame_header, tiles, saved_symbol_decoder_context,
prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame);
}
@@ -894,10 +971,8 @@ StatusCode DecoderImpl::DecodeTiles(
status = DecodeTilesNonFrameParallel(sequence_header, frame_header, tiles,
frame_scratch_buffer, &post_filter);
} else {
- status = DecodeTilesThreadedNonFrameParallel(
- sequence_header, frame_header, tiles, tile_groups,
- block_parameters_holder, frame_scratch_buffer, &post_filter,
- &pending_tiles);
+ status = DecodeTilesThreadedNonFrameParallel(tiles, frame_scratch_buffer,
+ &post_filter, &pending_tiles);
}
if (status != kStatusOk) return status;
if (frame_header.enable_frame_end_update_cdf) {
@@ -928,8 +1003,8 @@ StatusCode DecoderImpl::DecodeTilesNonFrameParallel(
}
}
post_filter->ApplyFilteringForOneSuperBlockRow(
- row4x4, block_width4x4,
- row4x4 + block_width4x4 >= frame_header.rows4x4);
+ row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+ /*do_deblock=*/true);
}
frame_scratch_buffer->tile_scratch_buffer_pool.Release(
std::move(tile_scratch_buffer));
@@ -937,11 +1012,7 @@ StatusCode DecoderImpl::DecodeTilesNonFrameParallel(
}
StatusCode DecoderImpl::DecodeTilesThreadedNonFrameParallel(
- const ObuSequenceHeader& sequence_header,
- const ObuFrameHeader& frame_header,
const Vector<std::unique_ptr<Tile>>& tiles,
- const Vector<ObuTileGroup>& tile_groups,
- const BlockParametersHolder& block_parameters_holder,
FrameScratchBuffer* const frame_scratch_buffer,
PostFilter* const post_filter,
BlockingCounterWithStatus* const pending_tiles) {
@@ -964,7 +1035,7 @@ StatusCode DecoderImpl::DecodeTilesThreadedNonFrameParallel(
tile_count) {
if (!failed) {
const auto& tile_ptr = tiles[index];
- if (!tile_ptr->ParseAndDecode(/*is_main_thread=*/false)) {
+ if (!tile_ptr->ParseAndDecode()) {
LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
failed = true;
}
@@ -981,7 +1052,7 @@ StatusCode DecoderImpl::DecodeTilesThreadedNonFrameParallel(
tile_count) {
if (!tile_decoding_failed) {
const auto& tile_ptr = tiles[index];
- if (!tile_ptr->ParseAndDecode(/*is_main_thread=*/true)) {
+ if (!tile_ptr->ParseAndDecode()) {
LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
tile_decoding_failed = true;
}
@@ -995,15 +1066,8 @@ StatusCode DecoderImpl::DecodeTilesThreadedNonFrameParallel(
// Wait until all the tiles have been decoded.
tile_decoding_failed |= !pending_tiles->Wait();
if (tile_decoding_failed) return kStatusUnknownError;
- if (post_filter->DoDeblock() && kDeblockFilterBitMask) {
- frame_scratch_buffer->loop_filter_mask.Build(
- sequence_header, frame_header, tile_groups.front().start,
- tile_groups.back().end, block_parameters_holder,
- frame_scratch_buffer->inter_transform_sizes);
- }
- if (threading_strategy.post_filter_thread_pool() != nullptr) {
- post_filter->ApplyFilteringThreaded();
- }
+ assert(threading_strategy.post_filter_thread_pool() != nullptr);
+ post_filter->ApplyFilteringThreaded();
return kStatusOk;
}
@@ -1048,8 +1112,8 @@ StatusCode DecoderImpl::DecodeTilesFrameParallel(
}
}
const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow(
- row4x4, block_width4x4,
- row4x4 + block_width4x4 >= frame_header.rows4x4);
+ row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+ /*do_deblock=*/true);
if (progress_row >= 0) {
current_frame->SetProgress(progress_row);
}
@@ -1062,6 +1126,309 @@ StatusCode DecoderImpl::DecodeTilesFrameParallel(
return kStatusOk;
}
+StatusCode DecoderImpl::DecodeTilesThreadedFrameParallel(
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header,
+ const Vector<std::unique_ptr<Tile>>& tiles,
+ const SymbolDecoderContext& saved_symbol_decoder_context,
+ const SegmentationMap* const prev_segment_ids,
+ FrameScratchBuffer* const frame_scratch_buffer,
+ PostFilter* const post_filter, RefCountedBuffer* const current_frame) {
+ // Parse the frame.
+ ThreadPool& thread_pool =
+ *frame_scratch_buffer->threading_strategy.thread_pool();
+ std::atomic<int> tile_counter(0);
+ const int tile_count = static_cast<int>(tiles.size());
+ const int num_workers = thread_pool.num_threads();
+ BlockingCounterWithStatus parse_workers(num_workers);
+ // Submit tile parsing jobs to the thread pool.
+ for (int i = 0; i < num_workers; ++i) {
+ thread_pool.Schedule([&tiles, tile_count, &tile_counter, &parse_workers]() {
+ bool failed = false;
+ int index;
+ while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+ tile_count) {
+ if (!failed) {
+ const auto& tile_ptr = tiles[index];
+ if (!tile_ptr->Parse()) {
+ LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number());
+ failed = true;
+ }
+ }
+ }
+ parse_workers.Decrement(!failed);
+ });
+ }
+
+ // Have the current thread participate in parsing.
+ bool failed = false;
+ int index;
+ while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+ tile_count) {
+ if (!failed) {
+ const auto& tile_ptr = tiles[index];
+ if (!tile_ptr->Parse()) {
+ LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number());
+ failed = true;
+ }
+ }
+ }
+
+ // Wait until all the parse workers are done. This ensures that all the tiles
+ // have been parsed.
+ if (!parse_workers.Wait() || failed) {
+ return kLibgav1StatusUnknownError;
+ }
+ if (frame_header.enable_frame_end_update_cdf) {
+ frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
+ }
+ current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
+ SetCurrentFrameSegmentationMap(frame_header, prev_segment_ids, current_frame);
+ current_frame->SetFrameState(kFrameStateParsed);
+
+ // Decode the frame.
+ const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
+ const int block_width4x4_log2 =
+ sequence_header.use_128x128_superblock ? 5 : 4;
+ const int superblock_rows =
+ (frame_header.rows4x4 + block_width4x4 - 1) >> block_width4x4_log2;
+ if (!frame_scratch_buffer->superblock_row_progress.Resize(superblock_rows) ||
+ !frame_scratch_buffer->superblock_row_progress_condvar.Resize(
+ superblock_rows)) {
+ return kLibgav1StatusOutOfMemory;
+ }
+ int* const superblock_row_progress =
+ frame_scratch_buffer->superblock_row_progress.get();
+ memset(superblock_row_progress, 0,
+ superblock_rows * sizeof(superblock_row_progress[0]));
+ frame_scratch_buffer->tile_decoding_failed = false;
+ const int tile_columns = frame_header.tile_info.tile_columns;
+ const bool decode_entire_tiles_in_worker_threads =
+ num_workers >= tile_columns;
+ BlockingCounter pending_jobs(
+ decode_entire_tiles_in_worker_threads ? num_workers : tile_columns);
+ if (decode_entire_tiles_in_worker_threads) {
+ // Submit tile decoding jobs to the thread pool.
+ tile_counter = 0;
+ for (int i = 0; i < num_workers; ++i) {
+ thread_pool.Schedule([&tiles, tile_count, &tile_counter, &pending_jobs,
+ frame_scratch_buffer, superblock_rows]() {
+ bool failed = false;
+ int index;
+ while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+ tile_count) {
+ if (failed) continue;
+ const auto& tile_ptr = tiles[index];
+ if (!tile_ptr->Decode(
+ &frame_scratch_buffer->superblock_row_mutex,
+ frame_scratch_buffer->superblock_row_progress.get(),
+ frame_scratch_buffer->superblock_row_progress_condvar
+ .get())) {
+ LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
+ failed = true;
+ SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+ }
+ }
+ pending_jobs.Decrement();
+ });
+ }
+ } else {
+ // Schedule the jobs for first tile row.
+ for (int tile_index = 0; tile_index < tile_columns; ++tile_index) {
+ thread_pool.Schedule([this, &tiles, tile_index, block_width4x4,
+ tile_columns, superblock_rows, frame_scratch_buffer,
+ post_filter, &pending_jobs]() {
+ DecodeSuperBlockRowInTile(
+ tiles, tile_index, 0, block_width4x4, tile_columns, superblock_rows,
+ frame_scratch_buffer, post_filter, &pending_jobs);
+ pending_jobs.Decrement();
+ });
+ }
+ }
+
+ // Current thread will do the post filters.
+ std::condition_variable* const superblock_row_progress_condvar =
+ frame_scratch_buffer->superblock_row_progress_condvar.get();
+ const std::unique_ptr<Tile>* tile_row_base = &tiles[0];
+ for (int row4x4 = 0, index = 0; row4x4 < frame_header.rows4x4;
+ row4x4 += block_width4x4, ++index) {
+ if (!tile_row_base[0]->IsRow4x4Inside(row4x4)) {
+ tile_row_base += tile_columns;
+ }
+ {
+ std::unique_lock<std::mutex> lock(
+ frame_scratch_buffer->superblock_row_mutex);
+ while (superblock_row_progress[index] != tile_columns &&
+ !frame_scratch_buffer->tile_decoding_failed) {
+ superblock_row_progress_condvar[index].wait(lock);
+ }
+ if (frame_scratch_buffer->tile_decoding_failed) break;
+ }
+ if (post_filter->DoDeblock()) {
+ // Apply deblocking filter for the tile boundaries of this superblock row.
+ // The deblocking filter for the internal blocks will be applied in the
+ // tile worker threads. In this thread, we will only have to apply
+ // deblocking filter for the tile boundaries.
+ ApplyDeblockingFilterForTileBoundaries(
+ post_filter, tile_row_base, frame_header, row4x4, block_width4x4,
+ tile_columns, decode_entire_tiles_in_worker_threads);
+ }
+ // Apply all the post filters other than deblocking.
+ const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow(
+ row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+ /*do_deblock=*/false);
+ if (progress_row >= 0) {
+ current_frame->SetProgress(progress_row);
+ }
+ }
+ // Wait until all the pending jobs are done. This ensures that all the tiles
+ // have been decoded and wrapped up.
+ pending_jobs.Wait();
+ {
+ std::lock_guard<std::mutex> lock(
+ frame_scratch_buffer->superblock_row_mutex);
+ if (frame_scratch_buffer->tile_decoding_failed) {
+ return kLibgav1StatusUnknownError;
+ }
+ }
+
+ current_frame->SetFrameState(kFrameStateDecoded);
+ return kStatusOk;
+}
+
+void DecoderImpl::DecodeSuperBlockRowInTile(
+ const Vector<std::unique_ptr<Tile>>& tiles, size_t tile_index, int row4x4,
+ const int superblock_size4x4, const int tile_columns,
+ const int superblock_rows, FrameScratchBuffer* const frame_scratch_buffer,
+ PostFilter* const post_filter, BlockingCounter* const pending_jobs) {
+ std::unique_ptr<TileScratchBuffer> scratch_buffer =
+ frame_scratch_buffer->tile_scratch_buffer_pool.Get();
+ if (scratch_buffer == nullptr) {
+ SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+ return;
+ }
+ Tile& tile = *tiles[tile_index];
+ const bool ok = tile.ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+ row4x4, scratch_buffer.get());
+ frame_scratch_buffer->tile_scratch_buffer_pool.Release(
+ std::move(scratch_buffer));
+ if (!ok) {
+ SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+ return;
+ }
+ if (post_filter->DoDeblock()) {
+ // Apply vertical deblock filtering for all the columns in this tile except
+ // for the first 64 columns.
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeVertical, row4x4,
+ tile.column4x4_start() + kNum4x4InLoopFilterUnit, tile.column4x4_end(),
+ superblock_size4x4);
+ // Apply horizontal deblock filtering for all the columns in this tile
+ // except for the first and the last 64 columns.
+ // Note about the last tile of each row: For the last tile, column4x4_end
+ // may not be a multiple of 16. In that case it is still okay to simply
+ // subtract 16 since ApplyDeblockFilter() will only do the filters in
+ // increments of 64 columns (or 32 columns for chroma with subsampling).
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeHorizontal, row4x4,
+ tile.column4x4_start() + kNum4x4InLoopFilterUnit,
+ tile.column4x4_end() - kNum4x4InLoopFilterUnit, superblock_size4x4);
+ }
+ const int superblock_size4x4_log2 = FloorLog2(superblock_size4x4);
+ const int index = row4x4 >> superblock_size4x4_log2;
+ int* const superblock_row_progress =
+ frame_scratch_buffer->superblock_row_progress.get();
+ std::condition_variable* const superblock_row_progress_condvar =
+ frame_scratch_buffer->superblock_row_progress_condvar.get();
+ bool notify;
+ {
+ std::lock_guard<std::mutex> lock(
+ frame_scratch_buffer->superblock_row_mutex);
+ notify = ++superblock_row_progress[index] == tile_columns;
+ }
+ if (notify) {
+ // We are done decoding this superblock row. Notify the post filtering
+ // thread.
+ superblock_row_progress_condvar[index].notify_one();
+ }
+ // Schedule the next superblock row (if one exists).
+ ThreadPool& thread_pool =
+ *frame_scratch_buffer->threading_strategy.thread_pool();
+ const int next_row4x4 = row4x4 + superblock_size4x4;
+ if (!tile.IsRow4x4Inside(next_row4x4)) {
+ tile_index += tile_columns;
+ }
+ if (tile_index >= tiles.size()) return;
+ pending_jobs->IncrementBy(1);
+ thread_pool.Schedule([this, &tiles, tile_index, next_row4x4,
+ superblock_size4x4, tile_columns, superblock_rows,
+ frame_scratch_buffer, post_filter, pending_jobs]() {
+ DecodeSuperBlockRowInTile(tiles, tile_index, next_row4x4,
+ superblock_size4x4, tile_columns, superblock_rows,
+ frame_scratch_buffer, post_filter, pending_jobs);
+ pending_jobs->Decrement();
+ });
+}
+
+void DecoderImpl::ApplyDeblockingFilterForTileBoundaries(
+ PostFilter* const post_filter, const std::unique_ptr<Tile>* tile_row_base,
+ const ObuFrameHeader& frame_header, int row4x4, int block_width4x4,
+ int tile_columns, bool decode_entire_tiles_in_worker_threads) {
+ // Apply vertical deblock filtering for the first 64 columns of each tile.
+ for (int tile_column = 0; tile_column < tile_columns; ++tile_column) {
+ const Tile& tile = *tile_row_base[tile_column];
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeVertical, row4x4, tile.column4x4_start(),
+ tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+ }
+ if (decode_entire_tiles_in_worker_threads &&
+ row4x4 == tile_row_base[0]->row4x4_start()) {
+ // This is the first superblock row of a tile row. In this case, apply
+ // horizontal deblock filtering for the entire superblock row.
+ post_filter->ApplyDeblockFilter(kLoopFilterTypeHorizontal, row4x4, 0,
+ frame_header.columns4x4, block_width4x4);
+ } else {
+ // Apply horizontal deblock filtering for the first 64 columns of the
+ // first tile.
+ const Tile& first_tile = *tile_row_base[0];
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeHorizontal, row4x4, first_tile.column4x4_start(),
+ first_tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+ // Apply horizontal deblock filtering for the last 64 columns of the
+ // previous tile and the first 64 columns of the current tile.
+ for (int tile_column = 1; tile_column < tile_columns; ++tile_column) {
+ const Tile& tile = *tile_row_base[tile_column];
+ // If the previous tile has more than 64 columns, then include those
+ // for the horizontal deblock.
+ const Tile& previous_tile = *tile_row_base[tile_column - 1];
+ const int column4x4_start =
+ tile.column4x4_start() -
+ ((tile.column4x4_start() - kNum4x4InLoopFilterUnit !=
+ previous_tile.column4x4_start())
+ ? kNum4x4InLoopFilterUnit
+ : 0);
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeHorizontal, row4x4, column4x4_start,
+ tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+ }
+ // Apply horizontal deblock filtering for the last 64 columns of the
+ // last tile.
+ const Tile& last_tile = *tile_row_base[tile_columns - 1];
+ // Identify the last column4x4 value and do horizontal filtering for
+ // that column4x4. The value of last column4x4 is the nearest multiple
+ // of 16 that is before tile.column4x4_end().
+ const int column4x4_start = (last_tile.column4x4_end() - 1) & ~15;
+ // If column4x4_start is the same as tile.column4x4_start() then it
+ // means that the last tile has <= 64 columns. So there is nothing left
+ // to deblock (since it was already deblocked in the loop above).
+ if (column4x4_start != last_tile.column4x4_start()) {
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeHorizontal, row4x4, column4x4_start,
+ last_tile.column4x4_end(), block_width4x4);
+ }
+ }
+}
+
void DecoderImpl::SetCurrentFrameSegmentationMap(
const ObuFrameHeader& frame_header, const SegmentationMap* prev_segment_ids,
RefCountedBuffer* const current_frame) {
@@ -1092,10 +1459,7 @@ StatusCode DecoderImpl::ApplyFilmGrain(
return kStatusOk;
}
if (!frame_header.show_existing_frame &&
- frame_header.refresh_frame_flags == 0 &&
- // TODO(vigneshv): In frame parallel mode, we never do film grain in
- // place. Revisit this and see if this constraint need to be enforced.
- !IsFrameParallel()) {
+ frame_header.refresh_frame_flags == 0) {
// If show_existing_frame is true, then the current frame is a previously
// saved reference frame. If refresh_frame_flags is nonzero, then the
// state_.UpdateReferenceFrames() call above has saved the current frame as
diff --git a/chromium/third_party/libgav1/src/src/decoder_impl.h b/chromium/third_party/libgav1/src/src/decoder_impl.h
index dbc79ed85d7..4d58999c95e 100644
--- a/chromium/third_party/libgav1/src/src/decoder_impl.h
+++ b/chromium/third_party/libgav1/src/src/decoder_impl.h
@@ -18,7 +18,6 @@
#define LIBGAV1_SRC_DECODER_IMPL_H_
#include <array>
-#include <atomic>
#include <condition_variable> // NOLINT (unapproved c++11 header)
#include <cstddef>
#include <cstdint>
@@ -32,7 +31,6 @@
#include "src/gav1/decoder_buffer.h"
#include "src/gav1/decoder_settings.h"
#include "src/gav1/status_code.h"
-#include "src/loop_filter_mask.h"
#include "src/obu_parser.h"
#include "src/residual_buffer_pool.h"
#include "src/symbol_decoder_context.h"
@@ -129,6 +127,19 @@ class DecoderImpl : public Allocable {
private:
explicit DecoderImpl(const DecoderSettings* settings);
StatusCode Init();
+ // Called when the first frame is enqueued. It does the OBU parsing for one
+ // temporal unit to retrieve the tile configuration and sets up the frame
+ // threading if frame parallel mode is allowed. It also initializes the
+ // |temporal_units_| queue based on the number of frame threads.
+ //
+ // The following are the limitations of the current implementation:
+ // * It assumes that all frames in the video have the same tile
+ // configuration. The frame parallel threading model will not be updated
+ // based on tile configuration changes mid-stream.
+ // * The above assumption holds true even when there is a new coded video
+ // sequence (i.e.) a new sequence header.
+ StatusCode InitializeFrameThreadPoolAndTemporalUnitQueue(const uint8_t* data,
+ size_t size);
// Used only in frame parallel mode. Signals failure and waits until the
// worker threads are aborted if |status| is a failure status. If |status| is
// equal to kStatusOk or kStatusTryAgain, this function does not do anything.
@@ -175,11 +186,7 @@ class DecoderImpl : public Allocable {
const Vector<std::unique_ptr<Tile>>& tiles,
FrameScratchBuffer* frame_scratch_buffer, PostFilter* post_filter);
StatusCode DecodeTilesThreadedNonFrameParallel(
- const ObuSequenceHeader& sequence_header,
- const ObuFrameHeader& frame_header,
const Vector<std::unique_ptr<Tile>>& tiles,
- const Vector<ObuTileGroup>& tile_groups,
- const BlockParametersHolder& block_parameters_holder,
FrameScratchBuffer* frame_scratch_buffer, PostFilter* post_filter,
BlockingCounterWithStatus* pending_tiles);
StatusCode DecodeTilesFrameParallel(
@@ -190,6 +197,36 @@ class DecoderImpl : public Allocable {
const SegmentationMap* prev_segment_ids,
FrameScratchBuffer* frame_scratch_buffer, PostFilter* post_filter,
RefCountedBuffer* current_frame);
+ StatusCode DecodeTilesThreadedFrameParallel(
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header,
+ const Vector<std::unique_ptr<Tile>>& tiles,
+ const SymbolDecoderContext& saved_symbol_decoder_context,
+ const SegmentationMap* prev_segment_ids,
+ FrameScratchBuffer* frame_scratch_buffer, PostFilter* post_filter,
+ RefCountedBuffer* current_frame);
+ // Helper function used by DecodeTilesThreadedFrameParallel. Decodes the
+ // superblock row starting at |row4x4| for tile at index |tile_index| in the
+ // list of tiles |tiles|. If the decoding is successful, then it does the
+ // following:
+ // * Schedule the next superblock row in the current tile column for
+ // decoding (the next superblock row may be in a different tile than the
+ // current one).
+ // * If an entire superblock row of the frame has been decoded, it notifies
+ // the waiters (if there are any).
+ void DecodeSuperBlockRowInTile(const Vector<std::unique_ptr<Tile>>& tiles,
+ size_t tile_index, int row4x4,
+ int superblock_size4x4, int tile_columns,
+ int superblock_rows,
+ FrameScratchBuffer* frame_scratch_buffer,
+ PostFilter* post_filter,
+ BlockingCounter* pending_jobs);
+ // Helper function used by DecodeTilesThreadedFrameParallel. Applies the
+ // deblocking filter for tile boundaries for the superblock row at |row4x4|.
+ void ApplyDeblockingFilterForTileBoundaries(
+ PostFilter* post_filter, const std::unique_ptr<Tile>* tile_row_base,
+ const ObuFrameHeader& frame_header, int row4x4, int block_width4x4,
+ int tile_columns, bool decode_entire_tiles_in_worker_threads);
// Sets the current frame's segmentation map for two cases. The third case
// is handled in Tile::DecodeBlock().
void SetCurrentFrameSegmentationMap(const ObuFrameHeader& frame_header,
@@ -206,6 +243,11 @@ class DecoderImpl : public Allocable {
bool IsNewSequenceHeader(const ObuParser& obu);
bool IsFrameParallel() const { return frame_thread_pool_ != nullptr; }
+ bool HasFailure() {
+ std::lock_guard<std::mutex> lock(mutex_);
+ return failure_status_ != kStatusOk;
+ }
+
Queue<TemporalUnit> temporal_units_;
DecoderState state_;
@@ -228,21 +270,16 @@ class DecoderImpl : public Allocable {
// 2) DecodeTiles()
// Both of these functions have to respond to the other one failing by
// aborting whatever they are doing. This variable is used to accomplish that.
- std::atomic<bool> abort_{false};
- // Stores the failure status if |abort_| is true.
- std::atomic<StatusCode> failure_status_{kStatusOk};
+ // If |failure_status_| is not kStatusOk, then the two functions will try to
+ // abort as early as they can.
+ StatusCode failure_status_ = kStatusOk LIBGAV1_GUARDED_BY(mutex_);
ObuSequenceHeader sequence_header_ = {};
// If true, sequence_header is valid.
bool has_sequence_header_ = false;
-#if defined(ENABLE_FRAME_PARALLEL)
- // TODO(b/142583029): A copy of the DecoderSettings is made to facilitate the
- // development of frame parallel mode behind a compile time flag.
- DecoderSettings settings_;
-#else
const DecoderSettings& settings_;
-#endif
+ bool seen_first_frame_ = false;
};
} // namespace libgav1
diff --git a/chromium/third_party/libgav1/src/src/dsp/arm/cdef_neon.cc b/chromium/third_party/libgav1/src/src/dsp/arm/cdef_neon.cc
index 1fccfb47b36..c005f081279 100644
--- a/chromium/third_party/libgav1/src/src/dsp/arm/cdef_neon.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/arm/cdef_neon.cc
@@ -36,16 +36,7 @@ namespace dsp {
namespace low_bitdepth {
namespace {
-// CdefDirection:
-// Mirror values and pad to 16 elements.
-alignas(16) constexpr uint32_t kDivisionTable[] = {840, 420, 280, 210, 168, 140,
- 120, 105, 120, 140, 168, 210,
- 280, 420, 840, 0};
-
-// Used when calculating odd |cost[x]| values to mask off unwanted elements.
-// Holds elements 1 3 5 X 5 3 1 X
-alignas(16) constexpr uint32_t kDivisionTableOdd[] = {420, 210, 140, 0,
- 140, 210, 420, 0};
+#include "src/dsp/cdef.inc"
// Expand |a| to int8x16_t, left shift it by |shift| and sum the low
// and high values with |b| and |c| respectively.
@@ -159,10 +150,10 @@ uint32x4_t SquareAccumulate(uint32x4_t a, uint16x4_t b) {
// |cost[0]| and |cost[4]| square the input and sum with the corresponding
// element from the other end of the vector:
-// |kDivisionTable[]| element:
+// |kCdefDivisionTable[]| element:
// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
-// kDivisionTable[i + 1];
-// cost[0] += Square(partial[0][7]) * kDivisionTable[8];
+// kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
// Because everything is being summed into a single value the distributive
// property allows us to mirror the division table and accumulate once.
uint32_t Cost0Or4(const uint16x8_t a, const uint16x8_t b,
@@ -179,7 +170,7 @@ uint32_t Cost0Or4(const uint16x8_t a, const uint16x8_t b,
uint32_t SquareAccumulate(const uint16x8_t a) {
uint32x4_t c = Square(vget_low_u16(a));
c = SquareAccumulate(c, vget_high_u16(a));
- c = vmulq_n_u32(c, kDivisionTable[7]);
+ c = vmulq_n_u32(c, kCdefDivisionTable[7]);
return SumVector(c);
}
@@ -188,7 +179,7 @@ uint32_t CostOdd(const uint16x8_t a, const uint16x8_t b, const uint32x4_t mask,
// Remove elements 0-2.
uint32x4_t c = vandq_u32(mask, Square(vget_low_u16(a)));
c = vaddq_u32(c, Square(vget_high_u16(a)));
- c = vmulq_n_u32(c, kDivisionTable[7]);
+ c = vmulq_n_u32(c, kCdefDivisionTable[7]);
c = vmlaq_u32(c, Square(vget_low_u16(a)), division_table[0]);
c = vmlaq_u32(c, Square(vget_low_u16(b)), division_table[1]);
@@ -230,14 +221,14 @@ void CdefDirection_NEON(const void* const source, ptrdiff_t stride,
cost[6] = SquareAccumulate(partial_lo[6]);
const uint32x4_t division_table[4] = {
- vld1q_u32(kDivisionTable), vld1q_u32(kDivisionTable + 4),
- vld1q_u32(kDivisionTable + 8), vld1q_u32(kDivisionTable + 12)};
+ vld1q_u32(kCdefDivisionTable), vld1q_u32(kCdefDivisionTable + 4),
+ vld1q_u32(kCdefDivisionTable + 8), vld1q_u32(kCdefDivisionTable + 12)};
cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table);
cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table);
- const uint32x4_t division_table_odd[2] = {vld1q_u32(kDivisionTableOdd),
- vld1q_u32(kDivisionTableOdd + 4)};
+ const uint32x4_t division_table_odd[2] = {
+ vld1q_u32(kCdefDivisionTableOdd), vld1q_u32(kCdefDivisionTableOdd + 4)};
const uint32x4_t element_3_mask = {0, 0, 0, static_cast<uint32_t>(-1)};
@@ -328,31 +319,34 @@ int16x8_t Constrain(const uint16x8_t pixel, const uint16x8_t reference,
return vsubq_s16(veorq_s16(clamp_abs_diff, sign), sign);
}
-template <int width>
+template <int width, bool enable_primary = true, bool enable_secondary = true>
void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height,
const int direction, const int primary_strength,
const int secondary_strength, const int damping, uint8_t* dst,
const ptrdiff_t dst_stride) {
static_assert(width == 8 || width == 4, "");
+ static_assert(enable_primary || enable_secondary, "");
const uint16x8_t cdef_large_value_mask =
vdupq_n_u16(static_cast<uint16_t>(~kCdefLargeValue));
const int16x8_t primary_threshold = vdupq_n_s16(primary_strength);
const int16x8_t secondary_threshold = vdupq_n_s16(secondary_strength);
int16x8_t primary_damping_shift, secondary_damping_shift;
+
// FloorLog2() requires input to be > 0.
- if (primary_strength == 0) {
- primary_damping_shift = vdupq_n_s16(0);
- } else {
+ // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+ if (enable_primary) {
+ // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
+ // for UV filtering.
primary_damping_shift =
vdupq_n_s16(-std::max(0, damping - FloorLog2(primary_strength)));
}
-
- if (secondary_strength == 0) {
- secondary_damping_shift = vdupq_n_s16(0);
- } else {
+ if (enable_secondary) {
+ // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+ // necessary.
+ assert(damping - FloorLog2(secondary_strength) >= 0);
secondary_damping_shift =
- vdupq_n_s16(-std::max(0, damping - FloorLog2(secondary_strength)));
+ vdupq_n_s16(-(damping - FloorLog2(secondary_strength)));
}
const int primary_tap_0 = kCdefPrimaryTaps[primary_strength & 1][0];
@@ -366,105 +360,112 @@ void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height,
} else {
pixel = vcombine_u16(vld1_u16(src), vld1_u16(src + src_stride));
}
+
uint16x8_t min = pixel;
uint16x8_t max = pixel;
-
- // Primary |direction|.
- uint16x8_t primary_val[4];
- if (width == 8) {
- LoadDirection(src, src_stride, primary_val, direction);
+ int16x8_t sum;
+
+ if (enable_primary) {
+ // Primary |direction|.
+ uint16x8_t primary_val[4];
+ if (width == 8) {
+ LoadDirection(src, src_stride, primary_val, direction);
+ } else {
+ LoadDirection4(src, src_stride, primary_val, direction);
+ }
+
+ min = vminq_u16(min, primary_val[0]);
+ min = vminq_u16(min, primary_val[1]);
+ min = vminq_u16(min, primary_val[2]);
+ min = vminq_u16(min, primary_val[3]);
+
+ // Convert kCdefLargeValue to 0 before calculating max.
+ max = vmaxq_u16(max, vandq_u16(primary_val[0], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(primary_val[1], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(primary_val[2], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(primary_val[3], cdef_large_value_mask));
+
+ sum = Constrain(primary_val[0], pixel, primary_threshold,
+ primary_damping_shift);
+ sum = vmulq_n_s16(sum, primary_tap_0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(primary_val[1], pixel, primary_threshold,
+ primary_damping_shift),
+ primary_tap_0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(primary_val[2], pixel, primary_threshold,
+ primary_damping_shift),
+ primary_tap_1);
+ sum = vmlaq_n_s16(sum,
+ Constrain(primary_val[3], pixel, primary_threshold,
+ primary_damping_shift),
+ primary_tap_1);
} else {
- LoadDirection4(src, src_stride, primary_val, direction);
+ sum = vdupq_n_s16(0);
}
- min = vminq_u16(min, primary_val[0]);
- min = vminq_u16(min, primary_val[1]);
- min = vminq_u16(min, primary_val[2]);
- min = vminq_u16(min, primary_val[3]);
-
- // Convert kCdefLargeValue to 0 before calculating max.
- max = vmaxq_u16(max, vandq_u16(primary_val[0], cdef_large_value_mask));
- max = vmaxq_u16(max, vandq_u16(primary_val[1], cdef_large_value_mask));
- max = vmaxq_u16(max, vandq_u16(primary_val[2], cdef_large_value_mask));
- max = vmaxq_u16(max, vandq_u16(primary_val[3], cdef_large_value_mask));
-
- int16x8_t sum = Constrain(primary_val[0], pixel, primary_threshold,
- primary_damping_shift);
- sum = vmulq_n_s16(sum, primary_tap_0);
- sum = vmlaq_n_s16(sum,
- Constrain(primary_val[1], pixel, primary_threshold,
- primary_damping_shift),
- primary_tap_0);
- sum = vmlaq_n_s16(sum,
- Constrain(primary_val[2], pixel, primary_threshold,
- primary_damping_shift),
- primary_tap_1);
- sum = vmlaq_n_s16(sum,
- Constrain(primary_val[3], pixel, primary_threshold,
- primary_damping_shift),
- primary_tap_1);
-
- // Secondary |direction| values (+/- 2). Clamp |direction|.
- uint16x8_t secondary_val[8];
- if (width == 8) {
- LoadDirection(src, src_stride, secondary_val, (direction + 2) & 0x7);
- LoadDirection(src, src_stride, secondary_val + 4, (direction - 2) & 0x7);
- } else {
- LoadDirection4(src, src_stride, secondary_val, (direction + 2) & 0x7);
- LoadDirection4(src, src_stride, secondary_val + 4, (direction - 2) & 0x7);
+ if (enable_secondary) {
+ // Secondary |direction| values (+/- 2). Clamp |direction|.
+ uint16x8_t secondary_val[8];
+ if (width == 8) {
+ LoadDirection(src, src_stride, secondary_val, direction + 2);
+ LoadDirection(src, src_stride, secondary_val + 4, direction - 2);
+ } else {
+ LoadDirection4(src, src_stride, secondary_val, direction + 2);
+ LoadDirection4(src, src_stride, secondary_val + 4, direction - 2);
+ }
+
+ min = vminq_u16(min, secondary_val[0]);
+ min = vminq_u16(min, secondary_val[1]);
+ min = vminq_u16(min, secondary_val[2]);
+ min = vminq_u16(min, secondary_val[3]);
+ min = vminq_u16(min, secondary_val[4]);
+ min = vminq_u16(min, secondary_val[5]);
+ min = vminq_u16(min, secondary_val[6]);
+ min = vminq_u16(min, secondary_val[7]);
+
+ max = vmaxq_u16(max, vandq_u16(secondary_val[0], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(secondary_val[1], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(secondary_val[2], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(secondary_val[3], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(secondary_val[4], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(secondary_val[5], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(secondary_val[6], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(secondary_val[7], cdef_large_value_mask));
+
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[0], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[1], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[2], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap1);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[3], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap1);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[4], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[5], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[6], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap1);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[7], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap1);
}
-
- min = vminq_u16(min, secondary_val[0]);
- min = vminq_u16(min, secondary_val[1]);
- min = vminq_u16(min, secondary_val[2]);
- min = vminq_u16(min, secondary_val[3]);
- min = vminq_u16(min, secondary_val[4]);
- min = vminq_u16(min, secondary_val[5]);
- min = vminq_u16(min, secondary_val[6]);
- min = vminq_u16(min, secondary_val[7]);
-
- max = vmaxq_u16(max, vandq_u16(secondary_val[0], cdef_large_value_mask));
- max = vmaxq_u16(max, vandq_u16(secondary_val[1], cdef_large_value_mask));
- max = vmaxq_u16(max, vandq_u16(secondary_val[2], cdef_large_value_mask));
- max = vmaxq_u16(max, vandq_u16(secondary_val[3], cdef_large_value_mask));
- max = vmaxq_u16(max, vandq_u16(secondary_val[4], cdef_large_value_mask));
- max = vmaxq_u16(max, vandq_u16(secondary_val[5], cdef_large_value_mask));
- max = vmaxq_u16(max, vandq_u16(secondary_val[6], cdef_large_value_mask));
- max = vmaxq_u16(max, vandq_u16(secondary_val[7], cdef_large_value_mask));
-
- sum = vmlaq_n_s16(sum,
- Constrain(secondary_val[0], pixel, secondary_threshold,
- secondary_damping_shift),
- kCdefSecondaryTap0);
- sum = vmlaq_n_s16(sum,
- Constrain(secondary_val[1], pixel, secondary_threshold,
- secondary_damping_shift),
- kCdefSecondaryTap0);
- sum = vmlaq_n_s16(sum,
- Constrain(secondary_val[2], pixel, secondary_threshold,
- secondary_damping_shift),
- kCdefSecondaryTap1);
- sum = vmlaq_n_s16(sum,
- Constrain(secondary_val[3], pixel, secondary_threshold,
- secondary_damping_shift),
- kCdefSecondaryTap1);
- sum = vmlaq_n_s16(sum,
- Constrain(secondary_val[4], pixel, secondary_threshold,
- secondary_damping_shift),
- kCdefSecondaryTap0);
- sum = vmlaq_n_s16(sum,
- Constrain(secondary_val[5], pixel, secondary_threshold,
- secondary_damping_shift),
- kCdefSecondaryTap0);
- sum = vmlaq_n_s16(sum,
- Constrain(secondary_val[6], pixel, secondary_threshold,
- secondary_damping_shift),
- kCdefSecondaryTap1);
- sum = vmlaq_n_s16(sum,
- Constrain(secondary_val[7], pixel, secondary_threshold,
- secondary_damping_shift),
- kCdefSecondaryTap1);
-
// Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
const int16x8_t sum_lt_0 = vshrq_n_s16(sum, 15);
sum = vaddq_s16(sum, vdupq_n_s16(8));
@@ -495,26 +496,48 @@ void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height,
// inside the frame. However it requires the source input to be padded with a
// constant large value if at the boundary. The input must be uint16_t.
void CdefFilter_NEON(const void* const source, const ptrdiff_t source_stride,
- const int rows4x4, const int columns4x4, const int curr_x,
- const int curr_y, const int subsampling_x,
- const int subsampling_y, const int primary_strength,
- const int secondary_strength, const int damping,
- const int direction, void* const dest,
+ const int block_width, const int block_height,
+ const int primary_strength, const int secondary_strength,
+ const int damping, const int direction, void* const dest,
const ptrdiff_t dest_stride) {
- const int plane_width = MultiplyBy4(columns4x4) >> subsampling_x;
- const int plane_height = MultiplyBy4(rows4x4) >> subsampling_y;
- const int block_width = std::min(8 >> subsampling_x, plane_width - curr_x);
- const int block_height = std::min(8 >> subsampling_y, plane_height - curr_y);
const auto* src = static_cast<const uint16_t*>(source);
auto* dst = static_cast<uint8_t*>(dest);
- if (block_width == 8) {
- DoCdef<8>(src, source_stride, block_height, direction, primary_strength,
- secondary_strength, damping, dst, dest_stride);
+ // TODO(slavarnway): Change dsp->cdef_filter to dsp->cdef_filter[2][2]. This
+ // would eliminate the strength checks.
+ if (secondary_strength > 0) {
+ if (primary_strength > 0) {
+ if (block_width == 8) {
+ DoCdef<8>(src, source_stride, block_height, direction, primary_strength,
+ secondary_strength, damping, dst, dest_stride);
+ } else {
+ assert(block_width == 4);
+ DoCdef<4>(src, source_stride, block_height, direction, primary_strength,
+ secondary_strength, damping, dst, dest_stride);
+ }
+ } else {
+ if (block_width == 8) {
+ DoCdef<8, /*enable_primary=*/false>(
+ src, source_stride, block_height, direction, primary_strength,
+ secondary_strength, damping, dst, dest_stride);
+ } else {
+ assert(block_width == 4);
+ DoCdef<4, /*enable_primary=*/false>(
+ src, source_stride, block_height, direction, primary_strength,
+ secondary_strength, damping, dst, dest_stride);
+ }
+ }
} else {
- assert(block_width == 4);
- DoCdef<4>(src, source_stride, block_height, direction, primary_strength,
- secondary_strength, damping, dst, dest_stride);
+ if (block_width == 8) {
+ DoCdef<8, /*enable_primary=*/true, /*enable_secondary=*/false>(
+ src, source_stride, block_height, direction, primary_strength,
+ secondary_strength, damping, dst, dest_stride);
+ } else {
+ assert(block_width == 4);
+ DoCdef<4, /*enable_primary=*/true, /*enable_secondary=*/false>(
+ src, source_stride, block_height, direction, primary_strength,
+ secondary_strength, damping, dst, dest_stride);
+ }
}
}
diff --git a/chromium/third_party/libgav1/src/src/dsp/arm/convolve_neon.cc b/chromium/third_party/libgav1/src/src/dsp/arm/convolve_neon.cc
index 34868826dcd..424be020bff 100644
--- a/chromium/third_party/libgav1/src/src/dsp/arm/convolve_neon.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/arm/convolve_neon.cc
@@ -1350,8 +1350,6 @@ void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
const int height, void* dest,
const ptrdiff_t dest_stride) {
constexpr ptrdiff_t src_stride = kIntermediateStride;
- constexpr int kernel_offset = (8 - num_taps) / 2;
- src += src_stride * kernel_offset;
const int16_t* src_y = src;
// |dest| is 16-bit in compound mode, Pixel otherwise.
uint16_t* dest16_y = static_cast<uint16_t*>(dest);
@@ -1425,8 +1423,6 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width,
const int step_y, const int height,
void* dest, const ptrdiff_t dest_stride) {
constexpr ptrdiff_t src_stride = kIntermediateStride;
- constexpr int kernel_offset = (8 - num_taps) / 2;
- src += src_stride * kernel_offset;
// A possible improvement is to use arithmetic to decide how many times to
// apply filters to same source before checking whether to load new srcs.
// However, this will only improve performance with very small step sizes.
@@ -1498,15 +1494,14 @@ void ConvolveScale2D_NEON(const void* const reference,
const int subpixel_y, const int step_x,
const int step_y, const int width, const int height,
void* prediction, const ptrdiff_t pred_stride) {
- // TODO(petersonab): Reduce the height here by using the vertical filter
- // size and offset horizontal filter. Reduce intermediate block stride to
- // width to make smaller blocks faster.
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ assert(step_x <= 2048);
+ const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
const int intermediate_height =
(((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
kScaleSubPixelBits) +
- kSubPixelTaps;
- // TODO(b/133525024): Decide whether it's worth branching to a special case
- // when step_x or step_y is 1024.
+ num_vert_taps;
assert(step_x <= 2048);
// The output of the horizontal filter, i.e. the intermediate_result, is
// guaranteed to fit in int16_t.
@@ -1520,11 +1515,27 @@ void ConvolveScale2D_NEON(const void* const reference,
// Similarly for height.
int filter_index = GetFilterIndex(horizontal_filter_index, width);
int16_t* intermediate = intermediate_result;
- const auto* src = static_cast<const uint8_t*>(reference);
const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference);
+ const int vert_kernel_offset = (8 - num_vert_taps) / 2;
+ src += vert_kernel_offset * src_stride;
+
+ // Derive the maximum value of |step_x| at which all source values fit in one
+ // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
+ // step_x*7 is the final base subpel index for the shuffle mask for filter
+ // inputs in each iteration on large blocks. When step_x is large, we need a
+ // larger structure and use a larger table lookup in order to gather all
+ // filter inputs.
+ // |num_taps| - 1 is the shuffle index of the final filter input.
+ const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+ const int kernel_start_ceiling = 16 - num_horiz_taps;
+ // This truncated quotient |grade_x_threshold| selects |step_x| such that:
+ // (step_x * 7) >> kScaleSubPixelBits < single load limit
+ const int grade_x_threshold =
+ (kernel_start_ceiling << kScaleSubPixelBits) / 7;
switch (filter_index) {
case 0:
- if (step_x > 1024) {
+ if (step_x > grade_x_threshold) {
ConvolveKernelHorizontalSigned6Tap<2>(
src, src_stride, width, subpixel_x, step_x, intermediate_height,
intermediate);
@@ -1535,7 +1546,7 @@ void ConvolveScale2D_NEON(const void* const reference,
}
break;
case 1:
- if (step_x > 1024) {
+ if (step_x > grade_x_threshold) {
ConvolveKernelHorizontalMixed6Tap<2>(src, src_stride, width, subpixel_x,
step_x, intermediate_height,
intermediate);
@@ -1547,7 +1558,7 @@ void ConvolveScale2D_NEON(const void* const reference,
}
break;
case 2:
- if (step_x > 1024) {
+ if (step_x > grade_x_threshold) {
ConvolveKernelHorizontalSigned8Tap<2>(
src, src_stride, width, subpixel_x, step_x, intermediate_height,
intermediate);
@@ -1558,7 +1569,7 @@ void ConvolveScale2D_NEON(const void* const reference,
}
break;
case 3:
- if (step_x > 1024) {
+ if (step_x > grade_x_threshold) {
ConvolveKernelHorizontal2Tap<2>(src, src_stride, width, subpixel_x,
step_x, intermediate_height,
intermediate);
diff --git a/chromium/third_party/libgav1/src/src/dsp/arm/loop_restoration_neon.cc b/chromium/third_party/libgav1/src/src/dsp/arm/loop_restoration_neon.cc
index f63fabdd7e2..e89ba36773b 100644
--- a/chromium/third_party/libgav1/src/src/dsp/arm/loop_restoration_neon.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/arm/loop_restoration_neon.cc
@@ -16,7 +16,6 @@
#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_NEON
-
#include <arm_neon.h>
#include <cassert>
@@ -33,6 +32,11 @@ namespace dsp {
namespace low_bitdepth {
namespace {
+template <int bytes>
+inline uint16x4_t VshrU128(const uint16x8_t a) {
+ return vext_u16(vget_low_u16(a), vget_high_u16(a), bytes / 2);
+}
+
// Wiener
// Must make a local copy of coefficients to help compiler know that they have
@@ -50,7 +54,6 @@ inline void PopulateWienerCoefficients(
assert(direction == WienerInfo::kVertical);
filter_3 = 128;
}
-
for (int i = 0; i < 3; ++i) {
const int16_t coeff = restoration_info.wiener_info.filter[direction][i];
filter[i] = coeff;
@@ -76,74 +79,24 @@ inline int CountZeroCoefficients(const int16_t filter[2][kSubPixelTaps]) {
return number_zero_coefficients;
}
-inline void LoadHorizontal4Tap3(const uint8_t* source, uint8x8_t s[3]) {
- s[0] = vld1_u8(source);
- // Faster than using vshr_n_u64().
- s[1] = vext_u8(s[0], s[0], 1);
- s[2] = vext_u8(s[0], s[0], 2);
-}
-
-inline void LoadHorizontal4Tap5(const uint8_t* source, uint8x8_t s[5]) {
- s[0] = vld1_u8(source);
- // Faster than using vshr_n_u64().
- s[1] = vext_u8(s[0], s[0], 1);
- s[2] = vext_u8(s[0], s[0], 2);
- s[3] = vext_u8(s[0], s[0], 3);
- s[4] = vext_u8(s[0], s[0], 4);
-}
-
-inline void LoadHorizontal8Tap3(const uint8_t* source, uint8x8_t s[3]) {
- const uint8x16_t r = vld1q_u8(source);
- s[0] = vget_low_u8(r);
- s[1] = vext_u8(s[0], vget_high_u8(r), 1);
- s[2] = vext_u8(s[0], vget_high_u8(r), 2);
-}
-
-inline void LoadHorizontal8Tap5(const uint8_t* source, uint8x8_t s[5]) {
- const uint8x16_t r = vld1q_u8(source);
- s[0] = vget_low_u8(r);
- s[1] = vext_u8(s[0], vget_high_u8(r), 1);
- s[2] = vext_u8(s[0], vget_high_u8(r), 2);
- s[3] = vext_u8(s[0], vget_high_u8(r), 3);
- s[4] = vext_u8(s[0], vget_high_u8(r), 4);
-}
-
-inline void LoadHorizontalTap7(const uint8_t* source, uint8x8_t s[7]) {
- // This is just as fast as an 8x8 transpose but avoids over-reading
- // extra rows. It always over-reads by at least 1 value. On small widths
- // (4xH) it over-reads by 9 values.
- const uint8x16_t r = vld1q_u8(source);
- s[0] = vget_low_u8(r);
- s[1] = vext_u8(s[0], vget_high_u8(r), 1);
- s[2] = vext_u8(s[0], vget_high_u8(r), 2);
- s[3] = vext_u8(s[0], vget_high_u8(r), 3);
- s[4] = vext_u8(s[0], vget_high_u8(r), 4);
- s[5] = vext_u8(s[0], vget_high_u8(r), 5);
- s[6] = vext_u8(s[0], vget_high_u8(r), 6);
-}
-
inline int16x8_t HorizontalSum(const uint8x8_t a[3], const int16_t filter[2],
int16x8_t sum) {
const int16x8_t a_0_2 = vreinterpretq_s16_u16(vaddl_u8(a[0], a[2]));
sum = vmlaq_n_s16(sum, a_0_2, filter[0]);
sum = vmlaq_n_s16(sum, vreinterpretq_s16_u16(vmovl_u8(a[1])), filter[1]);
-
sum = vrshrq_n_s16(sum, kInterRoundBitsHorizontal);
-
// Delaying |horizontal_rounding| until after down shifting allows the sum to
// stay in 16 bits.
// |horizontal_rounding| = 1 << (bitdepth + kWienerFilterBits - 1)
// 1 << ( 8 + 7 - 1)
// Plus |kInterRoundBitsHorizontal| and it works out to 1 << 11.
sum = vaddq_s16(sum, vdupq_n_s16(1 << 11));
-
// Just like |horizontal_rounding|, adding |filter[3]| at this point allows
// the sum to stay in 16 bits.
// But wait! We *did* calculate |filter[3]| and used it in the sum! But it was
// offset by 128. Fix that here:
// |src[3]| * 128 >> 3 == |src[3]| << 4
sum = vaddq_s16(sum, vreinterpretq_s16_u16(vshll_n_u8(a[1], 4)));
-
// Saturate to
// [0,
// (1 << (bitdepth + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1)]
@@ -153,111 +106,6 @@ inline int16x8_t HorizontalSum(const uint8x8_t a[3], const int16_t filter[2],
return sum;
}
-inline int16x8_t HorizontalSumTap3(const uint8x8_t a[3],
- const int16_t filter[2]) {
- return HorizontalSum(a, filter, vdupq_n_s16(0));
-}
-
-inline int16x8_t HorizontalSumTap5(const uint8x8_t a[5],
- const int16_t filter[3]) {
- const int16x8_t a_0_4 = vreinterpretq_s16_u16(vaddl_u8(a[0], a[4]));
- const int16x8_t sum = vmulq_n_s16(a_0_4, filter[0]);
- return HorizontalSum(a + 1, filter + 1, sum);
-}
-
-inline int16x8_t HorizontalSumTap7(const uint8x8_t a[7],
- const int16_t filter[4]) {
- const int16x8_t a_0_6 = vreinterpretq_s16_u16(vaddl_u8(a[0], a[6]));
- const int16x8_t a_1_5 = vreinterpretq_s16_u16(vaddl_u8(a[1], a[5]));
- int16x8_t sum = vmulq_n_s16(a_0_6, filter[0]);
- sum = vmlaq_n_s16(sum, a_1_5, filter[1]);
- return HorizontalSum(a + 2, filter + 2, sum);
-}
-
-inline int16x8_t WienerHorizontal4Tap3(const uint8_t* source,
- const int16_t filter[2]) {
- uint8x8_t s[5];
- LoadHorizontal4Tap3(source, s);
- return HorizontalSumTap3(s, filter);
-}
-
-inline int16x8_t WienerHorizontal4Tap5(const uint8_t* source,
- const int16_t filter[3]) {
- uint8x8_t s[5];
- LoadHorizontal4Tap5(source, s);
- return HorizontalSumTap5(s, filter);
-}
-
-inline int16x8_t WienerHorizontal4Tap7(const uint8_t* source,
- const int16_t filter[4]) {
- uint8x8_t s[7];
- LoadHorizontalTap7(source, s);
- return HorizontalSumTap7(s, filter);
-}
-
-inline int16x8_t WienerHorizontal4x2Tap3(const uint8_t* source,
- const ptrdiff_t stride,
- const int16_t filter[2]) {
- uint8x8_t s0[5], s1[5], s[5];
- LoadHorizontal4Tap3(source + 0 * stride, s0);
- LoadHorizontal4Tap3(source + 1 * stride, s1);
- s[0] = InterleaveLow32(s0[0], s1[0]);
- s[1] = InterleaveLow32(s0[1], s1[1]);
- s[2] = InterleaveLow32(s0[2], s1[2]);
- return HorizontalSumTap3(s, filter);
-}
-
-inline int16x8_t WienerHorizontal4x2Tap5(const uint8_t* source,
- const ptrdiff_t stride,
- const int16_t filter[3]) {
- uint8x8_t s0[5], s1[5], s[5];
- LoadHorizontal4Tap5(source + 0 * stride, s0);
- LoadHorizontal4Tap5(source + 1 * stride, s1);
- s[0] = InterleaveLow32(s0[0], s1[0]);
- s[1] = InterleaveLow32(s0[1], s1[1]);
- s[2] = InterleaveLow32(s0[2], s1[2]);
- s[3] = InterleaveLow32(s0[3], s1[3]);
- s[4] = InterleaveLow32(s0[4], s1[4]);
- return HorizontalSumTap5(s, filter);
-}
-
-inline int16x8_t WienerHorizontal4x2Tap7(const uint8_t* source,
- const ptrdiff_t stride,
- const int16_t filter[4]) {
- uint8x8_t s0[7], s1[7], s[7];
- LoadHorizontalTap7(source + 0 * stride, s0);
- LoadHorizontalTap7(source + 1 * stride, s1);
- s[0] = InterleaveLow32(s0[0], s1[0]);
- s[1] = InterleaveLow32(s0[1], s1[1]);
- s[2] = InterleaveLow32(s0[2], s1[2]);
- s[3] = InterleaveLow32(s0[3], s1[3]);
- s[4] = InterleaveLow32(s0[4], s1[4]);
- s[5] = InterleaveLow32(s0[5], s1[5]);
- s[6] = InterleaveLow32(s0[6], s1[6]);
- return HorizontalSumTap7(s, filter);
-}
-
-inline int16x8_t WienerHorizontal8Tap3(const uint8_t* source,
- const int16_t filter[2]) {
- uint8x8_t s[3];
- LoadHorizontal8Tap3(source, s);
- return HorizontalSumTap3(s, filter);
-}
-
-inline int16x8_t WienerHorizontal8Tap5(const uint8_t* source,
- const int16_t filter[3]) {
- uint8x8_t s[5];
- LoadHorizontal8Tap5(source, s);
- return HorizontalSumTap5(s, filter);
-}
-
-inline int16x8_t WienerHorizontal8Tap7(const uint8_t* source,
- const int16_t filter[4]) {
- uint8x8_t s[7];
- LoadHorizontalTap7(source, s);
- return HorizontalSumTap7(s, filter);
-}
-
inline uint8x8_t WienerVertical(const int16x8_t a[3], const int16_t filter[2],
int32x4_t sum[2]) {
// -(1 << (bitdepth + kInterRoundBitsVertical - 1))
@@ -265,7 +113,6 @@ inline uint8x8_t WienerVertical(const int16x8_t a[3], const int16_t filter[2],
constexpr int vertical_rounding = -(1 << 18);
const int32x4_t rounding = vdupq_n_s32(vertical_rounding);
const int16x8_t a_0_2 = vaddq_s16(a[0], a[2]);
-
sum[0] = vaddq_s32(sum[0], rounding);
sum[1] = vaddq_s32(sum[1], rounding);
sum[0] = vmlal_n_s16(sum[0], vget_low_s16(a_0_2), filter[0]);
@@ -274,44 +121,9 @@ inline uint8x8_t WienerVertical(const int16x8_t a[3], const int16_t filter[2],
sum[1] = vmlal_n_s16(sum[1], vget_high_s16(a[1]), filter[1]);
const uint16x4_t sum_lo_16 = vqrshrun_n_s32(sum[0], 11);
const uint16x4_t sum_hi_16 = vqrshrun_n_s32(sum[1], 11);
-
return vqmovn_u16(vcombine_u16(sum_lo_16, sum_hi_16));
}
-inline uint8x8_t WienerVerticalTap3(const int16x8_t a[3],
- const int16_t filter[2]) {
- int32x4_t sum[2];
- sum[0] = sum[1] = vdupq_n_s32(0);
- return WienerVertical(a, filter, sum);
-}
-
-inline uint8x8_t WienerVerticalTap5(const int16x8_t a[5],
- const int16_t filter[3]) {
- const int16x8_t a_0_4 = vaddq_s16(a[0], a[4]);
- int32x4_t sum[2];
-
- sum[0] = sum[1] = vdupq_n_s32(0);
- sum[0] = vmlal_n_s16(sum[0], vget_low_s16(a_0_4), filter[0]);
- sum[1] = vmlal_n_s16(sum[1], vget_high_s16(a_0_4), filter[0]);
-
- return WienerVertical(a + 1, filter + 1, sum);
-}
-
-inline uint8x8_t WienerVerticalTap7(const int16x8_t a[7],
- const int16_t filter[4]) {
- const int16x8_t a_0_6 = vaddq_s16(a[0], a[6]);
- const int16x8_t a_1_5 = vaddq_s16(a[1], a[5]);
- int32x4_t sum[2];
-
- sum[0] = sum[1] = vdupq_n_s32(0);
- sum[0] = vmlal_n_s16(sum[0], vget_low_s16(a_0_6), filter[0]);
- sum[1] = vmlal_n_s16(sum[1], vget_high_s16(a_0_6), filter[0]);
- sum[0] = vmlal_n_s16(sum[0], vget_low_s16(a_1_5), filter[1]);
- sum[1] = vmlal_n_s16(sum[1], vget_high_s16(a_1_5), filter[1]);
-
- return WienerVertical(a + 2, filter + 2, sum);
-}
-
// For width 16 and up, store the horizontal results, and then do the vertical
// filter row by row. This is faster than doing it column by column when
// considering cache issues.
@@ -330,360 +142,168 @@ void WienerFilter_NEON(const void* const source, void* const dest,
int16_t* wiener_buffer = reinterpret_cast<int16_t*>(buffer->wiener_buffer);
int16_t filter_horizontal[kSubPixelTaps / 2];
int16_t filter_vertical[kSubPixelTaps / 2];
- int16x8_t a[7];
-
PopulateWienerCoefficients(restoration_info, WienerInfo::kHorizontal,
filter_horizontal);
PopulateWienerCoefficients(restoration_info, WienerInfo::kVertical,
filter_vertical);
-
if (number_zero_coefficients == 0) {
// 7-tap
- src -= kCenterTap * source_stride + kCenterTap;
-
- if (width > 8) {
- int y = height + kSubPixelTaps - 2;
- do {
- int x = 0;
- do {
- const int16x8_t a = WienerHorizontal8Tap7(src + x, filter_horizontal);
- vst1q_s16(wiener_buffer + x, a);
- x += 8;
- } while (x < width);
- src += source_stride;
- wiener_buffer += width;
- } while (--y != 0);
-
- wiener_buffer = reinterpret_cast<int16_t*>(buffer->wiener_buffer);
-
- y = height;
+ src -= (kCenterTap - 1) * source_stride + kCenterTap;
+ int y = height + kSubPixelTaps - 4;
+ do {
+ wiener_buffer += width;
+ int x = 0;
do {
- int x = 0;
- do {
- a[0] = vld1q_s16(wiener_buffer + x + 0 * width);
- a[1] = vld1q_s16(wiener_buffer + x + 1 * width);
- a[2] = vld1q_s16(wiener_buffer + x + 2 * width);
- a[3] = vld1q_s16(wiener_buffer + x + 3 * width);
- a[4] = vld1q_s16(wiener_buffer + x + 4 * width);
- a[5] = vld1q_s16(wiener_buffer + x + 5 * width);
- a[6] = vld1q_s16(wiener_buffer + x + 6 * width);
-
- const uint8x8_t r = WienerVerticalTap7(a, filter_vertical);
- vst1_u8(dst + x, r);
- x += 8;
- } while (x < width);
- wiener_buffer += width;
- dst += dest_stride;
- } while (--y != 0);
- } else if (width > 4) {
- a[0] = WienerHorizontal8Tap7(src, filter_horizontal);
- src += source_stride;
- a[1] = WienerHorizontal8Tap7(src, filter_horizontal);
- src += source_stride;
- a[2] = WienerHorizontal8Tap7(src, filter_horizontal);
+ // This is just as fast as an 8x8 transpose but avoids over-reading
+ // extra rows. It always over-reads by at least 1 value. On small widths
+ // (4xH) it over-reads by 9 values.
+ const uint8x16_t r = vld1q_u8(src + x);
+ uint8x8_t s[7];
+ s[0] = vget_low_u8(r);
+ s[1] = vext_u8(s[0], vget_high_u8(r), 1);
+ s[2] = vext_u8(s[0], vget_high_u8(r), 2);
+ s[3] = vext_u8(s[0], vget_high_u8(r), 3);
+ s[4] = vext_u8(s[0], vget_high_u8(r), 4);
+ s[5] = vext_u8(s[0], vget_high_u8(r), 5);
+ s[6] = vext_u8(s[0], vget_high_u8(r), 6);
+ const int16x8_t s_0_6 = vreinterpretq_s16_u16(vaddl_u8(s[0], s[6]));
+ const int16x8_t s_1_5 = vreinterpretq_s16_u16(vaddl_u8(s[1], s[5]));
+ int16x8_t sum = vmulq_n_s16(s_0_6, filter_horizontal[0]);
+ sum = vmlaq_n_s16(sum, s_1_5, filter_horizontal[1]);
+ const int16x8_t a = HorizontalSum(s + 2, filter_horizontal + 2, sum);
+ vst1q_s16(wiener_buffer + x, a);
+ x += 8;
+ } while (x < width);
src += source_stride;
- a[3] = WienerHorizontal8Tap7(src, filter_horizontal);
- src += source_stride;
- a[4] = WienerHorizontal8Tap7(src, filter_horizontal);
- src += source_stride;
- a[5] = WienerHorizontal8Tap7(src, filter_horizontal);
- src += source_stride;
-
- int y = height;
+ } while (--y != 0);
+ // Because the top row of |source| is a duplicate of the second row, and the
+ // bottom row of |source| is a duplicate of its above row, we can duplicate
+ // the top and bottom row of |wiener_buffer| accordingly.
+ memcpy(wiener_buffer + width, wiener_buffer,
+ sizeof(*wiener_buffer) * width);
+ wiener_buffer = reinterpret_cast<int16_t*>(buffer->wiener_buffer);
+ memcpy(wiener_buffer, wiener_buffer + width,
+ sizeof(*wiener_buffer) * width);
+
+ y = height;
+ do {
+ int x = 0;
do {
- a[6] = WienerHorizontal8Tap7(src, filter_horizontal);
- src += source_stride;
-
- const uint8x8_t r = WienerVerticalTap7(a, filter_vertical);
- vst1_u8(dst, r);
- dst += dest_stride;
-
- a[0] = a[1];
- a[1] = a[2];
- a[2] = a[3];
- a[3] = a[4];
- a[4] = a[5];
- a[5] = a[6];
- } while (--y != 0);
- } else {
- int y = height;
-
- if ((y & 1) != 0) {
- --y;
- a[0] = WienerHorizontal4x2Tap7(src, source_stride, filter_horizontal);
- src += source_stride;
- a[2] = WienerHorizontal4x2Tap7(src + source_stride, source_stride,
- filter_horizontal);
- a[4] = WienerHorizontal4x2Tap7(src + 3 * source_stride, source_stride,
- filter_horizontal);
- a[1] = vcombine_s16(vget_high_s16(a[0]), vget_low_s16(a[2]));
- a[3] = vcombine_s16(vget_high_s16(a[2]), vget_low_s16(a[4]));
- a[6] =
- WienerHorizontal4Tap7(src + 5 * source_stride, filter_horizontal);
- a[5] = vcombine_s16(vget_high_s16(a[4]), vget_low_s16(a[6]));
- const uint8x8_t r = WienerVerticalTap7(a, filter_vertical);
- StoreLo4(dst, r);
- dst += dest_stride;
- }
-
- if (y != 0) {
- a[0] = WienerHorizontal4x2Tap7(src, source_stride, filter_horizontal);
- src += 2 * source_stride;
- a[2] = WienerHorizontal4x2Tap7(src, source_stride, filter_horizontal);
- src += 2 * source_stride;
- a[4] = WienerHorizontal4x2Tap7(src, source_stride, filter_horizontal);
- src += 2 * source_stride;
- a[1] = vcombine_s16(vget_high_s16(a[0]), vget_low_s16(a[2]));
- a[3] = vcombine_s16(vget_high_s16(a[2]), vget_low_s16(a[4]));
-
- do {
- a[6] = WienerHorizontal4x2Tap7(src, source_stride, filter_horizontal);
- src += 2 * source_stride;
- a[5] = vcombine_s16(vget_high_s16(a[4]), vget_low_s16(a[6]));
-
- const uint8x8_t r = WienerVerticalTap7(a, filter_vertical);
- StoreLo4(dst, r);
- dst += dest_stride;
- StoreHi4(dst, r);
- dst += dest_stride;
-
- a[0] = a[2];
- a[1] = a[3];
- a[2] = a[4];
- a[3] = a[5];
- a[4] = a[6];
- y -= 2;
- } while (y != 0);
- }
- }
+ int16x8_t a[7];
+ a[0] = vld1q_s16(wiener_buffer + x + 0 * width);
+ a[1] = vld1q_s16(wiener_buffer + x + 1 * width);
+ a[2] = vld1q_s16(wiener_buffer + x + 2 * width);
+ a[3] = vld1q_s16(wiener_buffer + x + 3 * width);
+ a[4] = vld1q_s16(wiener_buffer + x + 4 * width);
+ a[5] = vld1q_s16(wiener_buffer + x + 5 * width);
+ a[6] = vld1q_s16(wiener_buffer + x + 6 * width);
+ const int16x8_t a_0_6 = vaddq_s16(a[0], a[6]);
+ const int16x8_t a_1_5 = vaddq_s16(a[1], a[5]);
+ int32x4_t sum[2];
+ sum[0] = sum[1] = vdupq_n_s32(0);
+ sum[0] = vmlal_n_s16(sum[0], vget_low_s16(a_0_6), filter_vertical[0]);
+ sum[1] = vmlal_n_s16(sum[1], vget_high_s16(a_0_6), filter_vertical[0]);
+ sum[0] = vmlal_n_s16(sum[0], vget_low_s16(a_1_5), filter_vertical[1]);
+ sum[1] = vmlal_n_s16(sum[1], vget_high_s16(a_1_5), filter_vertical[1]);
+ const uint8x8_t r = WienerVertical(a + 2, filter_vertical + 2, sum);
+ vst1_u8(dst + x, r);
+ x += 8;
+ } while (x < width);
+ wiener_buffer += width;
+ dst += dest_stride;
+ } while (--y != 0);
} else if (number_zero_coefficients == 1) {
// 5-tap
src -= (kCenterTap - 1) * source_stride + kCenterTap - 1;
-
- if (width > 8) {
- int y = height + kSubPixelTaps - 4;
- do {
- int x = 0;
- do {
- const int16x8_t a =
- WienerHorizontal8Tap5(src + x, filter_horizontal + 1);
- vst1q_s16(wiener_buffer + x, a);
- x += 8;
- } while (x < width);
- src += source_stride;
- wiener_buffer += width;
- } while (--y != 0);
-
- wiener_buffer = reinterpret_cast<int16_t*>(buffer->wiener_buffer);
-
- y = height;
+ int y = height + kSubPixelTaps - 4;
+ do {
+ int x = 0;
do {
- int x = 0;
- do {
- a[0] = vld1q_s16(wiener_buffer + x + 0 * width);
- a[1] = vld1q_s16(wiener_buffer + x + 1 * width);
- a[2] = vld1q_s16(wiener_buffer + x + 2 * width);
- a[3] = vld1q_s16(wiener_buffer + x + 3 * width);
- a[4] = vld1q_s16(wiener_buffer + x + 4 * width);
-
- const uint8x8_t r = WienerVerticalTap5(a, filter_vertical + 1);
- vst1_u8(dst + x, r);
- x += 8;
- } while (x < width);
- wiener_buffer += width;
- dst += dest_stride;
- } while (--y != 0);
- } else if (width > 4) {
- a[0] = WienerHorizontal8Tap5(src, filter_horizontal + 1);
- src += source_stride;
- a[1] = WienerHorizontal8Tap5(src, filter_horizontal + 1);
- src += source_stride;
- a[2] = WienerHorizontal8Tap5(src, filter_horizontal + 1);
- src += source_stride;
- a[3] = WienerHorizontal8Tap5(src, filter_horizontal + 1);
+ const uint8x16_t r = vld1q_u8(src + x);
+ uint8x8_t s[5];
+ s[0] = vget_low_u8(r);
+ s[1] = vext_u8(s[0], vget_high_u8(r), 1);
+ s[2] = vext_u8(s[0], vget_high_u8(r), 2);
+ s[3] = vext_u8(s[0], vget_high_u8(r), 3);
+ s[4] = vext_u8(s[0], vget_high_u8(r), 4);
+ const int16x8_t s_0_4 = vreinterpretq_s16_u16(vaddl_u8(s[0], s[4]));
+ const int16x8_t sum = vmulq_n_s16(s_0_4, filter_horizontal[1]);
+ const int16x8_t a = HorizontalSum(s + 1, filter_horizontal + 2, sum);
+ vst1q_s16(wiener_buffer + x, a);
+ x += 8;
+ } while (x < width);
src += source_stride;
+ wiener_buffer += width;
+ } while (--y != 0);
- int y = height;
+ wiener_buffer = reinterpret_cast<int16_t*>(buffer->wiener_buffer);
+ y = height;
+ do {
+ int x = 0;
do {
- a[4] = WienerHorizontal8Tap5(src, filter_horizontal + 1);
- src += source_stride;
-
- const uint8x8_t r = WienerVerticalTap5(a, filter_vertical + 1);
- vst1_u8(dst, r);
- dst += dest_stride;
-
- a[0] = a[1];
- a[1] = a[2];
- a[2] = a[3];
- a[3] = a[4];
- } while (--y != 0);
- } else {
- int y = height;
-
- if ((y & 1) != 0) {
- --y;
- a[0] =
- WienerHorizontal4x2Tap5(src, source_stride, filter_horizontal + 1);
- src += source_stride;
- a[2] = WienerHorizontal4x2Tap5(src + source_stride, source_stride,
- filter_horizontal + 1);
- a[1] = vcombine_s16(vget_high_s16(a[0]), vget_low_s16(a[2]));
- a[4] = WienerHorizontal4Tap5(src + 3 * source_stride,
- filter_horizontal + 1);
- a[3] = vcombine_s16(vget_high_s16(a[2]), vget_low_s16(a[4]));
- const uint8x8_t r = WienerVerticalTap5(a, filter_vertical + 1);
- StoreLo4(dst, r);
- dst += dest_stride;
- }
-
- if (y != 0) {
- a[0] =
- WienerHorizontal4x2Tap5(src, source_stride, filter_horizontal + 1);
- src += 2 * source_stride;
- a[2] =
- WienerHorizontal4x2Tap5(src, source_stride, filter_horizontal + 1);
- src += 2 * source_stride;
- a[1] = vcombine_s16(vget_high_s16(a[0]), vget_low_s16(a[2]));
-
- do {
- a[4] = WienerHorizontal4x2Tap5(src, source_stride,
- filter_horizontal + 1);
- src += 2 * source_stride;
- a[3] = vcombine_s16(vget_high_s16(a[2]), vget_low_s16(a[4]));
-
- const uint8x8_t r = WienerVerticalTap5(a, filter_vertical + 1);
- StoreLo4(dst, r);
- dst += dest_stride;
- StoreHi4(dst, r);
- dst += dest_stride;
-
- a[0] = a[2];
- a[1] = a[3];
- a[2] = a[4];
- y -= 2;
- } while (y != 0);
- }
- }
+ int16x8_t a[5];
+ a[0] = vld1q_s16(wiener_buffer + x + 0 * width);
+ a[1] = vld1q_s16(wiener_buffer + x + 1 * width);
+ a[2] = vld1q_s16(wiener_buffer + x + 2 * width);
+ a[3] = vld1q_s16(wiener_buffer + x + 3 * width);
+ a[4] = vld1q_s16(wiener_buffer + x + 4 * width);
+ const int16x8_t a_0_4 = vaddq_s16(a[0], a[4]);
+ int32x4_t sum[2];
+ sum[0] = sum[1] = vdupq_n_s32(0);
+ sum[0] = vmlal_n_s16(sum[0], vget_low_s16(a_0_4), filter_vertical[1]);
+ sum[1] = vmlal_n_s16(sum[1], vget_high_s16(a_0_4), filter_vertical[1]);
+ const uint8x8_t r = WienerVertical(a + 1, filter_vertical + 2, sum);
+ vst1_u8(dst + x, r);
+ x += 8;
+ } while (x < width);
+ wiener_buffer += width;
+ dst += dest_stride;
+ } while (--y != 0);
} else {
// 3-tap
src -= (kCenterTap - 2) * source_stride + kCenterTap - 2;
-
- if (width > 8) {
- int y = height + kSubPixelTaps - 6;
- do {
- int x = 0;
- do {
- const int16x8_t a =
- WienerHorizontal8Tap3(src + x, filter_horizontal + 2);
- vst1q_s16(wiener_buffer + x, a);
- x += 8;
- } while (x < width);
- src += source_stride;
- wiener_buffer += width;
- } while (--y != 0);
-
- wiener_buffer = reinterpret_cast<int16_t*>(buffer->wiener_buffer);
-
- y = height;
+ int y = height + kSubPixelTaps - 6;
+ do {
+ int x = 0;
do {
- int x = 0;
- do {
- a[0] = vld1q_s16(wiener_buffer + x + 0 * width);
- a[1] = vld1q_s16(wiener_buffer + x + 1 * width);
- a[2] = vld1q_s16(wiener_buffer + x + 2 * width);
-
- const uint8x8_t r = WienerVerticalTap3(a, filter_vertical + 2);
- vst1_u8(dst + x, r);
- x += 8;
- } while (x < width);
- wiener_buffer += width;
- dst += dest_stride;
- } while (--y != 0);
- } else if (width > 4) {
- a[0] = WienerHorizontal8Tap3(src, filter_horizontal + 2);
- src += source_stride;
- a[1] = WienerHorizontal8Tap3(src, filter_horizontal + 2);
+ const uint8x16_t r = vld1q_u8(src + x);
+ uint8x8_t s[3];
+ s[0] = vget_low_u8(r);
+ s[1] = vext_u8(s[0], vget_high_u8(r), 1);
+ s[2] = vext_u8(s[0], vget_high_u8(r), 2);
+ const int16x8_t a =
+ HorizontalSum(s, filter_horizontal + 2, vdupq_n_s16(0));
+ vst1q_s16(wiener_buffer + x, a);
+ x += 8;
+ } while (x < width);
src += source_stride;
+ wiener_buffer += width;
+ } while (--y != 0);
- int y = height;
+ wiener_buffer = reinterpret_cast<int16_t*>(buffer->wiener_buffer);
+ y = height;
+ do {
+ int x = 0;
do {
- a[2] = WienerHorizontal8Tap3(src, filter_horizontal + 2);
- src += source_stride;
-
- const uint8x8_t r = WienerVerticalTap3(a, filter_vertical + 2);
- vst1_u8(dst, r);
- dst += dest_stride;
-
- a[0] = a[1];
- a[1] = a[2];
- } while (--y != 0);
- } else {
- int y = height;
-
- if ((y & 1) != 0) {
- --y;
- a[0] =
- WienerHorizontal4x2Tap3(src, source_stride, filter_horizontal + 2);
- src += source_stride;
- a[2] =
- WienerHorizontal4Tap3(src + source_stride, filter_horizontal + 2);
- a[1] = vcombine_s16(vget_high_s16(a[0]), vget_low_s16(a[2]));
- const uint8x8_t r = WienerVerticalTap3(a, filter_vertical + 2);
- StoreLo4(dst, r);
- dst += dest_stride;
- }
-
- if (y != 0) {
- a[0] =
- WienerHorizontal4x2Tap3(src, source_stride, filter_horizontal + 2);
- src += 2 * source_stride;
-
- do {
- a[2] = WienerHorizontal4x2Tap3(src, source_stride,
- filter_horizontal + 2);
- src += 2 * source_stride;
- a[1] = vcombine_s16(vget_high_s16(a[0]), vget_low_s16(a[2]));
-
- const uint8x8_t r = WienerVerticalTap3(a, filter_vertical + 2);
- StoreLo4(dst, r);
- dst += dest_stride;
- StoreHi4(dst, r);
- dst += dest_stride;
-
- a[0] = a[2];
- y -= 2;
- } while (y != 0);
- }
- }
+ int16x8_t a[3];
+ a[0] = vld1q_s16(wiener_buffer + x + 0 * width);
+ a[1] = vld1q_s16(wiener_buffer + x + 1 * width);
+ a[2] = vld1q_s16(wiener_buffer + x + 2 * width);
+ int32x4_t sum[2];
+ sum[0] = sum[1] = vdupq_n_s32(0);
+ const uint8x8_t r = WienerVertical(a, filter_vertical + 2, sum);
+ vst1_u8(dst + x, r);
+ x += 8;
+ } while (x < width);
+ wiener_buffer += width;
+ dst += dest_stride;
+ } while (--y != 0);
}
}
+//------------------------------------------------------------------------------
// SGR
-constexpr int kSgrProjScaleBits = 20;
-constexpr int kSgrProjRestoreBits = 4;
-constexpr int kSgrProjSgrBits = 8;
-constexpr int kSgrProjReciprocalBits = 12;
-
-// a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1);
-// sgr_ma2 = 256 - a2
-constexpr uint8_t kSgrMa2Lookup[256] = {
- 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16, 15, 14,
- 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7,
- 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5,
- 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 0};
-
template <int n>
inline uint16x4_t CalculateSgrMA2(const uint32x4_t sum_sq, const uint16x4_t sum,
const uint32_t s) {
@@ -697,15 +317,15 @@ inline uint16x4_t CalculateSgrMA2(const uint32x4_t sum_sq, const uint16x4_t sum,
// z = RightShiftWithRounding(p * s, kSgrProjScaleBits);
const uint32x4_t pxs = vmulq_n_u32(p, s);
- // For some reason vrshrn_n_u32() (narrowing shift) can only shift by 16
- // and kSgrProjScaleBits is 20.
+ // vrshrn_n_u32() (narrowing shift) can only shift by 16 and kSgrProjScaleBits
+ // is 20.
const uint32x4_t shifted = vrshrq_n_u32(pxs, kSgrProjScaleBits);
return vmovn_u32(shifted);
}
-inline uint16x4_t CalculateB2Shifted(const uint8x8_t sgr_ma2,
- const uint16x4_t sum,
- const uint32_t one_over_n) {
+inline uint16x4_t CalculateIntermediate4(const uint8x8_t sgr_ma2,
+ const uint16x4_t sum,
+ const uint32_t one_over_n) {
// b2 = ((1 << kSgrProjSgrBits) - a2) * b * one_over_n
// 1 << kSgrProjSgrBits = 256
// |a2| = [1, 256]
@@ -726,9 +346,9 @@ inline uint16x4_t CalculateB2Shifted(const uint8x8_t sgr_ma2,
return vrshrn_n_u32(b2, kSgrProjReciprocalBits);
}
-inline uint16x8_t CalculateB2Shifted(const uint8x8_t sgr_ma2,
- const uint16x8_t sum,
- const uint32_t one_over_n) {
+inline uint16x8_t CalculateIntermediate8(const uint8x8_t sgr_ma2,
+ const uint16x8_t sum,
+ const uint32_t one_over_n) {
// b2 = ((1 << kSgrProjSgrBits) - a2) * b * one_over_n
// 1 << kSgrProjSgrBits = 256
// |a2| = [1, 256]
@@ -753,41 +373,41 @@ inline uint16x8_t CalculateB2Shifted(const uint8x8_t sgr_ma2,
return vcombine_u16(b2_lo, b2_hi);
}
-inline uint16x8_t Sum3(const uint16x8_t left, const uint16x8_t middle,
- const uint16x8_t right) {
+inline uint16x4_t Sum3(const uint16x4_t left, const uint16x4_t middle,
+ const uint16x4_t right) {
+ const uint16x4_t sum = vadd_u16(left, middle);
+ return vadd_u16(sum, right);
+}
+
+inline uint16x8_t Sum3_16(const uint16x8_t left, const uint16x8_t middle,
+ const uint16x8_t right) {
const uint16x8_t sum = vaddq_u16(left, middle);
return vaddq_u16(sum, right);
}
-inline uint32x4_t Sum3(const uint32x4_t left, const uint32x4_t middle,
- const uint32x4_t right) {
+inline uint32x4_t Sum3_32(const uint32x4_t left, const uint32x4_t middle,
+ const uint32x4_t right) {
const uint32x4_t sum = vaddq_u32(left, middle);
return vaddq_u32(sum, right);
}
-inline uint16x8_t Sum3W(const uint8x8_t left, const uint8x8_t middle,
- const uint8x8_t right) {
+inline uint16x8_t Sum3W_16(const uint8x8_t left, const uint8x8_t middle,
+ const uint8x8_t right) {
const uint16x8_t sum = vaddl_u8(left, middle);
return vaddw_u8(sum, right);
}
-inline uint32x4_t Sum3W(const uint16x4_t left, const uint16x4_t middle,
- const uint16x4_t right) {
- const uint32x4_t sum = vaddl_u16(left, middle);
- return vaddw_u16(sum, right);
-}
-
-inline uint16x4_t Sum3(const uint16x4_t left, const uint16x4_t middle,
- const uint16x4_t right) {
- const uint16x4_t sum = vadd_u16(left, middle);
- return vadd_u16(sum, right);
+inline uint16x8_t Sum3W_16(const uint8x8_t a[3]) {
+ return Sum3W_16(a[0], a[1], a[2]);
}
-inline uint16x8_t Sum3W(const uint8x8_t a[3]) {
- return Sum3W(a[0], a[1], a[2]);
+inline uint32x4_t Sum3W_32(const uint16x4_t left, const uint16x4_t middle,
+ const uint16x4_t right) {
+ const uint32x4_t sum = vaddl_u16(left, middle);
+ return vaddw_u16(sum, right);
}
-inline uint16x8x2_t Sum3W(const uint8x16_t a[3]) {
+inline uint16x8x2_t Sum3W_16x2(const uint8x16_t a[3]) {
const uint8x8_t low0 = vget_low_u8(a[0]);
const uint8x8_t low1 = vget_low_u8(a[1]);
const uint8x8_t low2 = vget_low_u8(a[2]);
@@ -795,8 +415,8 @@ inline uint16x8x2_t Sum3W(const uint8x16_t a[3]) {
const uint8x8_t high1 = vget_high_u8(a[1]);
const uint8x8_t high2 = vget_high_u8(a[2]);
uint16x8x2_t sum;
- sum.val[0] = Sum3W(low0, low1, low2);
- sum.val[1] = Sum3W(high0, high1, high2);
+ sum.val[0] = Sum3W_16(low0, low1, low2);
+ sum.val[1] = Sum3W_16(high0, high1, high2);
return sum;
}
@@ -808,32 +428,31 @@ inline uint32x4x2_t Sum3W(const uint16x8_t a[3]) {
const uint16x4_t high1 = vget_high_u16(a[1]);
const uint16x4_t high2 = vget_high_u16(a[2]);
uint32x4x2_t sum;
- sum.val[0] = Sum3W(low0, low1, low2);
- sum.val[1] = Sum3W(high0, high1, high2);
+ sum.val[0] = Sum3W_32(low0, low1, low2);
+ sum.val[1] = Sum3W_32(high0, high1, high2);
return sum;
}
template <int index>
-inline uint32x4_t Sum3WLow(const uint16x8x2_t a[3]) {
+inline uint32x4_t Sum3WLo(const uint16x8x2_t a[3]) {
const uint16x4_t low0 = vget_low_u16(a[0].val[index]);
const uint16x4_t low1 = vget_low_u16(a[1].val[index]);
const uint16x4_t low2 = vget_low_u16(a[2].val[index]);
- return Sum3W(low0, low1, low2);
+ return Sum3W_32(low0, low1, low2);
}
-template <int index>
-inline uint32x4_t Sum3WHigh(const uint16x8x2_t a[3]) {
- const uint16x4_t high0 = vget_high_u16(a[0].val[index]);
- const uint16x4_t high1 = vget_high_u16(a[1].val[index]);
- const uint16x4_t high2 = vget_high_u16(a[2].val[index]);
- return Sum3W(high0, high1, high2);
+inline uint32x4_t Sum3WHi(const uint16x8x2_t a[3]) {
+ const uint16x4_t high0 = vget_high_u16(a[0].val[0]);
+ const uint16x4_t high1 = vget_high_u16(a[1].val[0]);
+ const uint16x4_t high2 = vget_high_u16(a[2].val[0]);
+ return Sum3W_32(high0, high1, high2);
}
inline uint32x4x3_t Sum3W(const uint16x8x2_t a[3]) {
uint32x4x3_t sum;
- sum.val[0] = Sum3WLow<0>(a);
- sum.val[1] = Sum3WHigh<0>(a);
- sum.val[2] = Sum3WLow<1>(a);
+ sum.val[0] = Sum3WLo<0>(a);
+ sum.val[1] = Sum3WHi(a);
+ sum.val[2] = Sum3WLo<1>(a);
return sum;
}
@@ -844,35 +463,35 @@ inline uint16x4_t Sum5(const uint16x4_t a[5]) {
return vadd_u16(sum, a[4]);
}
-inline uint16x8_t Sum5(const uint16x8_t a[5]) {
+inline uint16x8_t Sum5_16(const uint16x8_t a[5]) {
const uint16x8_t sum01 = vaddq_u16(a[0], a[1]);
const uint16x8_t sum23 = vaddq_u16(a[2], a[3]);
const uint16x8_t sum = vaddq_u16(sum01, sum23);
return vaddq_u16(sum, a[4]);
}
-inline uint32x4_t Sum5(const uint32x4_t a[5]) {
+inline uint32x4_t Sum5_32(const uint32x4_t a[5]) {
const uint32x4_t sum01 = vaddq_u32(a[0], a[1]);
const uint32x4_t sum23 = vaddq_u32(a[2], a[3]);
const uint32x4_t sum = vaddq_u32(sum01, sum23);
return vaddq_u32(sum, a[4]);
}
-inline uint16x8_t Sum5W(const uint8x8_t a[5]) {
+inline uint16x8_t Sum5W_16(const uint8x8_t a[5]) {
const uint16x8_t sum01 = vaddl_u8(a[0], a[1]);
const uint16x8_t sum23 = vaddl_u8(a[2], a[3]);
const uint16x8_t sum = vaddq_u16(sum01, sum23);
return vaddw_u8(sum, a[4]);
}
-inline uint32x4_t Sum5W(const uint16x4_t a[5]) {
+inline uint32x4_t Sum5W_32(const uint16x4_t a[5]) {
const uint32x4_t sum01 = vaddl_u16(a[0], a[1]);
const uint32x4_t sum23 = vaddl_u16(a[2], a[3]);
const uint32x4_t sum0123 = vaddq_u32(sum01, sum23);
return vaddw_u16(sum0123, a[4]);
}
-inline uint16x8x2_t Sum5W(const uint8x16_t a[5]) {
+inline uint16x8x2_t Sum5W_16D(const uint8x16_t a[5]) {
uint16x8x2_t sum;
uint8x8_t low[5], high[5];
low[0] = vget_low_u8(a[0]);
@@ -885,12 +504,12 @@ inline uint16x8x2_t Sum5W(const uint8x16_t a[5]) {
high[2] = vget_high_u8(a[2]);
high[3] = vget_high_u8(a[3]);
high[4] = vget_high_u8(a[4]);
- sum.val[0] = Sum5W(low);
- sum.val[1] = Sum5W(high);
+ sum.val[0] = Sum5W_16(low);
+ sum.val[1] = Sum5W_16(high);
return sum;
}
-inline uint32x4x2_t Sum5W(const uint16x8_t a[5]) {
+inline uint32x4x2_t Sum5W_32x2(const uint16x8_t a[5]) {
uint32x4x2_t sum;
uint16x4_t low[5], high[5];
low[0] = vget_low_u16(a[0]);
@@ -903,113 +522,112 @@ inline uint32x4x2_t Sum5W(const uint16x8_t a[5]) {
high[2] = vget_high_u16(a[2]);
high[3] = vget_high_u16(a[3]);
high[4] = vget_high_u16(a[4]);
- sum.val[0] = Sum5W(low);
- sum.val[1] = Sum5W(high);
+ sum.val[0] = Sum5W_32(low);
+ sum.val[1] = Sum5W_32(high);
return sum;
}
template <int index>
-inline uint32x4_t Sum5WLow(const uint16x8x2_t a[5]) {
+inline uint32x4_t Sum5WLo(const uint16x8x2_t a[5]) {
uint16x4_t low[5];
low[0] = vget_low_u16(a[0].val[index]);
low[1] = vget_low_u16(a[1].val[index]);
low[2] = vget_low_u16(a[2].val[index]);
low[3] = vget_low_u16(a[3].val[index]);
low[4] = vget_low_u16(a[4].val[index]);
- return Sum5W(low);
+ return Sum5W_32(low);
}
-template <int index>
-inline uint32x4_t Sum5WHigh(const uint16x8x2_t a[5]) {
+inline uint32x4_t Sum5WHi(const uint16x8x2_t a[5]) {
uint16x4_t high[5];
- high[0] = vget_high_u16(a[0].val[index]);
- high[1] = vget_high_u16(a[1].val[index]);
- high[2] = vget_high_u16(a[2].val[index]);
- high[3] = vget_high_u16(a[3].val[index]);
- high[4] = vget_high_u16(a[4].val[index]);
- return Sum5W(high);
+ high[0] = vget_high_u16(a[0].val[0]);
+ high[1] = vget_high_u16(a[1].val[0]);
+ high[2] = vget_high_u16(a[2].val[0]);
+ high[3] = vget_high_u16(a[3].val[0]);
+ high[4] = vget_high_u16(a[4].val[0]);
+ return Sum5W_32(high);
}
-inline uint32x4x3_t Sum5W(const uint16x8x2_t a[5]) {
+inline uint32x4x3_t Sum5W_32x3(const uint16x8x2_t a[5]) {
uint32x4x3_t sum;
- sum.val[0] = Sum5WLow<0>(a);
- sum.val[1] = Sum5WHigh<0>(a);
- sum.val[2] = Sum5WLow<1>(a);
+ sum.val[0] = Sum5WLo<0>(a);
+ sum.val[1] = Sum5WHi(a);
+ sum.val[2] = Sum5WLo<1>(a);
return sum;
}
inline uint16x4_t Sum3Horizontal(const uint16x8_t a) {
const uint16x4_t left = vget_low_u16(a);
- const uint16x4_t middle = vext_u16(vget_low_u16(a), vget_high_u16(a), 1);
- const uint16x4_t right = vext_u16(vget_low_u16(a), vget_high_u16(a), 2);
+ const uint16x4_t middle = VshrU128<2>(a);
+ const uint16x4_t right = VshrU128<4>(a);
return Sum3(left, middle, right);
}
-inline uint16x8_t Sum3Horizontal(const uint16x8x2_t a) {
+inline uint16x8_t Sum3Horizontal_16(const uint16x8x2_t a) {
const uint16x8_t left = a.val[0];
const uint16x8_t middle = vextq_u16(a.val[0], a.val[1], 1);
const uint16x8_t right = vextq_u16(a.val[0], a.val[1], 2);
- return Sum3(left, middle, right);
+ return Sum3_16(left, middle, right);
}
-inline uint32x4_t Sum3Horizontal(const uint32x4x2_t a) {
+inline uint32x4_t Sum3Horizontal_32(const uint32x4x2_t a) {
const uint32x4_t left = a.val[0];
const uint32x4_t middle = vextq_u32(a.val[0], a.val[1], 1);
const uint32x4_t right = vextq_u32(a.val[0], a.val[1], 2);
- return Sum3(left, middle, right);
+ return Sum3_32(left, middle, right);
}
-inline uint32x4x2_t Sum3Horizontal(const uint32x4x3_t a) {
+inline uint32x4x2_t Sum3Horizontal_32x2(const uint32x4x3_t a) {
uint32x4x2_t sum;
{
const uint32x4_t left = a.val[0];
const uint32x4_t middle = vextq_u32(a.val[0], a.val[1], 1);
const uint32x4_t right = vextq_u32(a.val[0], a.val[1], 2);
- sum.val[0] = Sum3(left, middle, right);
+ sum.val[0] = Sum3_32(left, middle, right);
}
{
const uint32x4_t left = a.val[1];
const uint32x4_t middle = vextq_u32(a.val[1], a.val[2], 1);
const uint32x4_t right = vextq_u32(a.val[1], a.val[2], 2);
- sum.val[1] = Sum3(left, middle, right);
+ sum.val[1] = Sum3_32(left, middle, right);
}
return sum;
}
inline uint16x4_t Sum3HorizontalOffset1(const uint16x8_t a) {
- const uint16x4_t left = vext_u16(vget_low_u16(a), vget_high_u16(a), 1);
- const uint16x4_t middle = vext_u16(vget_low_u16(a), vget_high_u16(a), 2);
- const uint16x4_t right = vext_u16(vget_low_u16(a), vget_high_u16(a), 3);
+ const uint16x4_t left = VshrU128<2>(a);
+ const uint16x4_t middle = VshrU128<4>(a);
+ const uint16x4_t right = VshrU128<6>(a);
return Sum3(left, middle, right);
}
-inline uint16x8_t Sum3HorizontalOffset1(const uint16x8x2_t a) {
+inline uint16x8_t Sum3HorizontalOffset1_16(const uint16x8x2_t a) {
const uint16x8_t left = vextq_u16(a.val[0], a.val[1], 1);
const uint16x8_t middle = vextq_u16(a.val[0], a.val[1], 2);
const uint16x8_t right = vextq_u16(a.val[0], a.val[1], 3);
- return Sum3(left, middle, right);
+ return Sum3_16(left, middle, right);
}
-inline uint32x4_t Sum3HorizontalOffset1(const uint32x4x2_t a) {
+inline uint32x4_t Sum3HorizontalOffset1_32(const uint32x4x2_t a) {
const uint32x4_t left = vextq_u32(a.val[0], a.val[1], 1);
const uint32x4_t middle = vextq_u32(a.val[0], a.val[1], 2);
const uint32x4_t right = vextq_u32(a.val[0], a.val[1], 3);
- return Sum3(left, middle, right);
+ return Sum3_32(left, middle, right);
}
-inline uint32x4x2_t Sum3HorizontalOffset1(const uint32x4x3_t a) {
+inline uint32x4x2_t Sum3HorizontalOffset1_32x2(const uint32x4x3_t a) {
uint32x4x2_t sum;
{
const uint32x4_t left = vextq_u32(a.val[0], a.val[1], 1);
const uint32x4_t middle = vextq_u32(a.val[0], a.val[1], 2);
const uint32x4_t right = vextq_u32(a.val[0], a.val[1], 3);
- sum.val[0] = Sum3(left, middle, right);
+ sum.val[0] = Sum3_32(left, middle, right);
}
{
const uint32x4_t left = vextq_u32(a.val[1], a.val[2], 1);
const uint32x4_t middle = vextq_u32(a.val[1], a.val[2], 2);
const uint32x4_t right = vextq_u32(a.val[1], a.val[2], 3);
- sum.val[1] = Sum3(left, middle, right);
+ sum.val[1] = Sum3_32(left, middle, right);
}
return sum;
}
@@ -1017,34 +635,34 @@ inline uint32x4x2_t Sum3HorizontalOffset1(const uint32x4x3_t a) {
inline uint16x4_t Sum5Horizontal(const uint16x8_t a) {
uint16x4_t s[5];
s[0] = vget_low_u16(a);
- s[1] = vext_u16(vget_low_u16(a), vget_high_u16(a), 1);
- s[2] = vext_u16(vget_low_u16(a), vget_high_u16(a), 2);
- s[3] = vext_u16(vget_low_u16(a), vget_high_u16(a), 3);
+ s[1] = VshrU128<2>(a);
+ s[2] = VshrU128<4>(a);
+ s[3] = VshrU128<6>(a);
s[4] = vget_high_u16(a);
return Sum5(s);
}
-inline uint16x8_t Sum5Horizontal(const uint16x8x2_t a) {
+inline uint16x8_t Sum5Horizontal_16(const uint16x8x2_t a) {
uint16x8_t s[5];
s[0] = a.val[0];
s[1] = vextq_u16(a.val[0], a.val[1], 1);
s[2] = vextq_u16(a.val[0], a.val[1], 2);
s[3] = vextq_u16(a.val[0], a.val[1], 3);
- s[4] = vcombine_u16(vget_high_u16(a.val[0]), vget_low_u16(a.val[1]));
- return Sum5(s);
+ s[4] = vextq_u16(a.val[0], a.val[1], 4);
+ return Sum5_16(s);
}
-inline uint32x4_t Sum5Horizontal(const uint32x4x2_t a) {
+inline uint32x4_t Sum5Horizontal_32(const uint32x4x2_t a) {
uint32x4_t s[5];
s[0] = a.val[0];
s[1] = vextq_u32(a.val[0], a.val[1], 1);
s[2] = vextq_u32(a.val[0], a.val[1], 2);
s[3] = vextq_u32(a.val[0], a.val[1], 3);
s[4] = a.val[1];
- return Sum5(s);
+ return Sum5_32(s);
}
-inline uint32x4x2_t Sum5Horizontal(const uint32x4x3_t a) {
+inline uint32x4x2_t Sum5Horizontal_32x2(const uint32x4x3_t a) {
uint32x4x2_t sum;
uint32x4_t s[5];
s[0] = a.val[0];
@@ -1052,43 +670,42 @@ inline uint32x4x2_t Sum5Horizontal(const uint32x4x3_t a) {
s[2] = vextq_u32(a.val[0], a.val[1], 2);
s[3] = vextq_u32(a.val[0], a.val[1], 3);
s[4] = a.val[1];
- sum.val[0] = Sum5(s);
+ sum.val[0] = Sum5_32(s);
s[0] = a.val[1];
s[1] = vextq_u32(a.val[1], a.val[2], 1);
s[2] = vextq_u32(a.val[1], a.val[2], 2);
s[3] = vextq_u32(a.val[1], a.val[2], 3);
s[4] = a.val[2];
- sum.val[1] = Sum5(s);
+ sum.val[1] = Sum5_32(s);
return sum;
}
template <int size, int offset>
-inline void PreProcess4(const uint8x8_t* const row,
- const uint16x8_t* const row_sq, const uint32_t s,
- uint16_t* const dst) {
+inline void BoxFilterPreProcess4(const uint8x8_t* const row,
+ const uint16x8_t* const row_sq,
+ const uint32_t s, uint16_t* const dst) {
static_assert(offset == 0 || offset == 1, "");
// Number of elements in the box being summed.
constexpr uint32_t n = size * size;
constexpr uint32_t one_over_n =
((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
- const uint16x4_t v_255 = vdup_n_u16(255);
uint16x4_t sum;
uint32x4_t sum_sq;
if (size == 3) {
if (offset == 0) {
- sum = Sum3Horizontal(Sum3W(row));
- sum_sq = Sum3Horizontal(Sum3W(row_sq));
+ sum = Sum3Horizontal(Sum3W_16(row));
+ sum_sq = Sum3Horizontal_32(Sum3W(row_sq));
} else {
- sum = Sum3HorizontalOffset1(Sum3W(row));
- sum_sq = Sum3HorizontalOffset1(Sum3W(row_sq));
+ sum = Sum3HorizontalOffset1(Sum3W_16(row));
+ sum_sq = Sum3HorizontalOffset1_32(Sum3W(row_sq));
}
}
if (size == 5) {
- sum = Sum5Horizontal(Sum5W(row));
- sum_sq = Sum5Horizontal(Sum5W(row_sq));
+ sum = Sum5Horizontal(Sum5W_16(row));
+ sum_sq = Sum5Horizontal_32(Sum5W_32x2(row_sq));
}
const uint16x4_t z0 = CalculateSgrMA2<n>(sum_sq, sum, s);
- const uint16x4_t z = vmin_u16(v_255, z0);
+ const uint16x4_t z = vmin_u16(z0, vdup_n_u16(255));
// Using vget_lane_s16() can save a sign extension instruction.
// Add 4 0s for memory initialization purpose only.
const uint8_t lookup[8] = {
@@ -1101,42 +718,41 @@ inline void PreProcess4(const uint8x8_t* const row,
kSgrMa2Lookup[vget_lane_s16(vreinterpret_s16_u16(z), 2)],
kSgrMa2Lookup[vget_lane_s16(vreinterpret_s16_u16(z), 3)]};
const uint8x8_t sgr_ma2 = vld1_u8(lookup);
- const uint16x4_t b2 = CalculateB2Shifted(sgr_ma2, sum, one_over_n);
+ const uint16x4_t b2 = CalculateIntermediate4(sgr_ma2, sum, one_over_n);
const uint16x8_t sgr_ma2_b2 = vcombine_u16(vreinterpret_u16_u8(sgr_ma2), b2);
vst1q_u16(dst, sgr_ma2_b2);
}
template <int size, int offset>
-inline void PreProcess8(const uint8x16_t* const row,
- const uint16x8x2_t* const row_sq, const uint32_t s,
- uint8x8_t* const sgr_ma2, uint16x8_t* const b2,
- uint16_t* const dst) {
+inline void BoxFilterPreProcess8(const uint8x16_t* const row,
+ const uint16x8x2_t* const row_sq,
+ const uint32_t s, uint8x8_t* const sgr_ma2,
+ uint16x8_t* const b2, uint16_t* const dst) {
// Number of elements in the box being summed.
constexpr uint32_t n = size * size;
constexpr uint32_t one_over_n =
((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
- const uint16x8_t v_255 = vdupq_n_u16(255);
uint16x8_t sum;
uint32x4x2_t sum_sq;
if (size == 3) {
if (offset == 0) {
- sum = Sum3Horizontal(Sum3W(row));
- sum_sq = Sum3Horizontal(Sum3W(row_sq));
+ sum = Sum3Horizontal_16(Sum3W_16x2(row));
+ sum_sq = Sum3Horizontal_32x2(Sum3W(row_sq));
} else /* if (offset == 1) */ {
- sum = Sum3HorizontalOffset1(Sum3W(row));
- sum_sq = Sum3HorizontalOffset1(Sum3W(row_sq));
+ sum = Sum3HorizontalOffset1_16(Sum3W_16x2(row));
+ sum_sq = Sum3HorizontalOffset1_32x2(Sum3W(row_sq));
}
}
if (size == 5) {
- sum = Sum5Horizontal(Sum5W(row));
- sum_sq = Sum5Horizontal(Sum5W(row_sq));
+ sum = Sum5Horizontal_16(Sum5W_16D(row));
+ sum_sq = Sum5Horizontal_32x2(Sum5W_32x3(row_sq));
}
const uint16x4_t z0 = CalculateSgrMA2<n>(sum_sq.val[0], vget_low_u16(sum), s);
const uint16x4_t z1 =
CalculateSgrMA2<n>(sum_sq.val[1], vget_high_u16(sum), s);
const uint16x8_t z01 = vcombine_u16(z0, z1);
// Using vqmovn_u16() needs an extra sign extension instruction.
- const uint16x8_t z = vminq_u16(v_255, z01);
+ const uint16x8_t z = vminq_u16(z01, vdupq_n_u16(255));
// Using vgetq_lane_s16() can save the sign extension instruction.
const uint8_t lookup[8] = {
kSgrMa2Lookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 0)],
@@ -1148,40 +764,40 @@ inline void PreProcess8(const uint8x16_t* const row,
kSgrMa2Lookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 6)],
kSgrMa2Lookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 7)]};
*sgr_ma2 = vld1_u8(lookup);
- *b2 = CalculateB2Shifted(*sgr_ma2, sum, one_over_n);
+ *b2 = CalculateIntermediate8(*sgr_ma2, sum, one_over_n);
const uint16x8_t sgr_ma2_b2 =
vcombine_u16(vreinterpret_u16_u8(*sgr_ma2), vget_high_u16(*b2));
vst1q_u16(dst, sgr_ma2_b2);
}
-inline void Prepare3(const uint8x8_t a[2], uint8x8_t* const left,
- uint8x8_t* const middle, uint8x8_t* const right) {
+inline void Prepare3_8(const uint8x8_t a[2], uint8x8_t* const left,
+ uint8x8_t* const middle, uint8x8_t* const right) {
*left = vext_u8(a[0], a[1], 4);
*middle = vext_u8(a[0], a[1], 5);
*right = vext_u8(a[0], a[1], 6);
}
-inline void Prepare3(const uint16x8_t a[2], uint16x8_t* const left,
- uint16x8_t* const middle, uint16x8_t* const right) {
- *left = vcombine_u16(vget_high_u16(a[0]), vget_low_u16(a[1]));
+inline void Prepare3_16(const uint16x8_t a[2], uint16x8_t* const left,
+ uint16x8_t* const middle, uint16x8_t* const right) {
+ *left = vextq_u16(a[0], a[1], 4);
*middle = vextq_u16(a[0], a[1], 5);
*right = vextq_u16(a[0], a[1], 6);
}
inline uint16x8_t Sum343(const uint8x8_t a[2]) {
uint8x8_t left, middle, right;
- Prepare3(a, &left, &middle, &right);
- const uint16x8_t sum = Sum3W(left, middle, right);
- const uint16x8_t sum3 = Sum3(sum, sum, sum);
+ Prepare3_8(a, &left, &middle, &right);
+ const uint16x8_t sum = Sum3W_16(left, middle, right);
+ const uint16x8_t sum3 = Sum3_16(sum, sum, sum);
return vaddw_u8(sum3, middle);
}
inline void Sum343_444(const uint8x8_t a[2], uint16x8_t* const sum343,
uint16x8_t* const sum444) {
uint8x8_t left, middle, right;
- Prepare3(a, &left, &middle, &right);
- const uint16x8_t sum = Sum3W(left, middle, right);
- const uint16x8_t sum3 = Sum3(sum, sum, sum);
+ Prepare3_8(a, &left, &middle, &right);
+ const uint16x8_t sum = Sum3W_16(left, middle, right);
+ const uint16x8_t sum3 = Sum3_16(sum, sum, sum);
*sum343 = vaddw_u8(sum3, middle);
*sum444 = vshlq_n_u16(sum, 2);
}
@@ -1189,13 +805,13 @@ inline void Sum343_444(const uint8x8_t a[2], uint16x8_t* const sum343,
inline uint32x4x2_t Sum343W(const uint16x8_t a[2]) {
uint16x8_t left, middle, right;
uint32x4x2_t d;
- Prepare3(a, &left, &middle, &right);
+ Prepare3_16(a, &left, &middle, &right);
d.val[0] =
- Sum3W(vget_low_u16(left), vget_low_u16(middle), vget_low_u16(right));
- d.val[1] =
- Sum3W(vget_high_u16(left), vget_high_u16(middle), vget_high_u16(right));
- d.val[0] = Sum3(d.val[0], d.val[0], d.val[0]);
- d.val[1] = Sum3(d.val[1], d.val[1], d.val[1]);
+ Sum3W_32(vget_low_u16(left), vget_low_u16(middle), vget_low_u16(right));
+ d.val[1] = Sum3W_32(vget_high_u16(left), vget_high_u16(middle),
+ vget_high_u16(right));
+ d.val[0] = Sum3_32(d.val[0], d.val[0], d.val[0]);
+ d.val[1] = Sum3_32(d.val[1], d.val[1], d.val[1]);
d.val[0] = vaddw_u16(d.val[0], vget_low_u16(middle));
d.val[1] = vaddw_u16(d.val[1], vget_high_u16(middle));
return d;
@@ -1204,13 +820,13 @@ inline uint32x4x2_t Sum343W(const uint16x8_t a[2]) {
inline void Sum343_444W(const uint16x8_t a[2], uint32x4x2_t* const sum343,
uint32x4x2_t* const sum444) {
uint16x8_t left, middle, right;
- Prepare3(a, &left, &middle, &right);
+ Prepare3_16(a, &left, &middle, &right);
sum444->val[0] =
- Sum3W(vget_low_u16(left), vget_low_u16(middle), vget_low_u16(right));
- sum444->val[1] =
- Sum3W(vget_high_u16(left), vget_high_u16(middle), vget_high_u16(right));
- sum343->val[0] = Sum3(sum444->val[0], sum444->val[0], sum444->val[0]);
- sum343->val[1] = Sum3(sum444->val[1], sum444->val[1], sum444->val[1]);
+ Sum3W_32(vget_low_u16(left), vget_low_u16(middle), vget_low_u16(right));
+ sum444->val[1] = Sum3W_32(vget_high_u16(left), vget_high_u16(middle),
+ vget_high_u16(right));
+ sum343->val[0] = Sum3_32(sum444->val[0], sum444->val[0], sum444->val[0]);
+ sum343->val[1] = Sum3_32(sum444->val[1], sum444->val[1], sum444->val[1]);
sum343->val[0] = vaddw_u16(sum343->val[0], vget_low_u16(middle));
sum343->val[1] = vaddw_u16(sum343->val[1], vget_high_u16(middle));
sum444->val[0] = vshlq_n_u32(sum444->val[0], 2);
@@ -1219,8 +835,8 @@ inline void Sum343_444W(const uint16x8_t a[2], uint32x4x2_t* const sum343,
inline uint16x8_t Sum565(const uint8x8_t a[2]) {
uint8x8_t left, middle, right;
- Prepare3(a, &left, &middle, &right);
- const uint16x8_t sum = Sum3W(left, middle, right);
+ Prepare3_8(a, &left, &middle, &right);
+ const uint16x8_t sum = Sum3W_16(left, middle, right);
const uint16x8_t sum4 = vshlq_n_u16(sum, 2);
const uint16x8_t sum5 = vaddq_u16(sum4, sum);
return vaddw_u8(sum5, middle);
@@ -1228,9 +844,9 @@ inline uint16x8_t Sum565(const uint8x8_t a[2]) {
inline uint32x4_t Sum565W(const uint16x8_t a) {
const uint16x4_t left = vget_low_u16(a);
- const uint16x4_t middle = vext_u16(left, vget_high_u16(a), 1);
- const uint16x4_t right = vext_u16(left, vget_high_u16(a), 2);
- const uint32x4_t sum = Sum3W(left, middle, right);
+ const uint16x4_t middle = VshrU128<2>(a);
+ const uint16x4_t right = VshrU128<4>(a);
+ const uint32x4_t sum = Sum3W_32(left, middle, right);
const uint32x4_t sum4 = vshlq_n_u32(sum, 2);
const uint32x4_t sum5 = vaddq_u32(sum4, sum);
return vaddw_u16(sum5, middle);
@@ -1256,53 +872,95 @@ inline uint16x4_t FilterOutput(const uint16x4_t src, const uint16x4_t a,
}
template <int shift>
-inline void CalculateFilteredOutput(const uint8x8_t src, const uint16x8_t a,
- const uint32x4x2_t b, uint16_t* const dst) {
+inline int16x8_t CalculateFilteredOutput(const uint8x8_t src,
+ const uint16x8_t a,
+ const uint32x4x2_t b) {
const uint16x8_t src_u16 = vmovl_u8(src);
const uint16x4_t dst_lo =
FilterOutput<shift>(vget_low_u16(src_u16), vget_low_u16(a), b.val[0]);
const uint16x4_t dst_hi =
FilterOutput<shift>(vget_high_u16(src_u16), vget_high_u16(a), b.val[1]);
- const uint16x8_t d = vcombine_u16(dst_lo, dst_hi);
- vst1q_u16(dst, d);
+ return vreinterpretq_s16_u16(vcombine_u16(dst_lo, dst_hi)); // 14 bits
}
-inline void BoxFilter1(const uint8x8_t src_u8, const uint8x8_t a2[2],
- const uint16x8_t b2[2], uint16x8_t sum565_a[2],
- uint32x4x2_t sum565_b[2], uint16_t* const out_buf) {
+inline int16x8_t BoxFilterPass1(const uint8x8_t src_u8, const uint8x8_t a2[2],
+ const uint16x8_t b2[2], uint16x8_t sum565_a[2],
+ uint32x4x2_t sum565_b[2]) {
uint32x4x2_t b_v;
sum565_a[1] = Sum565(a2);
sum565_a[1] = vsubq_u16(vdupq_n_u16((5 + 6 + 5) * 256), sum565_a[1]);
- sum565_b[1].val[0] =
- Sum565W(vcombine_u16(vget_high_u16(b2[0]), vget_low_u16(b2[1])));
+ sum565_b[1].val[0] = Sum565W(vextq_u16(b2[0], b2[1], 4));
sum565_b[1].val[1] = Sum565W(b2[1]);
uint16x8_t a_v = vaddq_u16(sum565_a[0], sum565_a[1]);
b_v.val[0] = vaddq_u32(sum565_b[0].val[0], sum565_b[1].val[0]);
b_v.val[1] = vaddq_u32(sum565_b[0].val[1], sum565_b[1].val[1]);
- CalculateFilteredOutput<5>(src_u8, a_v, b_v, out_buf);
+ return CalculateFilteredOutput<5>(src_u8, a_v, b_v); // 14 bits
}
-inline void BoxFilter2(const uint8x8_t src_u8, const uint8x8_t a2[2],
- const uint16x8_t b2[2], uint16x8_t sum343_a[4],
- uint16x8_t sum444_a[3], uint32x4x2_t sum343_b[4],
- uint32x4x2_t sum444_b[3], uint16_t* const out_buf) {
+inline int16x8_t BoxFilterPass2(const uint8x8_t src_u8, const uint8x8_t a2[2],
+ const uint16x8_t b2[2], uint16x8_t sum343_a[4],
+ uint16x8_t sum444_a[3],
+ uint32x4x2_t sum343_b[4],
+ uint32x4x2_t sum444_b[3]) {
uint32x4x2_t b_v;
Sum343_444(a2, &sum343_a[2], &sum444_a[1]);
sum343_a[2] = vsubq_u16(vdupq_n_u16((3 + 4 + 3) * 256), sum343_a[2]);
sum444_a[1] = vsubq_u16(vdupq_n_u16((4 + 4 + 4) * 256), sum444_a[1]);
- uint16x8_t a_v = Sum3(sum343_a[0], sum444_a[0], sum343_a[2]);
+ uint16x8_t a_v = Sum3_16(sum343_a[0], sum444_a[0], sum343_a[2]);
Sum343_444W(b2, &sum343_b[2], &sum444_b[1]);
- b_v.val[0] = Sum3(sum343_b[0].val[0], sum444_b[0].val[0], sum343_b[2].val[0]);
- b_v.val[1] = Sum3(sum343_b[0].val[1], sum444_b[0].val[1], sum343_b[2].val[1]);
- CalculateFilteredOutput<5>(src_u8, a_v, b_v, out_buf);
+ b_v.val[0] =
+ Sum3_32(sum343_b[0].val[0], sum444_b[0].val[0], sum343_b[2].val[0]);
+ b_v.val[1] =
+ Sum3_32(sum343_b[0].val[1], sum444_b[0].val[1], sum343_b[2].val[1]);
+ return CalculateFilteredOutput<5>(src_u8, a_v, b_v); // 14 bits
+}
+
+inline void SelfGuidedDoubleMultiplier(
+ const uint8x8_t src, const int16x8_t box_filter_process_output[2],
+ const int16x4_t w0, const int16x4_t w1, const int16x4_t w2,
+ uint8_t* const dst) {
+ // |wN| values are signed. |src| values can be treated as int16_t.
+ const int16x8_t u =
+ vreinterpretq_s16_u16(vshll_n_u8(src, kSgrProjRestoreBits));
+ int32x4_t v_lo = vmull_s16(vget_low_s16(u), w1);
+ v_lo = vmlal_s16(v_lo, vget_low_s16(box_filter_process_output[0]), w0);
+ v_lo = vmlal_s16(v_lo, vget_low_s16(box_filter_process_output[1]), w2);
+ int32x4_t v_hi = vmull_s16(vget_high_s16(u), w1);
+ v_hi = vmlal_s16(v_hi, vget_high_s16(box_filter_process_output[0]), w0);
+ v_hi = vmlal_s16(v_hi, vget_high_s16(box_filter_process_output[1]), w2);
+ // |s| is saturated to uint8_t.
+ const int16x4_t s_lo =
+ vrshrn_n_s32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const int16x4_t s_hi =
+ vrshrn_n_s32(v_hi, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ vst1_u8(dst, vqmovun_s16(vcombine_s16(s_lo, s_hi)));
}
-inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride,
+inline void SelfGuidedSingleMultiplier(
+ const uint8x8_t src, const int16x8_t box_filter_process_output,
+ const int16_t w0, const int16_t w1, uint8_t* dst) {
+ // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+ const int16x8_t u =
+ vreinterpretq_s16_u16(vshll_n_u8(src, kSgrProjRestoreBits));
+ // u * w1 + u * wN == u * (w1 + wN)
+ int32x4_t v_lo = vmull_n_s16(vget_low_s16(u), w1);
+ v_lo = vmlal_n_s16(v_lo, vget_low_s16(box_filter_process_output), w0);
+ int32x4_t v_hi = vmull_n_s16(vget_high_s16(u), w1);
+ v_hi = vmlal_n_s16(v_hi, vget_high_s16(box_filter_process_output), w0);
+ const int16x4_t s_lo =
+ vrshrn_n_s32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const int16x4_t s_hi =
+ vrshrn_n_s32(v_hi, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ vst1_u8(dst, vqmovun_s16(vcombine_s16(s_lo, s_hi)));
+}
+
+inline void BoxFilterProcess(const uint8_t* const src,
+ const ptrdiff_t src_stride,
+ const RestorationUnitInfo& restoration_info,
const int width, const int height,
- const uint16_t s[2],
- uint16_t* const box_filter_process_output,
- uint16_t* const temp) {
+ const uint16_t s[2], uint16_t* const temp,
+ uint8_t* const dst, const ptrdiff_t dst_stride) {
// We have combined PreProcess and Process for the first pass by storing
// intermediate values in the |a2| region. The values stored are one vertical
// column of interleaved |a2| and |b2| values and consume 8 * |height| values.
@@ -1340,45 +998,39 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride,
// interleaved in |temp|. The first half is not stored, since it is used
// immediately and becomes useless for the next column. Next we will start the
// second column. When 2 rows have been calculated we can calculate Process
- // and output those into the top of |box_filter_process_output|.
+ // and output the results.
// Calculate and store a single column. Scope so we can re-use the variable
// names for the next step.
uint16_t* ab_ptr = temp;
- // The first phase needs a radius of 2 context values. The second phase
- // needs a context of radius 1 values. This means we start at (-3, -3).
- const uint8_t* const src_pre_process = src - 3 - 3 * stride;
- // Calculate intermediate results, including two-pixel border, for example,
- // if unit size is 64x64, we calculate 68x68 pixels.
+ const uint8_t* const src_pre_process = src - 2 * src_stride - 3;
+ // Calculate intermediate results, including two-pixel border, for example, if
+ // unit size is 64x64, we calculate 68x68 pixels.
{
const uint8_t* column = src_pre_process;
uint8x8_t row[5];
uint16x8_t row_sq[5];
-
- row[0] = vld1_u8(column);
- column += stride;
- row[1] = vld1_u8(column);
- column += stride;
+ row[0] = row[1] = vld1_u8(column);
+ column += src_stride;
row[2] = vld1_u8(column);
- row_sq[0] = vmull_u8(row[0], row[0]);
- row_sq[1] = vmull_u8(row[1], row[1]);
+ row_sq[0] = row_sq[1] = vmull_u8(row[1], row[1]);
row_sq[2] = vmull_u8(row[2], row[2]);
- int y = 0;
+ int y = (height + 2) >> 1;
do {
- column += stride;
+ column += src_stride;
row[3] = vld1_u8(column);
- column += stride;
+ column += src_stride;
row[4] = vld1_u8(column);
row_sq[3] = vmull_u8(row[3], row[3]);
row_sq[4] = vmull_u8(row[4], row[4]);
- PreProcess4<5, 0>(row + 0, row_sq + 0, s[0], ab_ptr + 0);
- PreProcess4<3, 1>(row + 1, row_sq + 1, s[1], ab_ptr + 8);
- PreProcess4<3, 1>(row + 2, row_sq + 2, s[1], ab_ptr + 16);
+ BoxFilterPreProcess4<5, 0>(row + 0, row_sq + 0, s[0], ab_ptr + 0);
+ BoxFilterPreProcess4<3, 1>(row + 1, row_sq + 1, s[1], ab_ptr + 8);
+ BoxFilterPreProcess4<3, 1>(row + 2, row_sq + 2, s[1], ab_ptr + 16);
row[0] = row[2];
row[1] = row[3];
@@ -1388,10 +1040,23 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride,
row_sq[1] = row_sq[3];
row_sq[2] = row_sq[4];
ab_ptr += 24;
- y += 2;
- } while (y < height + 2);
+ } while (--y != 0);
+
+ if ((height & 1) != 0) {
+ column += src_stride;
+ row[3] = row[4] = vld1_u8(column);
+ row_sq[3] = row_sq[4] = vmull_u8(row[3], row[3]);
+ BoxFilterPreProcess4<5, 0>(row + 0, row_sq + 0, s[0], ab_ptr + 0);
+ BoxFilterPreProcess4<3, 1>(row + 1, row_sq + 1, s[1], ab_ptr + 8);
+ }
}
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ const int16x4_t w0_v = vdup_n_s16(w0);
+ const int16x4_t w1_v = vdup_n_s16(w1);
+ const int16x4_t w2_v = vdup_n_s16(w2);
int x = 0;
do {
// |src_pre_process| is X but we already processed the first column of 4
@@ -1423,21 +1088,18 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride,
const uint8_t* column = src_pre_process + x + 4;
uint8x16_t row[5];
uint16x8x2_t row_sq[5];
-
- row[0] = vld1q_u8(column);
- column += stride;
- row[1] = vld1q_u8(column);
- column += stride;
+ row[0] = row[1] = vld1q_u8(column);
+ column += src_stride;
row[2] = vld1q_u8(column);
- column += stride;
+ column += src_stride;
row[3] = vld1q_u8(column);
- column += stride;
+ column += src_stride;
row[4] = vld1q_u8(column);
- row_sq[0].val[0] = vmull_u8(vget_low_u8(row[0]), vget_low_u8(row[0]));
- row_sq[0].val[1] = vmull_u8(vget_high_u8(row[0]), vget_high_u8(row[0]));
- row_sq[1].val[0] = vmull_u8(vget_low_u8(row[1]), vget_low_u8(row[1]));
- row_sq[1].val[1] = vmull_u8(vget_high_u8(row[1]), vget_high_u8(row[1]));
+ row_sq[0].val[0] = row_sq[1].val[0] =
+ vmull_u8(vget_low_u8(row[1]), vget_low_u8(row[1]));
+ row_sq[0].val[1] = row_sq[1].val[1] =
+ vmull_u8(vget_high_u8(row[1]), vget_high_u8(row[1]));
row_sq[2].val[0] = vmull_u8(vget_low_u8(row[2]), vget_low_u8(row[2]));
row_sq[2].val[1] = vmull_u8(vget_high_u8(row[2]), vget_high_u8(row[2]));
row_sq[3].val[0] = vmull_u8(vget_low_u8(row[3]), vget_low_u8(row[3]));
@@ -1445,21 +1107,17 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride,
row_sq[4].val[0] = vmull_u8(vget_low_u8(row[4]), vget_low_u8(row[4]));
row_sq[4].val[1] = vmull_u8(vget_high_u8(row[4]), vget_high_u8(row[4]));
- PreProcess8<5, 0>(row, row_sq, s[0], &a2[0][1], &b2[0][1], ab_ptr);
- PreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1][1], &b2[1][1],
- ab_ptr + 8);
+ BoxFilterPreProcess8<5, 0>(row, row_sq, s[0], &a2[0][1], &b2[0][1], ab_ptr);
+ BoxFilterPreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1][1], &b2[1][1],
+ ab_ptr + 8);
// Pass 1 Process. These are the only values we need to propagate between
// rows.
sum565_a[0] = Sum565(a2[0]);
sum565_a[0] = vsubq_u16(vdupq_n_u16((5 + 6 + 5) * 256), sum565_a[0]);
- sum565_b[0].val[0] =
- Sum565W(vcombine_u16(vget_high_u16(b2[0][0]), vget_low_u16(b2[0][1])));
+ sum565_b[0].val[0] = Sum565W(vextq_u16(b2[0][0], b2[0][1], 4));
sum565_b[0].val[1] = Sum565W(b2[0][1]);
- const uint8_t* src_ptr = src + x;
- uint16_t* out_buf = box_filter_process_output + 2 * x;
-
sum343_a[0] = Sum343(a2[1]);
sum343_a[0] = vsubq_u16(vdupq_n_u16((3 + 4 + 3) * 256), sum343_a[0]);
sum343_b[0] = Sum343W(b2[1]);
@@ -1467,19 +1125,21 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride,
b2[1][0] = vld1q_u16(ab_ptr + 16);
a2[1][0] = vget_low_u8(vreinterpretq_u8_u16(b2[1][0]));
- PreProcess8<3, 1>(row + 2, row_sq + 2, s[1], &a2[1][1], &b2[1][1],
- ab_ptr + 16);
+ BoxFilterPreProcess8<3, 1>(row + 2, row_sq + 2, s[1], &a2[1][1], &b2[1][1],
+ ab_ptr + 16);
Sum343_444(a2[1], &sum343_a[1], &sum444_a[0]);
sum343_a[1] = vsubq_u16(vdupq_n_u16((3 + 4 + 3) * 256), sum343_a[1]);
sum444_a[0] = vsubq_u16(vdupq_n_u16((4 + 4 + 4) * 256), sum444_a[0]);
Sum343_444W(b2[1], &sum343_b[1], &sum444_b[0]);
+ const uint8_t* src_ptr = src + x;
+ uint8_t* dst_ptr = dst + x;
+
// Calculate one output line. Add in the line from the previous pass and
// output one even row. Sum the new line and output the odd row. Carry the
// new row into the next pass.
- int y = 0;
- do {
+ for (int y = height >> 1; y != 0; --y) {
ab_ptr += 24;
b2[0][0] = vld1q_u16(ab_ptr);
a2[0][0] = vget_low_u8(vreinterpretq_u8_u16(b2[0][0]));
@@ -1494,9 +1154,9 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride,
row_sq[1] = row_sq[3];
row_sq[2] = row_sq[4];
- column += stride;
+ column += src_stride;
row[3] = vld1q_u8(column);
- column += stride;
+ column += src_stride;
row[4] = vld1q_u8(column);
row_sq[3].val[0] = vmull_u8(vget_low_u8(row[3]), vget_low_u8(row[3]));
@@ -1504,28 +1164,31 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride,
row_sq[4].val[0] = vmull_u8(vget_low_u8(row[4]), vget_low_u8(row[4]));
row_sq[4].val[1] = vmull_u8(vget_high_u8(row[4]), vget_high_u8(row[4]));
- PreProcess8<5, 0>(row, row_sq, s[0], &a2[0][1], &b2[0][1], ab_ptr);
- PreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1][1], &b2[1][1],
- ab_ptr + 8);
+ BoxFilterPreProcess8<5, 0>(row, row_sq, s[0], &a2[0][1], &b2[0][1],
+ ab_ptr);
+ BoxFilterPreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1][1],
+ &b2[1][1], ab_ptr + 8);
+
+ int16x8_t p[2];
+ const uint8x8_t src0 = vld1_u8(src_ptr);
+ p[0] = BoxFilterPass1(src0, a2[0], b2[0], sum565_a, sum565_b);
+ p[1] = BoxFilterPass2(src0, a2[1], b2[1], sum343_a, sum444_a, sum343_b,
+ sum444_b);
+ SelfGuidedDoubleMultiplier(src0, p, w0_v, w1_v, w2_v, dst_ptr);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
- uint8x8_t src_u8 = vld1_u8(src_ptr);
- BoxFilter1(src_u8, a2[0], b2[0], sum565_a, sum565_b, out_buf);
- BoxFilter2(src_u8, a2[1], b2[1], sum343_a, sum444_a, sum343_b, sum444_b,
- out_buf + 8);
- src_ptr += stride;
- out_buf += 2 * kRestorationProcessingUnitSize;
-
- src_u8 = vld1_u8(src_ptr);
- CalculateFilteredOutput<4>(src_u8, sum565_a[1], sum565_b[1], out_buf);
+ const uint8x8_t src1 = vld1_u8(src_ptr);
+ p[0] = CalculateFilteredOutput<4>(src1, sum565_a[1], sum565_b[1]);
b2[1][0] = vld1q_u16(ab_ptr + 16);
a2[1][0] = vget_low_u8(vreinterpretq_u8_u16(b2[1][0]));
- PreProcess8<3, 1>(row + 2, row_sq + 2, s[1], &a2[1][1], &b2[1][1],
- ab_ptr + 16);
-
- BoxFilter2(src_u8, a2[1], b2[1], sum343_a + 1, sum444_a + 1, sum343_b + 1,
- sum444_b + 1, out_buf + 8);
- src_ptr += stride;
- out_buf += 2 * kRestorationProcessingUnitSize;
+ BoxFilterPreProcess8<3, 1>(row + 2, row_sq + 2, s[1], &a2[1][1],
+ &b2[1][1], ab_ptr + 16);
+ p[1] = BoxFilterPass2(src1, a2[1], b2[1], sum343_a + 1, sum444_a + 1,
+ sum343_b + 1, sum444_b + 1);
+ SelfGuidedDoubleMultiplier(src1, p, w0_v, w1_v, w2_v, dst_ptr);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
sum565_a[0] = sum565_a[1];
sum565_b[0] = sum565_b[1];
@@ -1535,17 +1198,53 @@ inline void BoxFilterProcess(const uint8_t* const src, const ptrdiff_t stride,
sum343_b[0] = sum343_b[2];
sum343_b[1] = sum343_b[3];
sum444_b[0] = sum444_b[2];
+ }
+ if ((height & 1) != 0) {
+ ab_ptr += 24;
+ b2[0][0] = vld1q_u16(ab_ptr);
+ a2[0][0] = vget_low_u8(vreinterpretq_u8_u16(b2[0][0]));
+ b2[1][0] = vld1q_u16(ab_ptr + 8);
+ a2[1][0] = vget_low_u8(vreinterpretq_u8_u16(b2[1][0]));
- y += 2;
- } while (y < height);
+ row[0] = row[2];
+ row[1] = row[3];
+ row[2] = row[4];
+
+ row_sq[0] = row_sq[2];
+ row_sq[1] = row_sq[3];
+ row_sq[2] = row_sq[4];
+
+ column += src_stride;
+ row[3] = row[4] = vld1q_u8(column);
+
+ row_sq[3].val[0] = row_sq[4].val[0] =
+ vmull_u8(vget_low_u8(row[3]), vget_low_u8(row[3]));
+ row_sq[3].val[1] = row_sq[4].val[1] =
+ vmull_u8(vget_high_u8(row[3]), vget_high_u8(row[3]));
+
+ BoxFilterPreProcess8<5, 0>(row, row_sq, s[0], &a2[0][1], &b2[0][1],
+ ab_ptr);
+ BoxFilterPreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1][1],
+ &b2[1][1], ab_ptr + 8);
+
+ int16x8_t p[2];
+ const uint8x8_t src0 = vld1_u8(src_ptr);
+ p[0] = BoxFilterPass1(src0, a2[0], b2[0], sum565_a, sum565_b);
+ p[1] = BoxFilterPass2(src0, a2[1], b2[1], sum343_a, sum444_a, sum343_b,
+ sum444_b);
+ SelfGuidedDoubleMultiplier(src0, p, w0_v, w1_v, w2_v, dst_ptr);
+ }
x += 8;
} while (x < width);
}
-inline void BoxFilterProcess_FirstPass(
- const uint8_t* const src, const ptrdiff_t stride, const int width,
- const int height, const uint32_t s,
- uint16_t* const box_filter_process_output, uint16_t* const temp) {
+inline void BoxFilterProcessPass1(const uint8_t* const src,
+ const ptrdiff_t src_stride,
+ const RestorationUnitInfo& restoration_info,
+ const int width, const int height,
+ const uint32_t s, uint16_t* const temp,
+ uint8_t* const dst,
+ const ptrdiff_t dst_stride) {
// We have combined PreProcess and Process for the first pass by storing
// intermediate values in the |a2| region. The values stored are one vertical
// column of interleaved |a2| and |b2| values and consume 8 * |height| values.
@@ -1583,43 +1282,37 @@ inline void BoxFilterProcess_FirstPass(
// interleaved in |temp|. The first half is not stored, since it is used
// immediately and becomes useless for the next column. Next we will start the
// second column. When 2 rows have been calculated we can calculate Process
- // and output those into the top of |box_filter_process_output|.
+ // and output the results.
// Calculate and store a single column. Scope so we can re-use the variable
// names for the next step.
uint16_t* ab_ptr = temp;
- // The first phase needs a radius of 2 context values. The second phase
- // needs a context of radius 1 values. This means we start at (-3, -3).
- const uint8_t* const src_pre_process = src - 3 - 3 * stride;
- // Calculate intermediate results, including two-pixel border, for example,
- // if unit size is 64x64, we calculate 68x68 pixels.
+ const uint8_t* const src_pre_process = src - 2 * src_stride - 3;
+ // Calculate intermediate results, including two-pixel border, for example, if
+ // unit size is 64x64, we calculate 68x68 pixels.
{
const uint8_t* column = src_pre_process;
uint8x8_t row[5];
uint16x8_t row_sq[5];
-
- row[0] = vld1_u8(column);
- column += stride;
- row[1] = vld1_u8(column);
- column += stride;
+ row[0] = row[1] = vld1_u8(column);
+ column += src_stride;
row[2] = vld1_u8(column);
- row_sq[0] = vmull_u8(row[0], row[0]);
- row_sq[1] = vmull_u8(row[1], row[1]);
+ row_sq[0] = row_sq[1] = vmull_u8(row[1], row[1]);
row_sq[2] = vmull_u8(row[2], row[2]);
- int y = 0;
+ int y = (height + 2) >> 1;
do {
- column += stride;
+ column += src_stride;
row[3] = vld1_u8(column);
- column += stride;
+ column += src_stride;
row[4] = vld1_u8(column);
row_sq[3] = vmull_u8(row[3], row[3]);
row_sq[4] = vmull_u8(row[4], row[4]);
- PreProcess4<5, 0>(row, row_sq, s, ab_ptr);
+ BoxFilterPreProcess4<5, 0>(row, row_sq, s, ab_ptr);
row[0] = row[2];
row[1] = row[3];
@@ -1629,10 +1322,18 @@ inline void BoxFilterProcess_FirstPass(
row_sq[1] = row_sq[3];
row_sq[2] = row_sq[4];
ab_ptr += 8;
- y += 2;
- } while (y < height + 2);
+ } while (--y != 0);
+
+ if ((height & 1) != 0) {
+ column += src_stride;
+ row[3] = row[4] = vld1_u8(column);
+ row_sq[3] = row_sq[4] = vmull_u8(row[3], row[3]);
+ BoxFilterPreProcess4<5, 0>(row, row_sq, s, ab_ptr);
+ }
}
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = (1 << kSgrProjPrecisionBits) - w0;
int x = 0;
do {
// |src_pre_process| is X but we already processed the first column of 4
@@ -1662,21 +1363,18 @@ inline void BoxFilterProcess_FirstPass(
const uint8_t* column = src_pre_process + x + 4;
uint8x16_t row[5];
uint16x8x2_t row_sq[5];
-
- row[0] = vld1q_u8(column);
- column += stride;
- row[1] = vld1q_u8(column);
- column += stride;
+ row[0] = row[1] = vld1q_u8(column);
+ column += src_stride;
row[2] = vld1q_u8(column);
- column += stride;
+ column += src_stride;
row[3] = vld1q_u8(column);
- column += stride;
+ column += src_stride;
row[4] = vld1q_u8(column);
- row_sq[0].val[0] = vmull_u8(vget_low_u8(row[0]), vget_low_u8(row[0]));
- row_sq[0].val[1] = vmull_u8(vget_high_u8(row[0]), vget_high_u8(row[0]));
- row_sq[1].val[0] = vmull_u8(vget_low_u8(row[1]), vget_low_u8(row[1]));
- row_sq[1].val[1] = vmull_u8(vget_high_u8(row[1]), vget_high_u8(row[1]));
+ row_sq[0].val[0] = row_sq[1].val[0] =
+ vmull_u8(vget_low_u8(row[1]), vget_low_u8(row[1]));
+ row_sq[0].val[1] = row_sq[1].val[1] =
+ vmull_u8(vget_high_u8(row[1]), vget_high_u8(row[1]));
row_sq[2].val[0] = vmull_u8(vget_low_u8(row[2]), vget_low_u8(row[2]));
row_sq[2].val[1] = vmull_u8(vget_high_u8(row[2]), vget_high_u8(row[2]));
row_sq[3].val[0] = vmull_u8(vget_low_u8(row[3]), vget_low_u8(row[3]));
@@ -1684,24 +1382,22 @@ inline void BoxFilterProcess_FirstPass(
row_sq[4].val[0] = vmull_u8(vget_low_u8(row[4]), vget_low_u8(row[4]));
row_sq[4].val[1] = vmull_u8(vget_high_u8(row[4]), vget_high_u8(row[4]));
- PreProcess8<5, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr);
+ BoxFilterPreProcess8<5, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr);
// Pass 1 Process. These are the only values we need to propagate between
// rows.
sum565_a[0] = Sum565(a2);
sum565_a[0] = vsubq_u16(vdupq_n_u16((5 + 6 + 5) * 256), sum565_a[0]);
- sum565_b[0].val[0] =
- Sum565W(vcombine_u16(vget_high_u16(b2[0]), vget_low_u16(b2[1])));
+ sum565_b[0].val[0] = Sum565W(vextq_u16(b2[0], b2[1], 4));
sum565_b[0].val[1] = Sum565W(b2[1]);
const uint8_t* src_ptr = src + x;
- uint16_t* out_buf = box_filter_process_output + x;
+ uint8_t* dst_ptr = dst + x;
// Calculate one output line. Add in the line from the previous pass and
// output one even row. Sum the new line and output the odd row. Carry the
// new row into the next pass.
- int y = 0;
- do {
+ for (int y = height >> 1; y != 0; --y) {
ab_ptr += 8;
b2[0] = vld1q_u16(ab_ptr);
a2[0] = vget_low_u8(vreinterpretq_u8_u16(b2[0]));
@@ -1714,9 +1410,9 @@ inline void BoxFilterProcess_FirstPass(
row_sq[1] = row_sq[3];
row_sq[2] = row_sq[4];
- column += stride;
+ column += src_stride;
row[3] = vld1q_u8(column);
- column += stride;
+ column += src_stride;
row[4] = vld1q_u8(column);
row_sq[3].val[0] = vmull_u8(vget_low_u8(row[3]), vget_low_u8(row[3]));
@@ -1724,55 +1420,86 @@ inline void BoxFilterProcess_FirstPass(
row_sq[4].val[0] = vmull_u8(vget_low_u8(row[4]), vget_low_u8(row[4]));
row_sq[4].val[1] = vmull_u8(vget_high_u8(row[4]), vget_high_u8(row[4]));
- PreProcess8<5, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr);
+ BoxFilterPreProcess8<5, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr);
- uint8x8_t src_u8 = vld1_u8(src_ptr);
- BoxFilter1(src_u8, a2, b2, sum565_a, sum565_b, out_buf);
- src_ptr += stride;
- out_buf += kRestorationProcessingUnitSize;
+ const uint8x8_t src0 = vld1_u8(src_ptr);
+ const int16x8_t p0 = BoxFilterPass1(src0, a2, b2, sum565_a, sum565_b);
+ SelfGuidedSingleMultiplier(src0, p0, w0, w1, dst_ptr);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
- src_u8 = vld1_u8(src_ptr);
- CalculateFilteredOutput<4>(src_u8, sum565_a[1], sum565_b[1], out_buf);
- src_ptr += stride;
- out_buf += kRestorationProcessingUnitSize;
+ const uint8x8_t src1 = vld1_u8(src_ptr);
+ const int16x8_t p1 =
+ CalculateFilteredOutput<4>(src1, sum565_a[1], sum565_b[1]);
+ SelfGuidedSingleMultiplier(src1, p1, w0, w1, dst_ptr);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
sum565_a[0] = sum565_a[1];
sum565_b[0] = sum565_b[1];
- y += 2;
- } while (y < height);
+ }
+ if ((height & 1) != 0) {
+ ab_ptr += 8;
+ b2[0] = vld1q_u16(ab_ptr);
+ a2[0] = vget_low_u8(vreinterpretq_u8_u16(b2[0]));
+
+ row[0] = row[2];
+ row[1] = row[3];
+ row[2] = row[4];
+
+ row_sq[0] = row_sq[2];
+ row_sq[1] = row_sq[3];
+ row_sq[2] = row_sq[4];
+
+ column += src_stride;
+ row[3] = row[4] = vld1q_u8(column);
+
+ row_sq[3].val[0] = row_sq[4].val[0] =
+ vmull_u8(vget_low_u8(row[3]), vget_low_u8(row[3]));
+ row_sq[3].val[1] = row_sq[4].val[1] =
+ vmull_u8(vget_high_u8(row[3]), vget_high_u8(row[3]));
+
+ BoxFilterPreProcess8<5, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr);
+
+ const uint8x8_t src0 = vld1_u8(src_ptr);
+ const int16x8_t p0 = BoxFilterPass1(src0, a2, b2, sum565_a, sum565_b);
+ SelfGuidedSingleMultiplier(src0, p0, w0, w1, dst_ptr);
+ }
x += 8;
} while (x < width);
}
-inline void BoxFilterProcess_SecondPass(
- const uint8_t* src, const ptrdiff_t stride, const int width,
- const int height, const uint32_t s,
- uint16_t* const box_filter_process_output, uint16_t* const temp) {
+inline void BoxFilterProcessPass2(const uint8_t* src,
+ const ptrdiff_t src_stride,
+ const RestorationUnitInfo& restoration_info,
+ const int width, const int height,
+ const uint32_t s, uint16_t* const temp,
+ uint8_t* const dst,
+ const ptrdiff_t dst_stride) {
uint16_t* ab_ptr = temp;
- // Calculate intermediate results, including one-pixel border, for example,
- // if unit size is 64x64, we calculate 66x66 pixels.
+ // Calculate intermediate results, including one-pixel border, for example, if
+ // unit size is 64x64, we calculate 66x66 pixels.
// Because of the vectors this calculates start in blocks of 4 so we actually
// get 68 values.
- const uint8_t* const src_top_left_corner = src - 2 - 2 * stride;
+ const uint8_t* const src_top_left_corner = src - 2 - 2 * src_stride;
{
const uint8_t* column = src_top_left_corner;
uint8x8_t row[3];
uint16x8_t row_sq[3];
-
row[0] = vld1_u8(column);
- column += stride;
+ column += src_stride;
row[1] = vld1_u8(column);
row_sq[0] = vmull_u8(row[0], row[0]);
row_sq[1] = vmull_u8(row[1], row[1]);
int y = height + 2;
do {
- column += stride;
+ column += src_stride;
row[2] = vld1_u8(column);
row_sq[2] = vmull_u8(row[2], row[2]);
- PreProcess4<3, 0>(row, row_sq, s, ab_ptr);
+ BoxFilterPreProcess4<3, 0>(row, row_sq, s, ab_ptr);
row[0] = row[1];
row[1] = row[2];
@@ -1780,13 +1507,14 @@ inline void BoxFilterProcess_SecondPass(
row_sq[0] = row_sq[1];
row_sq[1] = row_sq[2];
ab_ptr += 8;
- } while (--y);
+ } while (--y != 0);
}
+ assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
int x = 0;
do {
- const uint8_t* src_ptr = src + x;
- uint16_t* out_buf = box_filter_process_output + x;
ab_ptr = temp;
uint8x8_t a2[2];
@@ -1799,9 +1527,9 @@ inline void BoxFilterProcess_SecondPass(
uint8x16_t row[3];
uint16x8x2_t row_sq[3];
row[0] = vld1q_u8(column);
- column += stride;
+ column += src_stride;
row[1] = vld1q_u8(column);
- column += stride;
+ column += src_stride;
row[2] = vld1q_u8(column);
row_sq[0].val[0] = vmull_u8(vget_low_u8(row[0]), vget_low_u8(row[0]));
@@ -1811,7 +1539,7 @@ inline void BoxFilterProcess_SecondPass(
row_sq[2].val[0] = vmull_u8(vget_low_u8(row[2]), vget_low_u8(row[2]));
row_sq[2].val[1] = vmull_u8(vget_high_u8(row[2]), vget_high_u8(row[2]));
- PreProcess8<3, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr);
+ BoxFilterPreProcess8<3, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr);
sum343_a[0] = Sum343(a2);
sum343_a[0] = vsubq_u16(vdupq_n_u16((3 + 4 + 3) * 256), sum343_a[0]);
@@ -1826,19 +1554,21 @@ inline void BoxFilterProcess_SecondPass(
row_sq[0] = row_sq[1];
row_sq[1] = row_sq[2];
- column += stride;
+ column += src_stride;
row[2] = vld1q_u8(column);
row_sq[2].val[0] = vmull_u8(vget_low_u8(row[2]), vget_low_u8(row[2]));
row_sq[2].val[1] = vmull_u8(vget_high_u8(row[2]), vget_high_u8(row[2]));
- PreProcess8<3, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr);
+ BoxFilterPreProcess8<3, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr);
Sum343_444(a2, &sum343_a[1], &sum444_a[0]);
sum343_a[1] = vsubq_u16(vdupq_n_u16((3 + 4 + 3) * 256), sum343_a[1]);
sum444_a[0] = vsubq_u16(vdupq_n_u16((4 + 4 + 4) * 256), sum444_a[0]);
Sum343_444W(b2, &sum343_b[1], &sum444_b[0]);
+ const uint8_t* src_ptr = src + x;
+ uint8_t* dst_ptr = dst + x;
int y = height;
do {
ab_ptr += 8;
@@ -1850,214 +1580,59 @@ inline void BoxFilterProcess_SecondPass(
row_sq[0] = row_sq[1];
row_sq[1] = row_sq[2];
- column += stride;
+ column += src_stride;
row[2] = vld1q_u8(column);
row_sq[2].val[0] = vmull_u8(vget_low_u8(row[2]), vget_low_u8(row[2]));
row_sq[2].val[1] = vmull_u8(vget_high_u8(row[2]), vget_high_u8(row[2]));
- PreProcess8<3, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr);
+ BoxFilterPreProcess8<3, 0>(row, row_sq, s, &a2[1], &b2[1], ab_ptr);
uint8x8_t src_u8 = vld1_u8(src_ptr);
- BoxFilter2(src_u8, a2, b2, sum343_a, sum444_a, sum343_b, sum444_b,
- out_buf);
+ int16x8_t p = BoxFilterPass2(src_u8, a2, b2, sum343_a, sum444_a, sum343_b,
+ sum444_b);
+ SelfGuidedSingleMultiplier(src_u8, p, w0, w1, dst_ptr);
sum343_a[0] = sum343_a[1];
sum343_a[1] = sum343_a[2];
sum444_a[0] = sum444_a[1];
sum343_b[0] = sum343_b[1];
sum343_b[1] = sum343_b[2];
sum444_b[0] = sum444_b[1];
- src_ptr += stride;
- out_buf += kRestorationProcessingUnitSize;
- } while (--y);
- x += 8;
- } while (x < width);
-}
-
-inline void SelfGuidedSingleMultiplier(
- const uint8_t* src, const ptrdiff_t src_stride,
- const uint16_t* const box_filter_process_output, uint8_t* dst,
- const ptrdiff_t dst_stride, const int width, const int height,
- const int16_t w_single) {
- const int16_t w_combo = (1 << kSgrProjPrecisionBits) - w_single;
- const auto* box_filter =
- reinterpret_cast<const int16_t*>(box_filter_process_output);
- int w = width;
-
- if (w & 4) {
- w -= 4;
- const uint8_t* src_ptr = src + w;
- uint8_t* dst_ptr = dst + w;
- const int16_t* box_filter_w = box_filter + w;
- int y = height;
- do {
- const int16x8_t u = vreinterpretq_s16_u16(
- vshll_n_u8(vld1_u8(src_ptr), kSgrProjRestoreBits));
- const int16x4_t p = vld1_s16(box_filter_w);
- // u * w1 + u * wN == u * (w1 + wN)
- int32x4_t v_lo = vmull_n_s16(vget_low_s16(u), w_combo);
- v_lo = vmlal_n_s16(v_lo, p, w_single);
- const int16x4_t s_lo =
- vrshrn_n_s32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits);
- StoreLo4(dst_ptr, vqmovun_s16(vcombine_s16(s_lo, s_lo)));
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- box_filter_w += kRestorationProcessingUnitSize;
- } while (--y);
-
- if (!w) return;
- }
-
- int y = height;
- do {
- int x = 0;
- do {
- const int16x8_t u = vreinterpretq_s16_u16(
- vshll_n_u8(vld1_u8(src + x), kSgrProjRestoreBits));
- const int16x8_t p = vld1q_s16(box_filter + x);
- // u * w1 + u * wN == u * (w1 + wN)
- int32x4_t v_lo = vmull_n_s16(vget_low_s16(u), w_combo);
- v_lo = vmlal_n_s16(v_lo, vget_low_s16(p), w_single);
- int32x4_t v_hi = vmull_n_s16(vget_high_s16(u), w_combo);
- v_hi = vmlal_n_s16(v_hi, vget_high_s16(p), w_single);
- const int16x4_t s_lo =
- vrshrn_n_s32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits);
- const int16x4_t s_hi =
- vrshrn_n_s32(v_hi, kSgrProjRestoreBits + kSgrProjPrecisionBits);
- vst1_u8(dst + x, vqmovun_s16(vcombine_s16(s_lo, s_hi)));
- x += 8;
- } while (x < w);
- src += src_stride;
- dst += dst_stride;
- box_filter += kRestorationProcessingUnitSize;
- } while (--y);
-}
-
-inline void SelfGuidedDoubleMultiplier(
- const uint8_t* src, const ptrdiff_t src_stride,
- const uint16_t* const box_filter_process_output, uint8_t* dst,
- const ptrdiff_t dst_stride, const int width, const int height, const int w0,
- const int w1, const int w2) {
- const auto* box_filter =
- reinterpret_cast<const int16_t*>(box_filter_process_output);
- const int16x4_t w0_v = vdup_n_s16(w0);
- const int16x4_t w1_v = vdup_n_s16(w1);
- const int16x4_t w2_v = vdup_n_s16(w2);
- int w = width;
-
- if (w & 4) {
- w -= 4;
- const uint8_t* src_ptr = src + w;
- uint8_t* dst_ptr = dst + w;
- const int16_t* box_filter_w = box_filter + 2 * w;
- int y = height;
- do {
- // |wN| values are signed. |src| values can be treated as int16_t.
- // Load 8 values but ignore 4.
- const int16x4_t u = vget_low_s16(vreinterpretq_s16_u16(
- vshll_n_u8(vld1_u8(src_ptr), kSgrProjRestoreBits)));
- // |box_filter_process_output| is 14 bits, also safe to treat as int16_t.
- const int16x4_t p0 = vld1_s16(box_filter_w + 0);
- const int16x4_t p1 = vld1_s16(box_filter_w + 8);
- int32x4_t v = vmull_s16(u, w1_v);
- v = vmlal_s16(v, p0, w0_v);
- v = vmlal_s16(v, p1, w2_v);
- // |s| is saturated to uint8_t.
- const int16x4_t s =
- vrshrn_n_s32(v, kSgrProjRestoreBits + kSgrProjPrecisionBits);
- StoreLo4(dst_ptr, vqmovun_s16(vcombine_s16(s, s)));
src_ptr += src_stride;
dst_ptr += dst_stride;
- box_filter_w += 2 * kRestorationProcessingUnitSize;
- } while (--y);
-
- if (!w) return;
- }
-
- int y = height;
- do {
- int x = 0;
- do {
- // |wN| values are signed. |src| values can be treated as int16_t.
- const int16x8_t u = vreinterpretq_s16_u16(
- vshll_n_u8(vld1_u8(src + x), kSgrProjRestoreBits));
- // |box_filter_process_output| is 14 bits, also safe to treat as int16_t.
- const int16x8_t p0 = vld1q_s16(box_filter + 2 * x + 0);
- const int16x8_t p1 = vld1q_s16(box_filter + 2 * x + 8);
- int32x4_t v_lo = vmull_s16(vget_low_s16(u), w1_v);
- v_lo = vmlal_s16(v_lo, vget_low_s16(p0), w0_v);
- v_lo = vmlal_s16(v_lo, vget_low_s16(p1), w2_v);
- int32x4_t v_hi = vmull_s16(vget_high_s16(u), w1_v);
- v_hi = vmlal_s16(v_hi, vget_high_s16(p0), w0_v);
- v_hi = vmlal_s16(v_hi, vget_high_s16(p1), w2_v);
- // |s| is saturated to uint8_t.
- const int16x4_t s_lo =
- vrshrn_n_s32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits);
- const int16x4_t s_hi =
- vrshrn_n_s32(v_hi, kSgrProjRestoreBits + kSgrProjPrecisionBits);
- vst1_u8(dst + x, vqmovun_s16(vcombine_s16(s_lo, s_hi)));
- x += 8;
- } while (x < w);
- src += src_stride;
- dst += dst_stride;
- box_filter += 2 * kRestorationProcessingUnitSize;
- } while (--y);
+ } while (--y != 0);
+ x += 8;
+ } while (x < width);
}
+// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
+// the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
void SelfGuidedFilter_NEON(const void* const source, void* const dest,
const RestorationUnitInfo& restoration_info,
- ptrdiff_t source_stride, ptrdiff_t dest_stride,
- const int width, const int height,
- RestorationBuffer* const /*buffer*/) {
- const auto* src = static_cast<const uint8_t*>(source);
-
- // The output frame is broken into blocks of 64x64 (32x32 if U/V are
- // subsampled). If either dimension is less than 32/64 it indicates it is at
- // the right or bottom edge of the frame. It is safe to overwrite the output
- // as it will not be part of the visible frame. This saves us from having to
- // handle non-multiple-of-8 widths.
- // We could round here, but the for loop with += 8 does the same thing.
-
- // width = (width + 7) & ~0x7;
-
- // -96 to 96 (Sgrproj_Xqd_Min/Max)
+ const ptrdiff_t source_stride,
+ const ptrdiff_t dest_stride, const int width,
+ const int height, RestorationBuffer* const buffer) {
const int index = restoration_info.sgr_proj_info.index;
- const int radius_pass_0 = kSgrProjParams[index][0];
- const int radius_pass_1 = kSgrProjParams[index][2];
- alignas(kMaxAlignment)
- uint16_t box_filter_process_output[2 * kMaxBoxFilterProcessOutputPixels];
- alignas(kMaxAlignment)
- uint16_t temp[12 * (kRestorationProcessingUnitSize + 2)];
-
- // If |radius| is 0 then there is nothing to do. If |radius| is not 0, it is
- // always 2 for the first pass and 1 for the second pass.
- const int w0 = restoration_info.sgr_proj_info.multiplier[0];
- const int w1 = restoration_info.sgr_proj_info.multiplier[1];
- const int w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
+ const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
+ const auto* src = static_cast<const uint8_t*>(source);
auto* dst = static_cast<uint8_t*>(dest);
- // Note: Combining box filter process with the final multipliers has no speed
- // gain. There are not enough neon registers to hold those weights.
- if (radius_pass_0 != 0 && radius_pass_1 != 0) {
- BoxFilterProcess(src, source_stride, width, height,
- kSgrScaleParameter[index], box_filter_process_output,
- temp);
- SelfGuidedDoubleMultiplier(src, source_stride, box_filter_process_output,
- dst, dest_stride, width, height, w0, w1, w2);
+ if (radius_pass_1 == 0) {
+ // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+ // following assertion.
+ assert(radius_pass_0 != 0);
+ BoxFilterProcessPass1(src, source_stride, restoration_info, width, height,
+ kSgrScaleParameter[index][0], buffer->sgf_buffer, dst,
+ dest_stride);
+ } else if (radius_pass_0 == 0) {
+ BoxFilterProcessPass2(src, source_stride, restoration_info, width, height,
+ kSgrScaleParameter[index][1], buffer->sgf_buffer, dst,
+ dest_stride);
} else {
- int16_t w_single;
- if (radius_pass_0 != 0) {
- BoxFilterProcess_FirstPass(src, source_stride, width, height,
- kSgrScaleParameter[index][0],
- box_filter_process_output, temp);
- w_single = w0;
- } else /* if (radius_pass_1 != 0) */ {
- BoxFilterProcess_SecondPass(src, source_stride, width, height,
- kSgrScaleParameter[index][1],
- box_filter_process_output, temp);
- w_single = w2;
- }
- SelfGuidedSingleMultiplier(src, source_stride, box_filter_process_output,
- dst, dest_stride, width, height, w_single);
+ BoxFilterProcess(src, source_stride, restoration_info, width, height,
+ kSgrScaleParameter[index], buffer->sgf_buffer, dst,
+ dest_stride);
}
}
diff --git a/chromium/third_party/libgav1/src/src/dsp/arm/motion_field_projection_neon.cc b/chromium/third_party/libgav1/src/src/dsp/arm/motion_field_projection_neon.cc
index b84548de6f7..3e731b22450 100644
--- a/chromium/third_party/libgav1/src/src/dsp/arm/motion_field_projection_neon.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/arm/motion_field_projection_neon.cc
@@ -34,92 +34,77 @@ namespace libgav1 {
namespace dsp {
namespace {
-inline int8x8_t Project_NEON(const int16x8_t delta, const int16x8_t dst_sign) {
- // Add 63 to negative delta so that it shifts towards zero.
- const int16x8_t delta_sign = vshrq_n_s16(delta, 15);
- const uint16x8_t delta_u = vreinterpretq_u16_s16(delta);
- const uint16x8_t delta_sign_u = vreinterpretq_u16_s16(delta_sign);
- const uint16x8_t delta_adjust_u = vsraq_n_u16(delta_u, delta_sign_u, 10);
- const int16x8_t delta_adjust = vreinterpretq_s16_u16(delta_adjust_u);
- const int16x8_t offset0 = vshrq_n_s16(delta_adjust, 6);
- const int16x8_t offset1 = veorq_s16(offset0, dst_sign);
- const int16x8_t offset2 = vsubq_s16(offset1, dst_sign);
- return vqmovn_s16(offset2);
-}
-
-inline int16x8_t LookupTable(const int8x8x4_t division_table,
- const int8x16_t idx) {
- const int8x8_t idx_low = vget_low_s8(idx);
- const int8x8_t idx_high = vget_high_s8(idx);
- const int16x4_t d0 = vreinterpret_s16_s8(vtbl4_s8(division_table, idx_low));
- const int16x4_t d1 = vreinterpret_s16_s8(vtbl4_s8(division_table, idx_high));
- return vcombine_s16(d0, d1);
-}
-
-inline int16x8_t LoadDivision(const int8x8x4_t division_table[2],
+inline int16x8_t LoadDivision(const int8x8x2_t division_table,
const int8x8_t reference_offset) {
- const int8x16_t k32 = vdupq_n_s8(32);
const int8x8_t kOne = vcreate_s8(0x0100010001000100);
const int8x16_t kOneQ = vcombine_s8(kOne, kOne);
const int8x8_t t = vadd_s8(reference_offset, reference_offset);
const int8x8x2_t tt = vzip_s8(t, t);
const int8x16_t t1 = vcombine_s8(tt.val[0], tt.val[1]);
- const int8x16_t idx0 = vaddq_s8(t1, kOneQ);
- const int8x16_t idx1 = vsubq_s8(idx0, k32);
- const int16x8_t denorm0 = LookupTable(division_table[0], idx0);
- const int16x8_t denorm1 = LookupTable(division_table[1], idx1);
- return vorrq_s16(denorm0, denorm1);
+ const int8x16_t idx = vaddq_s8(t1, kOneQ);
+ const int8x8_t idx_low = vget_low_s8(idx);
+ const int8x8_t idx_high = vget_high_s8(idx);
+ const int16x4_t d0 = vreinterpret_s16_s8(vtbl2_s8(division_table, idx_low));
+ const int16x4_t d1 = vreinterpret_s16_s8(vtbl2_s8(division_table, idx_high));
+ return vcombine_s16(d0, d1);
}
inline int16x4_t MvProjection(const int16x4_t mv, const int16x4_t denominator,
const int numerator) {
const int32x4_t m0 = vmull_s16(mv, denominator);
const int32x4_t m = vmulq_n_s32(m0, numerator);
- // Subtract the sign bit to round towards zero.
- const int32x4_t sub_sign = vsraq_n_s32(m, m, 31);
- return vqrshrn_n_s32(sub_sign, 14);
+ // Add the sign (0 or -1) to round towards zero.
+ const int32x4_t add_sign = vsraq_n_s32(m, m, 31);
+ return vqrshrn_n_s32(add_sign, 14);
}
inline int16x8_t MvProjectionClip(const int16x8_t mv,
const int16x8_t denominator,
const int numerator) {
- const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp);
const int16x4_t mv0 = vget_low_s16(mv);
const int16x4_t mv1 = vget_high_s16(mv);
- const int16x4_t m0 = MvProjection(mv0, vget_low_s16(denominator), numerator);
- const int16x4_t m1 = MvProjection(mv1, vget_high_s16(denominator), numerator);
- const int16x8_t m = vcombine_s16(m0, m1);
- const int16x8_t clamp = vminq_s16(m, projection_mv_clamp);
+ const int16x4_t s0 = MvProjection(mv0, vget_low_s16(denominator), numerator);
+ const int16x4_t s1 = MvProjection(mv1, vget_high_s16(denominator), numerator);
+ const int16x8_t projection = vcombine_s16(s0, s1);
+ const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp);
+ const int16x8_t clamp = vminq_s16(projection, projection_mv_clamp);
return vmaxq_s16(clamp, vnegq_s16(projection_mv_clamp));
}
-inline void GetMvProjection(const int32x4_t mv[2], const int16x8_t denominator,
- const int numerator, int16x8_t projection_mv[2]) {
- const int16x8_t mv0 = vreinterpretq_s16_s32(mv[0]);
- const int16x8_t mv1 = vreinterpretq_s16_s32(mv[1]);
- // Deinterlace
- const int16x8x2_t mvs = vuzpq_s16(mv0, mv1);
- projection_mv[0] = MvProjectionClip(mvs.val[0], denominator, numerator);
- projection_mv[1] = MvProjectionClip(mvs.val[1], denominator, numerator);
+inline int8x8_t Project_NEON(const int16x8_t delta, const int16x8_t dst_sign) {
+ // Add 63 to negative delta so that it shifts towards zero.
+ const int16x8_t delta_sign = vshrq_n_s16(delta, 15);
+ const uint16x8_t delta_u = vreinterpretq_u16_s16(delta);
+ const uint16x8_t delta_sign_u = vreinterpretq_u16_s16(delta_sign);
+ const uint16x8_t delta_adjust_u = vsraq_n_u16(delta_u, delta_sign_u, 10);
+ const int16x8_t delta_adjust = vreinterpretq_s16_u16(delta_adjust_u);
+ const int16x8_t offset0 = vshrq_n_s16(delta_adjust, 6);
+ const int16x8_t offset1 = veorq_s16(offset0, dst_sign);
+ const int16x8_t offset2 = vsubq_s16(offset1, dst_sign);
+ return vqmovn_s16(offset2);
}
-void GetPosition(const int8x8x4_t division_table[2],
- const MotionVector* const mv,
- const int reference_to_current_with_sign, const int x8_start,
- const int x8_end, const int x8, const int8x8_t r_offsets,
- const int8x8_t source_reference_type8, const int8x8_t skip_r,
- const int8x8_t y8_floor8, const int8x8_t y8_ceiling8,
- const int16x8_t d_sign, const int delta, int8x8_t* const r,
- int8x8_t* const position_y8, int8x8_t* const position_x8,
- int64_t* const skip_64, int32x4_t mvs[2]) {
- const int32_t* const mv_int = reinterpret_cast<const int32_t*>(mv + x8);
+inline void GetPosition(
+ const int8x8x2_t division_table, const MotionVector* const mv,
+ const int numerator, const int x8_start, const int x8_end, const int x8,
+ const int8x8_t r_offsets, const int8x8_t source_reference_type8,
+ const int8x8_t skip_r, const int8x8_t y8_floor8, const int8x8_t y8_ceiling8,
+ const int16x8_t d_sign, const int delta, int8x8_t* const r,
+ int8x8_t* const position_y8, int8x8_t* const position_x8,
+ int64_t* const skip_64, int32x4_t mvs[2]) {
+ const auto* const mv_int = reinterpret_cast<const int32_t*>(mv + x8);
*r = vtbl1_s8(r_offsets, source_reference_type8);
- const int16x8_t denorm = LoadDivision(division_table, *r);
+ const int16x8_t denorm = LoadDivision(division_table, source_reference_type8);
int16x8_t projection_mv[2];
mvs[0] = vld1q_s32(mv_int + 0);
mvs[1] = vld1q_s32(mv_int + 4);
- // reference_to_current_with_sign could be 0.
- GetMvProjection(mvs, denorm, reference_to_current_with_sign, projection_mv);
+ // Deinterlace x and y components
+ const int16x8_t mv0 = vreinterpretq_s16_s32(mvs[0]);
+ const int16x8_t mv1 = vreinterpretq_s16_s32(mvs[1]);
+ const int16x8x2_t mv_yx = vuzpq_s16(mv0, mv1);
+ // numerator could be 0.
+ projection_mv[0] = MvProjectionClip(mv_yx.val[0], denorm, numerator);
+ projection_mv[1] = MvProjectionClip(mv_yx.val[1], denorm, numerator);
// Do not update the motion vector if the block position is not valid or
// if position_x8 is outside the current range of x8_start and x8_end.
// Note that position_y8 will always be within the range of y8_start and
@@ -147,46 +132,31 @@ void GetPosition(const int8x8x4_t division_table[2],
}
template <int idx>
-int16_t VgetqLaneS16(const int16x8_t src) {
- if (idx == 0) return vgetq_lane_s16(src, 0);
- if (idx == 1) return vgetq_lane_s16(src, 1);
- if (idx == 2) return vgetq_lane_s16(src, 2);
- if (idx == 3) return vgetq_lane_s16(src, 3);
- if (idx == 4) return vgetq_lane_s16(src, 4);
- if (idx == 5) return vgetq_lane_s16(src, 5);
- if (idx == 6) return vgetq_lane_s16(src, 6);
- return vgetq_lane_s16(src, 7);
-}
-
-template <int idx>
inline void Store(const int16x8_t position, const int8x8_t reference_offset,
- const int32x4_t mvs, int8_t* dst_reference_offset,
+ const int32x4_t mv, int8_t* dst_reference_offset,
MotionVector* dst_mv) {
- const ptrdiff_t offset = VgetqLaneS16<idx>(position);
- int32_t* const d_mv = reinterpret_cast<int32_t*>(&dst_mv[offset]);
- vst1q_lane_s32(d_mv, mvs, idx & 3);
+ const ptrdiff_t offset = vgetq_lane_s16(position, idx);
+ auto* const d_mv = reinterpret_cast<int32_t*>(&dst_mv[offset]);
+ vst1q_lane_s32(d_mv, mv, idx & 3);
vst1_lane_s8(&dst_reference_offset[offset], reference_offset, idx);
}
template <int idx>
inline void CheckStore(const int8_t* skips, const int16x8_t position,
- const int8x8_t reference_offset, const int32x4_t mvs,
+ const int8x8_t reference_offset, const int32x4_t mv,
int8_t* dst_reference_offset, MotionVector* dst_mv) {
if (skips[idx] == 0) {
- const ptrdiff_t offset = VgetqLaneS16<idx>(position);
- int32_t* const d_mv = reinterpret_cast<int32_t*>(&dst_mv[offset]);
- vst1q_lane_s32(d_mv, mvs, idx & 3);
- vst1_lane_s8(&dst_reference_offset[offset], reference_offset, idx);
+ Store<idx>(position, reference_offset, mv, dst_reference_offset, dst_mv);
}
}
// 7.9.2.
-void MotionFieldProjectionKernel_NEON(
- const ReferenceFrameType* source_reference_type, const MotionVector* mv,
- const uint8_t order_hint[kNumReferenceFrameTypes],
- unsigned int current_frame_order_hint, unsigned int order_hint_shift_bits,
- int reference_to_current_with_sign, int dst_sign, int y8_start, int y8_end,
- int x8_start, int x8_end, TemporalMotionField* motion_field) {
+void MotionFieldProjectionKernel_NEON(const ReferenceInfo& reference_info,
+ const int reference_to_current_with_sign,
+ const int dst_sign, const int y8_start,
+ const int y8_end, const int x8_start,
+ const int x8_end,
+ TemporalMotionField* const motion_field) {
const ptrdiff_t stride = motion_field->mv.columns();
// The column range has to be offset by kProjectionMvMaxHorizontalOffset since
// coordinates in that range could end up being position_x8 because of
@@ -197,14 +167,17 @@ void MotionFieldProjectionKernel_NEON(
x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
const int adjusted_x8_end8 = adjusted_x8_end & ~7;
const int leftover = adjusted_x8_end - adjusted_x8_end8;
- const int8_t* const table =
- reinterpret_cast<const int8_t*>(kProjectionMvDivisionLookup);
+ const int8_t* const reference_offsets =
+ reference_info.relative_distance_to.data();
+ const bool* const skip_references = reference_info.skip_references.data();
+ const int16_t* const projection_divisions =
+ reference_info.projection_divisions.data();
+ const ReferenceFrameType* source_reference_types =
+ &reference_info.motion_field_reference_frame[y8_start][0];
+ const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
MotionVector* dst_mv = motion_field->mv[y8_start];
const int16x8_t d_sign = vdupq_n_s16(dst_sign);
- int8_t reference_offsets[kNumReferenceFrameTypes];
- bool skip_reference[kNumReferenceFrameTypes];
- int8x8x4_t division_table[2];
static_assert(sizeof(int8_t) == sizeof(bool), "");
static_assert(sizeof(int8_t) == sizeof(ReferenceFrameType), "");
@@ -219,37 +192,13 @@ void MotionFieldProjectionKernel_NEON(
// which means this optimization works for frame width up to 32K (each
// position is a 8x8 block).
assert(8 * stride <= 32768);
-
- const int8x8_t current_order_hints = vdup_n_s8(current_frame_order_hint);
- const int8x8_t order_hints = vreinterpret_s8_u8(vld1_u8(order_hint));
- const int8x8_t diff = vsub_s8(current_order_hints, order_hints);
- // |order_hint_shift_bits| - 24 could be -24. In this case diff is 0,
- // and the behavior of left or right shifting -24 bits is defined for ARM NEON
- // instructions, and the result of shifting 0 is still 0.
- const int8x8_t left_shift_bits = vdup_n_s8(order_hint_shift_bits - 24);
- const int8x8_t diff_shift_left = vshl_s8(diff, left_shift_bits);
- const int8x8_t r_offsets = vshl_s8(diff_shift_left, vneg_s8(left_shift_bits));
- const uint8x8_t overflow = vcgt_s8(r_offsets, vdup_n_s8(kMaxFrameDistance));
- const uint8x8_t underflow = vcle_s8(r_offsets, vdup_n_s8(0));
- const int8x8_t sk = vreinterpret_s8_u8(vorr_u8(overflow, underflow));
- // Initialize skip_reference[kReferenceFrameIntra] to simplify branch
- // conditions in projection.
- const int8x8_t skip_reference8 = vset_lane_s8(-1, sk, 0);
- vst1_s8(reinterpret_cast<int8_t*>(skip_reference), skip_reference8);
- vst1_s8(reference_offsets, r_offsets);
-
- // The compiler is inefficient when using vld4_s64(). Instructions waste in
- // copying from int64x1x4_t to int8x8x4_t, and there is no such vector
- // reinterpret intrinsics available to the best of our knowledge. Anyway
- // compiler is good enough to use 4 vld1q_s8().
- division_table[0].val[0] = vld1_s8(table + 0 * 8);
- division_table[0].val[1] = vld1_s8(table + 1 * 8);
- division_table[0].val[2] = vld1_s8(table + 2 * 8);
- division_table[0].val[3] = vld1_s8(table + 3 * 8);
- division_table[1].val[0] = vld1_s8(table + 4 * 8);
- division_table[1].val[1] = vld1_s8(table + 5 * 8);
- division_table[1].val[2] = vld1_s8(table + 6 * 8);
- division_table[1].val[3] = vld1_s8(table + 7 * 8);
+ const int8x8_t skip_reference =
+ vld1_s8(reinterpret_cast<const int8_t*>(skip_references));
+ const int8x8_t r_offsets = vld1_s8(reference_offsets);
+ const int8x16_t table = vreinterpretq_s8_s16(vld1q_s16(projection_divisions));
+ int8x8x2_t division_table;
+ division_table.val[0] = vget_low_s8(table);
+ division_table.val[1] = vget_high_s8(table);
int y8 = y8_start;
do {
@@ -261,8 +210,8 @@ void MotionFieldProjectionKernel_NEON(
for (x8 = adjusted_x8_start; x8 < adjusted_x8_end8; x8 += 8) {
const int8x8_t source_reference_type8 =
- vld1_s8(reinterpret_cast<const int8_t*>(source_reference_type + x8));
- const int8x8_t skip_r = vtbl1_s8(skip_reference8, source_reference_type8);
+ vld1_s8(reinterpret_cast<const int8_t*>(source_reference_types + x8));
+ const int8x8_t skip_r = vtbl1_s8(skip_reference, source_reference_type8);
const int64_t early_skip = vget_lane_s64(vreinterpret_s64_s8(skip_r), 0);
// Early termination #1 if all are skips. Chance is typically ~30-40%.
if (early_skip == -1) continue;
@@ -278,8 +227,8 @@ void MotionFieldProjectionKernel_NEON(
if (skip_64 == -1) continue;
const int16x8_t p_y = vmovl_s8(position_y8);
const int16x8_t p_x = vmovl_s8(position_x8);
- const int16x8_t p_xy = vmlaq_n_s16(p_x, p_y, stride);
- const int16x8_t position = vaddq_s16(p_xy, vdupq_n_s16(x8));
+ const int16x8_t pos = vmlaq_n_s16(p_x, p_y, stride);
+ const int16x8_t position = vaddq_s16(pos, vdupq_n_s16(x8));
if (skip_64 == 0) {
// Store all. Chance is typically ~70-85% after Early termination #2.
Store<0>(position, r, mvs[0], dst_reference_offset, dst_mv);
@@ -318,9 +267,9 @@ void MotionFieldProjectionKernel_NEON(
const int delta = 8 - leftover;
x8 = adjusted_x8_end - 8;
const int8x8_t source_reference_type8 = vld1_s8(
- reinterpret_cast<const int8_t*>(source_reference_type + x8));
+ reinterpret_cast<const int8_t*>(source_reference_types + x8));
const int8x8_t skip_r =
- vtbl1_s8(skip_reference8, source_reference_type8);
+ vtbl1_s8(skip_reference, source_reference_type8);
const int64_t early_skip =
vget_lane_s64(vreinterpret_s64_s8(skip_r), 0);
// Early termination #1 if all are skips.
@@ -336,8 +285,8 @@ void MotionFieldProjectionKernel_NEON(
if (skip_64 != -1) {
const int16x8_t p_y = vmovl_s8(position_y8);
const int16x8_t p_x = vmovl_s8(position_x8);
- const int16x8_t p_xy = vmlaq_n_s16(p_x, p_y, stride);
- const int16x8_t position = vaddq_s16(p_xy, vdupq_n_s16(x8));
+ const int16x8_t pos = vmlaq_n_s16(p_x, p_y, stride);
+ const int16x8_t position = vaddq_s16(pos, vdupq_n_s16(x8));
// Store up to 7 elements since leftover is at most 7.
if (skip_64 == 0) {
// Store all.
@@ -373,13 +322,13 @@ void MotionFieldProjectionKernel_NEON(
}
} else {
for (; x8 < adjusted_x8_end; ++x8) {
- if (skip_reference[source_reference_type[x8]]) continue;
- const int reference_offset =
- reference_offsets[source_reference_type[x8]];
+ const int source_reference_type = source_reference_types[x8];
+ if (skip_references[source_reference_type]) continue;
MotionVector projection_mv;
// reference_to_current_with_sign could be 0.
GetMvProjection(mv[x8], reference_to_current_with_sign,
- reference_offset, &projection_mv);
+ projection_divisions[source_reference_type],
+ &projection_mv);
// Do not update the motion vector if the block position is not valid
// or if position_x8 is outside the current range of x8_start and
// x8_end. Note that position_y8 will always be within the range of
@@ -395,12 +344,12 @@ void MotionFieldProjectionKernel_NEON(
if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
dst_mv[position_y8 * stride + position_x8] = mv[x8];
dst_reference_offset[position_y8 * stride + position_x8] =
- reference_offset;
+ reference_offsets[source_reference_type];
}
}
}
- source_reference_type += stride;
+ source_reference_types += stride;
mv += stride;
dst_reference_offset += stride;
dst_mv += stride;
diff --git a/chromium/third_party/libgav1/src/src/dsp/arm/motion_vector_search_neon.cc b/chromium/third_party/libgav1/src/src/dsp/arm/motion_vector_search_neon.cc
index 5332180dfbc..da3ba1706e6 100644
--- a/chromium/third_party/libgav1/src/src/dsp/arm/motion_vector_search_neon.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/arm/motion_vector_search_neon.cc
@@ -64,7 +64,7 @@ inline int16x8_t MvProjectionCompoundClip(
const MotionVector* const temporal_mvs,
const int8_t* const temporal_reference_offsets,
const int reference_offsets[2]) {
- const int32_t* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
+ const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
const int32x2_t temporal_mv = vld1_s32(tmvs);
const int16x4_t tmv0 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 0));
const int16x4_t tmv1 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 1));
@@ -79,7 +79,7 @@ inline int16x8_t MvProjectionSingleClip(
const MotionVector* const temporal_mvs,
const int8_t* const temporal_reference_offsets, const int reference_offset,
int16x4_t* const lookup) {
- const int16_t* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
+ const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
const int16x8_t temporal_mv = vld1q_s16(tmvs);
*lookup = vld1_lane_s16(
&kProjectionMvDivisionLookup[temporal_reference_offsets[0]], *lookup, 0);
@@ -98,27 +98,26 @@ inline int16x8_t MvProjectionSingleClip(
return ProjectionClip(mv0, mv1);
}
-void LowPrecision(const int16x8_t mv, void* const candidate_mvs) {
- const int16x8_t k1 = vdupq_n_s16(1);
+inline void LowPrecision(const int16x8_t mv, void* const candidate_mvs) {
+ const int16x8_t kRoundDownMask = vdupq_n_s16(1);
const uint16x8_t mvu = vreinterpretq_u16_s16(mv);
const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15));
- const int16x8_t mv1 = vbicq_s16(mv0, k1);
+ const int16x8_t mv1 = vbicq_s16(mv0, kRoundDownMask);
vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv1);
}
-void ForceInteger(const int16x8_t mv, void* const candidate_mvs) {
- const int16x8_t k3 = vdupq_n_s16(3);
- const int16x8_t k7 = vdupq_n_s16(7);
+inline void ForceInteger(const int16x8_t mv, void* const candidate_mvs) {
+ const int16x8_t kRoundDownMask = vdupq_n_s16(7);
const uint16x8_t mvu = vreinterpretq_u16_s16(mv);
const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15));
- const int16x8_t mv1 = vaddq_s16(mv0, k3);
- const int16x8_t mv2 = vbicq_s16(mv1, k7);
+ const int16x8_t mv1 = vaddq_s16(mv0, vdupq_n_s16(3));
+ const int16x8_t mv2 = vbicq_s16(mv1, kRoundDownMask);
vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv2);
}
void MvProjectionCompoundLowPrecision_NEON(
const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
- const int reference_offsets[2], int count,
+ const int reference_offsets[2], const int count,
CompoundMotionVector* candidate_mvs) {
// |reference_offsets| non-zero check usually equals true and is ignored.
// To facilitate the compilers, make a local copy of |reference_offsets|.
diff --git a/chromium/third_party/libgav1/src/src/dsp/arm/warp_neon.cc b/chromium/third_party/libgav1/src/src/dsp/arm/warp_neon.cc
index 901aa3ddedf..c7fb739ba75 100644
--- a/chromium/third_party/libgav1/src/src/dsp/arm/warp_neon.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/arm/warp_neon.cc
@@ -133,7 +133,7 @@ void Warp_NEON(const void* const source, const ptrdiff_t source_stride,
assert(block_width >= 8);
assert(block_height >= 8);
- // Warp process applies for each 8x8 block (or smaller).
+ // Warp process applies for each 8x8 block.
int start_y = block_start_y;
do {
int start_x = block_start_x;
diff --git a/chromium/third_party/libgav1/src/src/dsp/arm/weight_mask_neon.h b/chromium/third_party/libgav1/src/src/dsp/arm/weight_mask_neon.h
index f13eb13605c..b4749ec6aea 100644
--- a/chromium/third_party/libgav1/src/src/dsp/arm/weight_mask_neon.h
+++ b/chromium/third_party/libgav1/src/src/dsp/arm/weight_mask_neon.h
@@ -36,6 +36,7 @@ void WeightMaskInit_NEON();
#define LIBGAV1_Dsp8bpp_WeightMask_16x8 LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_WeightMask_16x16 LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_WeightMask_16x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_16x64 LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_WeightMask_32x8 LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_WeightMask_32x16 LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_WeightMask_32x32 LIBGAV1_CPU_NEON
diff --git a/chromium/third_party/libgav1/src/src/dsp/cdef.cc b/chromium/third_party/libgav1/src/src/dsp/cdef.cc
index 0ebee20d8b5..a7c720b77cc 100644
--- a/chromium/third_party/libgav1/src/src/dsp/cdef.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/cdef.cc
@@ -29,6 +29,8 @@ namespace libgav1 {
namespace dsp {
namespace {
+#include "src/dsp/cdef.inc"
+
// Silence unused function warnings when CdefDirection_C is obviated.
#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
!defined(LIBGAV1_Dsp8bpp_CdefDirection) || \
@@ -119,21 +121,23 @@ int Constrain(int diff, int threshold, int damping) {
// constant large value if at the boundary. And the input should be uint16_t.
template <int bitdepth, typename Pixel>
void CdefFilter_C(const void* const source, const ptrdiff_t source_stride,
- const int rows4x4, const int columns4x4, const int curr_x,
- const int curr_y, const int subsampling_x,
- const int subsampling_y, const int primary_strength,
- const int secondary_strength, const int damping,
- const int direction, void* const dest,
+ const int block_width, const int block_height,
+ const int primary_strength, const int secondary_strength,
+ const int damping, const int direction, void* const dest,
const ptrdiff_t dest_stride) {
- static constexpr int kCdefSecondaryTaps[2] = {kCdefSecondaryTap0,
- kCdefSecondaryTap1};
- const int coeff_shift = bitdepth - 8;
- const int plane_width = MultiplyBy4(columns4x4) >> subsampling_x;
- const int plane_height = MultiplyBy4(rows4x4) >> subsampling_y;
- const int block_width = std::min(8 >> subsampling_x, plane_width - curr_x);
assert(block_width == 4 || block_width == 8);
- const int block_height = std::min(8 >> subsampling_y, plane_height - curr_y);
assert(block_height == 4 || block_height == 8);
+ assert(direction >= 0 && direction <= 7);
+ constexpr int coeff_shift = bitdepth - 8;
+ // Section 5.9.19. CDEF params syntax.
+ assert(primary_strength >= 0 && primary_strength <= 15 << coeff_shift);
+ assert(secondary_strength >= 0 && secondary_strength <= 4 << coeff_shift &&
+ secondary_strength != 3 << coeff_shift);
+ // damping is decreased by 1 for chroma.
+ assert((damping >= 3 && damping <= 6 + coeff_shift) ||
+ (damping >= 2 && damping <= 5 + coeff_shift));
+ static constexpr int kCdefSecondaryTaps[2] = {kCdefSecondaryTap0,
+ kCdefSecondaryTap1};
const auto* src = static_cast<const uint16_t*>(source);
auto* dst = static_cast<Pixel*>(dest);
const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel);
@@ -146,7 +150,7 @@ void CdefFilter_C(const void* const source, const ptrdiff_t source_stride,
uint16_t max_value = pixel_value;
uint16_t min_value = pixel_value;
for (int k = 0; k < 2; ++k) {
- const int signs[] = {-1, 1};
+ static constexpr int signs[] = {-1, 1};
for (const int& sign : signs) {
int dy = sign * kCdefDirections[direction][k][0];
int dx = sign * kCdefDirections[direction][k][1];
@@ -160,10 +164,10 @@ void CdefFilter_C(const void* const source, const ptrdiff_t source_stride,
max_value = std::max(value, max_value);
min_value = std::min(value, min_value);
}
- const int offsets[] = {-2, 2};
+ static constexpr int offsets[] = {-2, 2};
for (const int& offset : offsets) {
- dy = sign * kCdefDirections[(direction + offset) & 7][k][0];
- dx = sign * kCdefDirections[(direction + offset) & 7][k][1];
+ dy = sign * kCdefDirections[direction + offset][k][0];
+ dx = sign * kCdefDirections[direction + offset][k][1];
value = src[dy * source_stride + dx + x];
// Note: the summation can ignore the condition check in SIMD
// implementation.
diff --git a/chromium/third_party/libgav1/src/src/dsp/cdef.inc b/chromium/third_party/libgav1/src/src/dsp/cdef.inc
new file mode 100644
index 00000000000..c1a31361796
--- /dev/null
+++ b/chromium/third_party/libgav1/src/src/dsp/cdef.inc
@@ -0,0 +1,29 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants used for cdef implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+const int8_t (*const kCdefDirections)[2][2] = kCdefDirectionsPadded + 2;
+
+// Mirror values and pad to 16 elements.
+alignas(16) constexpr uint32_t kCdefDivisionTable[] = {
+ 840, 420, 280, 210, 168, 140, 120, 105,
+ 120, 140, 168, 210, 280, 420, 840, 0};
+
+// Used when calculating odd |cost[x]| values to mask off unwanted elements.
+// Holds elements 1 3 5 X 5 3 1 X
+alignas(16) constexpr uint32_t kCdefDivisionTableOdd[] = {420, 210, 140, 0,
+ 140, 210, 420, 0};
diff --git a/chromium/third_party/libgav1/src/src/dsp/common.h b/chromium/third_party/libgav1/src/src/dsp/common.h
index 2532d177856..2a08403379f 100644
--- a/chromium/third_party/libgav1/src/src/dsp/common.h
+++ b/chromium/third_party/libgav1/src/src/dsp/common.h
@@ -45,15 +45,15 @@ struct RestorationUnitInfo : public MaxAlignedAllocable {
WienerInfo wiener_info;
};
-struct RestorationBuffer {
+union RestorationBuffer {
// For self-guided filter.
- int* box_filter_process_output[2];
- ptrdiff_t box_filter_process_output_stride;
- uint32_t* box_filter_process_intermediate[2];
- ptrdiff_t box_filter_process_intermediate_stride;
+ alignas(kMaxAlignment) uint16_t sgf_buffer[12 * (kRestorationUnitHeight + 2)];
// For wiener filter.
- uint16_t* wiener_buffer;
- ptrdiff_t wiener_buffer_stride;
+ // The array |intermediate| in Section 7.17.4, the intermediate results
+ // between the horizontal and vertical filters.
+ alignas(kMaxAlignment) uint16_t
+ wiener_buffer[(kRestorationUnitHeight + kSubPixelTaps - 1) *
+ kRestorationUnitWidth];
};
} // namespace libgav1
diff --git a/chromium/third_party/libgav1/src/src/dsp/constants.cc b/chromium/third_party/libgav1/src/src/dsp/constants.cc
index 1b9e6fc14e0..0099ca36c8c 100644
--- a/chromium/third_party/libgav1/src/src/dsp/constants.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/constants.cc
@@ -81,8 +81,23 @@ const uint16_t kSgrScaleParameter[16][2] = {
const uint8_t kCdefPrimaryTaps[2][2] = {{4, 2}, {3, 3}};
-const int8_t kCdefDirections[8][2][2] = {
- {{-1, 1}, {-2, 2}}, {{0, 1}, {-1, 2}}, {{0, 1}, {0, 2}}, {{0, 1}, {1, 2}},
- {{1, 1}, {2, 2}}, {{1, 0}, {2, 1}}, {{1, 0}, {2, 0}}, {{1, 0}, {2, -1}}};
+// This is Cdef_Directions (section 7.15.3) with 2 padding entries at the
+// beginning and end of the table. The cdef direction range is [0, 7] and the
+// first index is offset +/-2. This removes the need to constrain the first
+// index to the same range using e.g., & 7.
+const int8_t kCdefDirectionsPadded[12][2][2] = {
+ {{1, 0}, {2, 0}}, // Padding: Cdef_Directions[6]
+ {{1, 0}, {2, -1}}, // Padding: Cdef_Directions[7]
+ {{-1, 1}, {-2, 2}}, // Begin Cdef_Directions
+ {{0, 1}, {-1, 2}}, //
+ {{0, 1}, {0, 2}}, //
+ {{0, 1}, {1, 2}}, //
+ {{1, 1}, {2, 2}}, //
+ {{1, 0}, {2, 1}}, //
+ {{1, 0}, {2, 0}}, //
+ {{1, 0}, {2, -1}}, // End Cdef_Directions
+ {{-1, 1}, {-2, 2}}, // Padding: Cdef_Directions[0]
+ {{0, 1}, {-1, 2}}, // Padding: Cdef_Directions[1]
+};
} // namespace libgav1
diff --git a/chromium/third_party/libgav1/src/src/dsp/constants.h b/chromium/third_party/libgav1/src/src/dsp/constants.h
index d588d22af41..7c1b62c4926 100644
--- a/chromium/third_party/libgav1/src/src/dsp/constants.h
+++ b/chromium/third_party/libgav1/src/src/dsp/constants.h
@@ -64,7 +64,7 @@ extern const uint16_t kSgrScaleParameter[16][2];
extern const uint8_t kCdefPrimaryTaps[2][2];
-extern const int8_t kCdefDirections[8][2][2];
+extern const int8_t kCdefDirectionsPadded[12][2][2];
} // namespace libgav1
diff --git a/chromium/third_party/libgav1/src/src/dsp/dsp.cc b/chromium/third_party/libgav1/src/src/dsp/dsp.cc
index db285a5f8a0..c1df27634cc 100644
--- a/chromium/third_party/libgav1/src/src/dsp/dsp.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/dsp.cc
@@ -94,6 +94,8 @@ void DspInit() {
LoopFilterInit_SSE4_1();
LoopRestorationInit_SSE4_1();
MaskBlendInit_SSE4_1();
+ MotionFieldProjectionInit_SSE4_1();
+ MotionVectorSearchInit_SSE4_1();
ObmcInit_SSE4_1();
SuperResInit_SSE4_1();
WarpInit_SSE4_1();
diff --git a/chromium/third_party/libgav1/src/src/dsp/dsp.h b/chromium/third_party/libgav1/src/src/dsp/dsp.h
index f5b5b366947..470436faf26 100644
--- a/chromium/third_party/libgav1/src/src/dsp/dsp.h
+++ b/chromium/third_party/libgav1/src/src/dsp/dsp.h
@@ -25,6 +25,7 @@
#include "src/dsp/constants.h"
#include "src/dsp/film_grain_common.h"
#include "src/utils/cpu.h"
+#include "src/utils/reference_info.h"
#include "src/utils/types.h"
namespace libgav1 {
@@ -328,20 +329,15 @@ using CdefDirectionFunc = void (*)(const void* src, ptrdiff_t stride,
// Cdef filtering function signature. Section 7.15.3.
// |source| is a pointer to the input block. |source_stride| is given in bytes.
-// |rows4x4| and |columns4x4| are frame sizes in units of 4x4 pixels.
-// |curr_x| and |curr_y| are current position in units of pixels.
-// |subsampling_x|, |subsampling_y| are the subsampling factors of current
-// plane.
+// |block_width|, |block_height| are the width/height of the input block.
// |primary_strength|, |secondary_strength|, and |damping| are Cdef filtering
// parameters.
// |direction| is the filtering direction.
// |dest| is the output buffer. |dest_stride| is given in bytes.
using CdefFilteringFunc = void (*)(const void* source, ptrdiff_t source_stride,
- int rows4x4, int columns4x4, int curr_x,
- int curr_y, int subsampling_x,
- int subsampling_y, int primary_strength,
- int secondary_strength, int damping,
- int direction, void* dest,
+ int block_width, int block_height,
+ int primary_strength, int secondary_strength,
+ int damping, int direction, void* dest,
ptrdiff_t dest_stride);
// Upscaling process function signature. Section 7.16.
@@ -360,7 +356,8 @@ using SuperResRowFunc = void (*)(const void* source, const int upscaled_width,
// |source| is the input frame buffer, which is deblocked and cdef filtered.
// |dest| is the output.
// |restoration_info| contains loop restoration information, such as filter
-// type, strength. |source| and |dest| share the same stride given in bytes.
+// type, strength.
+// |source_stride| and |dest_stride| are given in pixels.
// |buffer| contains buffers required for self guided filter and wiener filter.
// They must be initialized before calling.
using LoopRestorationFunc = void (*)(
@@ -745,15 +742,7 @@ struct FilmGrainFuncs {
};
// Motion field projection function signature. Section 7.9.
-// |source_reference_type| corresponds to MfRefFrames[i * 2 + 1][j * 2 + 1] in
-// the spec.
-// |mv| corresponds to MfMvs[i * 2 + 1][j * 2 + 1] in the spec.
-// |order_hint| points to an array of kNumReferenceFrameTypes elements which
-// specifies OrderHintBits least significant bits of the expected output order
-// for reference frames.
-// |current_frame_order_hint| specifies OrderHintBits least significant bits of
-// the expected output order for this frame.
-// |order_hint_shift_bits| equals (32 - OrderHintBits) % 32.
+// |reference_info| provides reference information for motion field projection.
// |reference_to_current_with_sign| is the precalculated reference frame id
// distance from current frame.
// |dst_sign| is -1 for LAST_FRAME and LAST2_FRAME, or 0 (1 in spec) for others.
@@ -763,11 +752,9 @@ struct FilmGrainFuncs {
// |motion_field| is the output which saves the projected motion field
// information.
using MotionFieldProjectionKernelFunc = void (*)(
- const ReferenceFrameType* source_reference_type, const MotionVector* mv,
- const uint8_t order_hint[kNumReferenceFrameTypes],
- unsigned int current_frame_order_hint, unsigned int order_hint_shift_bits,
- int reference_to_current_with_sign, int dst_sign, int y8_start, int y8_end,
- int x8_start, int x8_end, TemporalMotionField* motion_field);
+ const ReferenceInfo& reference_info, int reference_to_current_with_sign,
+ int dst_sign, int y8_start, int y8_end, int x8_start, int x8_end,
+ TemporalMotionField* motion_field);
// Compound temporal motion vector projection function signature.
// Section 7.9.3 and 7.10.2.10.
@@ -797,35 +784,35 @@ using MvProjectionSingleFunc = void (*)(
int reference_offset, int count, MotionVector* candidate_mvs);
struct Dsp {
- IntraPredictorFuncs intra_predictors;
+ AverageBlendFunc average_blend;
+ CdefDirectionFunc cdef_direction;
+ CdefFilteringFunc cdef_filter;
+ CflIntraPredictorFuncs cfl_intra_predictors;
+ CflSubsamplerFuncs cfl_subsamplers;
+ ConvolveFuncs convolve;
+ ConvolveScaleFuncs convolve_scale;
DirectionalIntraPredictorZone1Func directional_intra_predictor_zone1;
DirectionalIntraPredictorZone2Func directional_intra_predictor_zone2;
DirectionalIntraPredictorZone3Func directional_intra_predictor_zone3;
+ DistanceWeightedBlendFunc distance_weighted_blend;
+ FilmGrainFuncs film_grain;
FilterIntraPredictorFunc filter_intra_predictor;
- CflIntraPredictorFuncs cfl_intra_predictors;
- CflSubsamplerFuncs cfl_subsamplers;
+ InterIntraMaskBlendFuncs8bpp inter_intra_mask_blend_8bpp;
IntraEdgeFilterFunc intra_edge_filter;
IntraEdgeUpsamplerFunc intra_edge_upsampler;
+ IntraPredictorFuncs intra_predictors;
InverseTransformAddFuncs inverse_transforms;
LoopFilterFuncs loop_filters;
- CdefDirectionFunc cdef_direction;
- CdefFilteringFunc cdef_filter;
- SuperResRowFunc super_res_row;
LoopRestorationFuncs loop_restorations;
+ MaskBlendFuncs mask_blend;
MotionFieldProjectionKernelFunc motion_field_projection_kernel;
MvProjectionCompoundFunc mv_projection_compound[3];
MvProjectionSingleFunc mv_projection_single[3];
- ConvolveFuncs convolve;
- ConvolveScaleFuncs convolve_scale;
- WeightMaskFuncs weight_mask;
- AverageBlendFunc average_blend;
- DistanceWeightedBlendFunc distance_weighted_blend;
- MaskBlendFuncs mask_blend;
- InterIntraMaskBlendFuncs8bpp inter_intra_mask_blend_8bpp;
ObmcBlendFuncs obmc_blend;
- WarpFunc warp;
+ SuperResRowFunc super_res_row;
WarpCompoundFunc warp_compound;
- FilmGrainFuncs film_grain;
+ WarpFunc warp;
+ WeightMaskFuncs weight_mask;
};
// Initializes function pointers based on build config and runtime
diff --git a/chromium/third_party/libgav1/src/src/dsp/libgav1_dsp.cmake b/chromium/third_party/libgav1/src/src/dsp/libgav1_dsp.cmake
index 06e23ee0f4f..00574fa1953 100644
--- a/chromium/third_party/libgav1/src/src/dsp/libgav1_dsp.cmake
+++ b/chromium/third_party/libgav1/src/src/dsp/libgav1_dsp.cmake
@@ -24,6 +24,7 @@ list(APPEND libgav1_dsp_sources
"${libgav1_source}/dsp/average_blend.h"
"${libgav1_source}/dsp/cdef.cc"
"${libgav1_source}/dsp/cdef.h"
+ "${libgav1_source}/dsp/cdef.inc"
"${libgav1_source}/dsp/common.h"
"${libgav1_source}/dsp/constants.cc"
"${libgav1_source}/dsp/constants.h"
@@ -42,6 +43,7 @@ list(APPEND libgav1_dsp_sources
"${libgav1_source}/dsp/intrapred.h"
"${libgav1_source}/dsp/inverse_transform.cc"
"${libgav1_source}/dsp/inverse_transform.h"
+ "${libgav1_source}/dsp/inverse_transform.inc"
"${libgav1_source}/dsp/loop_filter.cc"
"${libgav1_source}/dsp/loop_filter.h"
"${libgav1_source}/dsp/loop_restoration.cc"
@@ -54,6 +56,7 @@ list(APPEND libgav1_dsp_sources
"${libgav1_source}/dsp/motion_vector_search.h"
"${libgav1_source}/dsp/obmc.cc"
"${libgav1_source}/dsp/obmc.h"
+ "${libgav1_source}/dsp/obmc.inc"
"${libgav1_source}/dsp/super_res.cc"
"${libgav1_source}/dsp/super_res.h"
"${libgav1_source}/dsp/warp.cc"
@@ -128,6 +131,10 @@ list(APPEND libgav1_dsp_sources_sse4
"${libgav1_source}/dsp/x86/loop_restoration_sse4.h"
"${libgav1_source}/dsp/x86/mask_blend_sse4.cc"
"${libgav1_source}/dsp/x86/mask_blend_sse4.h"
+ "${libgav1_source}/dsp/x86/motion_field_projection_sse4.cc"
+ "${libgav1_source}/dsp/x86/motion_field_projection_sse4.h"
+ "${libgav1_source}/dsp/x86/motion_vector_search_sse4.cc"
+ "${libgav1_source}/dsp/x86/motion_vector_search_sse4.h"
"${libgav1_source}/dsp/x86/obmc_sse4.cc"
"${libgav1_source}/dsp/x86/obmc_sse4.h"
"${libgav1_source}/dsp/x86/super_res_sse4.cc"
diff --git a/chromium/third_party/libgav1/src/src/dsp/loop_filter.cc b/chromium/third_party/libgav1/src/src/dsp/loop_filter.cc
index 946952b029c..6cad97d4280 100644
--- a/chromium/third_party/libgav1/src/src/dsp/loop_filter.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/loop_filter.cc
@@ -31,10 +31,10 @@ template <int bitdepth, typename Pixel>
struct LoopFilterFuncs_C {
LoopFilterFuncs_C() = delete;
- static const int kMaxPixel = (1 << bitdepth) - 1;
- static const int kMinSignedPixel = -(1 << (bitdepth - 1));
- static const int kMaxSignedPixel = (1 << (bitdepth - 1)) - 1;
- static const int kFlatThresh = 1 << (bitdepth - 8);
+ static constexpr int kMaxPixel = (1 << bitdepth) - 1;
+ static constexpr int kMinSignedPixel = -(1 << (bitdepth - 1));
+ static constexpr int kMaxSignedPixel = (1 << (bitdepth - 1)) - 1;
+ static constexpr int kFlatThresh = 1 << (bitdepth - 8);
static void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh,
int inner_thresh, int hev_thresh);
diff --git a/chromium/third_party/libgav1/src/src/dsp/loop_restoration.cc b/chromium/third_party/libgav1/src/src/dsp/loop_restoration.cc
index 467e33492fd..b2ae99c0882 100644
--- a/chromium/third_party/libgav1/src/src/dsp/loop_restoration.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/loop_restoration.cc
@@ -26,15 +26,6 @@
namespace libgav1 {
namespace dsp {
-namespace {
-
-// Precision of a division table (mtable)
-constexpr int kSgrProjScaleBits = 20;
-constexpr int kSgrProjReciprocalBits = 12;
-// Core self-guided restoration precision bits.
-constexpr int kSgrProjSgrBits = 8;
-// Precision bits of generated values higher than source before projection.
-constexpr int kSgrProjRestoreBits = 4;
// Section 7.17.3.
// a2: range [1, 256].
@@ -44,7 +35,7 @@ constexpr int kSgrProjRestoreBits = 4;
// a2 = 1;
// else
// a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1);
-constexpr int kXByXPlus1[256] = {
+const int kXByXPlus1[256] = {
1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
@@ -64,65 +55,51 @@ constexpr int kXByXPlus1[256] = {
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
256};
+// a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1);
+// sgr_ma2 = 256 - a2
+const uint8_t kSgrMa2Lookup[256] = {
+ 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16, 15, 14,
+ 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7,
+ 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 0};
+
+namespace {
+
+constexpr ptrdiff_t kIntermediateStride = kRestorationUnitWidth + 2;
+
+struct SgrIntermediateBuffer {
+ uint16_t a; // [1, 256]
+ uint32_t b; // < 2^20. 32-bit is required for bitdepth 10 and up.
+};
+
+struct SgrBuffer {
+ // Circular buffer to save memory.
+ // The 2d arrays A and B in Section 7.17.3, the intermediate results in the
+ // box filter process. Reused for pass 0 and pass 1. Pass 0 uses 2 rows. Pass
+ // 1 uses 3 or 4 rows.
+ SgrIntermediateBuffer intermediate[6 * kIntermediateStride];
+};
+
constexpr int kOneByX[25] = {
4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164,
};
-// Compute integral image. In an integral image, each pixel value of (xi, yi)
-// is the sum of all pixel values {(x, y) | x <= xi, y <= yi} from the source
-// image.
-// The integral image (II) can be calculated as:
-// II(D) = Pixel(D) + II(B) + II(C) - II(A),
-// where the rectangular region ABCD is
-// A = (x, y), B = (x + 1, y), C = (x, y + 1), D = (x + 1, y + 1).
-// Integral image helps to compute the sum of a rectangular area fast.
-// The box centered at (x, y), with radius r, is rectangular ABCD:
-// A = (x - r, y - r), B = (x + r, y - r),
-// C = (x - r, y + r), D = (x + r, y + r),
-// The sum of the box, or the rectangular ABCD can be calculated with the
-// integral image (II):
-// sum = II(D) - II(B') - II(C') + II(A').
-// A' = (x - r - 1, y - r - 1), B' = (x + r, y - r - 1),
-// C' = (x - r - 1, y + r), D = (x + r, y + r),
-// Here we calculate the integral image, as well as the squared integral image.
-template <typename Pixel>
-void ComputeIntegralImage(const Pixel* const src, ptrdiff_t src_stride,
- int width, int height, uint16_t* integral_image,
- uint32_t* square_integral_image,
- ptrdiff_t image_stride) {
- memset(integral_image, 0, image_stride * sizeof(integral_image[0]));
- memset(square_integral_image, 0,
- image_stride * sizeof(square_integral_image[0]));
-
- const Pixel* src_ptr = src;
- uint16_t* integral_image_ptr = integral_image + image_stride + 1;
- uint32_t* square_integral_image_ptr =
- square_integral_image + image_stride + 1;
- int y = 0;
- do {
- integral_image_ptr[-1] = 0;
- square_integral_image_ptr[-1] = 0;
- for (int x = 0; x < width; ++x) {
- integral_image_ptr[x] = src_ptr[x] + integral_image_ptr[x - 1] +
- integral_image_ptr[x - image_stride] -
- integral_image_ptr[x - image_stride - 1];
- square_integral_image_ptr[x] =
- src_ptr[x] * src_ptr[x] + square_integral_image_ptr[x - 1] +
- square_integral_image_ptr[x - image_stride] -
- square_integral_image_ptr[x - image_stride - 1];
- }
- src_ptr += src_stride;
- integral_image_ptr += image_stride;
- square_integral_image_ptr += image_stride;
- } while (++y < height);
-}
-
template <int bitdepth, typename Pixel>
struct LoopRestorationFuncs_C {
LoopRestorationFuncs_C() = delete;
- // |stride| for SelfGuidedFilter and WienerFilter is given in bytes.
static void SelfGuidedFilter(const void* source, void* dest,
const RestorationUnitInfo& restoration_info,
ptrdiff_t source_stride, ptrdiff_t dest_stride,
@@ -132,15 +109,18 @@ struct LoopRestorationFuncs_C {
const RestorationUnitInfo& restoration_info,
ptrdiff_t source_stride, ptrdiff_t dest_stride,
int width, int height, RestorationBuffer* buffer);
- // |stride| for box filter processing is in Pixels.
- static void BoxFilterPreProcess(const RestorationUnitInfo& restoration_info,
- const uint16_t* integral_image,
- const uint32_t* square_integral_image,
- int width, int height, int pass,
- RestorationBuffer* buffer);
static void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
- const Pixel* src, ptrdiff_t stride, int width,
- int height, RestorationBuffer* buffer);
+ const Pixel* src, ptrdiff_t src_stride,
+ int width, int height, SgrBuffer* buffer,
+ Pixel* dst, ptrdiff_t dst_stride);
+ static void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+ const Pixel* src, ptrdiff_t src_stride,
+ int width, int height, SgrBuffer* buffer,
+ Pixel* dst, ptrdiff_t dst_stride);
+ static void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+ const Pixel* src, ptrdiff_t src_stride,
+ int width, int height, SgrBuffer* buffer,
+ Pixel* dst, ptrdiff_t dst_stride);
};
// Note: range of wiener filter coefficients.
@@ -154,7 +134,7 @@ struct LoopRestorationFuncs_C {
// filter[3] = 0 - (filter[0] + filter[1] + filter[2]) * 2.
// Thus in libaom's computation, an offset of 128 is needed for filter[3].
inline void PopulateWienerCoefficients(
- const RestorationUnitInfo& restoration_info, int direction,
+ const RestorationUnitInfo& restoration_info, const int direction,
int16_t* const filter) {
filter[3] = 128;
for (int i = 0; i < 3; ++i) {
@@ -178,26 +158,64 @@ inline int CountZeroCoefficients(const int16_t* const filter) {
return number_zero_coefficients;
}
-template <typename Pixel>
-inline int WienerHorizontal(const Pixel* const source,
- const int16_t* const filter,
- const int number_zero_coefficients, int sum) {
+template <int bitdepth, typename Pixel>
+inline void WienerHorizontal(const Pixel* source, const ptrdiff_t source_stride,
+ const int width, const int height,
+ const int16_t* const filter,
+ const int number_zero_coefficients,
+ uint16_t** wiener_buffer) {
constexpr int kCenterTap = (kSubPixelTaps - 1) / 2;
- for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
- sum += filter[k] * (source[k] + source[kSubPixelTaps - 2 - k]);
- }
- return sum;
+ constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+ ? kInterRoundBitsHorizontal12bpp
+ : kInterRoundBitsHorizontal;
+ constexpr int limit =
+ (1 << (bitdepth + 1 + kWienerFilterBits - kRoundBitsHorizontal)) - 1;
+ constexpr int horizontal_rounding = 1 << (bitdepth + kWienerFilterBits - 1);
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ // sum fits into 16 bits only when bitdepth = 8.
+ int sum = horizontal_rounding;
+ for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
+ sum += filter[k] * (source[x + k] + source[x + kSubPixelTaps - 2 - k]);
+ }
+ sum += filter[kCenterTap] * source[x + kCenterTap];
+ const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsHorizontal);
+ (*wiener_buffer)[x] = static_cast<uint16_t>(Clip3(rounded_sum, 0, limit));
+ } while (++x < width);
+ source += source_stride;
+ *wiener_buffer += width;
+ } while (--y != 0);
}
-inline int WienerVertical(const uint16_t* const source,
- const int16_t* const filter, const int width,
- const int number_zero_coefficients, int sum) {
+template <int bitdepth, typename Pixel>
+inline void WienerVertical(const uint16_t* wiener_buffer, const int width,
+ const int height, const int16_t* const filter,
+ const int number_zero_coefficients, void* const dest,
+ const ptrdiff_t dest_stride) {
constexpr int kCenterTap = (kSubPixelTaps - 1) / 2;
- for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
- sum += filter[k] *
- (source[k * width] + source[(kSubPixelTaps - 2 - k) * width]);
- }
- return sum;
+ constexpr int kRoundBitsVertical =
+ (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
+ constexpr int vertical_rounding = -(1 << (bitdepth + kRoundBitsVertical - 1));
+ auto* dst = static_cast<Pixel*>(dest);
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ // sum needs 32 bits.
+ int sum = vertical_rounding;
+ for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
+ sum += filter[k] * (wiener_buffer[k * width + x] +
+ wiener_buffer[(kSubPixelTaps - 2 - k) * width + x]);
+ }
+ sum += filter[kCenterTap] * wiener_buffer[kCenterTap * width + x];
+ const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical);
+ dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1));
+ } while (++x < width);
+ wiener_buffer += width;
+ dst += dest_stride;
+ } while (--y != 0);
}
// Note: bit range for wiener filter.
@@ -223,13 +241,6 @@ void LoopRestorationFuncs_C<bitdepth, Pixel>::WienerFilter(
ptrdiff_t dest_stride, int width, int height,
RestorationBuffer* const buffer) {
constexpr int kCenterTap = (kSubPixelTaps - 1) / 2;
- constexpr int kRoundBitsHorizontal = (bitdepth == 12)
- ? kInterRoundBitsHorizontal12bpp
- : kInterRoundBitsHorizontal;
- constexpr int kRoundBitsVertical =
- (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
- const int limit =
- (1 << (bitdepth + 1 + kWienerFilterBits - kRoundBitsHorizontal)) - 1;
int16_t filter_horizontal[kSubPixelTaps / 2];
int16_t filter_vertical[kSubPixelTaps / 2];
PopulateWienerCoefficients(restoration_info, WienerInfo::kHorizontal,
@@ -240,448 +251,470 @@ void LoopRestorationFuncs_C<bitdepth, Pixel>::WienerFilter(
CountZeroCoefficients(filter_horizontal);
const int number_zero_coefficients_vertical =
CountZeroCoefficients(filter_vertical);
-
- source_stride /= sizeof(Pixel);
- dest_stride /= sizeof(Pixel);
+ const int number_rows_to_skip =
+ std::max(number_zero_coefficients_vertical, 1);
// horizontal filtering.
const auto* src = static_cast<const Pixel*>(source);
- src -= (kCenterTap - number_zero_coefficients_vertical) * source_stride +
- kCenterTap;
- auto* wiener_buffer =
- buffer->wiener_buffer + number_zero_coefficients_vertical * width;
- const int horizontal_rounding = 1 << (bitdepth + kWienerFilterBits - 1);
- int y = height + kSubPixelTaps - 2 - 2 * number_zero_coefficients_vertical;
+ src -= (kCenterTap - number_rows_to_skip) * source_stride + kCenterTap;
+ auto* wiener_buffer = buffer->wiener_buffer + number_rows_to_skip * width;
+ const int height_horizontal =
+ height + kSubPixelTaps - 2 - 2 * number_rows_to_skip;
if (number_zero_coefficients_horizontal == 0) {
- do {
- int x = 0;
- do {
- // sum fits into 16 bits only when bitdepth = 8.
- int sum = horizontal_rounding;
- sum = WienerHorizontal<Pixel>(src + x, filter_horizontal, 0, sum);
- sum += filter_horizontal[kCenterTap] * src[x + kCenterTap];
- const int rounded_sum =
- RightShiftWithRounding(sum, kRoundBitsHorizontal);
- wiener_buffer[x] = static_cast<uint16_t>(Clip3(rounded_sum, 0, limit));
- } while (++x < width);
- src += source_stride;
- wiener_buffer += width;
- } while (--y != 0);
+ WienerHorizontal<bitdepth, Pixel>(src, source_stride, width,
+ height_horizontal, filter_horizontal, 0,
+ &wiener_buffer);
} else if (number_zero_coefficients_horizontal == 1) {
- do {
- int x = 0;
- do {
- // sum fits into 16 bits only when bitdepth = 8.
- int sum = horizontal_rounding;
- sum = WienerHorizontal<Pixel>(src + x, filter_horizontal, 1, sum);
- sum += filter_horizontal[kCenterTap] * src[x + kCenterTap];
- const int rounded_sum =
- RightShiftWithRounding(sum, kRoundBitsHorizontal);
- wiener_buffer[x] = static_cast<uint16_t>(Clip3(rounded_sum, 0, limit));
- } while (++x < width);
- src += source_stride;
- wiener_buffer += width;
- } while (--y != 0);
+ WienerHorizontal<bitdepth, Pixel>(src, source_stride, width,
+ height_horizontal, filter_horizontal, 1,
+ &wiener_buffer);
} else if (number_zero_coefficients_horizontal == 2) {
- do {
- int x = 0;
- do {
- // sum fits into 16 bits only when bitdepth = 8.
- int sum = horizontal_rounding;
- sum = WienerHorizontal<Pixel>(src + x, filter_horizontal, 2, sum);
- sum += filter_horizontal[kCenterTap] * src[x + kCenterTap];
- const int rounded_sum =
- RightShiftWithRounding(sum, kRoundBitsHorizontal);
- wiener_buffer[x] = static_cast<uint16_t>(Clip3(rounded_sum, 0, limit));
- } while (++x < width);
- src += source_stride;
- wiener_buffer += width;
- } while (--y != 0);
+ WienerHorizontal<bitdepth, Pixel>(src, source_stride, width,
+ height_horizontal, filter_horizontal, 2,
+ &wiener_buffer);
} else {
- do {
- int x = 0;
- do {
- // sum fits into 16 bits only when bitdepth = 8.
- int sum = horizontal_rounding;
- sum += filter_horizontal[kCenterTap] * src[x + kCenterTap];
- const int rounded_sum =
- RightShiftWithRounding(sum, kRoundBitsHorizontal);
- wiener_buffer[x] = static_cast<uint16_t>(Clip3(rounded_sum, 0, limit));
- } while (++x < width);
- src += source_stride;
- wiener_buffer += width;
- } while (--y != 0);
+ WienerHorizontal<bitdepth, Pixel>(src, source_stride, width,
+ height_horizontal, filter_horizontal, 3,
+ &wiener_buffer);
}
// vertical filtering.
- const int vertical_rounding = -(1 << (bitdepth + kRoundBitsVertical - 1));
- auto* dst = static_cast<Pixel*>(dest);
- wiener_buffer = buffer->wiener_buffer;
- y = height;
-
if (number_zero_coefficients_vertical == 0) {
- do {
- int x = 0;
- do {
- // sum needs 32 bits.
- int sum = vertical_rounding;
- sum = WienerVertical(wiener_buffer + x, filter_vertical, width, 0, sum);
- sum +=
- filter_vertical[kCenterTap] * wiener_buffer[kCenterTap * width + x];
- const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical);
- dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1));
- } while (++x < width);
- dst += dest_stride;
- wiener_buffer += width;
- } while (--y != 0);
+ // Because the top row of |source| is a duplicate of the second row, and the
+ // bottom row of |source| is a duplicate of its above row, we can duplicate
+ // the top and bottom row of |wiener_buffer| accordingly.
+ memcpy(wiener_buffer, wiener_buffer - width,
+ sizeof(*wiener_buffer) * width);
+ memcpy(buffer->wiener_buffer, buffer->wiener_buffer + width,
+ sizeof(*wiener_buffer) * width);
+ WienerVertical<bitdepth, Pixel>(buffer->wiener_buffer, width, height,
+ filter_vertical, 0, dest, dest_stride);
} else if (number_zero_coefficients_vertical == 1) {
- do {
- int x = 0;
- do {
- // sum needs 32 bits.
- int sum = vertical_rounding;
- sum = WienerVertical(wiener_buffer + x, filter_vertical, width, 1, sum);
- sum +=
- filter_vertical[kCenterTap] * wiener_buffer[kCenterTap * width + x];
- const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical);
- dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1));
- } while (++x < width);
- dst += dest_stride;
- wiener_buffer += width;
- } while (--y != 0);
+ WienerVertical<bitdepth, Pixel>(buffer->wiener_buffer, width, height,
+ filter_vertical, 1, dest, dest_stride);
} else if (number_zero_coefficients_vertical == 2) {
- do {
- int x = 0;
- do {
- // sum needs 32 bits.
- int sum = vertical_rounding;
- sum = WienerVertical(wiener_buffer + x, filter_vertical, width, 2, sum);
- sum +=
- filter_vertical[kCenterTap] * wiener_buffer[kCenterTap * width + x];
- const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical);
- dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1));
- } while (++x < width);
- dst += dest_stride;
- wiener_buffer += width;
- } while (--y != 0);
+ WienerVertical<bitdepth, Pixel>(buffer->wiener_buffer, width, height,
+ filter_vertical, 2, dest, dest_stride);
} else {
- do {
- int x = 0;
- do {
- // sum needs 32 bits.
- int sum = vertical_rounding;
- sum +=
- filter_vertical[kCenterTap] * wiener_buffer[kCenterTap * width + x];
- const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical);
- dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1));
- } while (++x < width);
- dst += dest_stride;
- wiener_buffer += width;
- } while (--y != 0);
+ WienerVertical<bitdepth, Pixel>(buffer->wiener_buffer, width, height,
+ filter_vertical, 3, dest, dest_stride);
}
}
+//------------------------------------------------------------------------------
+// SGR
+
+template <int bitdepth>
+inline void CalculateIntermediate(const uint32_t s, uint32_t a,
+ const uint32_t b, const uint32_t n,
+ SgrIntermediateBuffer* const intermediate) {
+ // a: before shift, max is 25 * (2^(bitdepth) - 1) * (2^(bitdepth) - 1).
+ // since max bitdepth = 12, max < 2^31.
+ // after shift, a < 2^16 * n < 2^22 regardless of bitdepth
+ a = RightShiftWithRounding(a, (bitdepth - 8) << 1);
+ // b: max is 25 * (2^(bitdepth) - 1). If bitdepth = 12, max < 2^19.
+ // d < 2^8 * n < 2^14 regardless of bitdepth
+ const uint32_t d = RightShiftWithRounding(b, bitdepth - 8);
+ // p: Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
+ // and p itself satisfies p < 2^14 * n^2 < 2^26.
+ // This bound on p is due to:
+ // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
+ // Note: Sometimes, in high bitdepth, we can end up with a*n < b*b.
+ // This is an artifact of rounding, and can only happen if all pixels
+ // are (almost) identical, so in this case we saturate to p=0.
+ const uint32_t p = (a * n < d * d) ? 0 : a * n - d * d;
+ // p * s < (2^14 * n^2) * round(2^20 / (n^2 * scale)) < 2^34 / scale <
+ // 2^32 as long as scale >= 4. So p * s fits into a uint32_t, and z < 2^12
+ // (this holds even after accounting for the rounding in s)
+ const uint32_t z = RightShiftWithRounding(p * s, kSgrProjScaleBits);
+ // a2: range [1, 256].
+ uint32_t a2 = kXByXPlus1[std::min(z, 255u)];
+ const uint32_t one_over_n = kOneByX[n - 1];
+ // (kSgrProjSgrBits - a2) < 2^8, b < 2^(bitdepth) * n,
+ // one_over_n = round(2^12 / n)
+ // => the product here is < 2^(20 + bitdepth) <= 2^32,
+ // and b is set to a value < 2^(8 + bitdepth).
+ // This holds even with the rounding in one_over_n and in the overall
+ // result, as long as (kSgrProjSgrBits - a2) is strictly less than 2^8.
+ const uint32_t b2 = ((1 << kSgrProjSgrBits) - a2) * b * one_over_n;
+ intermediate->a = a2;
+ intermediate->b = RightShiftWithRounding(b2, kSgrProjReciprocalBits);
+}
+
template <int bitdepth, typename Pixel>
-void LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterPreProcess(
- const RestorationUnitInfo& restoration_info, const uint16_t* integral_image,
- const uint32_t* square_integral_image, int width, int height, int pass,
- RestorationBuffer* const buffer) {
- const int sgr_proj_index = restoration_info.sgr_proj_info.index;
- const uint8_t radius = kSgrProjParams[sgr_proj_index][pass * 2];
- assert(radius != 0);
- const uint32_t n = (2 * radius + 1) * (2 * radius + 1);
- // const uint8_t scale = kSgrProjParams[sgr_proj_index][pass * 2 + 1];
- // n2_with_scale: max value < 2^16. min value is 4.
- // const uint32_t n2_with_scale = n * n * scale;
- // s: max value < 2^12.
- // const uint32_t s =
- // ((1 << kSgrProjScaleBits) + (n2_with_scale >> 1)) / n2_with_scale;
- const uint32_t s = kSgrScaleParameter[sgr_proj_index][pass];
- assert(s != 0);
- const ptrdiff_t array_stride = buffer->box_filter_process_intermediate_stride;
- const ptrdiff_t integral_image_stride =
- kRestorationProcessingUnitSizeWithBorders + 1;
- // The size of the intermediate result buffer is the size of the filter area
- // plus horizontal (3) and vertical (3) padding. The processing start point
- // is the filter area start point -1 row and -1 column. Therefore we need to
- // set offset and use the intermediate_result as the start point for
- // processing.
- const ptrdiff_t intermediate_buffer_offset =
- kRestorationBorder * array_stride + kRestorationBorder;
- uint32_t* intermediate_result[2] = {
- buffer->box_filter_process_intermediate[0] + intermediate_buffer_offset -
- array_stride,
- buffer->box_filter_process_intermediate[1] + intermediate_buffer_offset -
- array_stride};
-
- // Calculate intermediate results, including one-pixel border, for example,
- // if unit size is 64x64, we calculate 66x66 pixels.
- const int step = (pass == 0) ? 2 : 1;
- const ptrdiff_t intermediate_stride = step * array_stride;
- for (int y = -1; y <= height; y += step) {
- for (int x = -1; x <= width; ++x) {
- // The integral image helps to calculate the sum of the square
- // centered at (x, y).
- // The calculation of a, b is equal to the following lines:
- // uint32_t a = 0;
- // uint32_t b = 0;
- // for (int dy = -radius; dy <= radius; ++dy) {
- // for (int dx = -radius; dx <= radius; ++dx) {
- // const Pixel source = src[(y + dy) * stride + (x + dx)];
- // a += source * source;
- // b += source;
- // }
- // }
- const int top_left =
- (y + kRestorationBorder - radius) * integral_image_stride + x +
- kRestorationBorder - radius;
- const int top_right = top_left + 2 * radius + 1;
- const int bottom_left =
- top_left + (2 * radius + 1) * integral_image_stride;
- const int bottom_right = bottom_left + 2 * radius + 1;
- uint32_t a = square_integral_image[bottom_right] -
- square_integral_image[bottom_left] -
- square_integral_image[top_right] +
- square_integral_image[top_left];
- uint32_t b;
-
- if (bitdepth <= 10 || radius < 2) {
- // The following cast is mandatory to get truncated sum.
- b = static_cast<uint16_t>(
- integral_image[bottom_right] - integral_image[bottom_left] -
- integral_image[top_right] + integral_image[top_left]);
- } else {
- assert(radius == 2);
- const uint16_t b_top_15_pixels =
- integral_image[top_right + 3 * integral_image_stride] -
- integral_image[top_left + 3 * integral_image_stride] -
- integral_image[top_right] + integral_image[top_left];
- const uint16_t b_bottom_10_pixels =
- integral_image[bottom_right] - integral_image[bottom_left] -
- integral_image[top_right + 3 * integral_image_stride] +
- integral_image[top_left + 3 * integral_image_stride];
- b = b_top_15_pixels + b_bottom_10_pixels;
- }
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessTop(
+ const Pixel* src, const ptrdiff_t stride, const int width, const uint32_t s,
+ SgrIntermediateBuffer* intermediate) {
+ uint32_t a = 0;
+ uint32_t b = 0;
+ for (int dx = 0; dx < 5; ++dx) {
+ const Pixel source = src[dx];
+ a += source * source;
+ b += source;
+ }
+ a += a;
+ b += b;
+ for (int dy = 1; dy < 4; ++dy) {
+ for (int dx = 0; dx < 5; ++dx) {
+ const Pixel source = src[dy * stride + dx];
+ a += source * source;
+ b += source;
+ }
+ }
+ CalculateIntermediate<bitdepth>(s, a, b, 25, intermediate);
+ int x = width - 1;
+ do {
+ {
+ const Pixel source0 = src[0];
+ const Pixel source1 = src[5];
+ a += 2 * (source1 * source1 - source0 * source0);
+ b += 2 * (source1 - source0);
+ }
+ int dy = 1;
+ do {
+ const Pixel source0 = src[dy * stride];
+ const Pixel source1 = src[dy * stride + 5];
+ a -= source0 * source0;
+ a += source1 * source1;
+ b -= source0;
+ b += source1;
+ } while (++dy < 4);
+ src++;
+ CalculateIntermediate<bitdepth>(s, a, b, 25, ++intermediate);
+ } while (--x != 0);
+}
- // a: before shift, max is 25 * (2^(bitdepth) - 1) * (2^(bitdepth) - 1).
- // since max bitdepth = 12, max < 2^31.
- // after shift, a < 2^16 * n < 2^22 regardless of bitdepth
- a = RightShiftWithRounding(a, (bitdepth - 8) << 1);
- // b: max is 25 * (2^(bitdepth) - 1). If bitdepth = 12, max < 2^19.
- // d < 2^8 * n < 2^14 regardless of bitdepth
- const uint32_t d = RightShiftWithRounding(b, bitdepth - 8);
- // p: Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
- // and p itself satisfies p < 2^14 * n^2 < 2^26.
- // This bound on p is due to:
- // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
- // Note: Sometimes, in high bitdepth, we can end up with a*n < b*b.
- // This is an artifact of rounding, and can only happen if all pixels
- // are (almost) identical, so in this case we saturate to p=0.
- const uint32_t p = (a * n < d * d) ? 0 : a * n - d * d;
- // p * s < (2^14 * n^2) * round(2^20 / (n^2 * scale)) < 2^34 / scale <
- // 2^32 as long as scale >= 4. So p * s fits into a uint32_t, and z < 2^12
- // (this holds even after accounting for the rounding in s)
- const uint32_t z = RightShiftWithRounding(p * s, kSgrProjScaleBits);
- // a2: range [1, 256].
- uint32_t a2 = kXByXPlus1[std::min(z, 255u)];
- const uint32_t one_over_n = kOneByX[n - 1];
- // (kSgrProjSgrBits - a2) < 2^8, b < 2^(bitdepth) * n,
- // one_over_n = round(2^12 / n)
- // => the product here is < 2^(20 + bitdepth) <= 2^32,
- // and b is set to a value < 2^(8 + bitdepth).
- // This holds even with the rounding in one_over_n and in the overall
- // result, as long as (kSgrProjSgrBits - a2) is strictly less than 2^8.
- const uint32_t b2 = ((1 << kSgrProjSgrBits) - a2) * b * one_over_n;
- intermediate_result[0][x] = a2;
- intermediate_result[1][x] =
- RightShiftWithRounding(b2, kSgrProjReciprocalBits);
+template <int bitdepth, typename Pixel, int size>
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+ const Pixel* src, const ptrdiff_t stride, const int width, const uint32_t s,
+ SgrIntermediateBuffer* intermediate) {
+ const int n = size * size;
+ uint32_t a = 0;
+ uint32_t b = 0;
+ for (int dy = 0; dy < size; ++dy) {
+ for (int dx = 0; dx < size; ++dx) {
+ const Pixel source = src[dy * stride + dx];
+ a += source * source;
+ b += source;
}
- intermediate_result[0] += intermediate_stride;
- intermediate_result[1] += intermediate_stride;
}
+ CalculateIntermediate<bitdepth>(s, a, b, n, intermediate);
+ int x = width - 1;
+ do {
+ int dy = 0;
+ do {
+ const Pixel source0 = src[dy * stride];
+ const Pixel source1 = src[dy * stride + size];
+ a -= source0 * source0;
+ a += source1 * source1;
+ b -= source0;
+ b += source1;
+ } while (++dy < size);
+ src++;
+ CalculateIntermediate<bitdepth>(s, a, b, n, ++intermediate);
+ } while (--x != 0);
+}
+
+template <int bitdepth, typename Pixel>
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessBottom(
+ const Pixel* src, const ptrdiff_t stride, const int width, const uint32_t s,
+ SgrIntermediateBuffer* intermediate) {
+ uint32_t a = 0;
+ uint32_t b = 0;
+ for (int dx = 0; dx < 5; ++dx) {
+ const Pixel source = src[3 * stride + dx];
+ a += source * source;
+ b += source;
+ }
+ a += a;
+ b += b;
+ for (int dy = 0; dy < 3; ++dy) {
+ for (int dx = 0; dx < 5; ++dx) {
+ const Pixel source = src[dy * stride + dx];
+ a += source * source;
+ b += source;
+ }
+ }
+ CalculateIntermediate<bitdepth>(s, a, b, 25, intermediate);
+ int x = width - 1;
+ do {
+ {
+ const Pixel source0 = src[3 * stride + 0];
+ const Pixel source1 = src[3 * stride + 5];
+ a += 2 * (source1 * source1 - source0 * source0);
+ b += 2 * (source1 - source0);
+ }
+ int dy = 0;
+ do {
+ const Pixel source0 = src[dy * stride];
+ const Pixel source1 = src[dy * stride + 5];
+ a -= source0 * source0;
+ a += source1 * source1;
+ b -= source0;
+ b += source1;
+ } while (++dy < 3);
+ src++;
+ CalculateIntermediate<bitdepth>(s, a, b, 25, ++intermediate);
+ } while (--x != 0);
+}
+
+inline void Sum565(const SgrIntermediateBuffer* const intermediate,
+ uint16_t* const a, uint32_t* const b) {
+ *a = 5 * (intermediate[0].a + intermediate[2].a) + 6 * intermediate[1].a;
+ *b = 5 * (intermediate[0].b + intermediate[2].b) + 6 * intermediate[1].b;
+}
+
+template <typename Pixel>
+inline int CalculateFilteredOutput(const Pixel src, const uint32_t a,
+ const uint32_t b, const int shift) {
+ // v < 2^32. All intermediate calculations are positive.
+ const uint32_t v = a * src + b;
+ return RightShiftWithRounding(v,
+ kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <typename Pixel>
+inline void BoxFilterPass1(const Pixel src0, const Pixel src1,
+ const SgrIntermediateBuffer* const intermediate[2],
+ const ptrdiff_t x, int p[2]) {
+ uint16_t a[2];
+ uint32_t b[2];
+ Sum565(intermediate[0] + x, &a[0], &b[0]);
+ Sum565(intermediate[1] + x, &a[1], &b[1]);
+ p[0] = CalculateFilteredOutput<Pixel>(src0, a[0] + a[1], b[0] + b[1], 5);
+ p[1] = CalculateFilteredOutput<Pixel>(src1, a[1], b[1], 4);
+}
+
+template <typename Pixel>
+inline int BoxFilterPass2(const Pixel src,
+ const SgrIntermediateBuffer* const intermediate[3],
+ const ptrdiff_t x) {
+ const uint32_t a = 3 * (intermediate[0][x + 0].a + intermediate[0][x + 2].a +
+ intermediate[2][x + 0].a + intermediate[2][x + 2].a) +
+ 4 * (intermediate[0][x + 1].a + intermediate[1][x + 0].a +
+ intermediate[1][x + 1].a + intermediate[1][x + 2].a +
+ intermediate[2][x + 1].a);
+ const uint32_t b = 3 * (intermediate[0][x + 0].b + intermediate[0][x + 2].b +
+ intermediate[2][x + 0].b + intermediate[2][x + 2].b) +
+ 4 * (intermediate[0][x + 1].b + intermediate[1][x + 0].b +
+ intermediate[1][x + 1].b + intermediate[1][x + 2].b +
+ intermediate[2][x + 1].b);
+ return CalculateFilteredOutput<Pixel>(src, a, b, 5);
+}
+
+template <int bitdepth, typename Pixel>
+inline Pixel SelfGuidedDoubleMultiplier(const int src,
+ const int box_filter_process_output0,
+ const int box_filter_process_output1,
+ const int16_t w0, const int16_t w1,
+ const int16_t w2) {
+ const int v = w1 * (src << kSgrProjRestoreBits) +
+ w0 * box_filter_process_output0 +
+ w2 * box_filter_process_output1;
+ // if radius_pass_0 == 0 and radius_pass_1 == 0, the range of v is:
+ // bits(u) + bits(w0/w1/w2) + 2 = bitdepth + 13.
+ // Then, range of s is bitdepth + 2. This is a rough estimation, taking
+ // the maximum value of each element.
+ const int s =
+ RightShiftWithRounding(v, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ return static_cast<Pixel>(Clip3(s, 0, (1 << bitdepth) - 1));
+}
+
+template <int bitdepth, typename Pixel>
+inline Pixel SelfGuidedSingleMultiplier(const int src,
+ const int box_filter_process_output,
+ const int16_t w0, const int16_t w1) {
+ const int v =
+ w1 * (src << kSgrProjRestoreBits) + w0 * box_filter_process_output;
+ // if radius_pass_0 == 0 and radius_pass_1 == 0, the range of v is:
+ // bits(u) + bits(w0/w1/w2) + 2 = bitdepth + 13.
+ // Then, range of s is bitdepth + 2. This is a rough estimation, taking
+ // the maximum value of each element.
+ const int s =
+ RightShiftWithRounding(v, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ return static_cast<Pixel>(Clip3(s, 0, (1 << bitdepth) - 1));
}
template <int bitdepth, typename Pixel>
-void LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcess(
+inline void LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcess(
const RestorationUnitInfo& restoration_info, const Pixel* src,
- ptrdiff_t stride, int width, int height, RestorationBuffer* const buffer) {
+ const ptrdiff_t src_stride, const int width, const int height,
+ SgrBuffer* const buffer, Pixel* dst, const ptrdiff_t dst_stride) {
const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t s0 = kSgrScaleParameter[sgr_proj_index][0]; // s0 < 2^12.
+ const uint32_t s1 = kSgrScaleParameter[sgr_proj_index][1]; // s1 < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ SgrIntermediateBuffer *intermediate0[2], *intermediate1[4];
+ assert(s0 != 0);
+ assert(s1 != 0);
+ intermediate0[0] = buffer->intermediate;
+ intermediate0[1] = intermediate0[0] + kIntermediateStride;
+ intermediate1[0] = intermediate0[1] + kIntermediateStride;
+ intermediate1[1] = intermediate1[0] + kIntermediateStride,
+ intermediate1[2] = intermediate1[1] + kIntermediateStride,
+ intermediate1[3] = intermediate1[2] + kIntermediateStride;
+ BoxFilterPreProcessTop<bitdepth, Pixel>(src - 2 * src_stride - 3, src_stride,
+ width + 2, s0, intermediate0[0]);
+ BoxFilterPreProcess<bitdepth, Pixel, 3>(src - 2 * src_stride - 2, src_stride,
+ width + 2, s1, intermediate1[0]);
+ BoxFilterPreProcess<bitdepth, Pixel, 3>(src - 1 * src_stride - 2, src_stride,
+ width + 2, s1, intermediate1[1]);
+ for (int y = height >> 1; y != 0; --y) {
+ BoxFilterPreProcess<bitdepth, Pixel, 5>(src - src_stride - 3, src_stride,
+ width + 2, s0, intermediate0[1]);
+ BoxFilterPreProcess<bitdepth, Pixel, 3>(src - 2, src_stride, width + 2, s1,
+ intermediate1[2]);
+ BoxFilterPreProcess<bitdepth, Pixel, 3>(src + src_stride - 2, src_stride,
+ width + 2, s1, intermediate1[3]);
+ int x = 0;
+ do {
+ int p[2][2];
+ BoxFilterPass1<Pixel>(src[x], src[src_stride + x], intermediate0, x,
+ p[0]);
+ p[1][0] = BoxFilterPass2<Pixel>(src[x], intermediate1, x);
+ p[1][1] =
+ BoxFilterPass2<Pixel>(src[src_stride + x], intermediate1 + 1, x);
+ dst[x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p[0][0],
+ p[1][0], w0, w1, w2);
+ dst[dst_stride + x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(
+ src[src_stride + x], p[0][1], p[1][1], w0, w1, w2);
+ } while (++x < width);
+ src += 2 * src_stride;
+ dst += 2 * dst_stride;
+ std::swap(intermediate0[0], intermediate0[1]);
+ std::swap(intermediate1[0], intermediate1[2]);
+ std::swap(intermediate1[1], intermediate1[3]);
+ }
+ if ((height & 1) != 0) {
+ BoxFilterPreProcessBottom<bitdepth, Pixel>(src - src_stride - 3, src_stride,
+ width + 2, s0, intermediate0[1]);
+ BoxFilterPreProcess<bitdepth, Pixel, 3>(src - 2, src_stride, width + 2, s1,
+ intermediate1[2]);
+ int x = 0;
+ do {
+ int p[2][2];
+ BoxFilterPass1<Pixel>(src[x], src[src_stride + x], intermediate0, x,
+ p[0]);
+ p[1][0] = BoxFilterPass2<Pixel>(src[x], intermediate1, x);
+ dst[x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p[0][0],
+ p[1][0], w0, w1, w2);
+ } while (++x < width);
+ }
+}
- // We calculate intermediate values for the region (width + 1) x (height + 1).
- // The region we can access is (width + 1 + radius) x (height + 1 + radius).
- // The max radius is 2. width = height =
- // kRestorationProcessingUnitSizeWithBorders.
- // For the integral_image, we need one row before the accessible region,
- // so the stride is kRestorationProcessingUnitSizeWithBorders + 1.
- // We fix the first row and first column of integral image be 0 to facilitate
- // computation.
-
- // Note that the max sum = (2 ^ bitdepth - 1) *
- // kRestorationProcessingUnitSizeWithBorders *
- // kRestorationProcessingUnitSizeWithBorders.
- // The max sum is larger than 2^16.
- // Case 8 bit and 10 bit:
- // The final box sum has at most 25 pixels, which is within 16 bits. So
- // keeping truncated 16-bit values is enough.
- // Case 12 bit, radius 1:
- // The final box sum has 9 pixels, which is within 16 bits. So keeping
- // truncated 16-bit values is enough.
- // Case 12 bit, radius 2:
- // The final box sum has 25 pixels. It can be calculated by calculating the
- // top 15 pixels and the bottom 10 pixels separately, and adding them
- // together. So keeping truncated 16-bit values is enough.
- // If it is slower than using 32-bit for specific CPU targets, please split
- // into 2 paths.
- uint16_t integral_image[(kRestorationProcessingUnitSizeWithBorders + 1) *
- (kRestorationProcessingUnitSizeWithBorders + 1)];
-
- // Note that the max squared sum =
- // (2 ^ bitdepth - 1) * (2 ^ bitdepth - 1) *
- // kRestorationProcessingUnitSizeWithBorders *
- // kRestorationProcessingUnitSizeWithBorders.
- // For 8 bit, 32-bit is enough. For 10 bit and up, the sum could be larger
- // than 2^32. However, the final box sum has at most 25 squares, which is
- // within 32 bits. So keeping truncated 32-bit values is enough.
- uint32_t
- square_integral_image[(kRestorationProcessingUnitSizeWithBorders + 1) *
- (kRestorationProcessingUnitSizeWithBorders + 1)];
- const ptrdiff_t integral_image_stride =
- kRestorationProcessingUnitSizeWithBorders + 1;
- const ptrdiff_t filtered_output_stride =
- buffer->box_filter_process_output_stride;
- const ptrdiff_t intermediate_stride =
- buffer->box_filter_process_intermediate_stride;
- const ptrdiff_t intermediate_buffer_offset =
- kRestorationBorder * intermediate_stride + kRestorationBorder;
-
- ComputeIntegralImage<Pixel>(
- src - kRestorationBorder * stride - kRestorationBorder, stride,
- width + 2 * kRestorationBorder, height + 2 * kRestorationBorder,
- integral_image, square_integral_image, integral_image_stride);
-
- for (int pass = 0; pass < 2; ++pass) {
- const uint8_t radius = kSgrProjParams[sgr_proj_index][pass * 2];
- if (radius == 0) continue;
- LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterPreProcess(
- restoration_info, integral_image, square_integral_image, width, height,
- pass, buffer);
-
- const Pixel* src_ptr = src;
- // Set intermediate buffer start point to the actual start point of
- // filtering.
- const uint32_t* array_start[2] = {
- buffer->box_filter_process_intermediate[0] + intermediate_buffer_offset,
- buffer->box_filter_process_intermediate[1] +
- intermediate_buffer_offset};
- int* filtered_output = buffer->box_filter_process_output[pass];
- for (int y = 0; y < height; ++y) {
- const int shift = (pass == 0 && (y & 1) != 0) ? 4 : 5;
- // array_start[0]: range [1, 256].
- // array_start[1] < 2^20.
- for (int x = 0; x < width; ++x) {
- uint32_t a, b;
- if (pass == 0) {
- if ((y & 1) == 0) {
- a = 5 * (array_start[0][-intermediate_stride + x - 1] +
- array_start[0][-intermediate_stride + x + 1] +
- array_start[0][intermediate_stride + x - 1] +
- array_start[0][intermediate_stride + x + 1]) +
- 6 * (array_start[0][-intermediate_stride + x] +
- array_start[0][intermediate_stride + x]);
- b = 5 * (array_start[1][-intermediate_stride + x - 1] +
- array_start[1][-intermediate_stride + x + 1] +
- array_start[1][intermediate_stride + x - 1] +
- array_start[1][intermediate_stride + x + 1]) +
- 6 * (array_start[1][-intermediate_stride + x] +
- array_start[1][intermediate_stride + x]);
- } else {
- a = 5 * (array_start[0][x - 1] + array_start[0][x + 1]) +
- 6 * array_start[0][x];
- b = 5 * (array_start[1][x - 1] + array_start[1][x + 1]) +
- 6 * array_start[1][x];
- }
- } else {
- a = 3 * (array_start[0][-intermediate_stride + x - 1] +
- array_start[0][-intermediate_stride + x + 1] +
- array_start[0][intermediate_stride + x - 1] +
- array_start[0][intermediate_stride + x + 1]) +
- 4 * (array_start[0][-intermediate_stride + x] +
- array_start[0][x - 1] + array_start[0][x] +
- array_start[0][x + 1] +
- array_start[0][intermediate_stride + x]);
- b = 3 * (array_start[1][-intermediate_stride + x - 1] +
- array_start[1][-intermediate_stride + x + 1] +
- array_start[1][intermediate_stride + x - 1] +
- array_start[1][intermediate_stride + x + 1]) +
- 4 * (array_start[1][-intermediate_stride + x] +
- array_start[1][x - 1] + array_start[1][x] +
- array_start[1][x + 1] +
- array_start[1][intermediate_stride + x]);
- }
- // v < 2^32. All intermediate calculations are positive.
- const uint32_t v = a * src_ptr[x] + b;
- filtered_output[x] = RightShiftWithRounding(
- v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
- }
- src_ptr += stride;
- array_start[0] += intermediate_stride;
- array_start[1] += intermediate_stride;
- filtered_output += filtered_output_stride;
- }
+template <int bitdepth, typename Pixel>
+inline void LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcessPass1(
+ const RestorationUnitInfo& restoration_info, const Pixel* src,
+ const ptrdiff_t src_stride, const int width, const int height,
+ SgrBuffer* const buffer, Pixel* dst, const ptrdiff_t dst_stride) {
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t s = kSgrScaleParameter[sgr_proj_index][0]; // s < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = (1 << kSgrProjPrecisionBits) - w0;
+ SgrIntermediateBuffer* intermediate[2];
+ assert(s != 0);
+ intermediate[0] = buffer->intermediate;
+ intermediate[1] = intermediate[0] + kIntermediateStride;
+ BoxFilterPreProcessTop<bitdepth, Pixel>(src - 2 * src_stride - 3, src_stride,
+ width + 2, s, intermediate[0]);
+ for (int y = height >> 1; y != 0; --y) {
+ BoxFilterPreProcess<bitdepth, Pixel, 5>(src - src_stride - 3, src_stride,
+ width + 2, s, intermediate[1]);
+ int x = 0;
+ do {
+ int p[2];
+ BoxFilterPass1<Pixel>(src[x], src[src_stride + x], intermediate, x, p);
+ dst[x] =
+ SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p[0], w0, w1);
+ dst[dst_stride + x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(
+ src[src_stride + x], p[1], w0, w1);
+ } while (++x < width);
+ src += 2 * src_stride;
+ dst += 2 * dst_stride;
+ std::swap(intermediate[0], intermediate[1]);
+ }
+ if ((height & 1) != 0) {
+ BoxFilterPreProcessBottom<bitdepth, Pixel>(src - src_stride - 3, src_stride,
+ width + 2, s, intermediate[1]);
+ int x = 0;
+ do {
+ int p[2];
+ BoxFilterPass1<Pixel>(src[x], src[src_stride + x], intermediate, x, p);
+ dst[x] =
+ SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p[0], w0, w1);
+ dst[dst_stride + x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(
+ src[src_stride + x], p[1], w0, w1);
+ } while (++x < width);
}
}
-// Assume box_filter_process_output[2] are allocated before calling
-// this function. Their sizes are width * height, stride equals width.
+template <int bitdepth, typename Pixel>
+inline void LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcessPass2(
+ const RestorationUnitInfo& restoration_info, const Pixel* src,
+ const ptrdiff_t src_stride, const int width, const int height,
+ SgrBuffer* const buffer, Pixel* dst, const ptrdiff_t dst_stride) {
+ assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t s = kSgrScaleParameter[sgr_proj_index][1]; // s < 2^12.
+ SgrIntermediateBuffer* intermediate[3];
+ assert(s != 0);
+ intermediate[0] = buffer->intermediate;
+ intermediate[1] = intermediate[0] + kIntermediateStride;
+ intermediate[2] = intermediate[1] + kIntermediateStride;
+ BoxFilterPreProcess<bitdepth, Pixel, 3>(src - 2 * src_stride - 2, src_stride,
+ width + 2, s, intermediate[0]);
+ BoxFilterPreProcess<bitdepth, Pixel, 3>(src - 1 * src_stride - 2, src_stride,
+ width + 2, s, intermediate[1]);
+ int y = height;
+ do {
+ BoxFilterPreProcess<bitdepth, Pixel, 3>(src - 2, src_stride, width + 2, s,
+ intermediate[2]);
+ int x = 0;
+ do {
+ const int p = BoxFilterPass2<Pixel>(src[x], intermediate, x);
+ dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0, w1);
+ } while (++x < width);
+ src += src_stride;
+ dst += dst_stride;
+ SgrIntermediateBuffer* const intermediate0 = intermediate[0];
+ intermediate[0] = intermediate[1];
+ intermediate[1] = intermediate[2];
+ intermediate[2] = intermediate0;
+ } while (--y != 0);
+}
+
template <int bitdepth, typename Pixel>
void LoopRestorationFuncs_C<bitdepth, Pixel>::SelfGuidedFilter(
const void* const source, void* const dest,
const RestorationUnitInfo& restoration_info, ptrdiff_t source_stride,
ptrdiff_t dest_stride, int width, int height,
- RestorationBuffer* const buffer) {
- const int w0 = restoration_info.sgr_proj_info.multiplier[0];
- const int w1 = restoration_info.sgr_proj_info.multiplier[1];
- const int w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ RestorationBuffer* const /*buffer*/) {
const int index = restoration_info.sgr_proj_info.index;
- const int radius_pass_0 = kSgrProjParams[index][0];
- const int radius_pass_1 = kSgrProjParams[index][2];
- const ptrdiff_t array_stride = buffer->box_filter_process_output_stride;
- const int* box_filter_process_output[2] = {
- buffer->box_filter_process_output[0],
- buffer->box_filter_process_output[1]};
+ const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
+ const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
const auto* src = static_cast<const Pixel*>(source);
auto* dst = static_cast<Pixel*>(dest);
- source_stride /= sizeof(Pixel);
- dest_stride /= sizeof(Pixel);
- LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcess(
- restoration_info, src, source_stride, width, height, buffer);
- for (int y = 0; y < height; ++y) {
- for (int x = 0; x < width; ++x) {
- const int u = src[x] << kSgrProjRestoreBits;
- int v = w1 * u;
- if (radius_pass_0 != 0) {
- v += w0 * box_filter_process_output[0][x];
- } else {
- v += w0 * u;
- }
- if (radius_pass_1 != 0) {
- v += w2 * box_filter_process_output[1][x];
- } else {
- v += w2 * u;
- }
- // if radius_pass_0 == 0 and radius_pass_1 == 0, the range of v is:
- // bits(u) + bits(w0/w1/w2) + 2 = bitdepth + 13.
- // Then, range of s is bitdepth + 2. This is a rough estimation, taking
- // the maximum value of each element.
- const int s = RightShiftWithRounding(
- v, kSgrProjRestoreBits + kSgrProjPrecisionBits);
- dst[x] = static_cast<Pixel>(Clip3(s, 0, (1 << bitdepth) - 1));
- }
- src += source_stride;
- dst += dest_stride;
- box_filter_process_output[0] += array_stride;
- box_filter_process_output[1] += array_stride;
+ SgrBuffer buffer;
+ if (radius_pass_1 == 0) {
+ // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+ // following assertion.
+ assert(radius_pass_0 != 0);
+ LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcessPass1(
+ restoration_info, src, source_stride, width, height, &buffer, dst,
+ dest_stride);
+ } else if (radius_pass_0 == 0) {
+ LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcessPass2(
+ restoration_info, src, source_stride, width, height, &buffer, dst,
+ dest_stride);
+ } else {
+ LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcess(
+ restoration_info, src, source_stride, width, height, &buffer, dst,
+ dest_stride);
}
}
@@ -739,7 +772,7 @@ void LoopRestorationInit_C() {
// available.
static_cast<void>(CountZeroCoefficients);
static_cast<void>(PopulateWienerCoefficients);
- static_cast<void>(WienerVertical);
+ static_cast<void>(Sum565);
}
} // namespace dsp
diff --git a/chromium/third_party/libgav1/src/src/dsp/loop_restoration.h b/chromium/third_party/libgav1/src/src/dsp/loop_restoration.h
index 663639c682f..d5511eab24f 100644
--- a/chromium/third_party/libgav1/src/src/dsp/loop_restoration.h
+++ b/chromium/third_party/libgav1/src/src/dsp/loop_restoration.h
@@ -38,6 +38,19 @@
namespace libgav1 {
namespace dsp {
+enum {
+ // Precision of a division table (mtable)
+ kSgrProjScaleBits = 20,
+ kSgrProjReciprocalBits = 12,
+ // Core self-guided restoration precision bits.
+ kSgrProjSgrBits = 8,
+ // Precision bits of generated values higher than source before projection.
+ kSgrProjRestoreBits = 4
+}; // anonymous enum
+
+extern const int kXByXPlus1[256];
+extern const uint8_t kSgrMa2Lookup[256];
+
// Initializes Dsp::loop_restorations. This function is not thread-safe.
void LoopRestorationInit_C();
diff --git a/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.cc b/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.cc
index 59cfeb4db72..b51ec8f7270 100644
--- a/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.cc
@@ -22,6 +22,7 @@
#include "src/dsp/dsp.h"
#include "src/utils/common.h"
#include "src/utils/constants.h"
+#include "src/utils/reference_info.h"
#include "src/utils/types.h"
namespace libgav1 {
@@ -36,12 +37,11 @@ namespace {
!defined(LIBGAV1_Dsp10bpp_MotionFieldProjectionKernel))
// 7.9.2.
-void MotionFieldProjectionKernel_C(
- const ReferenceFrameType* source_reference_type, const MotionVector* mv,
- const uint8_t order_hint[kNumReferenceFrameTypes],
- unsigned int current_frame_order_hint, unsigned int order_hint_shift_bits,
- int reference_to_current_with_sign, int dst_sign, int y8_start, int y8_end,
- int x8_start, int x8_end, TemporalMotionField* motion_field) {
+void MotionFieldProjectionKernel_C(const ReferenceInfo& reference_info,
+ int reference_to_current_with_sign,
+ int dst_sign, int y8_start, int y8_end,
+ int x8_start, int x8_end,
+ TemporalMotionField* motion_field) {
const ptrdiff_t stride = motion_field->mv.columns();
// The column range has to be offset by kProjectionMvMaxHorizontalOffset since
// coordinates in that range could end up being position_x8 because of
@@ -50,37 +50,31 @@ void MotionFieldProjectionKernel_C(
std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
const int adjusted_x8_end = std::min(
x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
+ const int8_t* const reference_offsets =
+ reference_info.relative_distance_to.data();
+ const bool* const skip_references = reference_info.skip_references.data();
+ const int16_t* const projection_divisions =
+ reference_info.projection_divisions.data();
+ const ReferenceFrameType* source_reference_types =
+ &reference_info.motion_field_reference_frame[y8_start][0];
+ const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
MotionVector* dst_mv = motion_field->mv[y8_start];
- int reference_offsets[kNumReferenceFrameTypes];
- bool skip_reference[kNumReferenceFrameTypes];
assert(stride == motion_field->reference_offset.columns());
assert((y8_start & 7) == 0);
- // Initialize skip_reference[kReferenceFrameIntra] to simplify branch
- // conditions in projection.
- skip_reference[kReferenceFrameIntra] = true;
- for (int reference_type = kReferenceFrameLast;
- reference_type <= kNumInterReferenceFrameTypes; ++reference_type) {
- const int reference_offset =
- GetRelativeDistance(current_frame_order_hint,
- order_hint[reference_type], order_hint_shift_bits);
- skip_reference[reference_type] =
- reference_offset > kMaxFrameDistance || reference_offset <= 0;
- reference_offsets[reference_type] = reference_offset;
- }
-
int y8 = y8_start;
do {
const int y8_floor = (y8 & ~7) - y8;
const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8);
int x8 = adjusted_x8_start;
do {
- if (skip_reference[source_reference_type[x8]]) continue;
- const int reference_offset = reference_offsets[source_reference_type[x8]];
+ const int source_reference_type = source_reference_types[x8];
+ if (skip_references[source_reference_type]) continue;
MotionVector projection_mv;
// reference_to_current_with_sign could be 0.
- GetMvProjection(mv[x8], reference_to_current_with_sign, reference_offset,
+ GetMvProjection(mv[x8], reference_to_current_with_sign,
+ projection_divisions[source_reference_type],
&projection_mv);
// Do not update the motion vector if the block position is not valid or
// if position_x8 is outside the current range of x8_start and x8_end.
@@ -97,9 +91,9 @@ void MotionFieldProjectionKernel_C(
if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
dst_mv[position_y8 * stride + position_x8] = mv[x8];
dst_reference_offset[position_y8 * stride + position_x8] =
- reference_offset;
+ reference_offsets[source_reference_type];
} while (++x8 < adjusted_x8_end);
- source_reference_type += stride;
+ source_reference_types += stride;
mv += stride;
dst_reference_offset += stride;
dst_mv += stride;
diff --git a/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.h b/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.h
index 5b18be5a3ac..36de459d8f3 100644
--- a/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.h
+++ b/chromium/third_party/libgav1/src/src/dsp/motion_field_projection.h
@@ -24,6 +24,14 @@
// ARM:
#include "src/dsp/arm/motion_field_projection_neon.h"
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+// SSE4_1
+#include "src/dsp/x86/motion_field_projection_sse4.h"
+// clang-format on
// IWYU pragma: end_exports
diff --git a/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.cc b/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.cc
index 33ecb2b1818..94023027fd9 100644
--- a/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.cc
@@ -47,9 +47,10 @@ void MvProjectionCompoundLowPrecision_C(
for (int i = 0; i < 2; ++i) {
// |offsets| non-zero check usually equals true and could be ignored.
if (offsets[i] != 0) {
- GetMvProjection(temporal_mvs[index], offsets[i],
- temporal_reference_offsets[index],
- &candidate_mvs[index].mv[i]);
+ GetMvProjection(
+ temporal_mvs[index], offsets[i],
+ kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+ &candidate_mvs[index].mv[i]);
for (auto& mv : candidate_mvs[index].mv[i].mv) {
// The next line is equivalent to:
// if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1;
@@ -73,9 +74,10 @@ void MvProjectionCompoundForceInteger_C(
for (int i = 0; i < 2; ++i) {
// |offsets| non-zero check usually equals true and could be ignored.
if (offsets[i] != 0) {
- GetMvProjection(temporal_mvs[index], offsets[i],
- temporal_reference_offsets[index],
- &candidate_mvs[index].mv[i]);
+ GetMvProjection(
+ temporal_mvs[index], offsets[i],
+ kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+ &candidate_mvs[index].mv[i]);
for (auto& mv : candidate_mvs[index].mv[i].mv) {
// The next line is equivalent to:
// const int value = (std::abs(static_cast<int>(mv)) + 3) & ~7;
@@ -101,9 +103,10 @@ void MvProjectionCompoundHighPrecision_C(
for (int i = 0; i < 2; ++i) {
// |offsets| non-zero check usually equals true and could be ignored.
if (offsets[i] != 0) {
- GetMvProjection(temporal_mvs[index], offsets[i],
- temporal_reference_offsets[index],
- &candidate_mvs[index].mv[i]);
+ GetMvProjection(
+ temporal_mvs[index], offsets[i],
+ kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+ &candidate_mvs[index].mv[i]);
}
}
} while (++index < count);
@@ -115,8 +118,10 @@ void MvProjectionSingleLowPrecision_C(
const int count, MotionVector* const candidate_mvs) {
int index = 0;
do {
- GetMvProjection(temporal_mvs[index], reference_offset,
- temporal_reference_offsets[index], &candidate_mvs[index]);
+ GetMvProjection(
+ temporal_mvs[index], reference_offset,
+ kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+ &candidate_mvs[index]);
for (auto& mv : candidate_mvs[index].mv) {
// The next line is equivalent to:
// if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1;
@@ -131,8 +136,10 @@ void MvProjectionSingleForceInteger_C(
const int count, MotionVector* const candidate_mvs) {
int index = 0;
do {
- GetMvProjection(temporal_mvs[index], reference_offset,
- temporal_reference_offsets[index], &candidate_mvs[index]);
+ GetMvProjection(
+ temporal_mvs[index], reference_offset,
+ kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+ &candidate_mvs[index]);
for (auto& mv : candidate_mvs[index].mv) {
// The next line is equivalent to:
// const int value = (std::abs(static_cast<int>(mv)) + 3) & ~7;
@@ -149,8 +156,10 @@ void MvProjectionSingleHighPrecision_C(
const int count, MotionVector* const candidate_mvs) {
int index = 0;
do {
- GetMvProjection(temporal_mvs[index], reference_offset,
- temporal_reference_offsets[index], &candidate_mvs[index]);
+ GetMvProjection(
+ temporal_mvs[index], reference_offset,
+ kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+ &candidate_mvs[index]);
} while (++index < count);
}
diff --git a/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.h b/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.h
index 7ab99a3f2f9..ae16726a961 100644
--- a/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.h
+++ b/chromium/third_party/libgav1/src/src/dsp/motion_vector_search.h
@@ -25,6 +25,15 @@
// ARM:
#include "src/dsp/arm/motion_vector_search_neon.h"
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+// SSE4_1
+#include "src/dsp/x86/motion_vector_search_sse4.h"
+// clang-format on
+
// IWYU pragma: end_exports
namespace libgav1 {
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/cdef_sse4.cc b/chromium/third_party/libgav1/src/src/dsp/x86/cdef_sse4.cc
index eed99e5a9c6..fd2c54af4f2 100644
--- a/chromium/third_party/libgav1/src/src/dsp/x86/cdef_sse4.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/cdef_sse4.cc
@@ -38,16 +38,7 @@ namespace dsp {
namespace low_bitdepth {
namespace {
-// CdefDirection:
-// Mirror values and pad to 16 elements.
-alignas(16) constexpr uint32_t kDivisionTable[] = {840, 420, 280, 210, 168, 140,
- 120, 105, 120, 140, 168, 210,
- 280, 420, 840, 0};
-
-// Used when calculating odd |cost[x]| values to mask off unwanted elements.
-// Holds elements 1 3 5 X 5 3 1 X
-alignas(16) constexpr uint32_t kDivisionTableOdd[] = {420, 210, 140, 0,
- 140, 210, 420, 0};
+#include "src/dsp/cdef.inc"
// Used to calculate |partial[0][i + j]| and |partial[4][7 + i - j]|. The input
// is |src[j]| and it is being added to |partial[]| based on the above indices.
@@ -160,10 +151,10 @@ inline __m128i Square_S32(__m128i a) { return _mm_mullo_epi32(a, a); }
// |cost[0]| and |cost[4]| square the input and sum with the corresponding
// element from the other end of the vector:
-// |kDivisionTable[]| element:
+// |kCdefDivisionTable[]| element:
// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
-// kDivisionTable[i + 1];
-// cost[0] += Square(partial[0][7]) * kDivisionTable[8];
+// kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
// Because everything is being summed into a single value the distributive
// property allows us to mirror the division table and accumulate once.
inline uint32_t Cost0Or4(const __m128i a, const __m128i b,
@@ -185,16 +176,16 @@ inline uint32_t CostOdd(const __m128i a, const __m128i b,
const __m128i a_hi_square =
Square_S32(_mm_cvtepi16_epi32(_mm_srli_si128(a, 8)));
// Swap element 0 and element 2. This pairs partial[i][10 - j] with
- // kDivisionTable[2*j+1].
+ // kCdefDivisionTable[2*j+1].
const __m128i b_lo_square =
_mm_shuffle_epi32(Square_S32(_mm_cvtepi16_epi32(b)), 0x06);
// First terms are indices 3-7.
__m128i c = _mm_srli_si128(a_lo_square, 12);
c = _mm_add_epi32(c, a_hi_square);
- c = _mm_mullo_epi32(c, _mm_set1_epi32(kDivisionTable[7]));
+ c = _mm_mullo_epi32(c, _mm_set1_epi32(kCdefDivisionTable[7]));
// cost[i] += (Square(base_partial[i][j]) + Square(base_partial[i][10 - j])) *
- // kDivisionTable[2 * j + 1];
+ // kCdefDivisionTable[2 * j + 1];
const __m128i second_cost = _mm_add_epi32(a_lo_square, b_lo_square);
c = _mm_add_epi32(c, _mm_mullo_epi32(second_cost, division_table));
return SumVector_S32(c);
@@ -241,18 +232,18 @@ void CdefDirection_SSE4_1(const void* const source, ptrdiff_t stride,
const __m128i signed_offset = _mm_set1_epi16(128 * 8);
partial_lo[2] = _mm_sub_epi16(partial_lo[2], signed_offset);
- cost[2] = kDivisionTable[7] * SquareSum_S16(partial_lo[2]);
- cost[6] = kDivisionTable[7] * SquareSum_S16(partial_lo[6]);
+ cost[2] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[2]);
+ cost[6] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[6]);
- const __m128i division_table[4] = {LoadUnaligned16(kDivisionTable),
- LoadUnaligned16(kDivisionTable + 4),
- LoadUnaligned16(kDivisionTable + 8),
- LoadUnaligned16(kDivisionTable + 12)};
+ const __m128i division_table[4] = {LoadUnaligned16(kCdefDivisionTable),
+ LoadUnaligned16(kCdefDivisionTable + 4),
+ LoadUnaligned16(kCdefDivisionTable + 8),
+ LoadUnaligned16(kCdefDivisionTable + 12)};
cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table);
cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table);
- const __m128i division_table_odd = LoadAligned16(kDivisionTableOdd);
+ const __m128i division_table_odd = LoadAligned16(kCdefDivisionTableOdd);
cost[1] = CostOdd(partial_lo[1], partial_hi[1], division_table_odd);
cost[3] = CostOdd(partial_lo[3], partial_hi[3], division_table_odd);
@@ -315,24 +306,6 @@ void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride,
src + y_1 * stride + stride + x_1);
}
-// Load 4 vectors based on the given |direction|. Use when |block_width| == 2 to
-// do 2 rows at a time.
-void LoadDirection2(const uint16_t* const src, const ptrdiff_t stride,
- __m128i* output, const int direction) {
- const int y_0 = kCdefDirections[direction][0][0];
- const int x_0 = kCdefDirections[direction][0][1];
- const int y_1 = kCdefDirections[direction][1][0];
- const int x_1 = kCdefDirections[direction][1][1];
- output[0] =
- Load4x2(src - y_0 * stride - x_0, src - y_0 * stride - x_0 + stride);
- output[1] =
- Load4x2(src + y_0 * stride + x_0, src - y_0 * stride - x_0 + stride);
- output[2] =
- Load4x2(src - y_1 * stride - x_1, src - y_0 * stride - x_0 + stride);
- output[3] =
- Load4x2(src + y_1 * stride + x_1, src - y_0 * stride - x_0 + stride);
-}
-
inline __m128i Constrain(const __m128i& pixel, const __m128i& reference,
const __m128i& damping, const __m128i& threshold) {
const __m128i diff = _mm_sub_epi16(pixel, reference);
@@ -340,6 +313,11 @@ inline __m128i Constrain(const __m128i& pixel, const __m128i& reference,
// sign(diff) * Clip3(threshold - (std::abs(diff) >> damping),
// 0, std::abs(diff))
const __m128i shifted_diff = _mm_srl_epi16(abs_diff, damping);
+ // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
+ // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
+ // larger than threshold. Subtract using saturation will return 0 when pixel
+ // == kCdefLargeValue.
+ static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
const __m128i thresh_minus_shifted_diff =
_mm_subs_epu16(threshold, shifted_diff);
const __m128i clamp_abs_diff =
@@ -349,34 +327,35 @@ inline __m128i Constrain(const __m128i& pixel, const __m128i& reference,
}
inline __m128i ApplyConstrainAndTap(const __m128i& pixel, const __m128i& val,
- const __m128i& mask, const __m128i& tap,
- const __m128i& damping,
+ const __m128i& tap, const __m128i& damping,
const __m128i& threshold) {
const __m128i constrained = Constrain(val, pixel, damping, threshold);
- return _mm_mullo_epi16(_mm_and_si128(constrained, mask), tap);
+ return _mm_mullo_epi16(constrained, tap);
}
-template <int width>
+template <int width, bool enable_primary = true, bool enable_secondary = true>
void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height,
const int direction, const int primary_strength,
const int secondary_strength, const int damping, uint8_t* dst,
const ptrdiff_t dst_stride) {
- static_assert(width == 8 || width == 4 || width == 2, "Invalid CDEF width.");
-
+ static_assert(width == 8 || width == 4, "Invalid CDEF width.");
+ static_assert(enable_primary || enable_secondary, "");
__m128i primary_damping_shift, secondary_damping_shift;
+
// FloorLog2() requires input to be > 0.
- if (primary_strength == 0) {
- primary_damping_shift = _mm_setzero_si128();
- } else {
+ // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+ if (enable_primary) {
+ // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
+ // for UV filtering.
primary_damping_shift =
_mm_cvtsi32_si128(std::max(0, damping - FloorLog2(primary_strength)));
}
-
- if (secondary_strength == 0) {
- secondary_damping_shift = _mm_setzero_si128();
- } else {
+ if (enable_secondary) {
+ // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+ // necessary.
+ assert(damping - FloorLog2(secondary_strength) >= 0);
secondary_damping_shift =
- _mm_cvtsi32_si128(std::max(0, damping - FloorLog2(secondary_strength)));
+ _mm_cvtsi32_si128(damping - FloorLog2(secondary_strength));
}
const __m128i primary_tap_0 =
@@ -385,8 +364,6 @@ void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height,
_mm_set1_epi16(kCdefPrimaryTaps[primary_strength & 1][1]);
const __m128i secondary_tap_0 = _mm_set1_epi16(kCdefSecondaryTap0);
const __m128i secondary_tap_1 = _mm_set1_epi16(kCdefSecondaryTap1);
- const __m128i cdef_large_value =
- _mm_set1_epi16(static_cast<int16_t>(kCdefLargeValue));
const __m128i cdef_large_value_mask =
_mm_set1_epi16(static_cast<int16_t>(~kCdefLargeValue));
const __m128i primary_threshold = _mm_set1_epi16(primary_strength);
@@ -397,126 +374,113 @@ void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height,
__m128i pixel;
if (width == 8) {
pixel = LoadUnaligned16(src);
- } else if (width == 4) {
- pixel = LoadHi8(LoadLo8(src), src + src_stride);
- } else {
- pixel = Load4x2(src, src + src_stride);
- }
-
- // Primary |direction|.
- __m128i primary_val[4];
- if (width == 8) {
- LoadDirection(src, src_stride, primary_val, direction);
- } else if (width == 4) {
- LoadDirection4(src, src_stride, primary_val, direction);
} else {
- LoadDirection2(src, src_stride, primary_val, direction);
+ pixel = LoadHi8(LoadLo8(src), src + src_stride);
}
__m128i min = pixel;
- min = _mm_min_epu16(min, primary_val[0]);
- min = _mm_min_epu16(min, primary_val[1]);
- min = _mm_min_epu16(min, primary_val[2]);
- min = _mm_min_epu16(min, primary_val[3]);
-
__m128i max = pixel;
- max = _mm_max_epu16(max,
- _mm_and_si128(primary_val[0], cdef_large_value_mask));
- max = _mm_max_epu16(max,
- _mm_and_si128(primary_val[1], cdef_large_value_mask));
- max = _mm_max_epu16(max,
- _mm_and_si128(primary_val[2], cdef_large_value_mask));
- max = _mm_max_epu16(max,
- _mm_and_si128(primary_val[3], cdef_large_value_mask));
- __m128i mask = _mm_cmplt_epi16(primary_val[0], cdef_large_value);
- __m128i sum =
- ApplyConstrainAndTap(pixel, primary_val[0], mask, primary_tap_0,
- primary_damping_shift, primary_threshold);
- mask = _mm_cmplt_epi16(primary_val[1], cdef_large_value);
- sum = _mm_add_epi16(
- sum, ApplyConstrainAndTap(pixel, primary_val[1], mask, primary_tap_0,
- primary_damping_shift, primary_threshold));
- mask = _mm_cmplt_epi16(primary_val[2], cdef_large_value);
- sum = _mm_add_epi16(
- sum, ApplyConstrainAndTap(pixel, primary_val[2], mask, primary_tap_1,
- primary_damping_shift, primary_threshold));
- mask = _mm_cmplt_epi16(primary_val[3], cdef_large_value);
- sum = _mm_add_epi16(
- sum, ApplyConstrainAndTap(pixel, primary_val[3], mask, primary_tap_1,
- primary_damping_shift, primary_threshold));
-
- // Secondary |direction| values (+/- 2). Clamp |direction|.
- __m128i secondary_val[8];
- if (width == 8) {
- LoadDirection(src, src_stride, secondary_val, (direction + 2) & 0x7);
- LoadDirection(src, src_stride, secondary_val + 4, (direction - 2) & 0x7);
- } else if (width == 4) {
- LoadDirection4(src, src_stride, secondary_val, (direction + 2) & 0x7);
- LoadDirection4(src, src_stride, secondary_val + 4, (direction - 2) & 0x7);
+ __m128i sum;
+
+ if (enable_primary) {
+ // Primary |direction|.
+ __m128i primary_val[4];
+ if (width == 8) {
+ LoadDirection(src, src_stride, primary_val, direction);
+ } else {
+ LoadDirection4(src, src_stride, primary_val, direction);
+ }
+
+ min = _mm_min_epu16(min, primary_val[0]);
+ min = _mm_min_epu16(min, primary_val[1]);
+ min = _mm_min_epu16(min, primary_val[2]);
+ min = _mm_min_epu16(min, primary_val[3]);
+
+ // The source is 16 bits, however, we only really care about the lower
+ // 8 bits. The upper 8 bits contain the "large" flag. After the final
+ // primary max has been calculated, zero out the upper 8 bits. Use this
+ // to find the "16 bit" max.
+ const __m128i max_p01 = _mm_max_epu8(primary_val[0], primary_val[1]);
+ const __m128i max_p23 = _mm_max_epu8(primary_val[2], primary_val[3]);
+ const __m128i max_p = _mm_max_epu8(max_p01, max_p23);
+ max = _mm_max_epu16(max, _mm_and_si128(max_p, cdef_large_value_mask));
+
+ sum = ApplyConstrainAndTap(pixel, primary_val[0], primary_tap_0,
+ primary_damping_shift, primary_threshold);
+ sum = _mm_add_epi16(
+ sum, ApplyConstrainAndTap(pixel, primary_val[1], primary_tap_0,
+ primary_damping_shift, primary_threshold));
+ sum = _mm_add_epi16(
+ sum, ApplyConstrainAndTap(pixel, primary_val[2], primary_tap_1,
+ primary_damping_shift, primary_threshold));
+ sum = _mm_add_epi16(
+ sum, ApplyConstrainAndTap(pixel, primary_val[3], primary_tap_1,
+ primary_damping_shift, primary_threshold));
} else {
- LoadDirection2(src, src_stride, secondary_val, (direction + 2) & 0x7);
- LoadDirection2(src, src_stride, secondary_val + 4, (direction - 2) & 0x7);
+ sum = _mm_setzero_si128();
}
- min = _mm_min_epu16(min, secondary_val[0]);
- min = _mm_min_epu16(min, secondary_val[1]);
- min = _mm_min_epu16(min, secondary_val[2]);
- min = _mm_min_epu16(min, secondary_val[3]);
- min = _mm_min_epu16(min, secondary_val[4]);
- min = _mm_min_epu16(min, secondary_val[5]);
- min = _mm_min_epu16(min, secondary_val[6]);
- min = _mm_min_epu16(min, secondary_val[7]);
-
- max = _mm_max_epu16(max,
- _mm_and_si128(secondary_val[0], cdef_large_value_mask));
- max = _mm_max_epu16(max,
- _mm_and_si128(secondary_val[1], cdef_large_value_mask));
- max = _mm_max_epu16(max,
- _mm_and_si128(secondary_val[2], cdef_large_value_mask));
- max = _mm_max_epu16(max,
- _mm_and_si128(secondary_val[3], cdef_large_value_mask));
- max = _mm_max_epu16(max,
- _mm_and_si128(secondary_val[4], cdef_large_value_mask));
- max = _mm_max_epu16(max,
- _mm_and_si128(secondary_val[5], cdef_large_value_mask));
- max = _mm_max_epu16(max,
- _mm_and_si128(secondary_val[6], cdef_large_value_mask));
- max = _mm_max_epu16(max,
- _mm_and_si128(secondary_val[7], cdef_large_value_mask));
-
- mask = _mm_cmplt_epi16(secondary_val[0], cdef_large_value);
- sum = _mm_add_epi16(sum, ApplyConstrainAndTap(
- pixel, secondary_val[0], mask, secondary_tap_0,
- secondary_damping_shift, secondary_threshold));
- mask = _mm_cmplt_epi16(secondary_val[1], cdef_large_value);
- sum = _mm_add_epi16(sum, ApplyConstrainAndTap(
- pixel, secondary_val[1], mask, secondary_tap_0,
- secondary_damping_shift, secondary_threshold));
- mask = _mm_cmplt_epi16(secondary_val[2], cdef_large_value);
- sum = _mm_add_epi16(sum, ApplyConstrainAndTap(
- pixel, secondary_val[2], mask, secondary_tap_1,
- secondary_damping_shift, secondary_threshold));
- mask = _mm_cmplt_epi16(secondary_val[3], cdef_large_value);
- sum = _mm_add_epi16(sum, ApplyConstrainAndTap(
- pixel, secondary_val[3], mask, secondary_tap_1,
- secondary_damping_shift, secondary_threshold));
- mask = _mm_cmplt_epi16(secondary_val[4], cdef_large_value);
- sum = _mm_add_epi16(sum, ApplyConstrainAndTap(
- pixel, secondary_val[4], mask, secondary_tap_0,
- secondary_damping_shift, secondary_threshold));
- mask = _mm_cmplt_epi16(secondary_val[5], cdef_large_value);
- sum = _mm_add_epi16(sum, ApplyConstrainAndTap(
- pixel, secondary_val[5], mask, secondary_tap_0,
- secondary_damping_shift, secondary_threshold));
- mask = _mm_cmplt_epi16(secondary_val[6], cdef_large_value);
- sum = _mm_add_epi16(sum, ApplyConstrainAndTap(
- pixel, secondary_val[6], mask, secondary_tap_1,
- secondary_damping_shift, secondary_threshold));
- mask = _mm_cmplt_epi16(secondary_val[7], cdef_large_value);
- sum = _mm_add_epi16(sum, ApplyConstrainAndTap(
- pixel, secondary_val[7], mask, secondary_tap_1,
- secondary_damping_shift, secondary_threshold));
-
+ if (enable_secondary) {
+ // Secondary |direction| values (+/- 2). Clamp |direction|.
+ __m128i secondary_val[8];
+ if (width == 8) {
+ LoadDirection(src, src_stride, secondary_val, direction + 2);
+ LoadDirection(src, src_stride, secondary_val + 4, direction - 2);
+ } else {
+ LoadDirection4(src, src_stride, secondary_val, direction + 2);
+ LoadDirection4(src, src_stride, secondary_val + 4, direction - 2);
+ }
+
+ min = _mm_min_epu16(min, secondary_val[0]);
+ min = _mm_min_epu16(min, secondary_val[1]);
+ min = _mm_min_epu16(min, secondary_val[2]);
+ min = _mm_min_epu16(min, secondary_val[3]);
+ min = _mm_min_epu16(min, secondary_val[4]);
+ min = _mm_min_epu16(min, secondary_val[5]);
+ min = _mm_min_epu16(min, secondary_val[6]);
+ min = _mm_min_epu16(min, secondary_val[7]);
+
+ const __m128i max_s01 = _mm_max_epu8(secondary_val[0], secondary_val[1]);
+ const __m128i max_s23 = _mm_max_epu8(secondary_val[2], secondary_val[3]);
+ const __m128i max_s45 = _mm_max_epu8(secondary_val[4], secondary_val[5]);
+ const __m128i max_s67 = _mm_max_epu8(secondary_val[6], secondary_val[7]);
+ const __m128i max_s = _mm_max_epu8(_mm_max_epu8(max_s01, max_s23),
+ _mm_max_epu8(max_s45, max_s67));
+ max = _mm_max_epu16(max, _mm_and_si128(max_s, cdef_large_value_mask));
+
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[0], secondary_tap_0,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[1], secondary_tap_0,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[2], secondary_tap_1,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[3], secondary_tap_1,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[4], secondary_tap_0,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[5], secondary_tap_0,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[6], secondary_tap_1,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[7], secondary_tap_1,
+ secondary_damping_shift, secondary_threshold));
+ }
// Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
const __m128i sum_lt_0 = _mm_srai_epi16(sum, 15);
// 8 + sum
@@ -536,20 +500,13 @@ void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height,
StoreLo8(dst, result);
dst += dst_stride;
++y;
- } else if (width == 4) {
+ } else {
src += 2 * src_stride;
Store4(dst, result);
dst += dst_stride;
Store4(dst, _mm_srli_si128(result, 4));
dst += dst_stride;
y += 2;
- } else {
- src += 2 * src_stride;
- Store2(dst, result);
- dst += dst_stride;
- Store2(dst, _mm_srli_si128(result, 2));
- dst += dst_stride;
- y += 2;
}
} while (y < height);
}
@@ -558,29 +515,46 @@ void DoCdef(const uint16_t* src, const ptrdiff_t src_stride, const int height,
// inside the frame. However it requires the source input to be padded with a
// constant large value if at the boundary. The input must be uint16_t.
void CdefFilter_SSE4_1(const void* const source, const ptrdiff_t source_stride,
- const int rows4x4, const int columns4x4,
- const int curr_x, const int curr_y,
- const int subsampling_x, const int subsampling_y,
+ const int block_width, const int block_height,
const int primary_strength, const int secondary_strength,
const int damping, const int direction, void* const dest,
const ptrdiff_t dest_stride) {
- const int plane_width = MultiplyBy4(columns4x4) >> subsampling_x;
- const int plane_height = MultiplyBy4(rows4x4) >> subsampling_y;
- const int block_width = std::min(8 >> subsampling_x, plane_width - curr_x);
- const int block_height = std::min(8 >> subsampling_y, plane_height - curr_y);
const auto* src = static_cast<const uint16_t*>(source);
auto* dst = static_cast<uint8_t*>(dest);
- if (block_width == 8) {
- DoCdef<8>(src, source_stride, block_height, direction, primary_strength,
- secondary_strength, damping, dst, dest_stride);
- } else if (block_width == 4) {
- DoCdef<4>(src, source_stride, block_height, direction, primary_strength,
- secondary_strength, damping, dst, dest_stride);
+ if (secondary_strength > 0) {
+ if (primary_strength > 0) {
+ if (block_width == 8) {
+ DoCdef<8>(src, source_stride, block_height, direction, primary_strength,
+ secondary_strength, damping, dst, dest_stride);
+ } else {
+ assert(block_width == 4);
+ DoCdef<4>(src, source_stride, block_height, direction, primary_strength,
+ secondary_strength, damping, dst, dest_stride);
+ }
+ } else {
+ if (block_width == 8) {
+ DoCdef<8, /*enable_primary=*/false>(
+ src, source_stride, block_height, direction, primary_strength,
+ secondary_strength, damping, dst, dest_stride);
+ } else {
+ assert(block_width == 4);
+ DoCdef<4, /*enable_primary=*/false>(
+ src, source_stride, block_height, direction, primary_strength,
+ secondary_strength, damping, dst, dest_stride);
+ }
+ }
} else {
- assert(block_width == 2);
- DoCdef<2>(src, source_stride, block_height, direction, primary_strength,
- secondary_strength, damping, dst, dest_stride);
+ if (block_width == 8) {
+ DoCdef<8, /*enable_primary=*/true, /*enable_secondary=*/false>(
+ src, source_stride, block_height, direction, primary_strength,
+ secondary_strength, damping, dst, dest_stride);
+ } else {
+ assert(block_width == 4);
+ DoCdef<4, /*enable_primary=*/true, /*enable_secondary=*/false>(
+ src, source_stride, block_height, direction, primary_strength,
+ secondary_strength, damping, dst, dest_stride);
+ }
}
}
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/common_sse4.h b/chromium/third_party/libgav1/src/src/dsp/x86/common_sse4.h
index 8b03db69f7a..24c801fd863 100644
--- a/chromium/third_party/libgav1/src/src/dsp/x86/common_sse4.h
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/common_sse4.h
@@ -17,6 +17,7 @@
#ifndef LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
#define LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
+#include "src/utils/compiler_attributes.h"
#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_SSE4_1
@@ -91,6 +92,14 @@ inline __m128i Load2x2(const void* src1, const void* src2) {
return _mm_cvtsi32_si128(val1 | (val2 << 16));
}
+// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
+template <int lane>
+inline __m128i Load2(const void* const buf, __m128i val) {
+ uint16_t temp;
+ memcpy(&temp, buf, 2);
+ return _mm_insert_epi16(val, temp, lane);
+}
+
inline __m128i Load4(const void* src) {
// With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
// intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
@@ -136,6 +145,41 @@ inline __m128i LoadAligned16(const void* a) {
}
//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline __m128i MaskOverreads(const __m128i source,
+ const int over_read_in_bytes) {
+ __m128i dst = source;
+#if LIBGAV1_MSAN
+ if (over_read_in_bytes > 0) {
+ __m128i mask = _mm_set1_epi8(-1);
+ for (int i = 0; i < over_read_in_bytes; ++i) {
+ mask = _mm_srli_si128(mask, 1);
+ }
+ dst = _mm_and_si128(dst, mask);
+ }
+#else
+ static_cast<void>(over_read_in_bytes);
+#endif
+ return dst;
+}
+
+inline __m128i LoadLo8Msan(const void* const source,
+ const int over_read_in_bytes) {
+ return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8);
+}
+
+inline __m128i LoadAligned16Msan(const void* const source,
+ const int over_read_in_bytes) {
+ return MaskOverreads(LoadAligned16(source), over_read_in_bytes);
+}
+
+inline __m128i LoadUnaligned16Msan(const void* const source,
+ const int over_read_in_bytes) {
+ return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes);
+}
+
+//------------------------------------------------------------------------------
// Store functions.
inline void Store2(void* dst, const __m128i x) {
@@ -156,6 +200,10 @@ inline void StoreHi8(void* a, const __m128i v) {
_mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v));
}
+inline void StoreAligned16(void* a, const __m128i v) {
+ _mm_store_si128(static_cast<__m128i*>(a), v);
+}
+
inline void StoreUnaligned16(void* a, const __m128i v) {
_mm_storeu_si128(static_cast<__m128i*>(a), v);
}
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.cc b/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.cc
index 40ce568a491..a0ed3bea758 100644
--- a/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.cc
@@ -13,6 +13,7 @@
// limitations under the License.
#include "src/dsp/convolve.h"
+#include "src/utils/constants.h"
#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_SSE4_1
@@ -33,8 +34,40 @@ namespace dsp {
namespace low_bitdepth {
namespace {
+// TODO(slavarnway): Move to common neon/sse4 file.
+int GetNumTapsInFilter(const int filter_index) {
+ if (filter_index < 2) {
+ // Despite the names these only use 6 taps.
+ // kInterpolationFilterEightTap
+ // kInterpolationFilterEightTapSmooth
+ return 6;
+ }
+
+ if (filter_index == 2) {
+ // kInterpolationFilterEightTapSharp
+ return 8;
+ }
+
+ if (filter_index == 3) {
+ // kInterpolationFilterBilinear
+ return 2;
+ }
+
+ assert(filter_index > 3);
+ // For small sizes (width/height <= 4) the large filters are replaced with 4
+ // tap options.
+ // If the original filters were |kInterpolationFilterEightTap| or
+ // |kInterpolationFilterEightTapSharp| then it becomes
+ // |kInterpolationFilterSwitchable|.
+ // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
+ // tap filter.
+ return 4;
+}
+
+constexpr int kIntermediateStride = kMaxSuperBlockSizeInPixels;
constexpr int kSubPixelMask = (1 << kSubPixelBits) - 1;
constexpr int kHorizontalOffset = 3;
+constexpr int kFilterIndexShift = 6;
// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
@@ -177,6 +210,15 @@ __m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
return _mm_packus_epi16(sum, sum);
}
+template <int filter_index>
+__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const __m128i* const v_tap) {
+ const __m128i sum =
+ SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+ return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
template <int num_taps, int step, int filter_index, bool is_2d = false,
bool is_compound = false>
void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
@@ -195,7 +237,11 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
if (is_2d || is_compound) {
const __m128i v_sum =
HorizontalTaps8To16<filter_index>(&src[x], v_tap);
- StoreUnaligned16(&dest16[x], v_sum);
+ if (is_2d) {
+ StoreAligned16(&dest16[x], v_sum);
+ } else {
+ StoreUnaligned16(&dest16[x], v_sum);
+ }
} else {
const __m128i result =
SimpleHorizontalTaps<filter_index>(&src[x], v_tap);
@@ -236,7 +282,12 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
int y = 0;
do {
if (is_2d) {
- // TODO(slavarnway): Add 2d support
+ const __m128i sum =
+ HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+ Store4(&dest16[0], sum);
+ dest16 += pred_stride;
+ Store4(&dest16[0], _mm_srli_si128(sum, 8));
+ dest16 += pred_stride;
} else {
const __m128i sum =
SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
@@ -254,13 +305,33 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
// generates context for the vertical pass.
if (is_2d) {
assert(height % 2 == 1);
- // TODO(slavarnway): Add 2d support
+ __m128i sum;
+ const __m128i input = LoadLo8(&src[2]);
+ if (filter_index == 3) {
+ // 03 04 04 05 05 06 06 07 ....
+ const __m128i v_src_43 =
+ _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
+ sum = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3
+ } else {
+ // 02 03 03 04 04 05 05 06 06 07 ....
+ const __m128i v_src_32 =
+ _mm_srli_si128(_mm_unpacklo_epi8(input, input), 1);
+ // 04 05 05 06 06 07 07 08 ...
+ const __m128i v_src_54 = _mm_srli_si128(v_src_32, 4);
+ const __m128i v_madd_32 =
+ _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2
+ const __m128i v_madd_54 =
+ _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4
+ sum = _mm_add_epi16(v_madd_54, v_madd_32);
+ }
+ sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+ Store4(dest16, sum);
}
}
}
}
-template <int num_taps>
+template <int num_taps, bool is_2d_vertical = false>
LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
__m128i* v_tap) {
if (num_taps == 8) {
@@ -268,30 +339,295 @@ LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2
v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4
v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff); // k7k6
- v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
- v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
- v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
- v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+ v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+ v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
+ }
} else if (num_taps == 6) {
const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0); // k2k1
v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3
v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa); // k6k5
- v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
- v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
- v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+ }
} else if (num_taps == 4) {
v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2
v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4
- v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
- v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ }
} else { // num_taps == 2
const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3
- v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ }
}
}
+template <int num_taps, bool is_compound>
+__m128i SimpleSum2DVerticalTaps(const __m128i* const src,
+ const __m128i* const taps) {
+ __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]);
+ __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]);
+ if (num_taps >= 4) {
+ __m128i madd_lo =
+ _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]);
+ __m128i madd_hi =
+ _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]);
+ sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+ if (num_taps >= 6) {
+ madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]);
+ madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]);
+ sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+ if (num_taps == 8) {
+ madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]);
+ madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]);
+ sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+ }
+ }
+ }
+
+ if (is_compound) {
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+ RightShiftWithRounding_S32(sum_hi,
+ kInterRoundBitsCompoundVertical - 1));
+ }
+
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+ RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int width,
+ const int height, const __m128i* const taps) {
+ assert(width >= 8);
+ constexpr int next_row = num_taps - 1;
+ // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+ const ptrdiff_t src_stride = width;
+
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ int x = 0;
+ do {
+ __m128i srcs[8];
+ const uint16_t* src_x = src + x;
+ srcs[0] = LoadAligned16(src_x);
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = LoadAligned16(src_x);
+ src_x += src_stride;
+ srcs[2] = LoadAligned16(src_x);
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = LoadAligned16(src_x);
+ src_x += src_stride;
+ srcs[4] = LoadAligned16(src_x);
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = LoadAligned16(src_x);
+ src_x += src_stride;
+ srcs[6] = LoadAligned16(src_x);
+ src_x += src_stride;
+ }
+ }
+ }
+
+ int y = 0;
+ do {
+ srcs[next_row] = LoadAligned16(src_x);
+ src_x += src_stride;
+
+ const __m128i sum =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+ if (is_compound) {
+ StoreUnaligned16(dst16 + x + y * dst_stride, sum);
+ } else {
+ StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(sum, sum));
+ }
+
+ srcs[0] = srcs[1];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
+ if (num_taps == 8) {
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
+ }
+ }
+ }
+ } while (++y < height);
+ x += 8;
+ } while (x < width);
+}
+
+// Take advantage of |src_stride| == |width| to process two rows at a time.
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical4xH(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const __m128i* const taps) {
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ __m128i srcs[9];
+ srcs[0] = LoadAligned16(src);
+ src += 8;
+ if (num_taps >= 4) {
+ srcs[2] = LoadAligned16(src);
+ src += 8;
+ srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]);
+ if (num_taps >= 6) {
+ srcs[4] = LoadAligned16(src);
+ src += 8;
+ srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]);
+ if (num_taps == 8) {
+ srcs[6] = LoadAligned16(src);
+ src += 8;
+ srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]);
+ }
+ }
+ }
+
+ int y = 0;
+ do {
+ srcs[num_taps] = LoadAligned16(src);
+ src += 8;
+ srcs[num_taps - 1] = _mm_unpacklo_epi64(
+ _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]);
+
+ const __m128i sum =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+ if (is_compound) {
+ StoreUnaligned16(dst16, sum);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results = _mm_packus_epi16(sum, sum);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ if (num_taps == 8) {
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ }
+ }
+ }
+ y += 2;
+ } while (y < height);
+}
+
+// Take advantage of |src_stride| == |width| to process four rows at a time.
+template <int num_taps>
+void Filter2DVertical2xH(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const __m128i* const taps) {
+ constexpr int next_row = (num_taps < 6) ? 4 : 8;
+
+ auto* dst8 = static_cast<uint8_t*>(dst);
+
+ __m128i srcs[9];
+ srcs[0] = LoadAligned16(src);
+ src += 8;
+ if (num_taps >= 6) {
+ srcs[4] = LoadAligned16(src);
+ src += 8;
+ srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+ if (num_taps == 8) {
+ srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+ srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+ }
+ }
+
+ int y = 0;
+ do {
+ srcs[next_row] = LoadAligned16(src);
+ src += 8;
+ if (num_taps == 2) {
+ srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+ } else if (num_taps == 4) {
+ srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+ srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+ srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+ } else if (num_taps == 6) {
+ srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+ srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+ srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+ } else if (num_taps == 8) {
+ srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+ srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8);
+ srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12);
+ }
+
+ const __m128i sum =
+ SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
+ const __m128i results = _mm_packus_epi16(sum, sum);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
+ // Therefore we don't need to check this condition when |height| > 4.
+ if (num_taps <= 4 && height == 2) return;
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ if (num_taps == 6) {
+ srcs[1] = srcs[5];
+ srcs[4] = srcs[8];
+ } else if (num_taps == 8) {
+ srcs[1] = srcs[5];
+ srcs[2] = srcs[6];
+ srcs[3] = srcs[7];
+ srcs[4] = srcs[8];
+ }
+
+ y += 4;
+ } while (y < height);
+}
+
template <bool is_2d = false, bool is_compound = false>
LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
@@ -330,6 +666,765 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
}
}
+void Convolve2D_SSE4_1(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index, const int subpixel_x,
+ const int subpixel_y, const int width, const int height,
+ void* prediction, const ptrdiff_t pred_stride) {
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+
+ // The output of the horizontal filter is guaranteed to fit in 16 bits.
+ alignas(16) uint16_t
+ intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+ const int intermediate_height = height + vertical_taps - 1;
+
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+
+ DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
+ width, intermediate_height, subpixel_x,
+ horiz_filter_index);
+
+ // Vertical filter.
+ auto* dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride;
+ const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
+ assert(filter_id != 0);
+
+ __m128i taps[4];
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[vert_filter_index][filter_id]);
+
+ if (vertical_taps == 8) {
+ SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (vertical_taps == 6) {
+ SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (vertical_taps == 4) {
+ SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
+ taps);
+ }
+ } else { // |vertical_taps| == 2
+ SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
+ taps);
+ }
+ }
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+__m128i Compound1DShift(const __m128i sum) {
+ return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int filter_index>
+__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
+ __m128i v_src[4];
+
+ if (filter_index < 2) {
+ // 6 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+ } else if (filter_index == 2) {
+ // 8 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+ v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
+ } else if (filter_index == 3) {
+ // 2 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ } else if (filter_index > 3) {
+ // 4 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+ }
+ const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+ return sum;
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int width, const int height,
+ const __m128i* const v_tap) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ const int next_row = num_taps - 1;
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+ assert(width >= 8);
+
+ int x = 0;
+ do {
+ const uint8_t* src_x = src + x;
+ __m128i srcs[8];
+ srcs[0] = LoadLo8(src_x);
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = LoadLo8(src_x);
+ src_x += src_stride;
+ srcs[2] = LoadLo8(src_x);
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = LoadLo8(src_x);
+ src_x += src_stride;
+ srcs[4] = LoadLo8(src_x);
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = LoadLo8(src_x);
+ src_x += src_stride;
+ srcs[6] = LoadLo8(src_x);
+ src_x += src_stride;
+ }
+ }
+ }
+
+ int y = 0;
+ do {
+ srcs[next_row] = LoadLo8(src_x);
+ src_x += src_stride;
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16 + x + y * dst_stride, results);
+ } else {
+ const __m128i results =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(results, results));
+ }
+
+ srcs[0] = srcs[1];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
+ if (num_taps == 8) {
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
+ }
+ }
+ }
+ } while (++y < height);
+ x += 8;
+ } while (x < width);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int height, const __m128i* const v_tap) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ __m128i srcs[9];
+
+ if (num_taps == 2) {
+ srcs[2] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+
+ int y = 0;
+ do {
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ y += 2;
+ } while (y < height);
+ } else if (num_taps == 4) {
+ srcs[4] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+ int y = 0;
+ do {
+ // 30 31 32 33
+ const __m128i b = Load4(src);
+ // 20 21 22 23 30 31 32 33
+ srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+ src += src_stride;
+ // 40 41 42 43
+ srcs[4] = Load4(src);
+ src += src_stride;
+ // 30 31 32 33 40 41 42 43
+ srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ y += 2;
+ } while (y < height);
+ } else if (num_taps == 6) {
+ srcs[6] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+ // 30 31 32 33
+ const __m128i b = Load4(src);
+ // 20 21 22 23 30 31 32 33
+ srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+ src += src_stride;
+ // 40 41 42 43
+ srcs[4] = Load4(src);
+ src += src_stride;
+ // 30 31 32 33 40 41 42 43
+ srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+ int y = 0;
+ do {
+ // 50 51 52 53
+ const __m128i c = Load4(src);
+ // 40 41 42 43 50 51 52 53
+ srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+ src += src_stride;
+ // 60 61 62 63
+ srcs[6] = Load4(src);
+ src += src_stride;
+ // 50 51 52 53 60 61 62 63
+ srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ y += 2;
+ } while (y < height);
+ } else if (num_taps == 8) {
+ srcs[8] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+ // 30 31 32 33
+ const __m128i b = Load4(src);
+ // 20 21 22 23 30 31 32 33
+ srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+ src += src_stride;
+ // 40 41 42 43
+ srcs[4] = Load4(src);
+ src += src_stride;
+ // 30 31 32 33 40 41 42 43
+ srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+ // 50 51 52 53
+ const __m128i c = Load4(src);
+ // 40 41 42 43 50 51 52 53
+ srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+ src += src_stride;
+ // 60 61 62 63
+ srcs[6] = Load4(src);
+ src += src_stride;
+ // 50 51 52 53 60 61 62 63
+ srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+ int y = 0;
+ do {
+ // 70 71 72 73
+ const __m128i d = Load4(src);
+ // 60 61 62 63 70 71 72 73
+ srcs[6] = _mm_unpacklo_epi32(srcs[6], d);
+ src += src_stride;
+ // 80 81 82 83
+ srcs[8] = Load4(src);
+ src += src_stride;
+ // 70 71 72 73 80 81 82 83
+ srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ y += 2;
+ } while (y < height);
+ }
+}
+
+template <int filter_index, bool negative_outside_taps = false>
+void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int height, const __m128i* const v_tap) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ auto* dst8 = static_cast<uint8_t*>(dst);
+
+ __m128i srcs[9];
+
+ if (num_taps == 2) {
+ srcs[2] = _mm_setzero_si128();
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+
+ int y = 0;
+ do {
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[2] = Load2<0>(src, srcs[2]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41
+ const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_2, 2);
+ // This uses srcs[0]..srcs[1].
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ if (height == 2) return;
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[2];
+ y += 4;
+ } while (y < height);
+ } else if (num_taps == 4) {
+ srcs[4] = _mm_setzero_si128();
+
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+
+ int y = 0;
+ do {
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[4] = Load2<0>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+ // 20 21 30 31 40 41 50 51
+ srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+ // 30 31 40 41 50 51 60 61
+ srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+ // This uses srcs[0]..srcs[3].
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ if (height == 2) return;
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ y += 4;
+ } while (y < height);
+ } else if (num_taps == 6) {
+ // During the vertical pass the number of taps is restricted when
+ // |height| <= 4.
+ assert(height > 4);
+ srcs[8] = _mm_setzero_si128();
+
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[4] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_4x, 2);
+
+ int y = 0;
+ do {
+ // 40 41 50 51
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61 70 71
+ srcs[4] = Load2<3>(src, srcs[4]);
+ src += src_stride;
+ // 80 81
+ srcs[8] = Load2<0>(src, srcs[8]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 20 21 30 31 40 41 50 51
+ srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+ // 30 31 40 41 50 51 60 61
+ srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+ const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+ // 50 51 60 61 70 71 80 81
+ srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+
+ // This uses srcs[0]..srcs[5].
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ srcs[1] = srcs[5];
+ srcs[4] = srcs[8];
+ y += 4;
+ } while (y < height);
+ } else if (num_taps == 8) {
+ // During the vertical pass the number of taps is restricted when
+ // |height| <= 4.
+ assert(height > 4);
+ srcs[8] = _mm_setzero_si128();
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[4] = Load2(src);
+ src += src_stride;
+ // 40 41 50 51
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+ // 20 21 30 31 40 41 50 51
+ srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+ // 30 31 40 41 50 51 60 61
+ srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+ int y = 0;
+ do {
+ // 40 41 50 51 60 61 70 71
+ srcs[4] = Load2<3>(src, srcs[4]);
+ src += src_stride;
+ // 80 81
+ srcs[8] = Load2<0>(src, srcs[8]);
+ src += src_stride;
+ // 80 81 90 91
+ srcs[8] = Load2<1>(src, srcs[8]);
+ src += src_stride;
+ // 80 81 90 91 a0 a1
+ srcs[8] = Load2<2>(src, srcs[8]);
+ src += src_stride;
+
+ // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1
+ const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+ // 50 51 60 61 70 71 80 81
+ srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+ // 60 61 70 71 80 81 90 91
+ srcs[6] = _mm_srli_si128(srcs_4_8, 4);
+ // 70 71 80 81 90 91 a0 a1
+ srcs[7] = _mm_srli_si128(srcs_4_8, 6);
+
+ // This uses srcs[0]..srcs[7].
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ srcs[1] = srcs[5];
+ srcs[2] = srcs[6];
+ srcs[3] = srcs[7];
+ srcs[4] = srcs[8];
+ y += 4;
+ } while (y < height);
+ }
+}
+
+void ConvolveVertical_SSE4_1(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/,
+ const int vertical_filter_index,
+ const int /*subpixel_x*/, const int subpixel_y,
+ const int width, const int height,
+ void* prediction, const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride;
+ auto* dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride;
+ const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
+ assert(filter_id != 0);
+
+ __m128i taps[4];
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+ if (filter_index < 2) { // 6 tap.
+ SetupTaps<6>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height, taps);
+ } else if (width == 4) {
+ FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (filter_index == 2) { // 8 tap.
+ SetupTaps<8>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ } else if (width == 4) {
+ FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (filter_index == 3) { // 2 tap.
+ SetupTaps<2>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height, taps);
+ } else if (width == 4) {
+ FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (filter_index == 4) { // 4 tap.
+ SetupTaps<4>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
+ } else if (width == 4) {
+ FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
+ taps);
+ }
+ } else {
+ // TODO(slavarnway): Investigate adding |filter_index| == 1 special cases.
+ // See convolve_neon.cc
+ SetupTaps<4>(&v_filter, taps);
+
+ if (width == 2) {
+ FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height, taps);
+ } else if (width == 4) {
+ FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
+ taps);
+ }
+ }
+}
+
void ConvolveCompoundCopy_SSE4(
const void* const reference, const ptrdiff_t reference_stride,
const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
@@ -388,6 +1483,76 @@ void ConvolveCompoundCopy_SSE4(
}
}
+void ConvolveCompoundVertical_SSE4_1(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/, const int vertical_filter_index,
+ const int /*subpixel_x*/, const int subpixel_y, const int width,
+ const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride;
+ auto* dest = static_cast<uint16_t*>(prediction);
+ const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
+ assert(filter_id != 0);
+
+ __m128i taps[4];
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+ if (filter_index < 2) { // 6 tap.
+ SetupTaps<6>(&v_filter, taps);
+ if (width == 4) {
+ FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
+ } else {
+ FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps);
+ }
+ } else if (filter_index == 2) { // 8 tap.
+ SetupTaps<8>(&v_filter, taps);
+
+ if (width == 4) {
+ FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
+ } else {
+ FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps);
+ }
+ } else if (filter_index == 3) { // 2 tap.
+ SetupTaps<2>(&v_filter, taps);
+
+ if (width == 4) {
+ FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
+ } else {
+ FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps);
+ }
+ } else if (filter_index == 4) { // 4 tap.
+ SetupTaps<4>(&v_filter, taps);
+
+ if (width == 4) {
+ FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
+ } else {
+ FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps);
+ }
+ } else {
+ SetupTaps<4>(&v_filter, taps);
+
+ if (width == 4) {
+ FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
+ } else {
+ FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps);
+ }
+ }
+}
+
void ConvolveHorizontal_SSE4_1(const void* const reference,
const ptrdiff_t reference_stride,
const int horizontal_filter_index,
@@ -418,13 +1583,720 @@ void ConvolveCompoundHorizontal_SSE4_1(
filter_index);
}
+void ConvolveCompound2D_SSE4_1(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int horizontal_filter_index, const int vertical_filter_index,
+ const int subpixel_x, const int subpixel_y, const int width,
+ const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
+ // The output of the horizontal filter, i.e. the intermediate_result, is
+ // guaranteed to fit in int16_t.
+ alignas(16) uint16_t
+ intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [4, 5].
+ // Similarly for height.
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+ const int intermediate_height = height + vertical_taps - 1;
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* const src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride -
+ kHorizontalOffset;
+
+ DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+ src, src_stride, intermediate_result, width, width, intermediate_height,
+ subpixel_x, horiz_filter_index);
+
+ // Vertical filter.
+ auto* dest = static_cast<uint16_t*>(prediction);
+ const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
+ assert(filter_id != 0);
+
+ const ptrdiff_t dest_stride = width;
+ __m128i taps[4];
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[vert_filter_index][filter_id]);
+
+ if (vertical_taps == 8) {
+ SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<8, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else if (vertical_taps == 6) {
+ SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<6, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else if (vertical_taps == 4) {
+ SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<4, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else { // |vertical_taps| == 2
+ SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<2, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ }
+}
+
+// Pre-transposed filters.
+template <int filter_index>
+inline void GetHalfSubPixelFilter(__m128i* output) {
+ // Filter 0
+ alignas(
+ 16) static constexpr int8_t kHalfSubPixel6TapSignedFilterColumns[6][16] =
+ {{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
+ {0, -3, -5, -6, -7, -7, -8, -7, -7, -6, -6, -6, -5, -4, -2, -1},
+ {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+ {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+ {0, -1, -2, -4, -5, -6, -6, -6, -7, -7, -8, -7, -7, -6, -5, -3},
+ {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+ // Filter 1
+ alignas(16) static constexpr int8_t
+ kHalfSubPixel6TapMixedSignedFilterColumns[6][16] = {
+ {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
+ {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
+ {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+ {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+ {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14},
+ {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
+ // Filter 2
+ alignas(
+ 16) static constexpr int8_t kHalfSubPixel8TapSignedFilterColumns[8][16] =
+ {{0, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, 0},
+ {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
+ {0, -3, -6, -9, -11, -11, -12, -12, -12, -11, -10, -9, -7, -5, -3, -1},
+ {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
+ {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
+ {0, -1, -3, -5, -7, -9, -10, -11, -12, -12, -12, -11, -11, -9, -6, -3},
+ {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
+ {0, 0, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1}};
+ // Filter 3
+ alignas(16) static constexpr uint8_t kHalfSubPixel2TapFilterColumns[2][16] = {
+ {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
+ {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
+ // Filter 4
+ alignas(
+ 16) static constexpr int8_t kHalfSubPixel4TapSignedFilterColumns[4][16] =
+ {{0, -2, -4, -5, -6, -6, -7, -6, -6, -5, -5, -5, -4, -3, -2, -1},
+ {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+ {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+ {0, -1, -2, -3, -4, -5, -5, -5, -6, -6, -7, -6, -6, -5, -4, -2}};
+ // Filter 5
+ alignas(
+ 16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
+ {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
+ {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+ {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+ {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
+ switch (filter_index) {
+ case 0:
+ output[0] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[0]);
+ output[1] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[1]);
+ output[2] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[2]);
+ output[3] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[3]);
+ output[4] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[4]);
+ output[5] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[5]);
+ break;
+ case 1:
+ // The term "mixed" refers to the fact that the outer taps have a mix of
+ // negative and positive values.
+ output[0] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[0]);
+ output[1] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[1]);
+ output[2] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[2]);
+ output[3] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[3]);
+ output[4] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[4]);
+ output[5] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[5]);
+ break;
+ case 2:
+ output[0] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[0]);
+ output[1] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[1]);
+ output[2] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[2]);
+ output[3] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[3]);
+ output[4] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[4]);
+ output[5] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[5]);
+ output[6] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[6]);
+ output[7] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[7]);
+ break;
+ case 3:
+ output[0] = LoadAligned16(kHalfSubPixel2TapFilterColumns[0]);
+ output[1] = LoadAligned16(kHalfSubPixel2TapFilterColumns[1]);
+ break;
+ case 4:
+ output[0] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[0]);
+ output[1] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[1]);
+ output[2] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[2]);
+ output[3] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[3]);
+ break;
+ default:
+ assert(filter_index == 5);
+ output[0] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[0]);
+ output[1] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[1]);
+ output[2] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[2]);
+ output[3] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[3]);
+ break;
+ }
+}
+
+// There are many opportunities for overreading in scaled convolve, because
+// the range of starting points for filter windows is anywhere from 0 to 16
+// for 8 destination pixels, and the window sizes range from 2 to 8. To
+// accommodate this range concisely, we use |grade_x| to mean the most steps
+// in src that can be traversed in a single |step_x| increment, i.e. 1 or 2.
+// More importantly, |grade_x| answers the question "how many vector loads are
+// needed to cover the source values?"
+// When |grade_x| == 1, the maximum number of source values needed is 8 separate
+// starting positions plus 7 more to cover taps, all fitting into 16 bytes.
+// When |grade_x| > 1, we are guaranteed to exceed 8 whole steps in src for
+// every 8 |step_x| increments, on top of 8 possible taps. The first load covers
+// the starting sources for each kernel, while the final load covers the taps.
+// Since the offset value of src_x cannot exceed 8 and |num_taps| does not
+// exceed 4 when width <= 4, |grade_x| is set to 1 regardless of the value of
+// |step_x|.
+template <int num_taps, int grade_x>
+inline void PrepareSourceVectors(const uint8_t* src, const __m128i src_indices,
+ __m128i source[num_taps >> 1]) {
+ const __m128i src_vals = LoadUnaligned16(src);
+ source[0] = _mm_shuffle_epi8(src_vals, src_indices);
+ if (grade_x == 1) {
+ if (num_taps > 2) {
+ source[1] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 2), src_indices);
+ }
+ if (num_taps > 4) {
+ source[2] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 4), src_indices);
+ }
+ if (num_taps > 6) {
+ source[3] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 6), src_indices);
+ }
+ } else {
+ assert(grade_x > 1);
+ assert(num_taps != 4);
+ // grade_x > 1 also means width >= 8 && num_taps != 4
+ const __m128i src_vals_ext = LoadLo8(src + 16);
+ if (num_taps > 2) {
+ source[1] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 2),
+ src_indices);
+ source[2] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 4),
+ src_indices);
+ }
+ if (num_taps > 6) {
+ source[3] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 6),
+ src_indices);
+ }
+ }
+}
+
+template <int num_taps>
+inline void PrepareHorizontalTaps(const __m128i subpel_indices,
+ const __m128i* filter_taps,
+ __m128i* out_taps) {
+ const __m128i scale_index_offsets =
+ _mm_srli_epi16(subpel_indices, kFilterIndexShift);
+ const __m128i filter_index_mask = _mm_set1_epi8(kSubPixelMask);
+ const __m128i filter_indices =
+ _mm_and_si128(_mm_packus_epi16(scale_index_offsets, scale_index_offsets),
+ filter_index_mask);
+ // Line up taps for maddubs_epi16.
+ // The unpack is also assumed to be lighter than shift+alignr.
+ for (int k = 0; k < (num_taps >> 1); ++k) {
+ const __m128i taps0 = _mm_shuffle_epi8(filter_taps[2 * k], filter_indices);
+ const __m128i taps1 =
+ _mm_shuffle_epi8(filter_taps[2 * k + 1], filter_indices);
+ out_taps[k] = _mm_unpacklo_epi8(taps0, taps1);
+ }
+}
+
+inline __m128i HorizontalScaleIndices(const __m128i subpel_indices) {
+ const __m128i src_indices16 =
+ _mm_srli_epi16(subpel_indices, kScaleSubPixelBits);
+ const __m128i src_indices = _mm_packus_epi16(src_indices16, src_indices16);
+ return _mm_unpacklo_epi8(src_indices,
+ _mm_add_epi8(src_indices, _mm_set1_epi8(1)));
+}
+
+template <int grade_x, int filter_index, int num_taps>
+inline void ConvolveHorizontalScale(const uint8_t* src, ptrdiff_t src_stride,
+ int width, int subpixel_x, int step_x,
+ int intermediate_height,
+ int16_t* intermediate) {
+ // Account for the 0-taps that precede the 2 nonzero taps.
+ const int kernel_offset = (8 - num_taps) >> 1;
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const int step_x8 = step_x << 3;
+ __m128i filter_taps[num_taps];
+ GetHalfSubPixelFilter<filter_index>(filter_taps);
+ const __m128i index_steps =
+ _mm_mullo_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
+ _mm_set1_epi16(static_cast<int16_t>(step_x)));
+
+ __m128i taps[num_taps >> 1];
+ __m128i source[num_taps >> 1];
+ int p = subpixel_x;
+ // Case when width <= 4 is possible.
+ if (filter_index >= 3) {
+ if (filter_index > 3 || width <= 4) {
+ const uint8_t* src_x =
+ &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const __m128i p_fraction = _mm_set1_epi16(p & 1023);
+ const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction);
+ PrepareHorizontalTaps<num_taps>(subpel_indices, filter_taps, taps);
+ const __m128i packed_indices = HorizontalScaleIndices(subpel_indices);
+
+ int y = intermediate_height;
+ do {
+ // Load and line up source values with the taps. Width 4 means no need
+ // to load extended source.
+ PrepareSourceVectors<num_taps, /*grade_x=*/1>(src_x, packed_indices,
+ source);
+
+ StoreLo8(intermediate, RightShiftWithRounding_S16(
+ SumOnePassTaps<filter_index>(source, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate += kIntermediateStride;
+ } while (--y != 0);
+ return;
+ }
+ }
+
+ // |width| >= 8
+ int x = 0;
+ do {
+ const uint8_t* src_x =
+ &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+ int16_t* intermediate_x = intermediate + x;
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const __m128i p_fraction = _mm_set1_epi16(p & 1023);
+ const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction);
+ PrepareHorizontalTaps<num_taps>(subpel_indices, filter_taps, taps);
+ const __m128i packed_indices = HorizontalScaleIndices(subpel_indices);
+
+ int y = intermediate_height;
+ do {
+ // For each x, a lane of src_k[k] contains src_x[k].
+ PrepareSourceVectors<num_taps, grade_x>(src_x, packed_indices, source);
+
+ // Shift by one less because the taps are halved.
+ StoreAligned16(
+ intermediate_x,
+ RightShiftWithRounding_S16(SumOnePassTaps<filter_index>(source, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate_x += kIntermediateStride;
+ } while (--y != 0);
+ x += 8;
+ p += step_x8;
+ } while (x < width);
+}
+
+template <int num_taps>
+inline void PrepareVerticalTaps(const int8_t* taps, __m128i* output) {
+ // Avoid overreading the filter due to starting at kernel_offset.
+ // The only danger of overread is in the final filter, which has 4 taps.
+ const __m128i filter =
+ _mm_cvtepi8_epi16((num_taps > 4) ? LoadLo8(taps) : Load4(taps));
+ output[0] = _mm_shuffle_epi32(filter, 0);
+ if (num_taps > 2) {
+ output[1] = _mm_shuffle_epi32(filter, 0x55);
+ }
+ if (num_taps > 4) {
+ output[2] = _mm_shuffle_epi32(filter, 0xAA);
+ }
+ if (num_taps > 6) {
+ output[3] = _mm_shuffle_epi32(filter, 0xFF);
+ }
+}
+
+// Process eight 16 bit inputs and output eight 16 bit values.
+template <int num_taps, bool is_compound>
+inline __m128i Sum2DVerticalTaps(const __m128i* const src,
+ const __m128i* taps) {
+ const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]);
+ __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps[0]);
+ const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]);
+ __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps[0]);
+ if (num_taps > 2) {
+ const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps[1]));
+ const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps[1]));
+ }
+ if (num_taps > 4) {
+ const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps[2]));
+ const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps[2]));
+ }
+ if (num_taps > 6) {
+ const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps[3]));
+ const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps[3]));
+ }
+ if (is_compound) {
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+ RightShiftWithRounding_S32(sum_hi,
+ kInterRoundBitsCompoundVertical - 1));
+ }
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+ RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+// Bottom half of each src[k] is the source for one filter, and the top half
+// is the source for the other filter, for the next destination row.
+template <int num_taps, bool is_compound>
+__m128i Sum2DVerticalTaps4x2(const __m128i* const src, const __m128i* taps_lo,
+ const __m128i* taps_hi) {
+ const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]);
+ __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps_lo[0]);
+ const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]);
+ __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps_hi[0]);
+ if (num_taps > 2) {
+ const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps_lo[1]));
+ const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps_hi[1]));
+ }
+ if (num_taps > 4) {
+ const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps_lo[2]));
+ const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps_hi[2]));
+ }
+ if (num_taps > 6) {
+ const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps_lo[3]));
+ const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps_hi[3]));
+ }
+
+ if (is_compound) {
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+ RightShiftWithRounding_S32(sum_hi,
+ kInterRoundBitsCompoundVertical - 1));
+ }
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+ RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+// |width_class| is 2, 4, or 8, according to the Store function that should be
+// used.
+template <int num_taps, int width_class, bool is_compound>
+#if LIBGAV1_MSAN
+__attribute__((no_sanitize_memory)) void ConvolveVerticalScale(
+#else
+inline void ConvolveVerticalScale(
+#endif
+ const int16_t* src, const int width, const int subpixel_y,
+ const int filter_index, const int step_y, const int height, void* dest,
+ const ptrdiff_t dest_stride) {
+ constexpr ptrdiff_t src_stride = kIntermediateStride;
+ constexpr int kernel_offset = (8 - num_taps) / 2;
+ const int16_t* src_y = src;
+ // |dest| is 16-bit in compound mode, Pixel otherwise.
+ auto* dest16_y = static_cast<uint16_t*>(dest);
+ auto* dest_y = static_cast<uint8_t*>(dest);
+ __m128i s[num_taps];
+
+ int p = subpixel_y & 1023;
+ int y = height;
+ if (width_class <= 4) {
+ __m128i filter_taps_lo[num_taps >> 1];
+ __m128i filter_taps_hi[num_taps >> 1];
+ do { // y > 0
+ for (int i = 0; i < num_taps; ++i) {
+ s[i] = LoadLo8(src_y + i * src_stride);
+ }
+ int filter_id = (p >> 6) & kSubPixelMask;
+ const int8_t* filter0 =
+ kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+ PrepareVerticalTaps<num_taps>(filter0, filter_taps_lo);
+ p += step_y;
+ src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+
+ for (int i = 0; i < num_taps; ++i) {
+ s[i] = LoadHi8(s[i], src_y + i * src_stride);
+ }
+ filter_id = (p >> 6) & kSubPixelMask;
+ const int8_t* filter1 =
+ kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+ PrepareVerticalTaps<num_taps>(filter1, filter_taps_hi);
+ p += step_y;
+ src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+
+ const __m128i sums = Sum2DVerticalTaps4x2<num_taps, is_compound>(
+ s, filter_taps_lo, filter_taps_hi);
+ if (is_compound) {
+ assert(width_class > 2);
+ StoreLo8(dest16_y, sums);
+ dest16_y += dest_stride;
+ StoreHi8(dest16_y, sums);
+ dest16_y += dest_stride;
+ } else {
+ const __m128i result = _mm_packus_epi16(sums, sums);
+ if (width_class == 2) {
+ Store2(dest_y, result);
+ dest_y += dest_stride;
+ Store2(dest_y, _mm_srli_si128(result, 4));
+ } else {
+ Store4(dest_y, result);
+ dest_y += dest_stride;
+ Store4(dest_y, _mm_srli_si128(result, 4));
+ }
+ dest_y += dest_stride;
+ }
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+
+ // |width_class| >= 8
+ __m128i filter_taps[num_taps >> 1];
+ do { // y > 0
+ src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+ const int filter_id = (p >> 6) & kSubPixelMask;
+ const int8_t* filter =
+ kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+ PrepareVerticalTaps<num_taps>(filter, filter_taps);
+
+ int x = 0;
+ do { // x < width
+ for (int i = 0; i < num_taps; ++i) {
+ s[i] = LoadUnaligned16(src_y + i * src_stride);
+ }
+
+ const __m128i sums =
+ Sum2DVerticalTaps<num_taps, is_compound>(s, filter_taps);
+ if (is_compound) {
+ StoreUnaligned16(dest16_y + x, sums);
+ } else {
+ StoreLo8(dest_y + x, _mm_packus_epi16(sums, sums));
+ }
+ x += 8;
+ src_y += 8;
+ } while (x < width);
+ p += step_y;
+ dest_y += dest_stride;
+ dest16_y += dest_stride;
+ } while (--y != 0);
+}
+
+template <bool is_compound>
+void ConvolveScale2D_SSE4_1(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int subpixel_x, const int subpixel_y,
+ const int step_x, const int step_y, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ assert(step_x <= 2048);
+ // The output of the horizontal filter, i.e. the intermediate_result, is
+ // guaranteed to fit in int16_t.
+ // TODO(petersonab): Reduce intermediate block stride to width to make smaller
+ // blocks faster.
+ alignas(16) int16_t
+ intermediate_result[kMaxSuperBlockSizeInPixels *
+ (2 * kMaxSuperBlockSizeInPixels + kSubPixelTaps)];
+ const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
+ const int intermediate_height =
+ (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+ kScaleSubPixelBits) +
+ num_vert_taps;
+
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [3, 5].
+ // Similarly for height.
+ int16_t* intermediate = intermediate_result;
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference);
+ const int vert_kernel_offset = (8 - num_vert_taps) / 2;
+ src += vert_kernel_offset * src_stride;
+
+ // Derive the maximum value of |step_x| at which all source values fit in one
+ // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
+ // step_x*7 is the final base sub-pixel index for the shuffle mask for filter
+ // inputs in each iteration on large blocks. When step_x is large, we need a
+ // second register and alignr in order to gather all filter inputs.
+ // |num_taps| - 1 is the offset for the shuffle of inputs to the final tap.
+ const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+ const int kernel_start_ceiling = 16 - num_horiz_taps;
+ // This truncated quotient |grade_x_threshold| selects |step_x| such that:
+ // (step_x * 7) >> kScaleSubPixelBits < single load limit
+ const int grade_x_threshold =
+ (kernel_start_ceiling << kScaleSubPixelBits) / 7;
+ switch (horiz_filter_index) {
+ case 0:
+ if (step_x > grade_x_threshold) {
+ ConvolveHorizontalScale<2, 0, 6>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ } else {
+ ConvolveHorizontalScale<1, 0, 6>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 1:
+ if (step_x > grade_x_threshold) {
+ ConvolveHorizontalScale<2, 1, 6>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+
+ } else {
+ ConvolveHorizontalScale<1, 1, 6>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 2:
+ if (step_x > grade_x_threshold) {
+ ConvolveHorizontalScale<2, 2, 8>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ } else {
+ ConvolveHorizontalScale<1, 2, 8>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 3:
+ if (step_x > grade_x_threshold) {
+ ConvolveHorizontalScale<2, 3, 2>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ } else {
+ ConvolveHorizontalScale<1, 3, 2>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 4:
+ assert(width <= 4);
+ ConvolveHorizontalScale<1, 4, 4>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ break;
+ default:
+ assert(horiz_filter_index == 5);
+ assert(width <= 4);
+ ConvolveHorizontalScale<1, 5, 4>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+
+ // Vertical filter.
+ intermediate = intermediate_result;
+ switch (vert_filter_index) {
+ case 0:
+ case 1:
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale<6, 2, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale<6, 4, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<6, 8, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ break;
+ case 2:
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale<8, 2, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale<8, 4, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<8, 8, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ break;
+ case 3:
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale<2, 2, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale<2, 4, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<2, 8, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ break;
+ default:
+ assert(vert_filter_index == 4 || vert_filter_index == 5);
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale<4, 2, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale<4, 4, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<4, 8, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ }
+}
+
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
dsp->convolve[0][0][0][1] = ConvolveHorizontal_SSE4_1;
+ dsp->convolve[0][0][1][0] = ConvolveVertical_SSE4_1;
+ dsp->convolve[0][0][1][1] = Convolve2D_SSE4_1;
dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4;
dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_SSE4_1;
+ dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_SSE4_1;
+ dsp->convolve[0][1][1][1] = ConvolveCompound2D_SSE4_1;
+
+ dsp->convolve_scale[0] = ConvolveScale2D_SSE4_1<false>;
+ dsp->convolve_scale[1] = ConvolveScale2D_SSE4_1<true>;
}
} // namespace
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.h b/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.h
index 92f35d79426..e449a87436f 100644
--- a/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.h
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/convolve_sse4.h
@@ -38,6 +38,14 @@ void ConvolveInit_SSE4_1();
#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_SSE4_1
#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical
+#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Convolve2D
+#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_SSE4_1
+#endif
+
#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundCopy
#define LIBGAV1_Dsp8bpp_ConvolveCompoundCopy LIBGAV1_CPU_SSE4_1
#endif
@@ -46,6 +54,22 @@ void ConvolveInit_SSE4_1();
#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_SSE4_1
#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompound2D
+#define LIBGAV1_Dsp8bpp_ConvolveCompound2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveScale2D
+#define LIBGAV1_Dsp8bpp_ConvolveScale2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D LIBGAV1_CPU_SSE4_1
+#endif
+
#endif // LIBGAV1_ENABLE_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/loop_filter_sse4.cc b/chromium/third_party/libgav1/src/src/dsp/x86/loop_filter_sse4.cc
index 78dec96bc69..edb8b1405f8 100644
--- a/chromium/third_party/libgav1/src/src/dsp/x86/loop_filter_sse4.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/loop_filter_sse4.cc
@@ -1143,7 +1143,7 @@ template <int bitdepth>
struct LoopFilterFuncs_SSE4_1 {
LoopFilterFuncs_SSE4_1() = delete;
- static const int kThreshShift = bitdepth - 8;
+ static constexpr int kThreshShift = bitdepth - 8;
static void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh,
int inner_thresh, int hev_thresh);
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/loop_restoration_sse4.cc b/chromium/third_party/libgav1/src/src/dsp/x86/loop_restoration_sse4.cc
index 02b7ed03e1a..7a01ab15aae 100644
--- a/chromium/third_party/libgav1/src/src/dsp/x86/loop_restoration_sse4.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/loop_restoration_sse4.cc
@@ -36,14 +36,6 @@ namespace dsp {
namespace low_bitdepth {
namespace {
-// Precision of a division table (mtable)
-constexpr int kSgrProjScaleBits = 20;
-constexpr int kSgrProjReciprocalBits = 12;
-// Core self-guided restoration precision bits.
-constexpr int kSgrProjSgrBits = 8;
-// Precision bits of generated values higher than source before projection.
-constexpr int kSgrProjRestoreBits = 4;
-
// Note: range of wiener filter coefficients.
// Wiener filter coefficients are symmetric, and their sum is 1 (128).
// The range of each coefficient:
@@ -85,12 +77,12 @@ void WienerFilter_SSE4_1(const void* source, void* const dest,
(1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
const auto* src = static_cast<const uint8_t*>(source);
auto* dst = static_cast<uint8_t*>(dest);
- const ptrdiff_t buffer_stride = buffer->wiener_buffer_stride;
- auto* wiener_buffer = buffer->wiener_buffer;
+ const ptrdiff_t buffer_stride = (width + 7) & ~7;
+ auto* wiener_buffer = buffer->wiener_buffer + buffer_stride;
// horizontal filtering.
PopulateWienerCoefficients(restoration_info, WienerInfo::kHorizontal, filter);
const int center_tap = 3;
- src -= center_tap * source_stride + center_tap;
+ src -= (center_tap - 1) * source_stride + center_tap;
const int horizontal_rounding =
1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
@@ -108,7 +100,7 @@ void WienerFilter_SSE4_1(const void* source, void* const dest,
const __m128i v_offset_shift =
_mm_cvtsi32_si128(7 - kInterRoundBitsHorizontal);
- int y = 0;
+ int y = height + kSubPixelTaps - 4;
do {
int x = 0;
do {
@@ -156,9 +148,16 @@ void WienerFilter_SSE4_1(const void* source, void* const dest,
} while (x < width);
src += source_stride;
wiener_buffer += buffer_stride;
- } while (++y < height + kSubPixelTaps - 2);
-
+ } while (--y != 0);
+ // Because the top row of |source| is a duplicate of the second row, and the
+ // bottom row of |source| is a duplicate of its above row, we can duplicate
+ // the top and bottom row of |wiener_buffer| accordingly.
+ memcpy(wiener_buffer, wiener_buffer - buffer_stride,
+ sizeof(*wiener_buffer) * width);
wiener_buffer = buffer->wiener_buffer;
+ memcpy(wiener_buffer, wiener_buffer + buffer_stride,
+ sizeof(*wiener_buffer) * width);
+
// vertical filtering.
PopulateWienerCoefficients(restoration_info, WienerInfo::kVertical, filter);
@@ -211,521 +210,1380 @@ void WienerFilter_SSE4_1(const void* source, void* const dest,
} while (++y < height);
}
-// Section 7.17.3.
-// a2: range [1, 256].
-// if (z >= 255)
-// a2 = 256;
-// else if (z == 0)
-// a2 = 1;
-// else
-// a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1);
-constexpr int kXByXPlus1[256] = {
- 1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
- 240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
- 248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
- 250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
- 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
- 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
- 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
- 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
- 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
- 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
- 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
- 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 256};
-
-inline __m128i HorizontalAddVerticalSumsRadius1(const uint32_t* vert_sums) {
- // Horizontally add vertical sums to get total box sum.
- const __m128i v_sums_3210 = LoadUnaligned16(&vert_sums[0]);
- const __m128i v_sums_7654 = LoadUnaligned16(&vert_sums[4]);
- const __m128i v_sums_4321 = _mm_alignr_epi8(v_sums_7654, v_sums_3210, 4);
- const __m128i v_sums_5432 = _mm_alignr_epi8(v_sums_7654, v_sums_3210, 8);
- const __m128i v_s0 = _mm_add_epi32(v_sums_3210, v_sums_4321);
- const __m128i v_s1 = _mm_add_epi32(v_s0, v_sums_5432);
- return v_s1;
-}
-
-inline __m128i HorizontalAddVerticalSumsRadius2(const uint32_t* vert_sums) {
- // Horizontally add vertical sums to get total box sum.
- const __m128i v_sums_3210 = LoadUnaligned16(&vert_sums[0]);
- const __m128i v_sums_7654 = LoadUnaligned16(&vert_sums[4]);
- const __m128i v_sums_4321 = _mm_alignr_epi8(v_sums_7654, v_sums_3210, 4);
- const __m128i v_sums_5432 = _mm_alignr_epi8(v_sums_7654, v_sums_3210, 8);
- const __m128i v_sums_6543 = _mm_alignr_epi8(v_sums_7654, v_sums_3210, 12);
- const __m128i v_s0 = _mm_add_epi32(v_sums_3210, v_sums_4321);
- const __m128i v_s1 = _mm_add_epi32(v_s0, v_sums_5432);
- const __m128i v_s2 = _mm_add_epi32(v_s1, v_sums_6543);
- const __m128i v_s3 = _mm_add_epi32(v_s2, v_sums_7654);
- return v_s3;
-}
-
-void BoxFilterPreProcessRadius1_SSE4_1(
- const uint8_t* const src, ptrdiff_t stride, int width, int height,
- uint32_t s, uint32_t* intermediate_result[2], ptrdiff_t array_stride,
- uint32_t* vertical_sums, uint32_t* vertical_sum_of_squares) {
- assert(s != 0);
- const uint32_t n = 9;
- const uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
- const __m128i v_one_over_n =
- _mm_shuffle_epi32(_mm_cvtsi32_si128(one_over_n), 0);
- const __m128i v_sgrbits =
- _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << kSgrProjSgrBits), 0);
-
-#if LIBGAV1_MSAN
- // Over-reads occur in the x loop, so set to a known value.
- memset(&vertical_sums[width], 0, 8 * sizeof(vertical_sums[0]));
- memset(&vertical_sum_of_squares[width], 0,
- 8 * sizeof(vertical_sum_of_squares[0]));
-#endif
+//------------------------------------------------------------------------------
+// SGR
- // Calculate intermediate results, including one-pixel border, for example,
- // if unit size is 64x64, we calculate 66x66 pixels.
- int y = -1;
- do {
- const uint8_t* top_left = &src[(y - 1) * stride - 2];
- // Calculate the box vertical sums for each x position.
- int vsx = -2;
- do {
- const __m128i v_box0 = _mm_cvtepu8_epi32(Load4(top_left));
- const __m128i v_box1 = _mm_cvtepu8_epi32(Load4(top_left + stride));
- const __m128i v_box2 = _mm_cvtepu8_epi32(Load4(top_left + stride * 2));
- const __m128i v_sqr0 = _mm_mullo_epi32(v_box0, v_box0);
- const __m128i v_sqr1 = _mm_mullo_epi32(v_box1, v_box1);
- const __m128i v_sqr2 = _mm_mullo_epi32(v_box2, v_box2);
- const __m128i v_a01 = _mm_add_epi32(v_sqr0, v_sqr1);
- const __m128i v_a012 = _mm_add_epi32(v_a01, v_sqr2);
- const __m128i v_b01 = _mm_add_epi32(v_box0, v_box1);
- const __m128i v_b012 = _mm_add_epi32(v_b01, v_box2);
- StoreUnaligned16(&vertical_sum_of_squares[vsx], v_a012);
- StoreUnaligned16(&vertical_sums[vsx], v_b012);
- top_left += 4;
- vsx += 4;
- } while (vsx <= width + 1);
-
- int x = -1;
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m128i VaddlLo8(const __m128i a, const __m128i b) {
+ const __m128i a0 = _mm_unpacklo_epi8(a, _mm_setzero_si128());
+ const __m128i b0 = _mm_unpacklo_epi8(b, _mm_setzero_si128());
+ return _mm_add_epi16(a0, b0);
+}
+
+inline __m128i VaddlHi8(const __m128i a, const __m128i b) {
+ const __m128i a0 = _mm_unpackhi_epi8(a, _mm_setzero_si128());
+ const __m128i b0 = _mm_unpackhi_epi8(b, _mm_setzero_si128());
+ return _mm_add_epi16(a0, b0);
+}
+
+inline __m128i VaddlLo16(const __m128i a, const __m128i b) {
+ const __m128i a0 = _mm_unpacklo_epi16(a, _mm_setzero_si128());
+ const __m128i b0 = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+ return _mm_add_epi32(a0, b0);
+}
+
+inline __m128i VaddlHi16(const __m128i a, const __m128i b) {
+ const __m128i a0 = _mm_unpackhi_epi16(a, _mm_setzero_si128());
+ const __m128i b0 = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+ return _mm_add_epi32(a0, b0);
+}
+
+inline __m128i VaddwLo8(const __m128i a, const __m128i b) {
+ const __m128i b0 = _mm_unpacklo_epi8(b, _mm_setzero_si128());
+ return _mm_add_epi16(a, b0);
+}
+
+inline __m128i VaddwHi8(const __m128i a, const __m128i b) {
+ const __m128i b0 = _mm_unpackhi_epi8(b, _mm_setzero_si128());
+ return _mm_add_epi16(a, b0);
+}
+
+inline __m128i VaddwLo16(const __m128i a, const __m128i b) {
+ const __m128i b0 = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+ return _mm_add_epi32(a, b0);
+}
+
+inline __m128i VaddwHi16(const __m128i a, const __m128i b) {
+ const __m128i b0 = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+ return _mm_add_epi32(a, b0);
+}
+
+// Using VgetLane16() can save a sign extension instruction.
+template <int n>
+inline int16_t VgetLane16(const __m128i a) {
+ return _mm_extract_epi16(a, n);
+}
+
+inline __m128i VmullLo8(const __m128i a, const __m128i b) {
+ const __m128i a0 = _mm_unpacklo_epi8(a, _mm_setzero_si128());
+ const __m128i b0 = _mm_unpacklo_epi8(b, _mm_setzero_si128());
+ return _mm_mullo_epi16(a0, b0);
+}
+
+inline __m128i VmullHi8(const __m128i a, const __m128i b) {
+ const __m128i a0 = _mm_unpackhi_epi8(a, _mm_setzero_si128());
+ const __m128i b0 = _mm_unpackhi_epi8(b, _mm_setzero_si128());
+ return _mm_mullo_epi16(a0, b0);
+}
+
+inline __m128i VmullNLo8(const __m128i a, const int16_t b) {
+ const __m128i a0 = _mm_unpacklo_epi16(a, _mm_setzero_si128());
+ return _mm_madd_epi16(a0, _mm_set1_epi32(b));
+}
+
+inline __m128i VmullNHi8(const __m128i a, const int16_t b) {
+ const __m128i a0 = _mm_unpackhi_epi16(a, _mm_setzero_si128());
+ return _mm_madd_epi16(a0, _mm_set1_epi32(b));
+}
+
+inline __m128i VmullLo16(const __m128i a, const __m128i b) {
+ const __m128i a0 = _mm_unpacklo_epi16(a, _mm_setzero_si128());
+ const __m128i b0 = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+ return _mm_madd_epi16(a0, b0);
+}
+
+inline __m128i VmullHi16(const __m128i a, const __m128i b) {
+ const __m128i a0 = _mm_unpackhi_epi16(a, _mm_setzero_si128());
+ const __m128i b0 = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+ return _mm_madd_epi16(a0, b0);
+}
+
+inline __m128i VmulwLo16(const __m128i a, const __m128i b) {
+ const __m128i b0 = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+ return _mm_madd_epi16(a, b0);
+}
+
+inline __m128i VmulwHi16(const __m128i a, const __m128i b) {
+ const __m128i b0 = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+ return _mm_madd_epi16(a, b0);
+}
+
+inline __m128i VmlalNLo16(const __m128i sum, const __m128i a, const int16_t b) {
+ return _mm_add_epi32(sum, VmullNLo8(a, b));
+}
+
+inline __m128i VmlalNHi16(const __m128i sum, const __m128i a, const int16_t b) {
+ return _mm_add_epi32(sum, VmullNHi8(a, b));
+}
+
+inline __m128i VmlawLo16(const __m128i sum, const __m128i a, const __m128i b) {
+ const __m128i b0 = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+ return _mm_add_epi32(sum, _mm_madd_epi16(a, b0));
+}
+
+inline __m128i VmlawHi16(const __m128i sum, const __m128i a, const __m128i b) {
+ const __m128i b0 = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+ return _mm_add_epi32(sum, _mm_madd_epi16(a, b0));
+}
+
+inline __m128i VrshrNS32(const __m128i a, const int b) {
+ const __m128i sum = _mm_add_epi32(a, _mm_set1_epi32(1 << (b - 1)));
+ return _mm_srai_epi32(sum, b);
+}
+
+inline __m128i VrshrN32(const __m128i a, const int b) {
+ const __m128i sum = _mm_add_epi32(a, _mm_set1_epi32(1 << (b - 1)));
+ return _mm_srli_epi32(sum, b);
+}
+
+inline __m128i VshllN8(const __m128i a, const int b) {
+ const __m128i a0 = _mm_unpacklo_epi8(a, _mm_setzero_si128());
+ return _mm_slli_epi16(a0, b);
+}
+
+template <int n>
+inline __m128i CalcAxN(const __m128i a) {
+ static_assert(n == 9 || n == 25, "");
+ // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+ // Some compilers could do this for us but we make this explicit.
+ // return _mm_mullo_epi32(a, _mm_set1_epi32(n));
+ const __m128i ax9 = _mm_add_epi32(a, _mm_slli_epi32(a, 3));
+ if (n == 9) return ax9;
+ if (n == 25) return _mm_add_epi32(ax9, _mm_slli_epi32(a, 4));
+}
+
+template <int n>
+inline __m128i CalculateSgrMA2(const __m128i sum_sq, const __m128i sum,
+ const uint32_t s) {
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const __m128i dxd = _mm_madd_epi16(sum, sum);
+ const __m128i axn = CalcAxN<n>(sum_sq);
+ const __m128i sub = _mm_sub_epi32(axn, dxd);
+ const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+
+ // z = RightShiftWithRounding(p * s, kSgrProjScaleBits);
+ const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(s));
+ return VrshrN32(pxs, kSgrProjScaleBits);
+}
+
+inline __m128i CalculateIntermediate4(const __m128i sgr_ma2, const __m128i sum,
+ const uint32_t one_over_n) {
+ // b2 = ((1 << kSgrProjSgrBits) - a2) * b * one_over_n
+ // 1 << kSgrProjSgrBits = 256
+ // |a2| = [1, 256]
+ // |sgr_ma2| max value = 255
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ const __m128i sgr_ma2q = _mm_unpacklo_epi8(sgr_ma2, _mm_setzero_si128());
+ const __m128i s = _mm_unpackhi_epi16(sgr_ma2q, _mm_setzero_si128());
+ const __m128i m = _mm_madd_epi16(s, sum);
+ const __m128i b2 = _mm_mullo_epi32(m, _mm_set1_epi32(one_over_n));
+ // static_cast<int>(RightShiftWithRounding(b2, kSgrProjReciprocalBits));
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m128i truncate_u32 = VrshrN32(b2, kSgrProjReciprocalBits);
+ return _mm_packus_epi32(truncate_u32, truncate_u32);
+}
+
+inline __m128i CalculateIntermediate8(const __m128i sgr_ma2, const __m128i sum,
+ const uint32_t one_over_n) {
+ // b2 = ((1 << kSgrProjSgrBits) - a2) * b * one_over_n
+ // 1 << kSgrProjSgrBits = 256
+ // |a2| = [1, 256]
+ // |sgr_ma2| max value = 255
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ const __m128i sgr_ma2q = _mm_unpackhi_epi8(sgr_ma2, _mm_setzero_si128());
+ const __m128i m0 = VmullLo16(sgr_ma2q, sum);
+ const __m128i m1 = VmullHi16(sgr_ma2q, sum);
+ const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+ const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+ // static_cast<int>(RightShiftWithRounding(b2, kSgrProjReciprocalBits));
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m128i b2_lo = VrshrN32(m2, kSgrProjReciprocalBits);
+ const __m128i b2_hi = VrshrN32(m3, kSgrProjReciprocalBits);
+ return _mm_packus_epi32(b2_lo, b2_hi);
+}
+
+inline __m128i Sum3_16(const __m128i left, const __m128i middle,
+ const __m128i right) {
+ const __m128i sum = _mm_add_epi16(left, middle);
+ return _mm_add_epi16(sum, right);
+}
+
+inline __m128i Sum3_32(const __m128i left, const __m128i middle,
+ const __m128i right) {
+ const __m128i sum = _mm_add_epi32(left, middle);
+ return _mm_add_epi32(sum, right);
+}
+
+inline __m128i Sum3W_16(const __m128i left, const __m128i middle,
+ const __m128i right) {
+ const __m128i sum = VaddlLo8(left, middle);
+ return VaddwLo8(sum, right);
+}
+
+inline __m128i Sum3WLo_16(const __m128i a[3]) {
+ return Sum3W_16(a[0], a[1], a[2]);
+}
+
+inline __m128i Sum3WHi_16(const __m128i a[3]) {
+ const __m128i sum = VaddlHi8(a[0], a[1]);
+ return VaddwHi8(sum, a[2]);
+}
+
+inline __m128i Sum3WLo_32(const __m128i left, const __m128i middle,
+ const __m128i right) {
+ const __m128i sum = VaddlLo16(left, middle);
+ return VaddwLo16(sum, right);
+}
+
+inline __m128i Sum3WHi_32(const __m128i left, const __m128i middle,
+ const __m128i right) {
+ const __m128i sum = VaddlHi16(left, middle);
+ return VaddwHi16(sum, right);
+}
+
+inline __m128i* Sum3W_16x2(const __m128i a[3], __m128i sum[2]) {
+ sum[0] = Sum3WLo_16(a);
+ sum[1] = Sum3WHi_16(a);
+ return sum;
+}
+
+inline __m128i* Sum3W(const __m128i a[3], __m128i sum[2]) {
+ sum[0] = Sum3WLo_32(a[0], a[1], a[2]);
+ sum[1] = Sum3WHi_32(a[0], a[1], a[2]);
+ return sum;
+}
+
+template <int index>
+inline __m128i Sum3WLo(const __m128i a[3][2]) {
+ const __m128i b0 = a[0][index];
+ const __m128i b1 = a[1][index];
+ const __m128i b2 = a[2][index];
+ return Sum3WLo_32(b0, b1, b2);
+}
+
+inline __m128i Sum3WHi(const __m128i a[3][2]) {
+ const __m128i b0 = a[0][0];
+ const __m128i b1 = a[1][0];
+ const __m128i b2 = a[2][0];
+ return Sum3WHi_32(b0, b1, b2);
+}
+
+inline __m128i* Sum3W(const __m128i a[3][2], __m128i sum[3]) {
+ sum[0] = Sum3WLo<0>(a);
+ sum[1] = Sum3WHi(a);
+ sum[2] = Sum3WLo<1>(a);
+ return sum;
+}
+
+inline __m128i Sum5_16(const __m128i a[5]) {
+ const __m128i sum01 = _mm_add_epi16(a[0], a[1]);
+ const __m128i sum23 = _mm_add_epi16(a[2], a[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return _mm_add_epi16(sum, a[4]);
+}
+
+inline __m128i Sum5_32(const __m128i a[5]) {
+ const __m128i sum01 = _mm_add_epi32(a[0], a[1]);
+ const __m128i sum23 = _mm_add_epi32(a[2], a[3]);
+ const __m128i sum = _mm_add_epi32(sum01, sum23);
+ return _mm_add_epi32(sum, a[4]);
+}
+
+inline __m128i Sum5WLo_16(const __m128i a[5]) {
+ const __m128i sum01 = VaddlLo8(a[0], a[1]);
+ const __m128i sum23 = VaddlLo8(a[2], a[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return VaddwLo8(sum, a[4]);
+}
+
+inline __m128i Sum5WHi_16(const __m128i a[5]) {
+ const __m128i sum01 = VaddlHi8(a[0], a[1]);
+ const __m128i sum23 = VaddlHi8(a[2], a[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return VaddwHi8(sum, a[4]);
+}
+
+inline __m128i Sum5WLo_32(const __m128i a[5]) {
+ const __m128i sum01 = VaddlLo16(a[0], a[1]);
+ const __m128i sum23 = VaddlLo16(a[2], a[3]);
+ const __m128i sum0123 = _mm_add_epi32(sum01, sum23);
+ return VaddwLo16(sum0123, a[4]);
+}
+
+inline __m128i Sum5WHi_32(const __m128i a[5]) {
+ const __m128i sum01 = VaddlHi16(a[0], a[1]);
+ const __m128i sum23 = VaddlHi16(a[2], a[3]);
+ const __m128i sum0123 = _mm_add_epi32(sum01, sum23);
+ return VaddwHi16(sum0123, a[4]);
+}
+
+inline __m128i* Sum5W_16D(const __m128i a[5], __m128i sum[2]) {
+ sum[0] = Sum5WLo_16(a);
+ sum[1] = Sum5WHi_16(a);
+ return sum;
+}
+
+inline __m128i* Sum5W_32x2(const __m128i a[5], __m128i sum[2]) {
+ sum[0] = Sum5WLo_32(a);
+ sum[1] = Sum5WHi_32(a);
+ return sum;
+}
+
+template <int index>
+inline __m128i Sum5WLo(const __m128i a[5][2]) {
+ __m128i b[5];
+ b[0] = a[0][index];
+ b[1] = a[1][index];
+ b[2] = a[2][index];
+ b[3] = a[3][index];
+ b[4] = a[4][index];
+ return Sum5WLo_32(b);
+}
+
+inline __m128i Sum5WHi(const __m128i a[5][2]) {
+ __m128i b[5];
+ b[0] = a[0][0];
+ b[1] = a[1][0];
+ b[2] = a[2][0];
+ b[3] = a[3][0];
+ b[4] = a[4][0];
+ return Sum5WHi_32(b);
+}
+
+inline __m128i* Sum5W_32x3(const __m128i a[5][2], __m128i sum[3]) {
+ sum[0] = Sum5WLo<0>(a);
+ sum[1] = Sum5WHi(a);
+ sum[2] = Sum5WLo<1>(a);
+ return sum;
+}
+
+inline __m128i Sum3Horizontal(const __m128i a) {
+ const auto left = a;
+ const auto middle = _mm_srli_si128(a, 2);
+ const auto right = _mm_srli_si128(a, 4);
+ return Sum3_16(left, middle, right);
+}
+
+inline __m128i Sum3Horizontal_16(const __m128i a[2]) {
+ const auto left = a[0];
+ const auto middle = _mm_alignr_epi8(a[1], a[0], 2);
+ const auto right = _mm_alignr_epi8(a[1], a[0], 4);
+ return Sum3_16(left, middle, right);
+}
+
+inline __m128i Sum3Horizontal_32(const __m128i a[2]) {
+ const auto left = a[0];
+ const auto middle = _mm_alignr_epi8(a[1], a[0], 4);
+ const auto right = _mm_alignr_epi8(a[1], a[0], 8);
+ return Sum3_32(left, middle, right);
+}
+
+inline __m128i* Sum3Horizontal_32x2(const __m128i a[3], __m128i sum[2]) {
+ {
+ const auto left = a[0];
+ const auto middle = _mm_alignr_epi8(a[1], a[0], 4);
+ const auto right = _mm_alignr_epi8(a[1], a[0], 8);
+ sum[0] = Sum3_32(left, middle, right);
+ }
+ {
+ const auto left = a[1];
+ const auto middle = _mm_alignr_epi8(a[2], a[1], 4);
+ const auto right = _mm_alignr_epi8(a[2], a[1], 8);
+ sum[1] = Sum3_32(left, middle, right);
+ }
+ return sum;
+}
+
+inline __m128i Sum3HorizontalOffset1(const __m128i a) {
+ const auto left = _mm_srli_si128(a, 2);
+ const auto middle = _mm_srli_si128(a, 4);
+ const auto right = _mm_srli_si128(a, 6);
+ return Sum3_16(left, middle, right);
+}
+
+inline __m128i Sum3HorizontalOffset1_16(const __m128i a[2]) {
+ const auto left = _mm_alignr_epi8(a[1], a[0], 2);
+ const auto middle = _mm_alignr_epi8(a[1], a[0], 4);
+ const auto right = _mm_alignr_epi8(a[1], a[0], 6);
+ return Sum3_16(left, middle, right);
+}
+
+inline __m128i Sum3HorizontalOffset1_32(const __m128i a[2]) {
+ const auto left = _mm_alignr_epi8(a[1], a[0], 4);
+ const auto middle = _mm_alignr_epi8(a[1], a[0], 8);
+ const auto right = _mm_alignr_epi8(a[1], a[0], 12);
+ return Sum3_32(left, middle, right);
+}
+
+inline void Sum3HorizontalOffset1_32x2(const __m128i a[3], __m128i sum[2]) {
+ sum[0] = Sum3HorizontalOffset1_32(a + 0);
+ sum[1] = Sum3HorizontalOffset1_32(a + 1);
+}
+
+inline __m128i Sum5Horizontal(const __m128i a) {
+ __m128i s[5];
+ s[0] = a;
+ s[1] = _mm_srli_si128(a, 2);
+ s[2] = _mm_srli_si128(a, 4);
+ s[3] = _mm_srli_si128(a, 6);
+ s[4] = _mm_srli_si128(a, 8);
+ return Sum5_16(s);
+}
+
+inline __m128i Sum5Horizontal_16(const __m128i a[2]) {
+ __m128i s[5];
+ s[0] = a[0];
+ s[1] = _mm_alignr_epi8(a[1], a[0], 2);
+ s[2] = _mm_alignr_epi8(a[1], a[0], 4);
+ s[3] = _mm_alignr_epi8(a[1], a[0], 6);
+ s[4] = _mm_alignr_epi8(a[1], a[0], 8);
+ return Sum5_16(s);
+}
+
+inline __m128i Sum5Horizontal_32(const __m128i a[2]) {
+ __m128i s[5];
+ s[0] = a[0];
+ s[1] = _mm_alignr_epi8(a[1], a[0], 4);
+ s[2] = _mm_alignr_epi8(a[1], a[0], 8);
+ s[3] = _mm_alignr_epi8(a[1], a[0], 12);
+ s[4] = a[1];
+ return Sum5_32(s);
+}
+
+inline __m128i* Sum5Horizontal_32x2(const __m128i a[3], __m128i sum[2]) {
+ __m128i s[5];
+ s[0] = a[0];
+ s[1] = _mm_alignr_epi8(a[1], a[0], 4);
+ s[2] = _mm_alignr_epi8(a[1], a[0], 8);
+ s[3] = _mm_alignr_epi8(a[1], a[0], 12);
+ s[4] = a[1];
+ sum[0] = Sum5_32(s);
+ s[0] = a[1];
+ s[1] = _mm_alignr_epi8(a[2], a[1], 4);
+ s[2] = _mm_alignr_epi8(a[2], a[1], 8);
+ s[3] = _mm_alignr_epi8(a[2], a[1], 12);
+ s[4] = a[2];
+ sum[1] = Sum5_32(s);
+ return sum;
+}
+
+template <int size, int offset>
+inline void BoxFilterPreProcess4(const __m128i* const row,
+ const __m128i* const row_sq, const uint32_t s,
+ uint16_t* const dst) {
+ static_assert(offset == 0 || offset == 1, "");
+ // Number of elements in the box being summed.
+ constexpr uint32_t n = size * size;
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+ __m128i sum, sum_sq;
+ if (size == 3) {
+ __m128i temp32[2];
+ if (offset == 0) {
+ sum = Sum3Horizontal(Sum3WLo_16(row));
+ sum_sq = Sum3Horizontal_32(Sum3W(row_sq, temp32));
+ } else {
+ sum = Sum3HorizontalOffset1(Sum3WLo_16(row));
+ sum_sq = Sum3HorizontalOffset1_32(Sum3W(row_sq, temp32));
+ }
+ }
+ if (size == 5) {
+ __m128i temp[2];
+ sum = Sum5Horizontal(Sum5WLo_16(row));
+ sum_sq = Sum5Horizontal_32(Sum5W_32x2(row_sq, temp));
+ }
+ const __m128i sum_32 = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
+ const __m128i z0 = CalculateSgrMA2<n>(sum_sq, sum_32, s);
+ const __m128i z1 = _mm_packus_epi32(z0, z0);
+ const __m128i z = _mm_min_epu16(z1, _mm_set1_epi16(255));
+ __m128i sgr_ma2 = _mm_setzero_si128();
+ sgr_ma2 = _mm_insert_epi8(sgr_ma2, kSgrMa2Lookup[VgetLane16<0>(z)], 4);
+ sgr_ma2 = _mm_insert_epi8(sgr_ma2, kSgrMa2Lookup[VgetLane16<1>(z)], 5);
+ sgr_ma2 = _mm_insert_epi8(sgr_ma2, kSgrMa2Lookup[VgetLane16<2>(z)], 6);
+ sgr_ma2 = _mm_insert_epi8(sgr_ma2, kSgrMa2Lookup[VgetLane16<3>(z)], 7);
+ const __m128i b2 = CalculateIntermediate4(sgr_ma2, sum_32, one_over_n);
+ const __m128i sgr_ma2_b2 = _mm_unpacklo_epi64(sgr_ma2, b2);
+ StoreAligned16(dst, sgr_ma2_b2);
+}
+
+template <int size, int offset>
+inline void BoxFilterPreProcess8(const __m128i* const row,
+ const __m128i row_sq[][2], const uint32_t s,
+ __m128i* const sgr_ma2, __m128i* const b2,
+ uint16_t* const dst) {
+ // Number of elements in the box being summed.
+ constexpr uint32_t n = size * size;
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+ __m128i sum, sum_sq[2];
+ if (size == 3) {
+ __m128i temp16[2], temp32[3];
+ if (offset == 0) {
+ sum = Sum3Horizontal_16(Sum3W_16x2(row, temp16));
+ Sum3Horizontal_32x2(Sum3W(row_sq, temp32), sum_sq);
+ } else /* if (offset == 1) */ {
+ sum = Sum3HorizontalOffset1_16(Sum3W_16x2(row, temp16));
+ Sum3HorizontalOffset1_32x2(Sum3W(row_sq, temp32), sum_sq);
+ }
+ }
+ if (size == 5) {
+ __m128i temp16[2], temp32[3];
+ sum = Sum5Horizontal_16(Sum5W_16D(row, temp16));
+ Sum5Horizontal_32x2(Sum5W_32x3(row_sq, temp32), sum_sq);
+ }
+ const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
+ const __m128i z0 = CalculateSgrMA2<n>(sum_sq[0], sum_lo, s);
+ const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
+ const __m128i z1 = CalculateSgrMA2<n>(sum_sq[1], sum_hi, s);
+ const __m128i z01 = _mm_packus_epi32(z0, z1);
+ const __m128i z = _mm_min_epu16(z01, _mm_set1_epi16(255));
+ *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<0>(z)], 8);
+ *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<1>(z)], 9);
+ *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<2>(z)], 10);
+ *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<3>(z)], 11);
+ *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<4>(z)], 12);
+ *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<5>(z)], 13);
+ *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<6>(z)], 14);
+ *sgr_ma2 = _mm_insert_epi8(*sgr_ma2, kSgrMa2Lookup[VgetLane16<7>(z)], 15);
+ *b2 = CalculateIntermediate8(*sgr_ma2, sum, one_over_n);
+ const __m128i sgr_ma2_b2 = _mm_unpackhi_epi64(*sgr_ma2, *b2);
+ StoreAligned16(dst, sgr_ma2_b2);
+}
+
+inline void Prepare3_8(const __m128i a, __m128i* const left,
+ __m128i* const middle, __m128i* const right) {
+ *left = _mm_srli_si128(a, 4);
+ *middle = _mm_srli_si128(a, 5);
+ *right = _mm_srli_si128(a, 6);
+}
+
+inline void Prepare3_16(const __m128i a[2], __m128i* const left,
+ __m128i* const middle, __m128i* const right) {
+ *left = _mm_alignr_epi8(a[1], a[0], 8);
+ *middle = _mm_alignr_epi8(a[1], a[0], 10);
+ *right = _mm_alignr_epi8(a[1], a[0], 12);
+}
+
+inline __m128i Sum343(const __m128i a) {
+ __m128i left, middle, right;
+ Prepare3_8(a, &left, &middle, &right);
+ const auto sum = Sum3W_16(left, middle, right);
+ const auto sum3 = Sum3_16(sum, sum, sum);
+ return VaddwLo8(sum3, middle);
+}
+
+inline void Sum343_444(const __m128i a, __m128i* const sum343,
+ __m128i* const sum444) {
+ __m128i left, middle, right;
+ Prepare3_8(a, &left, &middle, &right);
+ const auto sum = Sum3W_16(left, middle, right);
+ const auto sum3 = Sum3_16(sum, sum, sum);
+ *sum343 = VaddwLo8(sum3, middle);
+ *sum444 = _mm_slli_epi16(sum, 2);
+}
+
+inline __m128i* Sum343W(const __m128i a[2], __m128i d[2]) {
+ __m128i left, middle, right;
+ Prepare3_16(a, &left, &middle, &right);
+ d[0] = Sum3WLo_32(left, middle, right);
+ d[1] = Sum3WHi_32(left, middle, right);
+ d[0] = Sum3_32(d[0], d[0], d[0]);
+ d[1] = Sum3_32(d[1], d[1], d[1]);
+ d[0] = VaddwLo16(d[0], middle);
+ d[1] = VaddwHi16(d[1], middle);
+ return d;
+}
+
+inline void Sum343_444W(const __m128i a[2], __m128i sum343[2],
+ __m128i sum444[2]) {
+ __m128i left, middle, right;
+ Prepare3_16(a, &left, &middle, &right);
+ sum444[0] = Sum3WLo_32(left, middle, right);
+ sum444[1] = Sum3WHi_32(left, middle, right);
+ sum343[0] = Sum3_32(sum444[0], sum444[0], sum444[0]);
+ sum343[1] = Sum3_32(sum444[1], sum444[1], sum444[1]);
+ sum343[0] = VaddwLo16(sum343[0], middle);
+ sum343[1] = VaddwHi16(sum343[1], middle);
+ sum444[0] = _mm_slli_epi32(sum444[0], 2);
+ sum444[1] = _mm_slli_epi32(sum444[1], 2);
+}
+
+inline __m128i Sum565(const __m128i a) {
+ __m128i left, middle, right;
+ Prepare3_8(a, &left, &middle, &right);
+ const auto sum = Sum3W_16(left, middle, right);
+ const auto sum4 = _mm_slli_epi16(sum, 2);
+ const auto sum5 = _mm_add_epi16(sum4, sum);
+ return VaddwLo8(sum5, middle);
+}
+
+inline __m128i Sum565W(const __m128i a) {
+ const auto left = a;
+ const auto middle = _mm_srli_si128(a, 2);
+ const auto right = _mm_srli_si128(a, 4);
+ const auto sum = Sum3WLo_32(left, middle, right);
+ const auto sum4 = _mm_slli_epi32(sum, 2);
+ const auto sum5 = _mm_add_epi32(sum4, sum);
+ return VaddwLo16(sum5, middle);
+}
+
+// RightShiftWithRounding(
+// (a * src_ptr[x] + b), kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+template <int shift>
+inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i a,
+ const __m128i b[2]) {
+ const __m128i src_u16 = _mm_unpacklo_epi8(src, _mm_setzero_si128());
+ // a: 256 * 32 = 8192 (14 bits)
+ // b: 65088 * 32 = 2082816 (21 bits)
+ const __m128i axsrc_lo = VmullLo16(a, src_u16);
+ const __m128i axsrc_hi = VmullHi16(a, src_u16);
+ // v: 8192 * 255 + 2082816 = 4171876 (22 bits)
+ const __m128i v_lo = _mm_add_epi32(axsrc_lo, b[0]);
+ const __m128i v_hi = _mm_add_epi32(axsrc_hi, b[1]);
+
+ // kSgrProjSgrBits = 8
+ // kSgrProjRestoreBits = 4
+ // shift = 4 or 5
+ // v >> 8 or 9
+ // 22 bits >> 8 = 14 bits
+ const __m128i dst_lo =
+ VrshrN32(v_lo, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+ const __m128i dst_hi =
+ VrshrN32(v_hi, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+ return _mm_packus_epi32(dst_lo, dst_hi); // 14 bits
+}
+
+inline __m128i BoxFilterPass1(const __m128i src_u8, const __m128i a2,
+ const __m128i b2[2], __m128i sum565_a[2],
+ __m128i sum565_b[2][2]) {
+ __m128i b_v[2];
+ sum565_a[1] = Sum565(a2);
+ sum565_a[1] = _mm_sub_epi16(_mm_set1_epi16((5 + 6 + 5) * 256), sum565_a[1]);
+ sum565_b[1][0] = Sum565W(_mm_alignr_epi8(b2[1], b2[0], 8));
+ sum565_b[1][1] = Sum565W(b2[1]);
+
+ __m128i a_v = _mm_add_epi16(sum565_a[0], sum565_a[1]);
+ b_v[0] = _mm_add_epi32(sum565_b[0][0], sum565_b[1][0]);
+ b_v[1] = _mm_add_epi32(sum565_b[0][1], sum565_b[1][1]);
+ return CalculateFilteredOutput<5>(src_u8, a_v, b_v); // 14 bits
+}
+
+inline __m128i BoxFilterPass2(const __m128i src_u8, const __m128i a2,
+ const __m128i b2[2], __m128i sum343_a[4],
+ __m128i sum444_a[3], __m128i sum343_b[4][2],
+ __m128i sum444_b[3][2]) {
+ __m128i b_v[2];
+ Sum343_444(a2, &sum343_a[2], &sum444_a[1]);
+ sum343_a[2] = _mm_sub_epi16(_mm_set1_epi16((3 + 4 + 3) * 256), sum343_a[2]);
+ sum444_a[1] = _mm_sub_epi16(_mm_set1_epi16((4 + 4 + 4) * 256), sum444_a[1]);
+ __m128i a_v = Sum3_16(sum343_a[0], sum444_a[0], sum343_a[2]);
+ Sum343_444W(b2, sum343_b[2], sum444_b[1]);
+ b_v[0] = Sum3_32(sum343_b[0][0], sum444_b[0][0], sum343_b[2][0]);
+ b_v[1] = Sum3_32(sum343_b[0][1], sum444_b[0][1], sum343_b[2][1]);
+ return CalculateFilteredOutput<5>(src_u8, a_v, b_v); // 14 bits
+}
+
+inline void SelfGuidedDoubleMultiplier(
+ const __m128i src, const __m128i box_filter_process_output[2],
+ const __m128i w0, const __m128i w1, const __m128i w2, uint8_t* const dst) {
+ // |wN| values are signed. |src| values can be treated as int16_t.
+ const __m128i u = VshllN8(src, kSgrProjRestoreBits);
+ __m128i v_lo = VmulwLo16(w1, u);
+ v_lo = VmlawLo16(v_lo, w0, box_filter_process_output[0]);
+ v_lo = VmlawLo16(v_lo, w2, box_filter_process_output[1]);
+ __m128i v_hi = VmulwHi16(w1, u);
+ v_hi = VmlawHi16(v_hi, w0, box_filter_process_output[0]);
+ v_hi = VmlawHi16(v_hi, w2, box_filter_process_output[1]);
+ // |s| is saturated to uint8_t.
+ const __m128i s_lo =
+ VrshrNS32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m128i s_hi =
+ VrshrNS32(v_hi, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m128i s = _mm_packs_epi32(s_lo, s_hi);
+ StoreLo8(dst, _mm_packus_epi16(s, s));
+}
+
+inline void SelfGuidedSingleMultiplier(const __m128i src,
+ const __m128i box_filter_process_output,
+ const int16_t w0, const int16_t w1,
+ uint8_t* const dst) {
+ // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+ const __m128i u = VshllN8(src, kSgrProjRestoreBits);
+ // u * w1 + u * wN == u * (w1 + wN)
+ __m128i v_lo = VmullNLo8(u, w1);
+ v_lo = VmlalNLo16(v_lo, box_filter_process_output, w0);
+ __m128i v_hi = VmullNHi8(u, w1);
+ v_hi = VmlalNHi16(v_hi, box_filter_process_output, w0);
+ const __m128i s_lo =
+ VrshrNS32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m128i s_hi =
+ VrshrNS32(v_hi, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m128i s = _mm_packs_epi32(s_lo, s_hi);
+ StoreLo8(dst, _mm_packus_epi16(s, s));
+}
+
+inline void BoxFilterProcess(const uint8_t* const src,
+ const ptrdiff_t src_stride,
+ const RestorationUnitInfo& restoration_info,
+ const int width, const int height,
+ const uint16_t s[2], uint16_t* const temp,
+ uint8_t* const dst, const ptrdiff_t dst_stride) {
+ // We have combined PreProcess and Process for the first pass by storing
+ // intermediate values in the |a2| region. The values stored are one vertical
+ // column of interleaved |a2| and |b2| values and consume 8 * |height| values.
+ // This is |height| and not |height| * 2 because PreProcess only generates
+ // output for every other row. When processing the next column we write the
+ // new scratch values right after reading the previously saved ones.
+
+ // The PreProcess phase calculates a 5x5 box sum for every other row
+ //
+ // PreProcess and Process have been combined into the same step. We need 12
+ // input values to generate 8 output values for PreProcess:
+ // 0 1 2 3 4 5 6 7 8 9 10 11
+ // 2 = 0 + 1 + 2 + 3 + 4
+ // 3 = 1 + 2 + 3 + 4 + 5
+ // 4 = 2 + 3 + 4 + 5 + 6
+ // 5 = 3 + 4 + 5 + 6 + 7
+ // 6 = 4 + 5 + 6 + 7 + 8
+ // 7 = 5 + 6 + 7 + 8 + 9
+ // 8 = 6 + 7 + 8 + 9 + 10
+ // 9 = 7 + 8 + 9 + 10 + 11
+ //
+ // and then we need 10 input values to generate 8 output values for Process:
+ // 0 1 2 3 4 5 6 7 8 9
+ // 1 = 0 + 1 + 2
+ // 2 = 1 + 2 + 3
+ // 3 = 2 + 3 + 4
+ // 4 = 3 + 4 + 5
+ // 5 = 4 + 5 + 6
+ // 6 = 5 + 6 + 7
+ // 7 = 6 + 7 + 8
+ // 8 = 7 + 8 + 9
+ //
+ // To avoid re-calculating PreProcess values over and over again we will do a
+ // single column of 8 output values and store the second half of them
+ // interleaved in |temp|. The first half is not stored, since it is used
+ // immediately and becomes useless for the next column. Next we will start the
+ // second column. When 2 rows have been calculated we can calculate Process
+ // and output the results.
+
+ // Calculate and store a single column. Scope so we can re-use the variable
+ // names for the next step.
+ uint16_t* ab_ptr = temp;
+
+ const uint8_t* const src_pre_process = src - 2 * src_stride - 3;
+ // Calculate intermediate results, including two-pixel border, for example, if
+ // unit size is 64x64, we calculate 68x68 pixels.
+ {
+ const uint8_t* column = src_pre_process;
+ __m128i row[5], row_sq[5];
+ row[0] = row[1] = LoadLo8Msan(column, 2 - width);
+ column += src_stride;
+ row[2] = LoadLo8Msan(column, 2 - width);
+
+ row_sq[0] = row_sq[1] = VmullLo8(row[1], row[1]);
+ row_sq[2] = VmullLo8(row[2], row[2]);
+
+ int y = (height + 2) >> 1;
do {
- const __m128i v_a =
- HorizontalAddVerticalSumsRadius1(&vertical_sum_of_squares[x - 1]);
- const __m128i v_b =
- HorizontalAddVerticalSumsRadius1(&vertical_sums[x - 1]);
- // -----------------------
- // calc p, z, a2
- // -----------------------
- const __m128i v_255 = _mm_shuffle_epi32(_mm_cvtsi32_si128(255), 0);
- const __m128i v_n = _mm_shuffle_epi32(_mm_cvtsi32_si128(n), 0);
- const __m128i v_s = _mm_shuffle_epi32(_mm_cvtsi32_si128(s), 0);
- const __m128i v_dxd = _mm_mullo_epi32(v_b, v_b);
- const __m128i v_axn = _mm_mullo_epi32(v_a, v_n);
- const __m128i v_p = _mm_sub_epi32(v_axn, v_dxd);
- const __m128i v_z = _mm_min_epi32(
- v_255, RightShiftWithRounding_U32(_mm_mullo_epi32(v_p, v_s),
- kSgrProjScaleBits));
- const __m128i v_a2 = _mm_set_epi32(kXByXPlus1[_mm_extract_epi32(v_z, 3)],
- kXByXPlus1[_mm_extract_epi32(v_z, 2)],
- kXByXPlus1[_mm_extract_epi32(v_z, 1)],
- kXByXPlus1[_mm_extract_epi32(v_z, 0)]);
- // -----------------------
- // calc b2 and store
- // -----------------------
- const __m128i v_sgrbits_sub_a2 = _mm_sub_epi32(v_sgrbits, v_a2);
- const __m128i v_b2 =
- _mm_mullo_epi32(v_sgrbits_sub_a2, _mm_mullo_epi32(v_b, v_one_over_n));
- StoreUnaligned16(&intermediate_result[0][x], v_a2);
- StoreUnaligned16(
- &intermediate_result[1][x],
- RightShiftWithRounding_U32(v_b2, kSgrProjReciprocalBits));
- x += 4;
- } while (x <= width);
- intermediate_result[0] += array_stride;
- intermediate_result[1] += array_stride;
- } while (++y <= height);
-}
-
-void BoxFilterPreProcessRadius2_SSE4_1(
- const uint8_t* const src, ptrdiff_t stride, int width, int height,
- uint32_t s, uint32_t* intermediate_result[2], ptrdiff_t array_stride,
- uint32_t* vertical_sums, uint32_t* vertical_sum_of_squares) {
- assert(s != 0);
- const uint32_t n = 25;
- const uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
- const __m128i v_one_over_n =
- _mm_shuffle_epi32(_mm_cvtsi32_si128(one_over_n), 0);
- const __m128i v_sgrbits =
- _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << kSgrProjSgrBits), 0);
-
- // Calculate intermediate results, including one-pixel border, for example,
- // if unit size is 64x64, we calculate 66x66 pixels.
- int y = -1;
+ column += src_stride;
+ row[3] = LoadLo8Msan(column, 2 - width);
+ column += src_stride;
+ row[4] = LoadLo8Msan(column, 2 - width);
+
+ row_sq[3] = VmullLo8(row[3], row[3]);
+ row_sq[4] = VmullLo8(row[4], row[4]);
+
+ BoxFilterPreProcess4<5, 0>(row + 0, row_sq + 0, s[0], ab_ptr + 0);
+ BoxFilterPreProcess4<3, 1>(row + 1, row_sq + 1, s[1], ab_ptr + 8);
+ BoxFilterPreProcess4<3, 1>(row + 2, row_sq + 2, s[1], ab_ptr + 16);
+
+ row[0] = row[2];
+ row[1] = row[3];
+ row[2] = row[4];
+
+ row_sq[0] = row_sq[2];
+ row_sq[1] = row_sq[3];
+ row_sq[2] = row_sq[4];
+ ab_ptr += 24;
+ } while (--y != 0);
+ if ((height & 1) != 0) {
+ column += src_stride;
+ row[3] = row[4] = LoadLo8Msan(column, 2 - width);
+ row_sq[3] = row_sq[4] = VmullLo8(row[3], row[3]);
+ BoxFilterPreProcess4<5, 0>(row + 0, row_sq + 0, s[0], ab_ptr + 0);
+ BoxFilterPreProcess4<3, 1>(row + 1, row_sq + 1, s[1], ab_ptr + 8);
+ }
+ }
+
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ const __m128i w0_v = _mm_set1_epi32(w0);
+ const __m128i w1_v = _mm_set1_epi32(w1);
+ const __m128i w2_v = _mm_set1_epi32(w2);
+ int x = 0;
do {
- // Calculate the box vertical sums for each x position.
- const uint8_t* top_left = &src[(y - 2) * stride - 3];
- int vsx = -3;
- do {
- const __m128i v_box0 = _mm_cvtepu8_epi32(Load4(top_left));
- const __m128i v_box1 = _mm_cvtepu8_epi32(Load4(top_left + stride));
- const __m128i v_box2 = _mm_cvtepu8_epi32(Load4(top_left + stride * 2));
- const __m128i v_box3 = _mm_cvtepu8_epi32(Load4(top_left + stride * 3));
- const __m128i v_box4 = _mm_cvtepu8_epi32(Load4(top_left + stride * 4));
- const __m128i v_sqr0 = _mm_mullo_epi32(v_box0, v_box0);
- const __m128i v_sqr1 = _mm_mullo_epi32(v_box1, v_box1);
- const __m128i v_sqr2 = _mm_mullo_epi32(v_box2, v_box2);
- const __m128i v_sqr3 = _mm_mullo_epi32(v_box3, v_box3);
- const __m128i v_sqr4 = _mm_mullo_epi32(v_box4, v_box4);
- const __m128i v_a01 = _mm_add_epi32(v_sqr0, v_sqr1);
- const __m128i v_a012 = _mm_add_epi32(v_a01, v_sqr2);
- const __m128i v_a0123 = _mm_add_epi32(v_a012, v_sqr3);
- const __m128i v_a01234 = _mm_add_epi32(v_a0123, v_sqr4);
- const __m128i v_b01 = _mm_add_epi32(v_box0, v_box1);
- const __m128i v_b012 = _mm_add_epi32(v_b01, v_box2);
- const __m128i v_b0123 = _mm_add_epi32(v_b012, v_box3);
- const __m128i v_b01234 = _mm_add_epi32(v_b0123, v_box4);
- StoreUnaligned16(&vertical_sum_of_squares[vsx], v_a01234);
- StoreUnaligned16(&vertical_sums[vsx], v_b01234);
- top_left += 4;
- vsx += 4;
- } while (vsx <= width + 2);
-
- int x = -1;
+ // |src_pre_process| is X but we already processed the first column of 4
+ // values so we want to start at Y and increment from there.
+ // X s s s Y s s
+ // s s s s s s s
+ // s s i i i i i
+ // s s i o o o o
+ // s s i o o o o
+
+ // Seed the loop with one line of output. Then, inside the loop, for each
+ // iteration we can output one even row and one odd row and carry the new
+ // line to the next iteration. In the diagram below 'i' values are
+ // intermediary values from the first step and '-' values are empty.
+ // iiii
+ // ---- > even row
+ // iiii - odd row
+ // ---- > even row
+ // iiii
+ __m128i a2[2], b2[2][2], sum565_a[2], sum343_a[4], sum444_a[3];
+ __m128i sum565_b[2][2], sum343_b[4][2], sum444_b[3][2];
+ ab_ptr = temp;
+ a2[0] = b2[0][0] = LoadAligned16(ab_ptr);
+ a2[1] = b2[1][0] = LoadAligned16(ab_ptr + 8);
+
+ const uint8_t* column = src_pre_process + x + 4;
+ __m128i row[5], row_sq[5][2];
+ row[0] = row[1] = LoadUnaligned16Msan(column, x + 14 - width);
+ column += src_stride;
+ row[2] = LoadUnaligned16Msan(column, x + 14 - width);
+ column += src_stride;
+ row[3] = LoadUnaligned16Msan(column, x + 14 - width);
+ column += src_stride;
+ row[4] = LoadUnaligned16Msan(column, x + 14 - width);
+
+ row_sq[0][0] = row_sq[1][0] = VmullLo8(row[1], row[1]);
+ row_sq[0][1] = row_sq[1][1] = VmullHi8(row[1], row[1]);
+ row_sq[2][0] = VmullLo8(row[2], row[2]);
+ row_sq[2][1] = VmullHi8(row[2], row[2]);
+ row_sq[3][0] = VmullLo8(row[3], row[3]);
+ row_sq[3][1] = VmullHi8(row[3], row[3]);
+ row_sq[4][0] = VmullLo8(row[4], row[4]);
+ row_sq[4][1] = VmullHi8(row[4], row[4]);
+
+ BoxFilterPreProcess8<5, 0>(row, row_sq, s[0], &a2[0], &b2[0][1], ab_ptr);
+ BoxFilterPreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1], &b2[1][1],
+ ab_ptr + 8);
+
+ // Pass 1 Process. These are the only values we need to propagate between
+ // rows.
+ sum565_a[0] = Sum565(a2[0]);
+ sum565_a[0] = _mm_sub_epi16(_mm_set1_epi16((5 + 6 + 5) * 256), sum565_a[0]);
+ sum565_b[0][0] = Sum565W(_mm_alignr_epi8(b2[0][1], b2[0][0], 8));
+ sum565_b[0][1] = Sum565W(b2[0][1]);
+
+ sum343_a[0] = Sum343(a2[1]);
+ sum343_a[0] = _mm_sub_epi16(_mm_set1_epi16((3 + 4 + 3) * 256), sum343_a[0]);
+ Sum343W(b2[1], sum343_b[0]);
+
+ a2[1] = b2[1][0] = LoadAligned16(ab_ptr + 16);
+
+ BoxFilterPreProcess8<3, 1>(row + 2, row_sq + 2, s[1], &a2[1], &b2[1][1],
+ ab_ptr + 16);
+
+ Sum343_444(a2[1], &sum343_a[1], &sum444_a[0]);
+ sum343_a[1] = _mm_sub_epi16(_mm_set1_epi16((3 + 4 + 3) * 256), sum343_a[1]);
+ sum444_a[0] = _mm_sub_epi16(_mm_set1_epi16((4 + 4 + 4) * 256), sum444_a[0]);
+ Sum343_444W(b2[1], sum343_b[1], sum444_b[0]);
+
+ const uint8_t* src_ptr = src + x;
+ uint8_t* dst_ptr = dst + x;
+
+ // Calculate one output line. Add in the line from the previous pass and
+ // output one even row. Sum the new line and output the odd row. Carry the
+ // new row into the next pass.
+ for (int y = height >> 1; y != 0; --y) {
+ ab_ptr += 24;
+ a2[0] = b2[0][0] = LoadAligned16(ab_ptr);
+ a2[1] = b2[1][0] = LoadAligned16(ab_ptr + 8);
+
+ row[0] = row[2];
+ row[1] = row[3];
+ row[2] = row[4];
+
+ row_sq[0][0] = row_sq[2][0], row_sq[0][1] = row_sq[2][1];
+ row_sq[1][0] = row_sq[3][0], row_sq[1][1] = row_sq[3][1];
+ row_sq[2][0] = row_sq[4][0], row_sq[2][1] = row_sq[4][1];
+
+ column += src_stride;
+ row[3] = LoadUnaligned16Msan(column, x + 14 - width);
+ column += src_stride;
+ row[4] = LoadUnaligned16Msan(column, x + 14 - width);
+
+ row_sq[3][0] = VmullLo8(row[3], row[3]);
+ row_sq[3][1] = VmullHi8(row[3], row[3]);
+ row_sq[4][0] = VmullLo8(row[4], row[4]);
+ row_sq[4][1] = VmullHi8(row[4], row[4]);
+
+ BoxFilterPreProcess8<5, 0>(row, row_sq, s[0], &a2[0], &b2[0][1], ab_ptr);
+ BoxFilterPreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1], &b2[1][1],
+ ab_ptr + 8);
+
+ __m128i p[2];
+ const __m128i src0 = LoadLo8(src_ptr);
+ p[0] = BoxFilterPass1(src0, a2[0], b2[0], sum565_a, sum565_b);
+ p[1] = BoxFilterPass2(src0, a2[1], b2[1], sum343_a, sum444_a, sum343_b,
+ sum444_b);
+ SelfGuidedDoubleMultiplier(src0, p, w0_v, w1_v, w2_v, dst_ptr);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+
+ const __m128i src1 = LoadLo8(src_ptr);
+ p[0] = CalculateFilteredOutput<4>(src1, sum565_a[1], sum565_b[1]);
+ a2[1] = b2[1][0] = LoadAligned16(ab_ptr + 16);
+ BoxFilterPreProcess8<3, 1>(row + 2, row_sq + 2, s[1], &a2[1], &b2[1][1],
+ ab_ptr + 16);
+ p[1] = BoxFilterPass2(src1, a2[1], b2[1], sum343_a + 1, sum444_a + 1,
+ sum343_b + 1, sum444_b + 1);
+ SelfGuidedDoubleMultiplier(src1, p, w0_v, w1_v, w2_v, dst_ptr);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+
+ sum565_a[0] = sum565_a[1];
+ sum565_b[0][0] = sum565_b[1][0], sum565_b[0][1] = sum565_b[1][1];
+ sum343_a[0] = sum343_a[2];
+ sum343_a[1] = sum343_a[3];
+ sum444_a[0] = sum444_a[2];
+ sum343_b[0][0] = sum343_b[2][0], sum343_b[0][1] = sum343_b[2][1];
+ sum343_b[1][0] = sum343_b[3][0], sum343_b[1][1] = sum343_b[3][1];
+ sum444_b[0][0] = sum444_b[2][0], sum444_b[0][1] = sum444_b[2][1];
+ }
+ if ((height & 1) != 0) {
+ ab_ptr += 24;
+ a2[0] = b2[0][0] = LoadAligned16(ab_ptr);
+ a2[1] = b2[1][0] = LoadAligned16(ab_ptr + 8);
+
+ row[0] = row[2];
+ row[1] = row[3];
+ row[2] = row[4];
+
+ row_sq[0][0] = row_sq[2][0], row_sq[0][1] = row_sq[2][1];
+ row_sq[1][0] = row_sq[3][0], row_sq[1][1] = row_sq[3][1];
+ row_sq[2][0] = row_sq[4][0], row_sq[2][1] = row_sq[4][1];
+
+ column += src_stride;
+ row[3] = row[4] = LoadUnaligned16Msan(column, x + 14 - width);
+
+ row_sq[3][0] = row_sq[4][0] = VmullLo8(row[3], row[3]);
+ row_sq[3][1] = row_sq[4][1] = VmullHi8(row[3], row[3]);
+
+ BoxFilterPreProcess8<5, 0>(row, row_sq, s[0], &a2[0], &b2[0][1], ab_ptr);
+ BoxFilterPreProcess8<3, 1>(row + 1, row_sq + 1, s[1], &a2[1], &b2[1][1],
+ ab_ptr + 8);
+
+ __m128i p[2];
+ const __m128i src0 = LoadLo8(src_ptr);
+ p[0] = BoxFilterPass1(src0, a2[0], b2[0], sum565_a, sum565_b);
+ p[1] = BoxFilterPass2(src0, a2[1], b2[1], sum343_a, sum444_a, sum343_b,
+ sum444_b);
+ SelfGuidedDoubleMultiplier(src0, p, w0_v, w1_v, w2_v, dst_ptr);
+ }
+ x += 8;
+ } while (x < width);
+}
+
+inline void BoxFilterProcessPass1(const uint8_t* const src,
+ const ptrdiff_t src_stride,
+ const RestorationUnitInfo& restoration_info,
+ const int width, const int height,
+ const uint32_t s, uint16_t* const temp,
+ uint8_t* const dst,
+ const ptrdiff_t dst_stride) {
+ // We have combined PreProcess and Process for the first pass by storing
+ // intermediate values in the |a2| region. The values stored are one vertical
+ // column of interleaved |a2| and |b2| values and consume 8 * |height| values.
+ // This is |height| and not |height| * 2 because PreProcess only generates
+ // output for every other row. When processing the next column we write the
+ // new scratch values right after reading the previously saved ones.
+
+ // The PreProcess phase calculates a 5x5 box sum for every other row
+ //
+ // PreProcess and Process have been combined into the same step. We need 12
+ // input values to generate 8 output values for PreProcess:
+ // 0 1 2 3 4 5 6 7 8 9 10 11
+ // 2 = 0 + 1 + 2 + 3 + 4
+ // 3 = 1 + 2 + 3 + 4 + 5
+ // 4 = 2 + 3 + 4 + 5 + 6
+ // 5 = 3 + 4 + 5 + 6 + 7
+ // 6 = 4 + 5 + 6 + 7 + 8
+ // 7 = 5 + 6 + 7 + 8 + 9
+ // 8 = 6 + 7 + 8 + 9 + 10
+ // 9 = 7 + 8 + 9 + 10 + 11
+ //
+ // and then we need 10 input values to generate 8 output values for Process:
+ // 0 1 2 3 4 5 6 7 8 9
+ // 1 = 0 + 1 + 2
+ // 2 = 1 + 2 + 3
+ // 3 = 2 + 3 + 4
+ // 4 = 3 + 4 + 5
+ // 5 = 4 + 5 + 6
+ // 6 = 5 + 6 + 7
+ // 7 = 6 + 7 + 8
+ // 8 = 7 + 8 + 9
+ //
+ // To avoid re-calculating PreProcess values over and over again we will do a
+ // single column of 8 output values and store the second half of them
+ // interleaved in |temp|. The first half is not stored, since it is used
+ // immediately and becomes useless for the next column. Next we will start the
+ // second column. When 2 rows have been calculated we can calculate Process
+ // and output the results.
+
+ // Calculate and store a single column. Scope so we can re-use the variable
+ // names for the next step.
+ uint16_t* ab_ptr = temp;
+
+ const uint8_t* const src_pre_process = src - 2 * src_stride - 3;
+ // Calculate intermediate results, including two-pixel border, for example, if
+ // unit size is 64x64, we calculate 68x68 pixels.
+ {
+ const uint8_t* column = src_pre_process;
+ __m128i row[5], row_sq[5];
+ row[0] = row[1] = LoadLo8Msan(column, 2 - width);
+ column += src_stride;
+ row[2] = LoadLo8Msan(column, 2 - width);
+
+ row_sq[0] = row_sq[1] = VmullLo8(row[1], row[1]);
+ row_sq[2] = VmullLo8(row[2], row[2]);
+
+ int y = (height + 2) >> 1;
do {
- const __m128i v_a =
- HorizontalAddVerticalSumsRadius2(&vertical_sum_of_squares[x - 2]);
- const __m128i v_b =
- HorizontalAddVerticalSumsRadius2(&vertical_sums[x - 2]);
- // -----------------------
- // calc p, z, a2
- // -----------------------
- const __m128i v_255 = _mm_shuffle_epi32(_mm_cvtsi32_si128(255), 0);
- const __m128i v_n = _mm_shuffle_epi32(_mm_cvtsi32_si128(n), 0);
- const __m128i v_s = _mm_shuffle_epi32(_mm_cvtsi32_si128(s), 0);
- const __m128i v_dxd = _mm_mullo_epi32(v_b, v_b);
- const __m128i v_axn = _mm_mullo_epi32(v_a, v_n);
- const __m128i v_p = _mm_sub_epi32(v_axn, v_dxd);
- const __m128i v_z = _mm_min_epi32(
- v_255, RightShiftWithRounding_U32(_mm_mullo_epi32(v_p, v_s),
- kSgrProjScaleBits));
- const __m128i v_a2 = _mm_set_epi32(kXByXPlus1[_mm_extract_epi32(v_z, 3)],
- kXByXPlus1[_mm_extract_epi32(v_z, 2)],
- kXByXPlus1[_mm_extract_epi32(v_z, 1)],
- kXByXPlus1[_mm_extract_epi32(v_z, 0)]);
- // -----------------------
- // calc b2 and store
- // -----------------------
- const __m128i v_sgrbits_sub_a2 = _mm_sub_epi32(v_sgrbits, v_a2);
- const __m128i v_b2 =
- _mm_mullo_epi32(v_sgrbits_sub_a2, _mm_mullo_epi32(v_b, v_one_over_n));
- StoreUnaligned16(&intermediate_result[0][x], v_a2);
- StoreUnaligned16(
- &intermediate_result[1][x],
- RightShiftWithRounding_U32(v_b2, kSgrProjReciprocalBits));
- x += 4;
- } while (x <= width);
- intermediate_result[0] += 2 * array_stride;
- intermediate_result[1] += 2 * array_stride;
- y += 2;
- } while (y <= height);
-}
-
-void BoxFilterPreProcess_SSE4_1(const RestorationUnitInfo& restoration_info,
- const uint8_t* const src, ptrdiff_t stride,
- int width, int height, int pass,
- RestorationBuffer* const buffer) {
- uint32_t vertical_sums_buf[kRestorationProcessingUnitSize +
- 2 * kRestorationBorder + kRestorationPadding];
- uint32_t vertical_sum_of_squares_buf[kRestorationProcessingUnitSize +
- 2 * kRestorationBorder +
- kRestorationPadding];
- uint32_t* vertical_sums = &vertical_sums_buf[4];
- uint32_t* vertical_sum_of_squares = &vertical_sum_of_squares_buf[4];
- const ptrdiff_t array_stride = buffer->box_filter_process_intermediate_stride;
- // The size of the intermediate result buffer is the size of the filter area
- // plus horizontal (3) and vertical (3) padding. The processing start point
- // is the filter area start point -1 row and -1 column. Therefore we need to
- // set offset and use the intermediate_result as the start point for
- // processing.
- const ptrdiff_t intermediate_buffer_offset =
- kRestorationBorder * array_stride + kRestorationBorder;
- uint32_t* intermediate_result[2] = {
- buffer->box_filter_process_intermediate[0] + intermediate_buffer_offset -
- array_stride,
- buffer->box_filter_process_intermediate[1] + intermediate_buffer_offset -
- array_stride};
- const int sgr_proj_index = restoration_info.sgr_proj_info.index;
- if (pass == 0) {
- assert(kSgrProjParams[sgr_proj_index][0] == 2);
- BoxFilterPreProcessRadius2_SSE4_1(src, stride, width, height,
- kSgrScaleParameter[sgr_proj_index][0],
- intermediate_result, array_stride,
- vertical_sums, vertical_sum_of_squares);
- } else {
- assert(kSgrProjParams[sgr_proj_index][2] == 1);
- BoxFilterPreProcessRadius1_SSE4_1(src, stride, width, height,
- kSgrScaleParameter[sgr_proj_index][1],
- intermediate_result, array_stride,
- vertical_sums, vertical_sum_of_squares);
+ column += src_stride;
+ row[3] = LoadLo8Msan(column, 2 - width);
+ column += src_stride;
+ row[4] = LoadLo8Msan(column, 2 - width);
+
+ row_sq[3] = VmullLo8(row[3], row[3]);
+ row_sq[4] = VmullLo8(row[4], row[4]);
+
+ BoxFilterPreProcess4<5, 0>(row, row_sq, s, ab_ptr);
+
+ row[0] = row[2];
+ row[1] = row[3];
+ row[2] = row[4];
+
+ row_sq[0] = row_sq[2];
+ row_sq[1] = row_sq[3];
+ row_sq[2] = row_sq[4];
+ ab_ptr += 8;
+ } while (--y != 0);
+ if ((height & 1) != 0) {
+ column += src_stride;
+ row[3] = row[4] = LoadLo8Msan(column, 2 - width);
+ row_sq[3] = row_sq[4] = VmullLo8(row[3], row[3]);
+ BoxFilterPreProcess4<5, 0>(row, row_sq, s, ab_ptr);
+ }
}
-}
-inline __m128i Sum565Row(const __m128i v_DBCA, const __m128i v_XXFE) {
- __m128i v_sum = v_DBCA;
- const __m128i v_EDCB = _mm_alignr_epi8(v_XXFE, v_DBCA, 4);
- v_sum = _mm_add_epi32(v_sum, v_EDCB);
- const __m128i v_FEDC = _mm_alignr_epi8(v_XXFE, v_DBCA, 8);
- v_sum = _mm_add_epi32(v_sum, v_FEDC);
- // D C B A x4
- // + E D C B x4
- // + F E D C x4
- v_sum = _mm_slli_epi32(v_sum, 2);
- // + D C B A
- v_sum = _mm_add_epi32(v_sum, v_DBCA); // 5
- // + E D C B x2
- v_sum = _mm_add_epi32(v_sum, _mm_slli_epi32(v_EDCB, 1)); // 6
- // + F E D C
- return _mm_add_epi32(v_sum, v_FEDC); // 5
-}
-
-inline __m128i Process3x3Block_565_Odd(const uint32_t* src, ptrdiff_t stride) {
- // 0 0 0
- // 5 6 5
- // 0 0 0
- const uint32_t* top_left = src - 1;
- const __m128i v_src1_lo = LoadUnaligned16(top_left + stride);
- const __m128i v_src1_hi = LoadLo8(top_left + stride + 4);
- return Sum565Row(v_src1_lo, v_src1_hi);
-}
-
-inline __m128i Process3x3Block_565_Even(const uint32_t* src, ptrdiff_t stride) {
- // 5 6 5
- // 0 0 0
- // 5 6 5
- const uint32_t* top_left = src - 1;
- const __m128i v_src0_lo = LoadUnaligned16(top_left);
- const __m128i v_src0_hi = LoadLo8(top_left + 4);
- const __m128i v_src2_lo = LoadUnaligned16(top_left + stride * 2);
- const __m128i v_src2_hi = LoadLo8(top_left + stride * 2 + 4);
- const __m128i v_a0 = Sum565Row(v_src0_lo, v_src0_hi);
- const __m128i v_a2 = Sum565Row(v_src2_lo, v_src2_hi);
- return _mm_add_epi32(v_a0, v_a2);
-}
-
-inline __m128i Sum343Row(const __m128i v_DBCA, const __m128i v_XXFE) {
- __m128i v_sum = v_DBCA;
- const __m128i v_EDCB = _mm_alignr_epi8(v_XXFE, v_DBCA, 4);
- v_sum = _mm_add_epi32(v_sum, v_EDCB);
- const __m128i v_FEDC = _mm_alignr_epi8(v_XXFE, v_DBCA, 8);
- v_sum = _mm_add_epi32(v_sum, v_FEDC);
- // D C B A x4
- // + E D C B x4
- // + F E D C x4
- v_sum = _mm_slli_epi32(v_sum, 2); // 4
- // - D C B A
- v_sum = _mm_sub_epi32(v_sum, v_DBCA); // 3
- // - F E D C
- return _mm_sub_epi32(v_sum, v_FEDC); // 3
-}
-
-inline __m128i Sum444Row(const __m128i v_DBCA, const __m128i v_XXFE) {
- __m128i v_sum = v_DBCA;
- const __m128i v_EDCB = _mm_alignr_epi8(v_XXFE, v_DBCA, 4);
- v_sum = _mm_add_epi32(v_sum, v_EDCB);
- const __m128i v_FEDC = _mm_alignr_epi8(v_XXFE, v_DBCA, 8);
- v_sum = _mm_add_epi32(v_sum, v_FEDC);
- // D C B A x4
- // + E D C B x4
- // + F E D C x4
- return _mm_slli_epi32(v_sum, 2); // 4
-}
-
-inline __m128i Process3x3Block_343(const uint32_t* src, ptrdiff_t stride) {
- const uint32_t* top_left = src - 1;
- const __m128i v_ir0_lo = LoadUnaligned16(top_left);
- const __m128i v_ir0_hi = LoadLo8(top_left + 4);
- const __m128i v_ir1_lo = LoadUnaligned16(top_left + stride);
- const __m128i v_ir1_hi = LoadLo8(top_left + stride + 4);
- const __m128i v_ir2_lo = LoadUnaligned16(top_left + stride * 2);
- const __m128i v_ir2_hi = LoadLo8(top_left + stride * 2 + 4);
- const __m128i v_a0 = Sum343Row(v_ir0_lo, v_ir0_hi);
- const __m128i v_a1 = Sum444Row(v_ir1_lo, v_ir1_hi);
- const __m128i v_a2 = Sum343Row(v_ir2_lo, v_ir2_hi);
- return _mm_add_epi32(v_a0, _mm_add_epi32(v_a1, v_a2));
-}
-
-void BoxFilterProcess_SSE4_1(const RestorationUnitInfo& restoration_info,
- const uint8_t* src, ptrdiff_t stride, int width,
- int height, RestorationBuffer* const buffer) {
- const int sgr_proj_index = restoration_info.sgr_proj_info.index;
- for (int pass = 0; pass < 2; ++pass) {
- const uint8_t radius = kSgrProjParams[sgr_proj_index][pass * 2];
- const uint8_t* src_ptr = src;
- if (radius == 0) continue;
-
- BoxFilterPreProcess_SSE4_1(restoration_info, src_ptr, stride, width, height,
- pass, buffer);
-
- int* filtered_output = buffer->box_filter_process_output[pass];
- const ptrdiff_t filtered_output_stride =
- buffer->box_filter_process_output_stride;
- const ptrdiff_t intermediate_stride =
- buffer->box_filter_process_intermediate_stride;
- // Set intermediate buffer start point to the actual start point of
- // filtering.
- const ptrdiff_t intermediate_buffer_offset =
- kRestorationBorder * intermediate_stride + kRestorationBorder;
-
- if (pass == 0) {
- int y = 0;
- do {
- const int shift = ((y & 1) != 0) ? 4 : 5;
- uint32_t* const array_start[2] = {
- buffer->box_filter_process_intermediate[0] +
- intermediate_buffer_offset + y * intermediate_stride,
- buffer->box_filter_process_intermediate[1] +
- intermediate_buffer_offset + y * intermediate_stride};
- uint32_t* intermediate_result2[2] = {
- array_start[0] - intermediate_stride,
- array_start[1] - intermediate_stride};
- if ((y & 1) == 0) { // even row
- int x = 0;
- do {
- // 5 6 5
- // 0 0 0
- // 5 6 5
- const __m128i v_A = Process3x3Block_565_Even(
- &intermediate_result2[0][x], intermediate_stride);
- const __m128i v_B = Process3x3Block_565_Even(
- &intermediate_result2[1][x], intermediate_stride);
- const __m128i v_src = _mm_cvtepu8_epi32(Load4(src_ptr + x));
- const __m128i v_v0 = _mm_mullo_epi32(v_A, v_src);
- const __m128i v_v = _mm_add_epi32(v_v0, v_B);
- const __m128i v_filtered = RightShiftWithRounding_U32(
- v_v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
-
- StoreUnaligned16(&filtered_output[x], v_filtered);
- x += 4;
- } while (x < width);
- } else {
- int x = 0;
- do {
- // 0 0 0
- // 5 6 5
- // 0 0 0
- const __m128i v_A = Process3x3Block_565_Odd(
- &intermediate_result2[0][x], intermediate_stride);
- const __m128i v_B = Process3x3Block_565_Odd(
- &intermediate_result2[1][x], intermediate_stride);
- const __m128i v_src = _mm_cvtepu8_epi32(Load4(src_ptr + x));
- const __m128i v_v0 = _mm_mullo_epi32(v_A, v_src);
- const __m128i v_v = _mm_add_epi32(v_v0, v_B);
- const __m128i v_filtered = RightShiftWithRounding_U32(
- v_v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
-
- StoreUnaligned16(&filtered_output[x], v_filtered);
- x += 4;
- } while (x < width);
- }
- src_ptr += stride;
- filtered_output += filtered_output_stride;
- } while (++y < height);
- } else {
- int y = 0;
- do {
- const int shift = 5;
- uint32_t* const array_start[2] = {
- buffer->box_filter_process_intermediate[0] +
- intermediate_buffer_offset + y * intermediate_stride,
- buffer->box_filter_process_intermediate[1] +
- intermediate_buffer_offset + y * intermediate_stride};
- uint32_t* intermediate_result2[2] = {
- array_start[0] - intermediate_stride,
- array_start[1] - intermediate_stride};
- int x = 0;
- do {
- const __m128i v_A = Process3x3Block_343(&intermediate_result2[0][x],
- intermediate_stride);
- const __m128i v_B = Process3x3Block_343(&intermediate_result2[1][x],
- intermediate_stride);
- const __m128i v_src = _mm_cvtepu8_epi32(Load4(src_ptr + x));
- const __m128i v_v0 = _mm_mullo_epi32(v_A, v_src);
- const __m128i v_v = _mm_add_epi32(v_v0, v_B);
- const __m128i v_filtered = RightShiftWithRounding_U32(
- v_v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
-
- StoreUnaligned16(&filtered_output[x], v_filtered);
- x += 4;
- } while (x < width);
- src_ptr += stride;
- filtered_output += filtered_output_stride;
- } while (++y < height);
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = (1 << kSgrProjPrecisionBits) - w0;
+ int x = 0;
+ do {
+ // |src_pre_process| is X but we already processed the first column of 4
+ // values so we want to start at Y and increment from there.
+ // X s s s Y s s
+ // s s s s s s s
+ // s s i i i i i
+ // s s i o o o o
+ // s s i o o o o
+
+ // Seed the loop with one line of output. Then, inside the loop, for each
+ // iteration we can output one even row and one odd row and carry the new
+ // line to the next iteration. In the diagram below 'i' values are
+ // intermediary values from the first step and '-' values are empty.
+ // iiii
+ // ---- > even row
+ // iiii - odd row
+ // ---- > even row
+ // iiii
+ __m128i a2[2], b2[2], sum565_a[2], sum565_b[2][2];
+ ab_ptr = temp;
+ a2[0] = b2[0] = LoadAligned16(ab_ptr);
+
+ const uint8_t* column = src_pre_process + x + 4;
+ __m128i row[5], row_sq[5][2];
+ row[0] = row[1] = LoadUnaligned16Msan(column, x + 14 - width);
+ column += src_stride;
+ row[2] = LoadUnaligned16Msan(column, x + 14 - width);
+ column += src_stride;
+ row[3] = LoadUnaligned16Msan(column, x + 14 - width);
+ column += src_stride;
+ row[4] = LoadUnaligned16Msan(column, x + 14 - width);
+
+ row_sq[0][0] = row_sq[1][0] = VmullLo8(row[1], row[1]);
+ row_sq[0][1] = row_sq[1][1] = VmullHi8(row[1], row[1]);
+ row_sq[2][0] = VmullLo8(row[2], row[2]);
+ row_sq[2][1] = VmullHi8(row[2], row[2]);
+ row_sq[3][0] = VmullLo8(row[3], row[3]);
+ row_sq[3][1] = VmullHi8(row[3], row[3]);
+ row_sq[4][0] = VmullLo8(row[4], row[4]);
+ row_sq[4][1] = VmullHi8(row[4], row[4]);
+
+ BoxFilterPreProcess8<5, 0>(row, row_sq, s, &a2[0], &b2[1], ab_ptr);
+
+ // Pass 1 Process. These are the only values we need to propagate between
+ // rows.
+ sum565_a[0] = Sum565(a2[0]);
+ sum565_a[0] = _mm_sub_epi16(_mm_set1_epi16((5 + 6 + 5) * 256), sum565_a[0]);
+ sum565_b[0][0] = Sum565W(_mm_alignr_epi8(b2[1], b2[0], 8));
+ sum565_b[0][1] = Sum565W(b2[1]);
+
+ const uint8_t* src_ptr = src + x;
+ uint8_t* dst_ptr = dst + x;
+
+ // Calculate one output line. Add in the line from the previous pass and
+ // output one even row. Sum the new line and output the odd row. Carry the
+ // new row into the next pass.
+ for (int y = height >> 1; y != 0; --y) {
+ ab_ptr += 8;
+ a2[0] = b2[0] = LoadAligned16(ab_ptr);
+
+ row[0] = row[2];
+ row[1] = row[3];
+ row[2] = row[4];
+
+ row_sq[0][0] = row_sq[2][0], row_sq[0][1] = row_sq[2][1];
+ row_sq[1][0] = row_sq[3][0], row_sq[1][1] = row_sq[3][1];
+ row_sq[2][0] = row_sq[4][0], row_sq[2][1] = row_sq[4][1];
+
+ column += src_stride;
+ row[3] = LoadUnaligned16Msan(column, x + 14 - width);
+ column += src_stride;
+ row[4] = LoadUnaligned16Msan(column, x + 14 - width);
+
+ row_sq[3][0] = VmullLo8(row[3], row[3]);
+ row_sq[3][1] = VmullHi8(row[3], row[3]);
+ row_sq[4][0] = VmullLo8(row[4], row[4]);
+ row_sq[4][1] = VmullHi8(row[4], row[4]);
+
+ BoxFilterPreProcess8<5, 0>(row, row_sq, s, &a2[0], &b2[1], ab_ptr);
+
+ const __m128i src0 = LoadLo8(src_ptr);
+ const __m128i p0 = BoxFilterPass1(src0, a2[0], b2, sum565_a, sum565_b);
+ SelfGuidedSingleMultiplier(src0, p0, w0, w1, dst_ptr);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+
+ const __m128i src1 = LoadLo8(src_ptr);
+ const __m128i p1 =
+ CalculateFilteredOutput<4>(src1, sum565_a[1], sum565_b[1]);
+ SelfGuidedSingleMultiplier(src1, p1, w0, w1, dst_ptr);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+
+ sum565_a[0] = sum565_a[1];
+ sum565_b[0][0] = sum565_b[1][0], sum565_b[0][1] = sum565_b[1][1];
+ }
+ if ((height & 1) != 0) {
+ ab_ptr += 8;
+ a2[0] = b2[0] = LoadAligned16(ab_ptr);
+
+ row[0] = row[2];
+ row[1] = row[3];
+ row[2] = row[4];
+
+ row_sq[0][0] = row_sq[2][0], row_sq[0][1] = row_sq[2][1];
+ row_sq[1][0] = row_sq[3][0], row_sq[1][1] = row_sq[3][1];
+ row_sq[2][0] = row_sq[4][0], row_sq[2][1] = row_sq[4][1];
+
+ column += src_stride;
+ row[3] = row[4] = LoadUnaligned16Msan(column, x + 14 - width);
+
+ row_sq[3][0] = row_sq[4][0] = VmullLo8(row[3], row[3]);
+ row_sq[3][1] = row_sq[4][1] = VmullHi8(row[3], row[3]);
+
+ BoxFilterPreProcess8<5, 0>(row, row_sq, s, &a2[0], &b2[1], ab_ptr);
+
+ const __m128i src0 = LoadLo8(src_ptr);
+ const __m128i p0 = BoxFilterPass1(src0, a2[0], b2, sum565_a, sum565_b);
+ SelfGuidedSingleMultiplier(src0, p0, w0, w1, dst_ptr);
}
+ x += 8;
+ } while (x < width);
+}
+
+inline void BoxFilterProcessPass2(const uint8_t* src,
+ const ptrdiff_t src_stride,
+ const RestorationUnitInfo& restoration_info,
+ const int width, const int height,
+ const uint32_t s, uint16_t* const temp,
+ uint8_t* const dst,
+ const ptrdiff_t dst_stride) {
+ uint16_t* ab_ptr = temp;
+
+ // Calculate intermediate results, including one-pixel border, for example, if
+ // unit size is 64x64, we calculate 66x66 pixels.
+ // Because of the vectors this calculates start in blocks of 4 so we actually
+ // get 68 values.
+ const uint8_t* const src_top_left_corner = src - 2 * src_stride - 2;
+ {
+ const uint8_t* column = src_top_left_corner;
+ __m128i row[3], row_sq[3];
+ row[0] = LoadLo8Msan(column, 4 - width);
+ column += src_stride;
+ row[1] = LoadLo8Msan(column, 4 - width);
+ row_sq[0] = VmullLo8(row[0], row[0]);
+ row_sq[1] = VmullLo8(row[1], row[1]);
+
+ int y = height + 2;
+ do {
+ column += src_stride;
+ row[2] = LoadLo8Msan(column, 4 - width);
+ row_sq[2] = VmullLo8(row[2], row[2]);
+
+ BoxFilterPreProcess4<3, 0>(row, row_sq, s, ab_ptr);
+
+ row[0] = row[1];
+ row[1] = row[2];
+
+ row_sq[0] = row_sq[1];
+ row_sq[1] = row_sq[2];
+ ab_ptr += 8;
+ } while (--y != 0);
}
+
+ assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+ int x = 0;
+ do {
+ ab_ptr = temp;
+
+ __m128i a2, b2[2], sum343_a[3], sum444_a[2], sum343_b[3][2], sum444_b[2][2];
+ a2 = b2[0] = LoadAligned16(ab_ptr);
+
+ const uint8_t* column = src_top_left_corner + x + 4;
+ __m128i row[3], row_sq[3][2];
+ row[0] = LoadUnaligned16Msan(column, x + 16 - width);
+ column += src_stride;
+ row[1] = LoadUnaligned16Msan(column, x + 16 - width);
+ column += src_stride;
+ row[2] = LoadUnaligned16Msan(column, x + 16 - width);
+
+ row_sq[0][0] = VmullLo8(row[0], row[0]);
+ row_sq[0][1] = VmullHi8(row[0], row[0]);
+ row_sq[1][0] = VmullLo8(row[1], row[1]);
+ row_sq[1][1] = VmullHi8(row[1], row[1]);
+ row_sq[2][0] = VmullLo8(row[2], row[2]);
+ row_sq[2][1] = VmullHi8(row[2], row[2]);
+
+ BoxFilterPreProcess8<3, 0>(row, row_sq, s, &a2, &b2[1], ab_ptr);
+
+ sum343_a[0] = Sum343(a2);
+ sum343_a[0] = _mm_sub_epi16(_mm_set1_epi16((3 + 4 + 3) * 256), sum343_a[0]);
+ Sum343W(b2, sum343_b[0]);
+
+ ab_ptr += 8;
+ a2 = b2[0] = LoadAligned16(ab_ptr);
+
+ row[0] = row[1];
+ row[1] = row[2];
+
+ row_sq[0][0] = row_sq[1][0], row_sq[0][1] = row_sq[1][1];
+ row_sq[1][0] = row_sq[2][0], row_sq[1][1] = row_sq[2][1];
+ column += src_stride;
+ row[2] = LoadUnaligned16Msan(column, x + 16 - width);
+
+ row_sq[2][0] = VmullLo8(row[2], row[2]);
+ row_sq[2][1] = VmullHi8(row[2], row[2]);
+
+ BoxFilterPreProcess8<3, 0>(row, row_sq, s, &a2, &b2[1], ab_ptr);
+
+ Sum343_444(a2, &sum343_a[1], &sum444_a[0]);
+ sum343_a[1] = _mm_sub_epi16(_mm_set1_epi16((3 + 4 + 3) * 256), sum343_a[1]);
+ sum444_a[0] = _mm_sub_epi16(_mm_set1_epi16((4 + 4 + 4) * 256), sum444_a[0]);
+ Sum343_444W(b2, sum343_b[1], sum444_b[0]);
+
+ const uint8_t* src_ptr = src + x;
+ uint8_t* dst_ptr = dst + x;
+ int y = height;
+ do {
+ ab_ptr += 8;
+ a2 = b2[0] = LoadAligned16(ab_ptr);
+
+ row[0] = row[1];
+ row[1] = row[2];
+
+ row_sq[0][0] = row_sq[1][0], row_sq[0][1] = row_sq[1][1];
+ row_sq[1][0] = row_sq[2][0], row_sq[1][1] = row_sq[2][1];
+ column += src_stride;
+ row[2] = LoadUnaligned16Msan(column, x + 16 - width);
+
+ row_sq[2][0] = VmullLo8(row[2], row[2]);
+ row_sq[2][1] = VmullHi8(row[2], row[2]);
+
+ BoxFilterPreProcess8<3, 0>(row, row_sq, s, &a2, &b2[1], ab_ptr);
+
+ const __m128i src_u8 = LoadLo8(src_ptr);
+ const __m128i p = BoxFilterPass2(src_u8, a2, b2, sum343_a, sum444_a,
+ sum343_b, sum444_b);
+ SelfGuidedSingleMultiplier(src_u8, p, w0, w1, dst_ptr);
+ sum343_a[0] = sum343_a[1];
+ sum343_a[1] = sum343_a[2];
+ sum444_a[0] = sum444_a[1];
+ sum343_b[0][0] = sum343_b[1][0], sum343_b[0][1] = sum343_b[1][1];
+ sum343_b[1][0] = sum343_b[2][0], sum343_b[1][1] = sum343_b[2][1];
+ sum444_b[0][0] = sum444_b[1][0], sum444_b[0][1] = sum444_b[1][1];
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--y != 0);
+ x += 8;
+ } while (x < width);
}
-void SelfGuidedFilter_SSE4_1(const void* source, void* dest,
+// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
+// the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_SSE4_1(const void* const source, void* const dest,
const RestorationUnitInfo& restoration_info,
- ptrdiff_t source_stride, ptrdiff_t dest_stride,
- int width, int height,
+ const ptrdiff_t source_stride,
+ const ptrdiff_t dest_stride, const int width,
+ const int height,
RestorationBuffer* const buffer) {
+ const int index = restoration_info.sgr_proj_info.index;
+ const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
+ const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
const auto* src = static_cast<const uint8_t*>(source);
auto* dst = static_cast<uint8_t*>(dest);
- const int w0 = restoration_info.sgr_proj_info.multiplier[0];
- const int w1 = restoration_info.sgr_proj_info.multiplier[1];
- const int w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
- const int index = restoration_info.sgr_proj_info.index;
- const uint8_t r0 = kSgrProjParams[index][0];
- const uint8_t r1 = kSgrProjParams[index][2];
- const ptrdiff_t array_stride = buffer->box_filter_process_output_stride;
- int* box_filter_process_output[2] = {buffer->box_filter_process_output[0],
- buffer->box_filter_process_output[1]};
-
- BoxFilterProcess_SSE4_1(restoration_info, src, source_stride, width, height,
- buffer);
-
- const __m128i v_w0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(w0), 0);
- const __m128i v_w1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(w1), 0);
- const __m128i v_w2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(w2), 0);
- const __m128i v_r0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(r0), 0);
- const __m128i v_r1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(r1), 0);
- const __m128i zero = _mm_setzero_si128();
- // Create masks used to select between src and box_filter_process_output.
- const __m128i v_r0_mask = _mm_cmpeq_epi32(v_r0, zero);
- const __m128i v_r1_mask = _mm_cmpeq_epi32(v_r1, zero);
-
- int y = 0;
- do {
- int x = 0;
- do {
- const __m128i v_src = _mm_cvtepu8_epi32(Load4(src + x));
- const __m128i v_u = _mm_slli_epi32(v_src, kSgrProjRestoreBits);
- const __m128i v_v_a = _mm_mullo_epi32(v_w1, v_u);
- const __m128i v_bfp_out0 =
- LoadUnaligned16(&box_filter_process_output[0][x]);
- // Select u or box_filter_process_output[0][x].
- const __m128i v_r0_mult = _mm_blendv_epi8(v_bfp_out0, v_u, v_r0_mask);
- const __m128i v_v_b = _mm_mullo_epi32(v_w0, v_r0_mult);
- const __m128i v_v_c = _mm_add_epi32(v_v_a, v_v_b);
- const __m128i v_bfp_out1 =
- LoadUnaligned16(&box_filter_process_output[1][x]);
- // Select u or box_filter_process_output[1][x].
- const __m128i v_r1_mult = _mm_blendv_epi8(v_bfp_out1, v_u, v_r1_mask);
- const __m128i v_v_d = _mm_mullo_epi32(v_w2, v_r1_mult);
- const __m128i v_v_e = _mm_add_epi32(v_v_c, v_v_d);
- __m128i v_s = RightShiftWithRounding_S32(
- v_v_e, kSgrProjRestoreBits + kSgrProjPrecisionBits);
- v_s = _mm_packs_epi32(v_s, v_s);
- v_s = _mm_packus_epi16(v_s, v_s);
- Store4(&dst[x], v_s);
- x += 4;
- } while (x < width);
-
- src += source_stride;
- dst += dest_stride;
- box_filter_process_output[0] += array_stride;
- box_filter_process_output[1] += array_stride;
- } while (++y < height);
+ if (radius_pass_1 == 0) {
+ // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+ // following assertion.
+ assert(radius_pass_0 != 0);
+ BoxFilterProcessPass1(src, source_stride, restoration_info, width, height,
+ kSgrScaleParameter[index][0], buffer->sgf_buffer, dst,
+ dest_stride);
+ } else if (radius_pass_0 == 0) {
+ BoxFilterProcessPass2(src, source_stride, restoration_info, width, height,
+ kSgrScaleParameter[index][1], buffer->sgf_buffer, dst,
+ dest_stride);
+ } else {
+ BoxFilterProcess(src, source_stride, restoration_info, width, height,
+ kSgrScaleParameter[index], buffer->sgf_buffer, dst,
+ dest_stride);
+ }
}
void Init8bpp() {
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.cc b/chromium/third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.cc
new file mode 100644
index 00000000000..13f0853b2cb
--- /dev/null
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.cc
@@ -0,0 +1,397 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_field_projection.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline __m128i LoadDivision(const __m128i division_table,
+ const __m128i reference_offset) {
+ const __m128i kOne = _mm_set1_epi16(0x0100);
+ const __m128i t = _mm_add_epi8(reference_offset, reference_offset);
+ const __m128i tt = _mm_unpacklo_epi8(t, t);
+ const __m128i idx = _mm_add_epi8(tt, kOne);
+ return _mm_shuffle_epi8(division_table, idx);
+}
+
+inline __m128i MvProjection(const __m128i mv, const __m128i denominator,
+ const int numerator) {
+ const __m128i m0 = _mm_madd_epi16(mv, denominator);
+ const __m128i m = _mm_mullo_epi32(m0, _mm_set1_epi32(numerator));
+ // Add the sign (0 or -1) to round towards zero.
+ const __m128i sign = _mm_srai_epi32(m, 31);
+ const __m128i add_sign = _mm_add_epi32(m, sign);
+ const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13));
+ return _mm_srai_epi32(sum, 14);
+}
+
+inline __m128i MvProjectionClip(const __m128i mv, const __m128i denominator,
+ const int numerator) {
+ const __m128i mv0 = _mm_unpacklo_epi16(mv, _mm_setzero_si128());
+ const __m128i mv1 = _mm_unpackhi_epi16(mv, _mm_setzero_si128());
+ const __m128i denorm0 = _mm_unpacklo_epi16(denominator, _mm_setzero_si128());
+ const __m128i denorm1 = _mm_unpackhi_epi16(denominator, _mm_setzero_si128());
+ const __m128i s0 = MvProjection(mv0, denorm0, numerator);
+ const __m128i s1 = MvProjection(mv1, denorm1, numerator);
+ const __m128i projection = _mm_packs_epi32(s0, s1);
+ const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp);
+ const __m128i projection_mv_clamp_negative =
+ _mm_set1_epi16(-kProjectionMvClamp);
+ const __m128i clamp = _mm_min_epi16(projection, projection_mv_clamp);
+ return _mm_max_epi16(clamp, projection_mv_clamp_negative);
+}
+
+inline __m128i Project_SSE4_1(const __m128i delta, const __m128i dst_sign) {
+ // Add 63 to negative delta so that it shifts towards zero.
+ const __m128i delta_sign = _mm_srai_epi16(delta, 15);
+ const __m128i delta_sign_63 = _mm_srli_epi16(delta_sign, 10);
+ const __m128i delta_adjust = _mm_add_epi16(delta, delta_sign_63);
+ const __m128i offset0 = _mm_srai_epi16(delta_adjust, 6);
+ const __m128i offset1 = _mm_xor_si128(offset0, dst_sign);
+ return _mm_sub_epi16(offset1, dst_sign);
+}
+
+inline void GetPosition(
+ const __m128i division_table, const MotionVector* const mv,
+ const int numerator, const int x8_start, const int x8_end, const int x8,
+ const __m128i r_offsets, const __m128i source_reference_type8,
+ const __m128i skip_r, const __m128i y8_floor8, const __m128i y8_ceiling8,
+ const __m128i d_sign, const int delta, __m128i* const r,
+ __m128i* const position_xy, int64_t* const skip_64, __m128i mvs[2]) {
+ const auto* const mv_int = reinterpret_cast<const int32_t*>(mv + x8);
+ *r = _mm_shuffle_epi8(r_offsets, source_reference_type8);
+ const __m128i denorm = LoadDivision(division_table, source_reference_type8);
+ __m128i projection_mv[2];
+ mvs[0] = LoadUnaligned16(mv_int + 0);
+ mvs[1] = LoadUnaligned16(mv_int + 4);
+ // Deinterlace x and y components
+ const __m128i kShuffle =
+ _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+ const __m128i mv0 = _mm_shuffle_epi8(mvs[0], kShuffle);
+ const __m128i mv1 = _mm_shuffle_epi8(mvs[1], kShuffle);
+ const __m128i mv_y = _mm_unpacklo_epi64(mv0, mv1);
+ const __m128i mv_x = _mm_unpackhi_epi64(mv0, mv1);
+ // numerator could be 0.
+ projection_mv[0] = MvProjectionClip(mv_y, denorm, numerator);
+ projection_mv[1] = MvProjectionClip(mv_x, denorm, numerator);
+ // Do not update the motion vector if the block position is not valid or
+ // if position_x8 is outside the current range of x8_start and x8_end.
+ // Note that position_y8 will always be within the range of y8_start and
+ // y8_end.
+ // After subtracting the base, valid projections are within 8-bit.
+ const __m128i position_y = Project_SSE4_1(projection_mv[0], d_sign);
+ const __m128i position_x = Project_SSE4_1(projection_mv[1], d_sign);
+ const __m128i positions = _mm_packs_epi16(position_x, position_y);
+ const __m128i k01234567 =
+ _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
+ *position_xy = _mm_add_epi8(positions, k01234567);
+ const int x8_floor = std::max(
+ x8_start - x8, delta - kProjectionMvMaxHorizontalOffset); // [-8, 8]
+ const int x8_ceiling =
+ std::min(x8_end - x8, delta + 8 + kProjectionMvMaxHorizontalOffset) -
+ 1; // [-1, 15]
+ const __m128i x8_floor8 = _mm_set1_epi8(x8_floor);
+ const __m128i x8_ceiling8 = _mm_set1_epi8(x8_ceiling);
+ const __m128i floor_xy = _mm_unpacklo_epi64(x8_floor8, y8_floor8);
+ const __m128i ceiling_xy = _mm_unpacklo_epi64(x8_ceiling8, y8_ceiling8);
+ const __m128i underflow = _mm_cmplt_epi8(*position_xy, floor_xy);
+ const __m128i overflow = _mm_cmpgt_epi8(*position_xy, ceiling_xy);
+ const __m128i out = _mm_or_si128(underflow, overflow);
+ const __m128i skip_low = _mm_or_si128(skip_r, out);
+ const __m128i skip = _mm_or_si128(skip_low, _mm_srli_si128(out, 8));
+ StoreLo8(skip_64, skip);
+}
+
+template <int idx>
+inline void Store(const __m128i position, const __m128i reference_offset,
+ const __m128i mv, int8_t* dst_reference_offset,
+ MotionVector* dst_mv) {
+ const ptrdiff_t offset =
+ static_cast<int16_t>(_mm_extract_epi16(position, idx));
+ if ((idx & 3) == 0) {
+ dst_mv[offset].mv32 = _mm_cvtsi128_si32(mv);
+ } else {
+ dst_mv[offset].mv32 = _mm_extract_epi32(mv, idx & 3);
+ }
+ dst_reference_offset[offset] = _mm_extract_epi8(reference_offset, idx);
+}
+
+template <int idx>
+inline void CheckStore(const int8_t* skips, const __m128i position,
+ const __m128i reference_offset, const __m128i mv,
+ int8_t* dst_reference_offset, MotionVector* dst_mv) {
+ if (skips[idx] == 0) {
+ Store<idx>(position, reference_offset, mv, dst_reference_offset, dst_mv);
+ }
+}
+
+// 7.9.2.
+void MotionFieldProjectionKernel_SSE4_1(
+ const ReferenceInfo& reference_info,
+ const int reference_to_current_with_sign, const int dst_sign,
+ const int y8_start, const int y8_end, const int x8_start, const int x8_end,
+ TemporalMotionField* const motion_field) {
+ const ptrdiff_t stride = motion_field->mv.columns();
+ // The column range has to be offset by kProjectionMvMaxHorizontalOffset since
+ // coordinates in that range could end up being position_x8 because of
+ // projection.
+ const int adjusted_x8_start =
+ std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
+ const int adjusted_x8_end = std::min(
+ x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
+ const int adjusted_x8_end8 = adjusted_x8_end & ~7;
+ const int leftover = adjusted_x8_end - adjusted_x8_end8;
+ const int8_t* const reference_offsets =
+ reference_info.relative_distance_to.data();
+ const bool* const skip_references = reference_info.skip_references.data();
+ const int16_t* const projection_divisions =
+ reference_info.projection_divisions.data();
+ const ReferenceFrameType* source_reference_types =
+ &reference_info.motion_field_reference_frame[y8_start][0];
+ const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
+ int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
+ MotionVector* dst_mv = motion_field->mv[y8_start];
+ const __m128i d_sign = _mm_set1_epi16(dst_sign);
+
+ static_assert(sizeof(int8_t) == sizeof(bool), "");
+ static_assert(sizeof(int8_t) == sizeof(ReferenceFrameType), "");
+ static_assert(sizeof(int32_t) == sizeof(MotionVector), "");
+ assert(dst_sign == 0 || dst_sign == -1);
+ assert(stride == motion_field->reference_offset.columns());
+ assert((y8_start & 7) == 0);
+ assert((adjusted_x8_start & 7) == 0);
+ // The final position calculation is represented with int16_t. Valid
+ // position_y8 from its base is at most 7. After considering the horizontal
+ // offset which is at most |stride - 1|, we have the following assertion,
+ // which means this optimization works for frame width up to 32K (each
+ // position is a 8x8 block).
+ assert(8 * stride <= 32768);
+ const __m128i skip_reference = LoadLo8(skip_references);
+ const __m128i r_offsets = LoadLo8(reference_offsets);
+ const __m128i division_table = LoadUnaligned16(projection_divisions);
+
+ int y8 = y8_start;
+ do {
+ const int y8_floor = (y8 & ~7) - y8; // [-7, 0]
+ const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8) - 1; // [0, 7]
+ const __m128i y8_floor8 = _mm_set1_epi8(y8_floor);
+ const __m128i y8_ceiling8 = _mm_set1_epi8(y8_ceiling);
+ int x8;
+
+ for (x8 = adjusted_x8_start; x8 < adjusted_x8_end8; x8 += 8) {
+ const __m128i source_reference_type8 =
+ LoadLo8(source_reference_types + x8);
+ const __m128i skip_r =
+ _mm_shuffle_epi8(skip_reference, source_reference_type8);
+ int64_t early_skip;
+ StoreLo8(&early_skip, skip_r);
+ // Early termination #1 if all are skips. Chance is typically ~30-40%.
+ if (early_skip == -1) continue;
+ int64_t skip_64;
+ __m128i r, position_xy, mvs[2];
+ GetPosition(division_table, mv, reference_to_current_with_sign, x8_start,
+ x8_end, x8, r_offsets, source_reference_type8, skip_r,
+ y8_floor8, y8_ceiling8, d_sign, 0, &r, &position_xy, &skip_64,
+ mvs);
+ // Early termination #2 if all are skips.
+ // Chance is typically ~15-25% after Early termination #1.
+ if (skip_64 == -1) continue;
+ const __m128i p_y = _mm_cvtepi8_epi16(_mm_srli_si128(position_xy, 8));
+ const __m128i p_x = _mm_cvtepi8_epi16(position_xy);
+ const __m128i p_y_offset = _mm_mullo_epi16(p_y, _mm_set1_epi16(stride));
+ const __m128i pos = _mm_add_epi16(p_y_offset, p_x);
+ const __m128i position = _mm_add_epi16(pos, _mm_set1_epi16(x8));
+ if (skip_64 == 0) {
+ // Store all. Chance is typically ~70-85% after Early termination #2.
+ Store<0>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ } else {
+ // Check and store each.
+ // Chance is typically ~15-30% after Early termination #2.
+ // The compiler is smart enough to not create the local buffer skips[].
+ int8_t skips[8];
+ memcpy(skips, &skip_64, sizeof(skips));
+ CheckStore<0>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ }
+ }
+
+ // The following leftover processing cannot be moved out of the do...while
+ // loop. Doing so may change the result storing orders of the same position.
+ if (leftover > 0) {
+ // Use SIMD only when leftover is at least 4, and there are at least 8
+ // elements in a row.
+ if (leftover >= 4 && adjusted_x8_start < adjusted_x8_end8) {
+ // Process the last 8 elements to avoid loading invalid memory. Some
+ // elements may have been processed in the above loop, which is OK.
+ const int delta = 8 - leftover;
+ x8 = adjusted_x8_end - 8;
+ const __m128i source_reference_type8 =
+ LoadLo8(source_reference_types + x8);
+ const __m128i skip_r =
+ _mm_shuffle_epi8(skip_reference, source_reference_type8);
+ int64_t early_skip;
+ StoreLo8(&early_skip, skip_r);
+ // Early termination #1 if all are skips.
+ if (early_skip != -1) {
+ int64_t skip_64;
+ __m128i r, position_xy, mvs[2];
+ GetPosition(division_table, mv, reference_to_current_with_sign,
+ x8_start, x8_end, x8, r_offsets, source_reference_type8,
+ skip_r, y8_floor8, y8_ceiling8, d_sign, delta, &r,
+ &position_xy, &skip_64, mvs);
+ // Early termination #2 if all are skips.
+ if (skip_64 != -1) {
+ const __m128i p_y =
+ _mm_cvtepi8_epi16(_mm_srli_si128(position_xy, 8));
+ const __m128i p_x = _mm_cvtepi8_epi16(position_xy);
+ const __m128i p_y_offset =
+ _mm_mullo_epi16(p_y, _mm_set1_epi16(stride));
+ const __m128i pos = _mm_add_epi16(p_y_offset, p_x);
+ const __m128i position = _mm_add_epi16(pos, _mm_set1_epi16(x8));
+ // Store up to 7 elements since leftover is at most 7.
+ if (skip_64 == 0) {
+ // Store all.
+ Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ } else {
+ // Check and store each.
+ // The compiler is smart enough to not create the local buffer
+ // skips[].
+ int8_t skips[8];
+ memcpy(skips, &skip_64, sizeof(skips));
+ CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset,
+ dst_mv);
+ CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset,
+ dst_mv);
+ CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset,
+ dst_mv);
+ CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ }
+ }
+ }
+ } else {
+ for (; x8 < adjusted_x8_end; ++x8) {
+ const int source_reference_type = source_reference_types[x8];
+ if (skip_references[source_reference_type]) continue;
+ MotionVector projection_mv;
+ // reference_to_current_with_sign could be 0.
+ GetMvProjection(mv[x8], reference_to_current_with_sign,
+ projection_divisions[source_reference_type],
+ &projection_mv);
+ // Do not update the motion vector if the block position is not valid
+ // or if position_x8 is outside the current range of x8_start and
+ // x8_end. Note that position_y8 will always be within the range of
+ // y8_start and y8_end.
+ const int position_y8 = Project(0, projection_mv.mv[0], dst_sign);
+ if (position_y8 < y8_floor || position_y8 > y8_ceiling) continue;
+ const int x8_base = x8 & ~7;
+ const int x8_floor =
+ std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset);
+ const int x8_ceiling =
+ std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset);
+ const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign);
+ if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
+ dst_mv[position_y8 * stride + position_x8] = mv[x8];
+ dst_reference_offset[position_y8 * stride + position_x8] =
+ reference_offsets[source_reference_type];
+ }
+ }
+ }
+
+ source_reference_types += stride;
+ mv += stride;
+ dst_reference_offset += stride;
+ dst_mv += stride;
+ } while (++y8 < y8_end);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1;
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1;
+}
+#endif
+
+} // namespace
+
+void MotionFieldProjectionInit_SSE4_1() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void MotionFieldProjectionInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_SSE4_1
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.h b/chromium/third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.h
new file mode 100644
index 00000000000..7828de5ca39
--- /dev/null
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/motion_field_projection_sse4.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::motion_field_projection_kernel. This function is not
+// thread-safe.
+void MotionFieldProjectionInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_SSE4_1
+#define LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel LIBGAV1_CPU_SSE4_1
+#endif // LIBGAV1_ENABLE_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.cc b/chromium/third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.cc
new file mode 100644
index 00000000000..a4b77da7877
--- /dev/null
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.cc
@@ -0,0 +1,262 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kProjectionMvDivisionLookup_32bit[kMaxFrameDistance + 1] = {
+ 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638,
+ 1489, 1365, 1260, 1170, 1092, 1024, 963, 910, 862, 819, 780,
+ 744, 712, 682, 655, 630, 606, 585, 564, 546, 528};
+
+inline __m128i MvProjection(const __m128i mv, const __m128i denominator,
+ const __m128i numerator) {
+ const __m128i m0 = _mm_madd_epi16(mv, denominator);
+ const __m128i m = _mm_mullo_epi32(m0, numerator);
+ // Add the sign (0 or -1) to round towards zero.
+ const __m128i sign = _mm_srai_epi32(m, 31);
+ const __m128i add_sign = _mm_add_epi32(m, sign);
+ const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13));
+ return _mm_srai_epi32(sum, 14);
+}
+
+inline __m128i MvProjectionClip(const __m128i mvs[2],
+ const __m128i denominators[2],
+ const __m128i numerator) {
+ const __m128i s0 = MvProjection(mvs[0], denominators[0], numerator);
+ const __m128i s1 = MvProjection(mvs[1], denominators[1], numerator);
+ const __m128i mv = _mm_packs_epi32(s0, s1);
+ const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp);
+ const __m128i projection_mv_clamp_negative =
+ _mm_set1_epi16(-kProjectionMvClamp);
+ const __m128i clamp = _mm_min_epi16(mv, projection_mv_clamp);
+ return _mm_max_epi16(clamp, projection_mv_clamp_negative);
+}
+
+inline __m128i MvProjectionCompoundClip(
+ const MotionVector* const temporal_mvs,
+ const int8_t temporal_reference_offsets[2],
+ const int reference_offsets[2]) {
+ const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
+ const __m128i temporal_mv = LoadLo8(tmvs);
+ const __m128i temporal_mv_0 = _mm_cvtepu16_epi32(temporal_mv);
+ __m128i mvs[2], denominators[2];
+ mvs[0] = _mm_unpacklo_epi64(temporal_mv_0, temporal_mv_0);
+ mvs[1] = _mm_unpackhi_epi64(temporal_mv_0, temporal_mv_0);
+ denominators[0] = _mm_set1_epi32(
+ kProjectionMvDivisionLookup[temporal_reference_offsets[0]]);
+ denominators[1] = _mm_set1_epi32(
+ kProjectionMvDivisionLookup[temporal_reference_offsets[1]]);
+ const __m128i offsets = LoadLo8(reference_offsets);
+ const __m128i numerator = _mm_unpacklo_epi32(offsets, offsets);
+ return MvProjectionClip(mvs, denominators, numerator);
+}
+
+inline __m128i MvProjectionSingleClip(
+ const MotionVector* const temporal_mvs,
+ const int8_t* const temporal_reference_offsets,
+ const int reference_offset) {
+ const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
+ const __m128i temporal_mv = LoadAligned16(tmvs);
+ __m128i lookup = _mm_cvtsi32_si128(
+ kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[0]]);
+ lookup = _mm_insert_epi32(
+ lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[1]],
+ 1);
+ lookup = _mm_insert_epi32(
+ lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[2]],
+ 2);
+ lookup = _mm_insert_epi32(
+ lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[3]],
+ 3);
+ __m128i mvs[2], denominators[2];
+ mvs[0] = _mm_unpacklo_epi16(temporal_mv, _mm_setzero_si128());
+ mvs[1] = _mm_unpackhi_epi16(temporal_mv, _mm_setzero_si128());
+ denominators[0] = _mm_unpacklo_epi32(lookup, lookup);
+ denominators[1] = _mm_unpackhi_epi32(lookup, lookup);
+ const __m128i numerator = _mm_set1_epi32(reference_offset);
+ return MvProjectionClip(mvs, denominators, numerator);
+}
+
+inline void LowPrecision(const __m128i mv, void* const candidate_mvs) {
+ const __m128i kRoundDownMask = _mm_set1_epi16(~1);
+ const __m128i sign = _mm_srai_epi16(mv, 15);
+ const __m128i sub_sign = _mm_sub_epi16(mv, sign);
+ const __m128i d = _mm_and_si128(sub_sign, kRoundDownMask);
+ StoreAligned16(candidate_mvs, d);
+}
+
+inline void ForceInteger(const __m128i mv, void* const candidate_mvs) {
+ const __m128i kRoundDownMask = _mm_set1_epi16(~7);
+ const __m128i sign = _mm_srai_epi16(mv, 15);
+ const __m128i mv1 = _mm_add_epi16(mv, _mm_set1_epi16(3));
+ const __m128i mv2 = _mm_sub_epi16(mv1, sign);
+ const __m128i mv3 = _mm_and_si128(mv2, kRoundDownMask);
+ StoreAligned16(candidate_mvs, mv3);
+}
+
+void MvProjectionCompoundLowPrecision_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionCompoundClip(
+ temporal_mvs + i, temporal_reference_offsets + i, offsets);
+ LowPrecision(mv, candidate_mvs + i);
+ i += 2;
+ } while (i < count);
+}
+
+void MvProjectionCompoundForceInteger_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionCompoundClip(
+ temporal_mvs + i, temporal_reference_offsets + i, offsets);
+ ForceInteger(mv, candidate_mvs + i);
+ i += 2;
+ } while (i < count);
+}
+
+void MvProjectionCompoundHighPrecision_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionCompoundClip(
+ temporal_mvs + i, temporal_reference_offsets + i, offsets);
+ StoreAligned16(candidate_mvs + i, mv);
+ i += 2;
+ } while (i < count);
+}
+
+void MvProjectionSingleLowPrecision_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offset, const int count, MotionVector* candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionSingleClip(
+ temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+ LowPrecision(mv, candidate_mvs + i);
+ i += 4;
+ } while (i < count);
+}
+
+void MvProjectionSingleForceInteger_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offset, const int count, MotionVector* candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionSingleClip(
+ temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+ ForceInteger(mv, candidate_mvs + i);
+ i += 4;
+ } while (i < count);
+}
+
+void MvProjectionSingleHighPrecision_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offset, const int count, MotionVector* candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionSingleClip(
+ temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+ StoreAligned16(candidate_mvs + i, mv);
+ i += 4;
+ } while (i < count);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1;
+ dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1;
+ dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1;
+ dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1;
+ dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1;
+ dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1;
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1;
+ dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1;
+ dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1;
+ dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1;
+ dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1;
+ dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1;
+}
+#endif
+
+} // namespace
+
+void MotionVectorSearchInit_SSE4_1() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void MotionVectorSearchInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_SSE4_1
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.h b/chromium/third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.h
new file mode 100644
index 00000000000..b8b04123635
--- /dev/null
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/motion_vector_search_sse4.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mv_projection_compound and Dsp::mv_projection_single. This
+// function is not thread-safe.
+void MotionVectorSearchInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_SSE4_1
+#define LIBGAV1_Dsp8bpp_MotionVectorSearch LIBGAV1_CPU_SSE4_1
+#endif // LIBGAV1_ENABLE_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/transpose_sse4.h b/chromium/third_party/libgav1/src/src/dsp/x86/transpose_sse4.h
index 2a10dc05633..cd61c9275d3 100644
--- a/chromium/third_party/libgav1/src/src/dsp/x86/transpose_sse4.h
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/transpose_sse4.h
@@ -27,7 +27,7 @@ namespace libgav1 {
namespace dsp {
LIBGAV1_ALWAYS_INLINE __m128i Transpose4x4_U8(const __m128i* const in) {
- // Unpack 16 bit elements. Goes from:
+ // Unpack 8 bit elements. Goes from:
// in[0]: 00 01 02 03
// in[1]: 10 11 12 13
// in[2]: 20 21 22 23
@@ -43,10 +43,10 @@ LIBGAV1_ALWAYS_INLINE __m128i Transpose4x4_U8(const __m128i* const in) {
return _mm_unpacklo_epi16(a0, a1);
}
-LIBGAV1_ALWAYS_INLINE void Transpose8x8_U8(const __m128i* const in,
- __m128i* out) {
- // Unpack 16 bit elements. Goes from:
- // in[0]: 00 01 02 03 04 05 06 07
+LIBGAV1_ALWAYS_INLINE void Transpose8x8To4x16_U8(const __m128i* const in,
+ __m128i* out) {
+ // Unpack 8 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
// in[1]: 10 11 12 13 14 15 16 17
// in[2]: 20 21 22 23 24 25 26 27
// in[3]: 30 31 32 33 34 35 36 37
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/warp_sse4.cc b/chromium/third_party/libgav1/src/src/dsp/x86/warp_sse4.cc
index 4003f5db459..922110ba573 100644
--- a/chromium/third_party/libgav1/src/src/dsp/x86/warp_sse4.cc
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/warp_sse4.cc
@@ -19,11 +19,10 @@
#include <smmintrin.h>
-#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
-#include <cstdlib>
+#include <cstring>
#include <type_traits>
#include "src/dsp/constants.h"
@@ -69,7 +68,7 @@ inline void HorizontalFilter(const int sx4, const int16_t alpha,
f = LoadLo8(kWarpedFilters8[offset]);
sx += alpha;
}
- Transpose8x8_U8(filter, filter);
+ Transpose8x8To4x16_U8(filter, filter);
// |filter| now contains two filters per register.
// Staggered combinations allow us to take advantage of _mm_maddubs_epi16
// without overflowing the sign bit. The sign bit is hit only where two taps
@@ -128,10 +127,10 @@ inline void WriteVerticalFilter(const __m128i filter[8],
sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical);
if (is_compound) {
const __m128i sum = _mm_packs_epi32(sum_low, sum_high);
- StoreUnaligned16(reinterpret_cast<int16_t*>(dst_row), sum);
+ StoreUnaligned16(static_cast<int16_t*>(dst_row), sum);
} else {
const __m128i sum = _mm_packus_epi32(sum_low, sum_high);
- StoreLo8(reinterpret_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
+ StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
}
}
@@ -159,22 +158,206 @@ inline void WriteVerticalFilter(const __m128i filter[8],
sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical);
if (is_compound) {
const __m128i sum = _mm_packs_epi32(sum_low, sum_high);
- StoreUnaligned16(reinterpret_cast<int16_t*>(dst_row), sum);
+ StoreUnaligned16(static_cast<int16_t*>(dst_row), sum);
} else {
const __m128i sum = _mm_packus_epi32(sum_low, sum_high);
- StoreLo8(reinterpret_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
+ StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
}
}
-template <bool is_compound>
-void Warp_SSE4_1(const void* source, ptrdiff_t source_stride, int source_width,
- int source_height, const int* warp_params, int subsampling_x,
- int subsampling_y, int block_start_x, int block_start_y,
- int block_width, int block_height, int16_t alpha, int16_t beta,
- int16_t gamma, int16_t delta, void* dest,
- ptrdiff_t dest_stride) {
- constexpr int kRoundBitsVertical =
- is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
+template <bool is_compound, typename DestType>
+inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma,
+ int delta, DestType* dest_row,
+ ptrdiff_t dest_stride) {
+ int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+ for (int y = 0; y < 8; ++y) {
+ int sy = sy4 - MultiplyBy4(gamma);
+ __m128i filter[8];
+ for (__m128i& f : filter) {
+ const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ f = LoadUnaligned16(kWarpedFilters[offset]);
+ sy += gamma;
+ }
+ Transpose8x8_U16(filter, filter);
+ WriteVerticalFilter<is_compound>(filter, source, y, dest_row);
+ dest_row += dest_stride;
+ sy4 += delta;
+ }
+}
+
+template <bool is_compound, typename DestType>
+inline void VerticalFilter(const int16_t* source_cols, int y4, int gamma,
+ int delta, DestType* dest_row,
+ ptrdiff_t dest_stride) {
+ int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+ for (int y = 0; y < 8; ++y) {
+ int sy = sy4 - MultiplyBy4(gamma);
+ __m128i filter[8];
+ for (__m128i& f : filter) {
+ const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ f = LoadUnaligned16(kWarpedFilters[offset]);
+ sy += gamma;
+ }
+ Transpose8x8_U16(filter, filter);
+ WriteVerticalFilter<is_compound>(filter, &source_cols[y], dest_row);
+ dest_row += dest_stride;
+ sy4 += delta;
+ }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion1(const uint8_t* src, ptrdiff_t source_stride,
+ int source_width, int source_height, int ix4, int iy4,
+ DestType* dst_row, ptrdiff_t dest_stride) {
+ // Region 1
+ // Points to the left or right border of the first row of |src|.
+ const uint8_t* first_row_border =
+ (ix4 + 7 <= 0) ? src : src + source_width - 1;
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+ // Region 1.
+ // Every sample used to calculate the prediction block has the same
+ // value. So the whole prediction block has the same value.
+ const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const uint8_t row_border_pixel = first_row_border[row * source_stride];
+
+ if (is_compound) {
+ const __m128i sum =
+ _mm_set1_epi16(row_border_pixel << (kInterRoundBitsVertical -
+ kInterRoundBitsCompoundVertical));
+ StoreUnaligned16(dst_row, sum);
+ } else {
+ memset(dst_row, row_border_pixel, 8);
+ }
+ const DestType* const first_dst_row = dst_row;
+ dst_row += dest_stride;
+ for (int y = 1; y < 8; ++y) {
+ memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row));
+ dst_row += dest_stride;
+ }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion2(const uint8_t* src, ptrdiff_t source_stride,
+ int source_width, int y4, int ix4, int iy4, int gamma,
+ int delta, int16_t intermediate_result_column[15],
+ DestType* dst_row, ptrdiff_t dest_stride) {
+ // Region 2.
+ // Points to the left or right border of the first row of |src|.
+ const uint8_t* first_row_border =
+ (ix4 + 7 <= 0) ? src : src + source_width - 1;
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+
+ // Region 2.
+ // Horizontal filter.
+ // The input values in this region are generated by extending the border
+ // which makes them identical in the horizontal direction. This
+ // computation could be inlined in the vertical pass but most
+ // implementations will need a transpose of some sort.
+ // It is not necessary to use the offset values here because the
+ // horizontal pass is a simple shift and the vertical pass will always
+ // require using 32 bits.
+ for (int y = -7; y < 8; ++y) {
+ // We may over-read up to 13 pixels above the top source row, or up
+ // to 13 pixels below the bottom source row. This is proved in
+ // warp.cc.
+ const int row = iy4 + y;
+ int sum = first_row_border[row * source_stride];
+ sum <<= (kFilterBits - kInterRoundBitsHorizontal);
+ intermediate_result_column[y + 7] = sum;
+ }
+ // Region 2 vertical filter.
+ VerticalFilter<is_compound, DestType>(intermediate_result_column, y4, gamma,
+ delta, dst_row, dest_stride);
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion3(const uint8_t* src, ptrdiff_t source_stride,
+ int source_height, int alpha, int beta, int x4, int ix4,
+ int iy4, int16_t intermediate_result[15][8]) {
+ // Region 3
+ // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+ // Horizontal filter.
+ const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const uint8_t* const src_row = src + row * source_stride;
+ // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+ // read but is ignored.
+ //
+ // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+ // bytes after src_row[source_width - 1]. We assume the source frame
+ // has left and right borders of at least 13 bytes that extend the
+ // frame boundary pixels. We also assume there is at least one extra
+ // padding byte after the right border of the last source row.
+ const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
+ int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+ for (int y = -7; y < 8; ++y) {
+ HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
+ sx4 += beta;
+ }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion4(const uint8_t* src, ptrdiff_t source_stride, int alpha,
+ int beta, int x4, int ix4, int iy4,
+ int16_t intermediate_result[15][8]) {
+ // Region 4.
+ // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+ // Horizontal filter.
+ int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+ for (int y = -7; y < 8; ++y) {
+ // We may over-read up to 13 pixels above the top source row, or up
+ // to 13 pixels below the bottom source row. This is proved in
+ // warp.cc.
+ const int row = iy4 + y;
+ const uint8_t* const src_row = src + row * source_stride;
+ // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+ // read but is ignored.
+ //
+ // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+ // bytes after src_row[source_width - 1]. We assume the source frame
+ // has left and right borders of at least 13 bytes that extend the
+ // frame boundary pixels. We also assume there is at least one extra
+ // padding byte after the right border of the last source row.
+ const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
+ // Convert src_row_v to int8 (subtract 128).
+ HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
+ sx4 += beta;
+ }
+}
+
+template <bool is_compound, typename DestType>
+inline void HandleWarpBlock(const uint8_t* src, ptrdiff_t source_stride,
+ int source_width, int source_height,
+ const int* warp_params, int subsampling_x,
+ int subsampling_y, int src_x, int src_y,
+ int16_t alpha, int16_t beta, int16_t gamma,
+ int16_t delta, DestType* dst_row,
+ ptrdiff_t dest_stride) {
union {
// Intermediate_result is the output of the horizontal filtering and
// rounding. The range is within 13 (= bitdepth + kFilterBits + 1 -
@@ -187,242 +370,133 @@ void Warp_SSE4_1(const void* source, ptrdiff_t source_stride, int source_width,
int16_t intermediate_result_column[15];
};
+ const int dst_x =
+ src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
+ const int dst_y =
+ src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
+ const int x4 = dst_x >> subsampling_x;
+ const int y4 = dst_y >> subsampling_y;
+ const int ix4 = x4 >> kWarpedModelPrecisionBits;
+ const int iy4 = y4 >> kWarpedModelPrecisionBits;
+ // A prediction block may fall outside the frame's boundaries. If a
+ // prediction block is calculated using only samples outside the frame's
+ // boundary, the filtering can be simplified. We can divide the plane
+ // into several regions and handle them differently.
+ //
+ // | |
+ // 1 | 3 | 1
+ // | |
+ // -------+-----------+-------
+ // |***********|
+ // 2 |*****4*****| 2
+ // |***********|
+ // -------+-----------+-------
+ // | |
+ // 1 | 3 | 1
+ // | |
+ //
+ // At the center, region 4 represents the frame and is the general case.
+ //
+ // In regions 1 and 2, the prediction block is outside the frame's
+ // boundary horizontally. Therefore the horizontal filtering can be
+ // simplified. Furthermore, in the region 1 (at the four corners), the
+ // prediction is outside the frame's boundary both horizontally and
+ // vertically, so we get a constant prediction block.
+ //
+ // In region 3, the prediction block is outside the frame's boundary
+ // vertically. Unfortunately because we apply the horizontal filters
+ // first, by the time we apply the vertical filters, they no longer see
+ // simple inputs. So the only simplification is that all the rows are
+ // the same, but we still need to apply all the horizontal and vertical
+ // filters.
+
+ // Check for two simple special cases, where the horizontal filter can
+ // be significantly simplified.
+ //
+ // In general, for each row, the horizontal filter is calculated as
+ // follows:
+ // for (int x = -4; x < 4; ++x) {
+ // const int offset = ...;
+ // int sum = first_pass_offset;
+ // for (int k = 0; k < 8; ++k) {
+ // const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
+ // sum += kWarpedFilters[offset][k] * src_row[column];
+ // }
+ // ...
+ // }
+ // The column index before clipping, ix4 + x + k - 3, varies in the range
+ // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
+ // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
+ // border index (source_width - 1 or 0, respectively). Then for each x,
+ // the inner for loop of the horizontal filter is reduced to multiplying
+ // the border pixel by the sum of the filter coefficients.
+ if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
+ if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) {
+ // Outside the frame in both directions. One repeated value.
+ WarpRegion1<is_compound, DestType>(src, source_stride, source_width,
+ source_height, ix4, iy4, dst_row,
+ dest_stride);
+ return;
+ }
+ // Outside the frame horizontally. Rows repeated.
+ WarpRegion2<is_compound, DestType>(
+ src, source_stride, source_width, y4, ix4, iy4, gamma, delta,
+ intermediate_result_column, dst_row, dest_stride);
+ return;
+ }
+
+ if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) {
+ // Outside the frame vertically.
+ WarpRegion3<is_compound, DestType>(src, source_stride, source_height, alpha,
+ beta, x4, ix4, iy4, intermediate_result);
+ } else {
+ // Inside the frame.
+ WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta, x4, ix4,
+ iy4, intermediate_result);
+ }
+ // Region 3 and 4 vertical filter.
+ VerticalFilter<is_compound, DestType>(intermediate_result, y4, gamma, delta,
+ dst_row, dest_stride);
+}
+
+template <bool is_compound>
+void Warp_SSE4_1(const void* source, ptrdiff_t source_stride, int source_width,
+ int source_height, const int* warp_params, int subsampling_x,
+ int subsampling_y, int block_start_x, int block_start_y,
+ int block_width, int block_height, int16_t alpha, int16_t beta,
+ int16_t gamma, int16_t delta, void* dest,
+ ptrdiff_t dest_stride) {
const auto* const src = static_cast<const uint8_t*>(source);
using DestType =
typename std::conditional<is_compound, int16_t, uint8_t>::type;
auto* dst = static_cast<DestType*>(dest);
+ // Warp process applies for each 8x8 block.
assert(block_width >= 8);
assert(block_height >= 8);
-
- // Warp process applies for each 8x8 block (or smaller).
- int start_y = block_start_y;
+ const int block_end_x = block_start_x + block_width;
+ const int block_end_y = block_start_y + block_height;
+
+ const int start_x = block_start_x;
+ const int start_y = block_start_y;
+ int src_x = (start_x + 4) << subsampling_x;
+ int src_y = (start_y + 4) << subsampling_y;
+ const int end_x = (block_end_x + 4) << subsampling_x;
+ const int end_y = (block_end_y + 4) << subsampling_y;
do {
- int start_x = block_start_x;
+ DestType* dst_row = dst;
+ src_x = (start_x + 4) << subsampling_x;
do {
- const int src_x = (start_x + 4) << subsampling_x;
- const int src_y = (start_y + 4) << subsampling_y;
- const int dst_x =
- src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
- const int dst_y =
- src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
- const int x4 = dst_x >> subsampling_x;
- const int y4 = dst_y >> subsampling_y;
- const int ix4 = x4 >> kWarpedModelPrecisionBits;
- const int iy4 = y4 >> kWarpedModelPrecisionBits;
- // A prediction block may fall outside the frame's boundaries. If a
- // prediction block is calculated using only samples outside the frame's
- // boundary, the filtering can be simplified. We can divide the plane
- // into several regions and handle them differently.
- //
- // | |
- // 1 | 3 | 1
- // | |
- // -------+-----------+-------
- // |***********|
- // 2 |*****4*****| 2
- // |***********|
- // -------+-----------+-------
- // | |
- // 1 | 3 | 1
- // | |
- //
- // At the center, region 4 represents the frame and is the general case.
- //
- // In regions 1 and 2, the prediction block is outside the frame's
- // boundary horizontally. Therefore the horizontal filtering can be
- // simplified. Furthermore, in the region 1 (at the four corners), the
- // prediction is outside the frame's boundary both horizontally and
- // vertically, so we get a constant prediction block.
- //
- // In region 3, the prediction block is outside the frame's boundary
- // vertically. Unfortunately because we apply the horizontal filters
- // first, by the time we apply the vertical filters, they no longer see
- // simple inputs. So the only simplification is that all the rows are
- // the same, but we still need to apply all the horizontal and vertical
- // filters.
-
- // Check for two simple special cases, where the horizontal filter can
- // be significantly simplified.
- //
- // In general, for each row, the horizontal filter is calculated as
- // follows:
- // for (int x = -4; x < 4; ++x) {
- // const int offset = ...;
- // int sum = first_pass_offset;
- // for (int k = 0; k < 8; ++k) {
- // const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
- // sum += kWarpedFilters[offset][k] * src_row[column];
- // }
- // ...
- // }
- // The column index before clipping, ix4 + x + k - 3, varies in the range
- // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
- // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
- // border index (source_width - 1 or 0, respectively). Then for each x,
- // the inner for loop of the horizontal filter is reduced to multiplying
- // the border pixel by the sum of the filter coefficients.
- if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
- // Regions 1 and 2.
- // Points to the left or right border of the first row of |src|.
- const uint8_t* first_row_border =
- (ix4 + 7 <= 0) ? src : src + source_width - 1;
- // In general, for y in [-7, 8), the row number iy4 + y is clipped:
- // const int row = Clip3(iy4 + y, 0, source_height - 1);
- // In two special cases, iy4 + y is clipped to either 0 or
- // source_height - 1 for all y. In the rest of the cases, iy4 + y is
- // bounded and we can avoid clipping iy4 + y by relying on a reference
- // frame's boundary extension on the top and bottom.
- if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
- // Region 1.
- // Every sample used to calculate the prediction block has the same
- // value. So the whole prediction block has the same value.
- const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
- const uint8_t row_border_pixel =
- first_row_border[row * source_stride];
-
- DestType* dst_row = dst + start_x - block_start_x;
- if (is_compound) {
- const __m128i sum =
- _mm_set1_epi16(row_border_pixel << (kInterRoundBitsVertical -
- kRoundBitsVertical));
- StoreUnaligned16(dst_row, sum);
- } else {
- memset(dst_row, row_border_pixel, 8);
- }
- const DestType* const first_dst_row = dst_row;
- dst_row += dest_stride;
- for (int y = 1; y < 8; ++y) {
- memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row));
- dst_row += dest_stride;
- }
- // End of region 1. Continue the |start_x| do-while loop.
- start_x += 8;
- continue;
- }
-
- // Region 2.
- // Horizontal filter.
- // The input values in this region are generated by extending the border
- // which makes them identical in the horizontal direction. This
- // computation could be inlined in the vertical pass but most
- // implementations will need a transpose of some sort.
- // It is not necessary to use the offset values here because the
- // horizontal pass is a simple shift and the vertical pass will always
- // require using 32 bits.
- for (int y = -7; y < 8; ++y) {
- // We may over-read up to 13 pixels above the top source row, or up
- // to 13 pixels below the bottom source row. This is proved in
- // warp.cc.
- const int row = iy4 + y;
- int sum = first_row_border[row * source_stride];
- sum <<= (kFilterBits - kInterRoundBitsHorizontal);
- intermediate_result_column[y + 7] = sum;
- }
- // Vertical filter.
- DestType* dst_row = dst + start_x - block_start_x;
- int sy4 =
- (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
- for (int y = 0; y < 8; ++y) {
- int sy = sy4 - MultiplyBy4(gamma);
- __m128i filter[8];
- for (__m128i& f : filter) {
- const int offset =
- RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
- kWarpedPixelPrecisionShifts;
- f = LoadUnaligned16(kWarpedFilters[offset]);
- sy += gamma;
- }
- Transpose8x8_U16(filter, filter);
- WriteVerticalFilter<is_compound>(
- filter, &intermediate_result_column[y], dst_row);
- dst_row += dest_stride;
- sy4 += delta;
- }
- // End of region 2. Continue the |start_x| do-while loop.
- start_x += 8;
- continue;
- }
-
- // Regions 3 and 4.
- // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
-
- // In general, for y in [-7, 8), the row number iy4 + y is clipped:
- // const int row = Clip3(iy4 + y, 0, source_height - 1);
- // In two special cases, iy4 + y is clipped to either 0 or
- // source_height - 1 for all y. In the rest of the cases, iy4 + y is
- // bounded and we can avoid clipping iy4 + y by relying on a reference
- // frame's boundary extension on the top and bottom.
- if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
- // Region 3.
- // Horizontal filter.
- const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
- const uint8_t* const src_row = src + row * source_stride;
- // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
- // read but is ignored.
- //
- // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
- // bytes after src_row[source_width - 1]. We assume the source frame
- // has left and right borders of at least 13 bytes that extend the
- // frame boundary pixels. We also assume there is at least one extra
- // padding byte after the right border of the last source row.
- const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
- int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
- for (int y = -7; y < 8; ++y) {
- HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
- sx4 += beta;
- }
- } else {
- // Region 4.
- // Horizontal filter.
- int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
- for (int y = -7; y < 8; ++y) {
- // We may over-read up to 13 pixels above the top source row, or up
- // to 13 pixels below the bottom source row. This is proved in
- // warp.cc.
- const int row = iy4 + y;
- const uint8_t* const src_row = src + row * source_stride;
- // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
- // read but is ignored.
- //
- // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
- // bytes after src_row[source_width - 1]. We assume the source frame
- // has left and right borders of at least 13 bytes that extend the
- // frame boundary pixels. We also assume there is at least one extra
- // padding byte after the right border of the last source row.
- const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
- // Convert src_row_v to int8 (subtract 128).
- HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
- sx4 += beta;
- }
- }
-
- // Regions 3 and 4.
- // Vertical filter.
- DestType* dst_row = dst + start_x - block_start_x;
- int sy4 =
- (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
- for (int y = 0; y < 8; ++y) {
- int sy = sy4 - MultiplyBy4(gamma);
- __m128i filter[8];
- for (__m128i& f : filter) {
- const int offset =
- RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
- kWarpedPixelPrecisionShifts;
- f = LoadUnaligned16(kWarpedFilters[offset]);
- sy += gamma;
- }
- Transpose8x8_U16(filter, filter);
- WriteVerticalFilter<is_compound>(filter, intermediate_result, y,
- dst_row);
- dst_row += dest_stride;
- sy4 += delta;
- }
- start_x += 8;
- } while (start_x < block_start_x + block_width);
+ HandleWarpBlock<is_compound, DestType>(
+ src, source_stride, source_width, source_height, warp_params,
+ subsampling_x, subsampling_y, src_x, src_y, alpha, beta, gamma, delta,
+ dst_row, dest_stride);
+ src_x += (8 << subsampling_x);
+ dst_row += 8;
+ } while (src_x < end_x);
dst += 8 * dest_stride;
- start_y += 8;
- } while (start_y < block_start_y + block_height);
+ src_y += (8 << subsampling_y);
+ } while (src_y < end_y);
}
void Init8bpp() {
diff --git a/chromium/third_party/libgav1/src/src/dsp/x86/weight_mask_sse4.h b/chromium/third_party/libgav1/src/src/dsp/x86/weight_mask_sse4.h
index 42309916eb0..841dd5a26af 100644
--- a/chromium/third_party/libgav1/src/src/dsp/x86/weight_mask_sse4.h
+++ b/chromium/third_party/libgav1/src/src/dsp/x86/weight_mask_sse4.h
@@ -36,6 +36,7 @@ void WeightMaskInit_SSE4_1();
#define LIBGAV1_Dsp8bpp_WeightMask_16x8 LIBGAV1_CPU_SSE4_1
#define LIBGAV1_Dsp8bpp_WeightMask_16x16 LIBGAV1_CPU_SSE4_1
#define LIBGAV1_Dsp8bpp_WeightMask_16x32 LIBGAV1_CPU_SSE4_1
+#define LIBGAV1_Dsp8bpp_WeightMask_16x64 LIBGAV1_CPU_SSE4_1
#define LIBGAV1_Dsp8bpp_WeightMask_32x8 LIBGAV1_CPU_SSE4_1
#define LIBGAV1_Dsp8bpp_WeightMask_32x16 LIBGAV1_CPU_SSE4_1
#define LIBGAV1_Dsp8bpp_WeightMask_32x32 LIBGAV1_CPU_SSE4_1
diff --git a/chromium/third_party/libgav1/src/src/frame_scratch_buffer.h b/chromium/third_party/libgav1/src/src/frame_scratch_buffer.h
index 6b336b0a58c..1d6a1f4fadb 100644
--- a/chromium/third_party/libgav1/src/src/frame_scratch_buffer.h
+++ b/chromium/third_party/libgav1/src/src/frame_scratch_buffer.h
@@ -17,17 +17,19 @@
#ifndef LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_
#define LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_
+#include <condition_variable> // NOLINT (unapproved c++11 header)
#include <cstdint>
#include <memory>
#include <mutex> // NOLINT (unapproved c++11 header)
-#include "src/loop_filter_mask.h"
#include "src/loop_restoration_info.h"
#include "src/residual_buffer_pool.h"
#include "src/symbol_decoder_context.h"
#include "src/threading_strategy.h"
#include "src/tile_scratch_buffer.h"
#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/compiler_attributes.h"
#include "src/utils/constants.h"
#include "src/utils/dynamic_buffer.h"
#include "src/utils/memory.h"
@@ -37,17 +39,21 @@
namespace libgav1 {
+// Buffer used to store the unfiltered pixels that are necessary for decoding
+// the next superblock row (for the intra prediction process).
+using IntraPredictionBuffer =
+ std::array<AlignedDynamicBuffer<uint8_t, kMaxAlignment>, kMaxPlanes>;
+
// Buffer to facilitate decoding a frame. This struct is used only within
// DecoderImpl::DecodeTiles().
struct FrameScratchBuffer {
- LoopFilterMask loop_filter_mask;
LoopRestorationInfo loop_restoration_info;
Array2D<int16_t> cdef_index;
Array2D<TransformSize> inter_transform_sizes;
+ BlockParametersHolder block_parameters_holder;
TemporalMotionField motion_field;
SymbolDecoderContext symbol_decoder_context;
std::unique_ptr<ResidualBufferPool> residual_buffer_pool;
- Array2D<SuperBlockState> superblock_state;
// threaded_window_buffer will be subdivided by PostFilter into windows of
// width 512 pixels. Each row in the window is filtered by a worker thread.
// To avoid false sharing, each 512-pixel row processed by one thread should
@@ -62,11 +68,22 @@ struct FrameScratchBuffer {
// for every 32x32 for chroma with subsampling). The indices of the rows that
// are stored are specified in |kDeblockedRowsForLoopRestoration|.
YuvBuffer deblock_buffer;
+ // The size of this dynamic buffer is |tile_rows|.
+ DynamicBuffer<IntraPredictionBuffer> intra_prediction_buffers;
TileScratchBufferPool tile_scratch_buffer_pool;
- // TODO(vigneshv): This is part of the frame scratch buffer for now. This will
- // have to change or move to DecoderImpl when frame parallel mode with
- // in-frame multi-theading is implemented.
ThreadingStrategy threading_strategy;
+ std::mutex superblock_row_mutex;
+ // The size of this buffer is the number of superblock rows.
+ // |superblock_row_progress[i]| is incremented whenever a tile finishes
+ // decoding superblock row at index i. If the count reaches tile_columns, then
+ // |superblock_row_progress_condvar[i]| is notified.
+ DynamicBuffer<int> superblock_row_progress
+ LIBGAV1_GUARDED_BY(superblock_row_mutex);
+ // The size of this buffer is the number of superblock rows. Used to wait for
+ // |superblock_row_progress[i]| to reach tile_columns.
+ DynamicBuffer<std::condition_variable> superblock_row_progress_condvar;
+ // Used to signal tile decoding failure in the combined multithreading mode.
+ bool tile_decoding_failed LIBGAV1_GUARDED_BY(superblock_row_mutex);
};
class FrameScratchBufferPool {
@@ -89,8 +106,6 @@ class FrameScratchBufferPool {
private:
std::mutex mutex_;
- // TODO(b/142583029): The size of this stack is set to kMaxThreads. This may
- // have to be revisited as we iterate over the frame parallel design.
Stack<std::unique_ptr<FrameScratchBuffer>, kMaxThreads> buffers_
LIBGAV1_GUARDED_BY(mutex_);
};
diff --git a/chromium/third_party/libgav1/src/src/gav1/decoder.h b/chromium/third_party/libgav1/src/src/gav1/decoder.h
index 5151d647b6f..9d0d87291ee 100644
--- a/chromium/third_party/libgav1/src/src/gav1/decoder.h
+++ b/chromium/third_party/libgav1/src/src/gav1/decoder.h
@@ -94,11 +94,11 @@ class LIBGAV1_PUBLIC Decoder {
// NOTE: |EnqueueFrame()| does not copy the data. Therefore, after a
// successful |EnqueueFrame()| call, the caller must keep the |data| buffer
// alive until:
- // 1) If release_input_buffer is not nullptr, then |data| buffer must be kept
- // alive until release_input_buffer is called with the |buffer_private_data|
- // passed into this EnqueueFrame call.
- // 2) If release_input_buffer is nullptr, then |data| buffer must be kept
- // alive until the corresponding DequeueFrame() call is completed.
+ // 1) If |settings_.release_input_buffer| is not nullptr, then |data| buffer
+ // must be kept alive until release_input_buffer is called with the
+ // |buffer_private_data| passed into this EnqueueFrame call.
+ // 2) If |settings_.release_input_buffer| is nullptr, then |data| buffer must
+ // be kept alive until the corresponding DequeueFrame() call is completed.
StatusCode EnqueueFrame(const uint8_t* data, size_t size,
int64_t user_private_data, void* buffer_private_data);
@@ -107,9 +107,12 @@ class LIBGAV1_PUBLIC Decoder {
// compressed frame. If there are no displayable frames available, sets
// |*out_ptr| to nullptr. Returns an error status if there is an error.
//
- // In frame parallel mode, if |settings_.blocking_dequeue| is true, then this
- // call will block until an enqueued frame has been decoded. Otherwise, it
- // will return kStatusTryAgain if an enqueued frame is not yet decoded.
+ // If |settings_.blocking_dequeue| is false and the decoder is operating in
+ // frame parallel mode (|settings_.frame_parallel| is true and the video
+ // stream passes the decoder's heuristics for enabling frame parallel mode),
+ // then this call will return kStatusTryAgain if an enqueued frame is not yet
+ // decoded (it is a non blocking call in this case). In all other cases, this
+ // call will block until an enqueued frame has been decoded.
StatusCode DequeueFrame(const DecoderBuffer** out_ptr);
// Signals the end of stream.
diff --git a/chromium/third_party/libgav1/src/src/gav1/decoder_settings.h b/chromium/third_party/libgav1/src/src/gav1/decoder_settings.h
index d7ec8d6754b..33777248a3c 100644
--- a/chromium/third_party/libgav1/src/src/gav1/decoder_settings.h
+++ b/chromium/third_party/libgav1/src/src/gav1/decoder_settings.h
@@ -41,15 +41,13 @@ typedef void (*Libgav1ReleaseInputBufferCallback)(void* callback_private_data,
void* buffer_private_data);
typedef struct Libgav1DecoderSettings {
- // Number of threads to use when decoding. Must be greater than 0. The
- // library will create at most |threads|-1 new threads, the calling thread is
- // considered part of the library's thread count. Defaults to 1 (no new
- // threads will be created).
+ // Number of threads to use when decoding. Must be greater than 0. The library
+ // will create at most |threads| new threads. Defaults to 1 (no new threads
+ // will be created).
int threads;
- // A boolean. Do frame parallel decoding.
- //
- // NOTE: Frame parallel decoding is not implemented, this setting is
- // currently ignored.
+ // A boolean. Indicate to the decoder that frame parallel decoding is allowed.
+ // Note that this is just a request and the decoder will decide the number of
+ // frames to be decoded in parallel based on the video stream being decoded.
int frame_parallel;
// A boolean. In frame parallel mode, should Libgav1DecoderDequeueFrame wait
// until a enqueued frame is available for dequeueing.
@@ -91,15 +89,13 @@ using ReleaseInputBufferCallback = Libgav1ReleaseInputBufferCallback;
// Applications must populate this structure before creating a decoder instance.
struct DecoderSettings {
- // Number of threads to use when decoding. Must be greater than 0. The
- // library will create at most |threads|-1 new threads, the calling thread is
- // considered part of the library's thread count. Defaults to 1 (no new
- // threads will be created).
+ // Number of threads to use when decoding. Must be greater than 0. The library
+ // will create at most |threads| new threads. Defaults to 1 (no new threads
+ // will be created).
int threads = 1;
- // Do frame parallel decoding.
- //
- // NOTE: Frame parallel decoding is not implemented, this setting is
- // currently ignored.
+ // Indicate to the decoder that frame parallel decoding is allowed. Note that
+ // this is just a request and the decoder will decide the number of frames to
+ // be decoded in parallel based on the video stream being decoded.
bool frame_parallel = false;
// In frame parallel mode, should DequeueFrame wait until a enqueued frame is
// available for dequeueing.
diff --git a/chromium/third_party/libgav1/src/src/libgav1_decoder.cmake b/chromium/third_party/libgav1/src/src/libgav1_decoder.cmake
index a97f1425dd3..b97d09def17 100644
--- a/chromium/third_party/libgav1/src/src/libgav1_decoder.cmake
+++ b/chromium/third_party/libgav1/src/src/libgav1_decoder.cmake
@@ -33,8 +33,6 @@ list(APPEND libgav1_decoder_sources
"${libgav1_source}/inter_intra_masks.inc"
"${libgav1_source}/internal_frame_buffer_list.cc"
"${libgav1_source}/internal_frame_buffer_list.h"
- "${libgav1_source}/loop_filter_mask.cc"
- "${libgav1_source}/loop_filter_mask.h"
"${libgav1_source}/loop_restoration_info.cc"
"${libgav1_source}/loop_restoration_info.h"
"${libgav1_source}/motion_vector.cc"
@@ -43,6 +41,7 @@ list(APPEND libgav1_decoder_sources
"${libgav1_source}/obu_parser.h"
"${libgav1_source}/post_filter/cdef.cc"
"${libgav1_source}/post_filter/deblock.cc"
+ "${libgav1_source}/post_filter/deblock_thresholds.inc"
"${libgav1_source}/post_filter/loop_restoration.cc"
"${libgav1_source}/post_filter/post_filter.cc"
"${libgav1_source}/post_filter/super_res.cc"
@@ -56,6 +55,7 @@ list(APPEND libgav1_decoder_sources
"${libgav1_source}/reconstruction.h"
"${libgav1_source}/residual_buffer_pool.cc"
"${libgav1_source}/residual_buffer_pool.h"
+ "${libgav1_source}/scan_tables.inc"
"${libgav1_source}/symbol_decoder_context.cc"
"${libgav1_source}/symbol_decoder_context.h"
"${libgav1_source}/symbol_decoder_context_cdfs.inc"
diff --git a/chromium/third_party/libgav1/src/src/loop_filter_mask.cc b/chromium/third_party/libgav1/src/src/loop_filter_mask.cc
deleted file mode 100644
index 8f96df9bf92..00000000000
--- a/chromium/third_party/libgav1/src/src/loop_filter_mask.cc
+++ /dev/null
@@ -1,208 +0,0 @@
-// Copyright 2019 The libgav1 Authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "src/loop_filter_mask.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <new>
-
-#include "src/utils/array_2d.h"
-#include "src/utils/compiler_attributes.h"
-
-namespace libgav1 {
-
-#if !LIBGAV1_CXX17
-// static.
-constexpr BitMaskSet LoopFilterMask::kPredictionModeDeltasMask;
-#endif
-
-bool LoopFilterMask::Reset(int width, int height) {
- num_64x64_blocks_per_row_ = DivideBy64(width + 63);
- num_64x64_blocks_per_column_ = DivideBy64(height + 63);
- const int num_64x64_blocks =
- num_64x64_blocks_per_row_ * num_64x64_blocks_per_column_;
- if (num_64x64_blocks_ == -1 || num_64x64_blocks_ < num_64x64_blocks) {
- // Note that this need not be zero initialized here since we zero
- // initialize the required number of entries in the loop that follows.
- loop_filter_masks_.reset(new (std::nothrow)
- Data[num_64x64_blocks]); // NOLINT.
- if (loop_filter_masks_ == nullptr) {
- return false;
- }
- }
- for (int i = 0; i < num_64x64_blocks; ++i) {
- memset(&loop_filter_masks_[i], 0, sizeof(loop_filter_masks_[i]));
- }
- num_64x64_blocks_ = num_64x64_blocks;
- return true;
-}
-
-void LoopFilterMask::Build(
- const ObuSequenceHeader& sequence_header,
- const ObuFrameHeader& frame_header, int tile_group_start,
- int tile_group_end, const BlockParametersHolder& block_parameters_holder,
- const Array2D<TransformSize>& inter_transform_sizes) {
- for (int tile_number = tile_group_start; tile_number <= tile_group_end;
- ++tile_number) {
- const int row = tile_number / frame_header.tile_info.tile_columns;
- const int column = tile_number % frame_header.tile_info.tile_columns;
- const int row4x4_start = frame_header.tile_info.tile_row_start[row];
- const int row4x4_end = frame_header.tile_info.tile_row_start[row + 1];
- const int column4x4_start =
- frame_header.tile_info.tile_column_start[column];
- const int column4x4_end =
- frame_header.tile_info.tile_column_start[column + 1];
-
- const int num_planes = sequence_header.color_config.is_monochrome
- ? kMaxPlanesMonochrome
- : kMaxPlanes;
- for (int plane = kPlaneY; plane < num_planes; ++plane) {
- // For U and V planes, do not build bit masks if level == 0.
- if (plane > kPlaneY && frame_header.loop_filter.level[plane + 1] == 0) {
- continue;
- }
- const int8_t subsampling_x =
- (plane == kPlaneY) ? 0 : sequence_header.color_config.subsampling_x;
- const int8_t subsampling_y =
- (plane == kPlaneY) ? 0 : sequence_header.color_config.subsampling_y;
- const int vertical_step = 1 << subsampling_y;
- const int horizontal_step = 1 << subsampling_x;
-
- // Build bit masks for vertical edges (except the frame boundary).
- if (column4x4_start != 0) {
- const int plane_height =
- RightShiftWithRounding(frame_header.height, subsampling_y);
- const int row4x4_limit =
- std::min(row4x4_end, DivideBy4(plane_height + 3) << subsampling_y);
- const int vertical_level_index =
- kDeblockFilterLevelIndex[plane][kLoopFilterTypeVertical];
- for (int row4x4 = GetDeblockPosition(row4x4_start, subsampling_y);
- row4x4 < row4x4_limit; row4x4 += vertical_step) {
- const int column4x4 =
- GetDeblockPosition(column4x4_start, subsampling_x);
- const BlockParameters& bp =
- *block_parameters_holder.Find(row4x4, column4x4);
- const uint8_t vertical_level =
- bp.deblock_filter_level[vertical_level_index];
- const BlockParameters& bp_left = *block_parameters_holder.Find(
- row4x4, column4x4 - horizontal_step);
- const uint8_t left_level =
- bp_left.deblock_filter_level[vertical_level_index];
- const int unit_id = DivideBy16(row4x4) * num_64x64_blocks_per_row_ +
- DivideBy16(column4x4);
- const int row = row4x4 % kNum4x4InLoopFilterMaskUnit;
- const int column = column4x4 % kNum4x4InLoopFilterMaskUnit;
- const int shift = LoopFilterMask::GetShift(row, column);
- const int index = LoopFilterMask::GetIndex(row);
- const auto mask = static_cast<uint64_t>(1) << shift;
- // Tile boundary must be coding block boundary. So we don't have to
- // check (!left_skip || !skip || is_vertical_border).
- if (vertical_level != 0 || left_level != 0) {
- assert(inter_transform_sizes[row4x4] != nullptr);
- const TransformSize tx_size =
- (plane == kPlaneY) ? inter_transform_sizes[row4x4][column4x4]
- : bp.uv_transform_size;
- const TransformSize left_tx_size =
- (plane == kPlaneY)
- ? inter_transform_sizes[row4x4][column4x4 - horizontal_step]
- : bp_left.uv_transform_size;
- const LoopFilterTransformSizeId transform_size_id =
- GetTransformSizeIdWidth(tx_size, left_tx_size);
- SetLeft(mask, unit_id, plane, transform_size_id, index);
- const uint8_t current_level =
- (vertical_level == 0) ? left_level : vertical_level;
- SetLevel(current_level, unit_id, plane, kLoopFilterTypeVertical,
- LoopFilterMask::GetLevelOffset(row, column));
- }
- }
- }
-
- // Build bit masks for horizontal edges (except the frame boundary).
- if (row4x4_start != 0) {
- const int plane_width =
- RightShiftWithRounding(frame_header.width, subsampling_x);
- const int column4x4_limit = std::min(
- column4x4_end, DivideBy4(plane_width + 3) << subsampling_y);
- const int horizontal_level_index =
- kDeblockFilterLevelIndex[plane][kLoopFilterTypeHorizontal];
- for (int column4x4 = GetDeblockPosition(column4x4_start, subsampling_x);
- column4x4 < column4x4_limit; column4x4 += horizontal_step) {
- const int row4x4 = GetDeblockPosition(row4x4_start, subsampling_y);
- const BlockParameters& bp =
- *block_parameters_holder.Find(row4x4, column4x4);
- const uint8_t horizontal_level =
- bp.deblock_filter_level[horizontal_level_index];
- const BlockParameters& bp_top =
- *block_parameters_holder.Find(row4x4 - vertical_step, column4x4);
- const uint8_t top_level =
- bp_top.deblock_filter_level[horizontal_level_index];
- const int unit_id = DivideBy16(row4x4) * num_64x64_blocks_per_row_ +
- DivideBy16(column4x4);
- const int row = row4x4 % kNum4x4InLoopFilterMaskUnit;
- const int column = column4x4 % kNum4x4InLoopFilterMaskUnit;
- const int shift = LoopFilterMask::GetShift(row, column);
- const int index = LoopFilterMask::GetIndex(row);
- const auto mask = static_cast<uint64_t>(1) << shift;
- // Tile boundary must be coding block boundary. So we don't have to
- // check (!top_skip || !skip || is_horizontal_border).
- if (horizontal_level != 0 || top_level != 0) {
- assert(inter_transform_sizes[row4x4] != nullptr);
- const TransformSize tx_size =
- (plane == kPlaneY) ? inter_transform_sizes[row4x4][column4x4]
- : bp.uv_transform_size;
- const TransformSize top_tx_size =
- (plane == kPlaneY)
- ? inter_transform_sizes[row4x4 - vertical_step][column4x4]
- : bp_top.uv_transform_size;
- const LoopFilterTransformSizeId transform_size_id =
- static_cast<LoopFilterTransformSizeId>(
- std::min({kTransformHeightLog2[tx_size] - 2,
- kTransformHeightLog2[top_tx_size] - 2, 2}));
- SetTop(mask, unit_id, plane, transform_size_id, index);
- const uint8_t current_level =
- (horizontal_level == 0) ? top_level : horizontal_level;
- SetLevel(current_level, unit_id, plane, kLoopFilterTypeHorizontal,
- LoopFilterMask::GetLevelOffset(row, column));
- }
- }
- }
- }
- }
- assert(IsValid());
-}
-
-bool LoopFilterMask::IsValid() const {
- for (int mask_id = 0; mask_id < num_64x64_blocks_; ++mask_id) {
- for (int plane = 0; plane < kMaxPlanes; ++plane) {
- for (int i = 0; i < kNumLoopFilterTransformSizeIds; ++i) {
- for (int j = i + 1; j < kNumLoopFilterTransformSizeIds; ++j) {
- for (int k = 0; k < kNumLoopFilterMasks; ++k) {
- if ((loop_filter_masks_[mask_id].left[plane][i][k] &
- loop_filter_masks_[mask_id].left[plane][j][k]) != 0 ||
- (loop_filter_masks_[mask_id].top[plane][i][k] &
- loop_filter_masks_[mask_id].top[plane][j][k]) != 0) {
- return false;
- }
- }
- }
- }
- }
- }
- return true;
-}
-
-} // namespace libgav1
diff --git a/chromium/third_party/libgav1/src/src/loop_filter_mask.h b/chromium/third_party/libgav1/src/src/loop_filter_mask.h
deleted file mode 100644
index 314f020b99b..00000000000
--- a/chromium/third_party/libgav1/src/src/loop_filter_mask.h
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright 2019 The libgav1 Authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LIBGAV1_SRC_LOOP_FILTER_MASK_H_
-#define LIBGAV1_SRC_LOOP_FILTER_MASK_H_
-
-#include <array>
-#include <cassert>
-#include <cstdint>
-#include <memory>
-
-#include "src/dsp/constants.h"
-#include "src/dsp/dsp.h"
-#include "src/obu_parser.h"
-#include "src/utils/array_2d.h"
-#include "src/utils/bit_mask_set.h"
-#include "src/utils/block_parameters_holder.h"
-#include "src/utils/common.h"
-#include "src/utils/constants.h"
-#include "src/utils/segmentation.h"
-#include "src/utils/types.h"
-
-namespace libgav1 {
-
-class LoopFilterMask {
- public:
- // This structure holds loop filter bit masks for a 64x64 block.
- // 64x64 block contains kNum4x4In64x64 = (64x64 / (4x4) = 256)
- // 4x4 blocks. It requires kNumLoopFilterMasks = 4 uint64_t to represent them.
- struct Data : public Allocable {
- uint8_t level[kMaxPlanes][kNumLoopFilterTypes][kNum4x4In64x64];
- uint64_t left[kMaxPlanes][kNumLoopFilterTransformSizeIds]
- [kNumLoopFilterMasks];
- uint64_t top[kMaxPlanes][kNumLoopFilterTransformSizeIds]
- [kNumLoopFilterMasks];
- };
-
- LoopFilterMask() = default;
-
- // Loop filter mask is built and used for each superblock individually.
- // Thus not copyable/movable.
- LoopFilterMask(const LoopFilterMask&) = delete;
- LoopFilterMask& operator=(const LoopFilterMask&) = delete;
- LoopFilterMask(LoopFilterMask&&) = delete;
- LoopFilterMask& operator=(LoopFilterMask&&) = delete;
-
- // Allocates the loop filter masks for the given |width| and
- // |height| if necessary and zeros out the appropriate number of
- // entries. Returns true on success.
- bool Reset(int width, int height);
-
- // Builds bit masks for tile boundaries.
- // This function is called after the frame has been decoded so that
- // information across tiles is available.
- // Before this function call, bit masks of transform edges other than those
- // on tile boundaries are built together with tile decoding, in
- // Tile::BuildBitMask().
- void Build(const ObuSequenceHeader& sequence_header,
- const ObuFrameHeader& frame_header, int tile_group_start,
- int tile_group_end,
- const BlockParametersHolder& block_parameters_holder,
- const Array2D<TransformSize>& inter_transform_sizes);
-
- uint8_t GetLevel(int mask_id, int plane, LoopFilterType type,
- int offset) const {
- return loop_filter_masks_[mask_id].level[plane][type][offset];
- }
-
- uint64_t GetLeft(int mask_id, int plane, LoopFilterTransformSizeId tx_size_id,
- int index) const {
- return loop_filter_masks_[mask_id].left[plane][tx_size_id][index];
- }
-
- uint64_t GetTop(int mask_id, int plane, LoopFilterTransformSizeId tx_size_id,
- int index) const {
- return loop_filter_masks_[mask_id].top[plane][tx_size_id][index];
- }
-
- int num_64x64_blocks_per_row() const { return num_64x64_blocks_per_row_; }
-
- void SetLeft(uint64_t new_mask, int mask_id, int plane,
- LoopFilterTransformSizeId transform_size_id, int index) {
- loop_filter_masks_[mask_id].left[plane][transform_size_id][index] |=
- new_mask;
- }
-
- void SetTop(uint64_t new_mask, int mask_id, int plane,
- LoopFilterTransformSizeId transform_size_id, int index) {
- loop_filter_masks_[mask_id].top[plane][transform_size_id][index] |=
- new_mask;
- }
-
- void SetLevel(uint8_t level, int mask_id, int plane, LoopFilterType type,
- int offset) {
- loop_filter_masks_[mask_id].level[plane][type][offset] = level;
- }
-
- static int GetIndex(int row4x4) { return row4x4 >> 2; }
-
- static int GetShift(int row4x4, int column4x4) {
- return ((row4x4 & 3) << 4) | column4x4;
- }
-
- static int GetLevelOffset(int row4x4, int column4x4) {
- assert(row4x4 < 16);
- assert(column4x4 < 16);
- return (row4x4 << 4) | column4x4;
- }
-
- static constexpr int GetModeId(PredictionMode mode) {
- return static_cast<int>(kPredictionModeDeltasMask.Contains(mode));
- }
-
- // 7.14.5.
- static void ComputeDeblockFilterLevels(
- const ObuFrameHeader& frame_header, int segment_id, int level_index,
- const int8_t delta_lf[kFrameLfCount],
- uint8_t deblock_filter_levels[kNumReferenceFrameTypes][2]) {
- const int delta = delta_lf[frame_header.delta_lf.multi ? level_index : 0];
- uint8_t level = Clip3(frame_header.loop_filter.level[level_index] + delta,
- 0, kMaxLoopFilterValue);
- const auto feature = static_cast<SegmentFeature>(
- kSegmentFeatureLoopFilterYVertical + level_index);
- level = Clip3(
- level + frame_header.segmentation.feature_data[segment_id][feature], 0,
- kMaxLoopFilterValue);
- if (!frame_header.loop_filter.delta_enabled) {
- static_assert(sizeof(deblock_filter_levels[0][0]) == 1, "");
- memset(deblock_filter_levels, level, kNumReferenceFrameTypes * 2);
- return;
- }
- assert(frame_header.loop_filter.delta_enabled);
- const int shift = level >> 5;
- deblock_filter_levels[kReferenceFrameIntra][0] = Clip3(
- level +
- LeftShift(frame_header.loop_filter.ref_deltas[kReferenceFrameIntra],
- shift),
- 0, kMaxLoopFilterValue);
- // deblock_filter_levels[kReferenceFrameIntra][1] is never used. So it does
- // not have to be populated.
- for (int reference_frame = kReferenceFrameIntra + 1;
- reference_frame < kNumReferenceFrameTypes; ++reference_frame) {
- for (int mode_id = 0; mode_id < 2; ++mode_id) {
- deblock_filter_levels[reference_frame][mode_id] = Clip3(
- level +
- LeftShift(frame_header.loop_filter.ref_deltas[reference_frame] +
- frame_header.loop_filter.mode_deltas[mode_id],
- shift),
- 0, kMaxLoopFilterValue);
- }
- }
- }
-
- private:
- std::unique_ptr<Data[]> loop_filter_masks_;
- int num_64x64_blocks_ = -1;
- int num_64x64_blocks_per_row_;
- int num_64x64_blocks_per_column_;
-
- // Mask used to determine the index for mode_deltas lookup.
- static constexpr BitMaskSet kPredictionModeDeltasMask{
- BitMaskSet(kPredictionModeNearestMv, kPredictionModeNearMv,
- kPredictionModeNewMv, kPredictionModeNearestNearestMv,
- kPredictionModeNearNearMv, kPredictionModeNearestNewMv,
- kPredictionModeNewNearestMv, kPredictionModeNearNewMv,
- kPredictionModeNewNearMv, kPredictionModeNewNewMv)};
-
- // Validates that the loop filter masks at different transform sizes are
- // mutually exclusive. Only used in an assert. This function will not be used
- // in optimized builds.
- bool IsValid() const;
-};
-
-} // namespace libgav1
-
-#endif // LIBGAV1_SRC_LOOP_FILTER_MASK_H_
diff --git a/chromium/third_party/libgav1/src/src/motion_vector.cc b/chromium/third_party/libgav1/src/src/motion_vector.cc
index c7a496e5979..8223f3decc1 100644
--- a/chromium/third_party/libgav1/src/src/motion_vector.cc
+++ b/chromium/third_party/libgav1/src/src/motion_vector.cc
@@ -479,19 +479,28 @@ void TemporalScan(const Tile::Block& block, bool is_compound,
if (count != 0) {
BlockParameters* const bp = block.bp;
int reference_offsets[2];
- const int offset_0 = GetRelativeDistance(
- tile.frame_header().order_hint,
- tile.current_frame().order_hint(bp->reference_frame[0]),
- tile.sequence_header().order_hint_shift_bits);
+ const int offset_0 = tile.current_frame()
+ .reference_info()
+ ->relative_distance_to[bp->reference_frame[0]];
reference_offsets[0] =
Clip3(offset_0, -kMaxFrameDistance, kMaxFrameDistance);
if (is_compound) {
- const int offset_1 = GetRelativeDistance(
- tile.frame_header().order_hint,
- tile.current_frame().order_hint(bp->reference_frame[1]),
- tile.sequence_header().order_hint_shift_bits);
+ const int offset_1 = tile.current_frame()
+ .reference_info()
+ ->relative_distance_to[bp->reference_frame[1]];
reference_offsets[1] =
Clip3(offset_1, -kMaxFrameDistance, kMaxFrameDistance);
+ // Pad so that SIMD implementations won't read uninitialized memory.
+ if ((count & 1) != 0) {
+ temporal_mvs[count].mv32 = 0;
+ temporal_reference_offsets[count] = 0;
+ }
+ } else {
+ // Pad so that SIMD implementations won't read uninitialized memory.
+ for (int i = count; i < ((count + 3) & ~3); ++i) {
+ temporal_mvs[i].mv32 = 0;
+ temporal_reference_offsets[i] = 0;
+ }
}
AddTemporalReferenceMvCandidate(
tile.frame_header(), reference_offsets, temporal_mvs,
@@ -752,12 +761,12 @@ void AddSample(const Tile::Block& block, int delta_row, int delta_column,
// or -1 so that it can be XORed and subtracted directly in ApplySign() and
// corresponding SIMD implementations.
bool MotionFieldProjection(
- const ObuFrameHeader& frame_header, const RefCountedBuffer& current_frame,
+ const ObuFrameHeader& frame_header,
const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
reference_frames,
- ReferenceFrameType source, unsigned int order_hint_shift_bits,
- int reference_to_current_with_sign, int dst_sign, int y8_start, int y8_end,
- int x8_start, int x8_end, TemporalMotionField* const motion_field) {
+ ReferenceFrameType source, int reference_to_current_with_sign, int dst_sign,
+ int y8_start, int y8_end, int x8_start, int x8_end,
+ TemporalMotionField* const motion_field) {
const int source_index =
frame_header.reference_frame_index[source - kReferenceFrameLast];
auto* const source_frame = reference_frames[source_index].get();
@@ -770,12 +779,10 @@ bool MotionFieldProjection(
}
assert(reference_to_current_with_sign >= -kMaxFrameDistance);
if (reference_to_current_with_sign > kMaxFrameDistance) return true;
+ const ReferenceInfo& reference_info = *source_frame->reference_info();
const dsp::Dsp& dsp = *dsp::GetDspTable(8);
dsp.motion_field_projection_kernel(
- source_frame->motion_field_reference_frame(y8_start, 0),
- source_frame->motion_field_mv(y8_start, 0),
- source_frame->order_hint_array(), current_frame.order_hint(source),
- order_hint_shift_bits, reference_to_current_with_sign, dst_sign, y8_start,
+ reference_info, reference_to_current_with_sign, dst_sign, y8_start,
y8_end, x8_start, x8_end, motion_field);
return true;
}
@@ -921,62 +928,58 @@ void SetupMotionField(
const ObuFrameHeader& frame_header, const RefCountedBuffer& current_frame,
const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
reference_frames,
- unsigned int order_hint_shift_bits, int row4x4_start, int row4x4_end,
- int column4x4_start, int column4x4_end,
+ int row4x4_start, int row4x4_end, int column4x4_start, int column4x4_end,
TemporalMotionField* const motion_field) {
assert(frame_header.use_ref_frame_mvs);
- assert(order_hint_shift_bits != 0);
const int y8_start = DivideBy2(row4x4_start);
const int y8_end = DivideBy2(std::min(row4x4_end, frame_header.rows4x4));
const int x8_start = DivideBy2(column4x4_start);
const int x8_end =
DivideBy2(std::min(column4x4_end, frame_header.columns4x4));
- const int8_t* const reference_frame_index =
- frame_header.reference_frame_index;
- const int last_index = reference_frame_index[0];
- const int last_alternate_order_hint =
- reference_frames[last_index]->order_hint(kReferenceFrameAlternate);
- const int current_gold_order_hint =
- current_frame.order_hint(kReferenceFrameGolden);
- if (last_alternate_order_hint != current_gold_order_hint) {
- const int reference_offset_last =
- -GetRelativeDistance(current_frame.order_hint(kReferenceFrameLast),
- frame_header.order_hint, order_hint_shift_bits);
- if (std::abs(reference_offset_last) <= kMaxFrameDistance) {
- MotionFieldProjection(frame_header, current_frame, reference_frames,
- kReferenceFrameLast, order_hint_shift_bits,
- reference_offset_last, -1, y8_start, y8_end,
- x8_start, x8_end, motion_field);
+ const int last_index = frame_header.reference_frame_index[0];
+ const ReferenceInfo& reference_info = *current_frame.reference_info();
+ if (!IsIntraFrame(reference_frames[last_index]->frame_type())) {
+ const int last_alternate_order_hint =
+ reference_frames[last_index]
+ ->reference_info()
+ ->order_hint[kReferenceFrameAlternate];
+ const int current_gold_order_hint =
+ reference_info.order_hint[kReferenceFrameGolden];
+ if (last_alternate_order_hint != current_gold_order_hint) {
+ const int reference_offset_last =
+ -reference_info.relative_distance_from[kReferenceFrameLast];
+ if (std::abs(reference_offset_last) <= kMaxFrameDistance) {
+ MotionFieldProjection(frame_header, reference_frames,
+ kReferenceFrameLast, reference_offset_last, -1,
+ y8_start, y8_end, x8_start, x8_end, motion_field);
+ }
}
}
int ref_stamp = 1;
const int reference_offset_backward =
- GetRelativeDistance(current_frame.order_hint(kReferenceFrameBackward),
- frame_header.order_hint, order_hint_shift_bits);
+ reference_info.relative_distance_from[kReferenceFrameBackward];
if (reference_offset_backward > 0 &&
- MotionFieldProjection(frame_header, current_frame, reference_frames,
- kReferenceFrameBackward, order_hint_shift_bits,
- reference_offset_backward, 0, y8_start, y8_end,
- x8_start, x8_end, motion_field)) {
+ MotionFieldProjection(frame_header, reference_frames,
+ kReferenceFrameBackward, reference_offset_backward,
+ 0, y8_start, y8_end, x8_start, x8_end,
+ motion_field)) {
--ref_stamp;
}
const int reference_offset_alternate2 =
- GetRelativeDistance(current_frame.order_hint(kReferenceFrameAlternate2),
- frame_header.order_hint, order_hint_shift_bits);
+ reference_info.relative_distance_from[kReferenceFrameAlternate2];
if (reference_offset_alternate2 > 0 &&
- MotionFieldProjection(frame_header, current_frame, reference_frames,
- kReferenceFrameAlternate2, order_hint_shift_bits,
+ MotionFieldProjection(frame_header, reference_frames,
+ kReferenceFrameAlternate2,
reference_offset_alternate2, 0, y8_start, y8_end,
x8_start, x8_end, motion_field)) {
--ref_stamp;
}
if (ref_stamp >= 0) {
const int reference_offset_alternate =
- GetRelativeDistance(current_frame.order_hint(kReferenceFrameAlternate),
- frame_header.order_hint, order_hint_shift_bits);
+ reference_info.relative_distance_from[kReferenceFrameAlternate];
if (reference_offset_alternate > 0 &&
- MotionFieldProjection(frame_header, current_frame, reference_frames,
- kReferenceFrameAlternate, order_hint_shift_bits,
+ MotionFieldProjection(frame_header, reference_frames,
+ kReferenceFrameAlternate,
reference_offset_alternate, 0, y8_start, y8_end,
x8_start, x8_end, motion_field)) {
--ref_stamp;
@@ -984,13 +987,11 @@ void SetupMotionField(
}
if (ref_stamp >= 0) {
const int reference_offset_last2 =
- -GetRelativeDistance(current_frame.order_hint(kReferenceFrameLast2),
- frame_header.order_hint, order_hint_shift_bits);
+ -reference_info.relative_distance_from[kReferenceFrameLast2];
if (std::abs(reference_offset_last2) <= kMaxFrameDistance) {
- MotionFieldProjection(frame_header, current_frame, reference_frames,
- kReferenceFrameLast2, order_hint_shift_bits,
- reference_offset_last2, -1, y8_start, y8_end,
- x8_start, x8_end, motion_field);
+ MotionFieldProjection(frame_header, reference_frames,
+ kReferenceFrameLast2, reference_offset_last2, -1,
+ y8_start, y8_end, x8_start, x8_end, motion_field);
}
}
}
diff --git a/chromium/third_party/libgav1/src/src/motion_vector.h b/chromium/third_party/libgav1/src/src/motion_vector.h
index f34bebb5346..d739e802831 100644
--- a/chromium/third_party/libgav1/src/src/motion_vector.h
+++ b/chromium/third_party/libgav1/src/src/motion_vector.h
@@ -51,8 +51,8 @@ void SetupMotionField(
const ObuFrameHeader& frame_header, const RefCountedBuffer& current_frame,
const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
reference_frames,
- unsigned int order_hint_shift_bits, int row4x4_start, int row4x4_end,
- int column4x4_start, int column4x4_end, TemporalMotionField* motion_field);
+ int row4x4_start, int row4x4_end, int column4x4_start, int column4x4_end,
+ TemporalMotionField* motion_field);
} // namespace libgav1
diff --git a/chromium/third_party/libgav1/src/src/obu_parser.cc b/chromium/third_party/libgav1/src/src/obu_parser.cc
index 0a3ccd49254..ffa267fb348 100644
--- a/chromium/third_party/libgav1/src/src/obu_parser.cc
+++ b/chromium/third_party/libgav1/src/src/obu_parser.cc
@@ -1080,29 +1080,32 @@ void ObuParser::ComputeSegmentLosslessAndQIndex() {
}
bool ObuParser::ParseCdefParameters() {
+ const int coeff_shift = sequence_header_.color_config.bitdepth - 8;
if (frame_header_.coded_lossless || frame_header_.allow_intrabc ||
!sequence_header_.enable_cdef) {
- frame_header_.cdef.damping = 3;
+ frame_header_.cdef.damping = 3 + coeff_shift;
return true;
}
Cdef* const cdef = &frame_header_.cdef;
int64_t scratch;
OBU_READ_LITERAL_OR_FAIL(2);
- cdef->damping = scratch + 3;
+ cdef->damping = scratch + 3 + coeff_shift;
OBU_READ_LITERAL_OR_FAIL(2);
cdef->bits = scratch;
for (int i = 0; i < (1 << cdef->bits); ++i) {
OBU_READ_LITERAL_OR_FAIL(4);
- cdef->y_primary_strength[i] = scratch;
+ cdef->y_primary_strength[i] = scratch << coeff_shift;
OBU_READ_LITERAL_OR_FAIL(2);
cdef->y_secondary_strength[i] = scratch;
if (cdef->y_secondary_strength[i] == 3) ++cdef->y_secondary_strength[i];
+ cdef->y_secondary_strength[i] <<= coeff_shift;
if (sequence_header_.color_config.is_monochrome) continue;
OBU_READ_LITERAL_OR_FAIL(4);
- cdef->uv_primary_strength[i] = scratch;
+ cdef->uv_primary_strength[i] = scratch << coeff_shift;
OBU_READ_LITERAL_OR_FAIL(2);
cdef->uv_secondary_strength[i] = scratch;
if (cdef->uv_secondary_strength[i] == 3) ++cdef->uv_secondary_strength[i];
+ cdef->uv_secondary_strength[i] <<= coeff_shift;
}
return true;
}
@@ -1192,6 +1195,12 @@ bool ObuParser::IsSkipModeAllowed() {
const unsigned int reference_hint =
decoder_state_
.reference_order_hint[frame_header_.reference_frame_index[i]];
+ // TODO(linfengz): |relative_distance| equals
+ // current_frame_->reference_info()->
+ // relative_distance_from[i + kReferenceFrameLast];
+ // However, the unit test ObuParserTest.SkipModeParameters() would fail.
+ // Will figure out how to initialize |current_frame_.reference_info_| in the
+ // RefCountedBuffer later.
const int relative_distance =
GetRelativeDistance(reference_hint, frame_header_.order_hint,
sequence_header_.order_hint_shift_bits);
@@ -1842,7 +1851,6 @@ bool ObuParser::ParseFrameParameters() {
if (frame_header_.frame_type == kFrameKey && frame_header_.show_frame) {
decoder_state_.reference_valid.fill(false);
decoder_state_.reference_order_hint.fill(0);
- current_frame_->ClearOrderHints();
}
OBU_READ_BIT_OR_FAIL;
frame_header_.enable_cdf_update = !static_cast<bool>(scratch);
@@ -2092,16 +2100,44 @@ bool ObuParser::ParseFrameParameters() {
return false;
}
if (!IsIntraFrame(frame_header_.frame_type)) {
- for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
- const auto reference_frame =
- static_cast<ReferenceFrameType>(kReferenceFrameLast + i);
+ // Initialize the kReferenceFrameIntra type reference frame information to
+ // simplify the frame type validation in motion field projection.
+ // Set the kReferenceFrameIntra type |order_hint_| to
+ // |frame_header_.order_hint|. This guarantees that in SIMD implementations,
+ // the other reference frame information of the kReferenceFrameIntra type
+ // could be correctly initialized using the following loop with
+ // |frame_header_.order_hint| being the |hint|.
+ ReferenceInfo* const reference_info = current_frame_->reference_info();
+ reference_info->order_hint[kReferenceFrameIntra] = frame_header_.order_hint;
+ reference_info->relative_distance_from[kReferenceFrameIntra] = 0;
+ reference_info->relative_distance_to[kReferenceFrameIntra] = 0;
+ reference_info->skip_references[kReferenceFrameIntra] = true;
+ reference_info->projection_divisions[kReferenceFrameIntra] = 0;
+
+ for (int i = kReferenceFrameLast; i <= kNumInterReferenceFrameTypes; ++i) {
+ const auto reference_frame = static_cast<ReferenceFrameType>(i);
const uint8_t hint =
- decoder_state_
- .reference_order_hint[frame_header_.reference_frame_index[i]];
- current_frame_->set_order_hint(reference_frame, hint);
- decoder_state_.reference_frame_sign_bias[reference_frame] =
+ decoder_state_.reference_order_hint
+ [frame_header_.reference_frame_index[i - kReferenceFrameLast]];
+ reference_info->order_hint[reference_frame] = hint;
+ const int relative_distance_from =
GetRelativeDistance(hint, frame_header_.order_hint,
- sequence_header_.order_hint_shift_bits) > 0;
+ sequence_header_.order_hint_shift_bits);
+ const int relative_distance_to =
+ GetRelativeDistance(frame_header_.order_hint, hint,
+ sequence_header_.order_hint_shift_bits);
+ reference_info->relative_distance_from[reference_frame] =
+ relative_distance_from;
+ reference_info->relative_distance_to[reference_frame] =
+ relative_distance_to;
+ reference_info->skip_references[reference_frame] =
+ relative_distance_to > kMaxFrameDistance || relative_distance_to <= 0;
+ reference_info->projection_divisions[reference_frame] =
+ reference_info->skip_references[reference_frame]
+ ? 0
+ : kProjectionMvDivisionLookup[relative_distance_to];
+ decoder_state_.reference_frame_sign_bias[reference_frame] =
+ relative_distance_from > 0;
}
}
if (frame_header_.enable_cdf_update &&
@@ -2128,6 +2164,11 @@ bool ObuParser::ParseFrameHeader() {
ParseQuantizerIndexDeltaParameters() && ParseLoopFilterDeltaParameters();
if (!status) return false;
ComputeSegmentLosslessAndQIndex();
+ // Section 6.8.2: It is a requirement of bitstream conformance that
+ // delta_q_present is equal to 0 when CodedLossless is equal to 1.
+ if (frame_header_.coded_lossless && frame_header_.delta_q.present) {
+ return false;
+ }
status = ParseLoopFilterParameters();
if (!status) return false;
current_frame_->SetLoopFilterDeltas(frame_header_.loop_filter);
diff --git a/chromium/third_party/libgav1/src/src/post_filter.h b/chromium/third_party/libgav1/src/src/post_filter.h
index 16c784ac458..c7af197575d 100644
--- a/chromium/third_party/libgav1/src/src/post_filter.h
+++ b/chromium/third_party/libgav1/src/src/post_filter.h
@@ -27,7 +27,7 @@
#include "src/dsp/common.h"
#include "src/dsp/dsp.h"
-#include "src/loop_filter_mask.h"
+#include "src/frame_scratch_buffer.h"
#include "src/loop_restoration_info.h"
#include "src/obu_parser.h"
#include "src/utils/array_2d.h"
@@ -46,8 +46,6 @@ namespace libgav1 {
// and loop restoration.
// Historically, for example in libaom, loop filter refers to deblock filter.
// To avoid name conflicts, we call this class PostFilter (post processing).
-// Input info includes deblock parameters (bit masks), CDEF
-// parameters, super resolution parameters and loop restoration parameters.
// In-loop post filtering order is:
// deblock --> CDEF --> super resolution--> loop restoration.
// When CDEF and super resolution is not used, we can combine deblock
@@ -76,14 +74,9 @@ class PostFilter {
// * Output: |loop_restoration_buffer_|.
// -> Now |frame_buffer_| contains the filtered frame.
PostFilter(const ObuFrameHeader& frame_header,
- const ObuSequenceHeader& sequence_header, LoopFilterMask* masks,
- const Array2D<int16_t>& cdef_index,
- const Array2D<TransformSize>& inter_transform_sizes,
- LoopRestorationInfo* restoration_info,
- BlockParametersHolder* block_parameters, YuvBuffer* frame_buffer,
- YuvBuffer* deblock_buffer, const dsp::Dsp* dsp,
- ThreadPool* thread_pool, uint8_t* threaded_window_buffer,
- uint8_t* superres_line_buffer, int do_post_filter_mask);
+ const ObuSequenceHeader& sequence_header,
+ FrameScratchBuffer* frame_scratch_buffer, YuvBuffer* frame_buffer,
+ const dsp::Dsp* dsp, int do_post_filter_mask);
// non copyable/movable.
PostFilter(const PostFilter&) = delete;
@@ -123,9 +116,9 @@ class PostFilter {
// with a shift to the top-left).
void ApplyFilteringThreaded();
- // Does the overall post processing filter for one superblock row (starting at
- // |row4x4| with height 4*|sb4x4|. Cdef, SuperRes and Loop Restoration lag by
- // one superblock row to account for deblocking.
+ // Does the overall post processing filter for one superblock row starting at
+ // |row4x4| with height 4*|sb4x4|. If |do_deblock| is false, deblocking filter
+ // will not be applied.
//
// Filter behavior (single-threaded):
// * Deblock: In-place filtering. The output is written to |source_buffer_|.
@@ -143,26 +136,35 @@ class PostFilter {
// top-left).
// Returns the index of the last row whose post processing is complete and can
// be used for referencing.
- int ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4,
- bool is_last_row);
+ int ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4, bool is_last_row,
+ bool do_deblock);
+
+ // Apply deblocking filter in one direction (specified by |loop_filter_type|)
+ // for the superblock row starting at |row4x4_start| for columns starting from
+ // |column4x4_start| in increments of 16 (or 8 for chroma with subsampling)
+ // until the smallest multiple of 16 that is >= |column4x4_end| or until
+ // |frame_header_.columns4x4|, whichever is lower. This function must be
+ // called only if |DoDeblock()| returns true.
+ void ApplyDeblockFilter(LoopFilterType loop_filter_type, int row4x4_start,
+ int column4x4_start, int column4x4_end, int sb4x4);
- bool DoCdef() const { return DoCdef(frame_header_, do_post_filter_mask_); }
static bool DoCdef(const ObuFrameHeader& frame_header,
int do_post_filter_mask) {
- return (do_post_filter_mask & 0x02) != 0 &&
- (frame_header.cdef.bits > 0 ||
+ return (frame_header.cdef.bits > 0 ||
frame_header.cdef.y_primary_strength[0] > 0 ||
frame_header.cdef.y_secondary_strength[0] > 0 ||
frame_header.cdef.uv_primary_strength[0] > 0 ||
- frame_header.cdef.uv_secondary_strength[0] > 0);
+ frame_header.cdef.uv_secondary_strength[0] > 0) &&
+ (do_post_filter_mask & 0x02) != 0;
}
+ bool DoCdef() const { return DoCdef(frame_header_, do_post_filter_mask_); }
// If filter levels for Y plane (0 for vertical, 1 for horizontal),
// are all zero, deblock filter will not be applied.
static bool DoDeblock(const ObuFrameHeader& frame_header,
uint8_t do_post_filter_mask) {
- return (do_post_filter_mask & 0x01) != 0 &&
- (frame_header.loop_filter.level[0] > 0 ||
- frame_header.loop_filter.level[1] > 0);
+ return (frame_header.loop_filter.level[0] > 0 ||
+ frame_header.loop_filter.level[1] > 0) &&
+ (do_post_filter_mask & 0x01) != 0;
}
bool DoDeblock() const {
return DoDeblock(frame_header_, do_post_filter_mask_);
@@ -178,20 +180,21 @@ class PostFilter {
const int8_t delta_lf[kFrameLfCount],
uint8_t deblock_filter_levels[kMaxSegments][kFrameLfCount]
[kNumReferenceFrameTypes][2]) const;
- bool DoRestoration() const {
- return DoRestoration(loop_restoration_, do_post_filter_mask_, planes_);
- }
// Returns true if loop restoration will be performed for the given parameters
// and mask.
static bool DoRestoration(const LoopRestoration& loop_restoration,
uint8_t do_post_filter_mask, int num_planes) {
- if ((do_post_filter_mask & 0x08) == 0) return false;
if (num_planes == kMaxPlanesMonochrome) {
- return loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone;
+ return loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone &&
+ (do_post_filter_mask & 0x08) != 0;
}
- return loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone ||
- loop_restoration.type[kPlaneU] != kLoopRestorationTypeNone ||
- loop_restoration.type[kPlaneV] != kLoopRestorationTypeNone;
+ return (loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone ||
+ loop_restoration.type[kPlaneU] != kLoopRestorationTypeNone ||
+ loop_restoration.type[kPlaneV] != kLoopRestorationTypeNone) &&
+ (do_post_filter_mask & 0x08) != 0;
+ }
+ bool DoRestoration() const {
+ return DoRestoration(loop_restoration_, do_post_filter_mask_, planes_);
}
// Returns a pointer to the unfiltered buffer. This is used by the Tile class
@@ -204,13 +207,12 @@ class PostFilter {
// mask.
static bool DoSuperRes(const ObuFrameHeader& frame_header,
uint8_t do_post_filter_mask) {
- return (do_post_filter_mask & 0x04) != 0 &&
- frame_header.width != frame_header.upscaled_width;
+ return frame_header.width != frame_header.upscaled_width &&
+ (do_post_filter_mask & 0x04) != 0;
}
bool DoSuperRes() const {
return DoSuperRes(frame_header_, do_post_filter_mask_);
}
- LoopFilterMask* masks() const { return masks_; }
LoopRestorationInfo* restoration_info() const { return restoration_info_; }
uint8_t* GetBufferOffset(uint8_t* base_buffer, int stride, Plane plane,
int row4x4, int column4x4) const {
@@ -249,37 +251,23 @@ class PostFilter {
// The type of the HorizontalDeblockFilter and VerticalDeblockFilter member
// functions.
using DeblockFilter = void (PostFilter::*)(Plane plane, int row4x4_start,
- int column4x4_start, int unit_id);
- // The lookup table for picking the deblock filter, according to:
- // kDeblockFilterBitMask (first dimension), and deblock filter type (second).
- const DeblockFilter deblock_filter_type_table_[2][2] = {
- {&PostFilter::VerticalDeblockFilterNoMask,
- &PostFilter::HorizontalDeblockFilterNoMask},
- {&PostFilter::VerticalDeblockFilter,
- &PostFilter::HorizontalDeblockFilter},
- };
- // Buffers for loop restoration intermediate results. Depending on the filter
- // type, only one member of the union is used.
- union IntermediateBuffers {
- // For Wiener filter.
- // The array |intermediate| in Section 7.17.4, the intermediate results
- // between the horizontal and vertical filters.
- alignas(kMaxAlignment)
- uint16_t wiener[(kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1) *
- kMaxSuperBlockSizeInPixels];
- // For self-guided filter.
- struct {
- // The arrays flt0 and flt1 in Section 7.17.2, the outputs of the box
- // filter process in pass 0 and pass 1.
- alignas(
- kMaxAlignment) int32_t output[2][kMaxBoxFilterProcessOutputPixels];
- // The 2d arrays A and B in Section 7.17.3, the intermediate results in
- // the box filter process. Reused for pass 0 and pass 1.
- alignas(kMaxAlignment) uint32_t
- intermediate_a[kBoxFilterProcessIntermediatePixels];
- alignas(kMaxAlignment) uint32_t
- intermediate_b[kBoxFilterProcessIntermediatePixels];
- } box_filter;
+ int column4x4_start);
+ // The lookup table for picking the deblock filter, according to deblock
+ // filter type.
+ const DeblockFilter deblock_filter_func_[2] = {
+ &PostFilter::VerticalDeblockFilter, &PostFilter::HorizontalDeblockFilter};
+
+ // The type of GetVerticalDeblockFilterEdgeInfo* member functions.
+ using DeblockVerticalEdgeInfo = bool (PostFilter::*)(
+ const Plane plane, int row4x4, int column4x4, const int8_t subsampling_x,
+ const int8_t subsampling_y, BlockParameters* const* bp_ptr,
+ uint8_t* level, int* step, int* filter_length) const;
+ // The lookup table for picking the GetVerticalDeblockEdgeInfo based on the
+ // plane.
+ const DeblockVerticalEdgeInfo deblock_vertical_edge_info_[kMaxPlanes] = {
+ &PostFilter::GetVerticalDeblockFilterEdgeInfo,
+ &PostFilter::GetVerticalDeblockFilterEdgeInfoUV,
+ &PostFilter::GetVerticalDeblockFilterEdgeInfoUV,
};
// Functions common to all post filters.
@@ -337,35 +325,26 @@ class PostFilter {
int GetDeblockUnitId(int row_unit, int column_unit) const {
return row_unit * num_64x64_blocks_per_row_ + column_unit;
}
- static dsp::LoopFilterSize GetLoopFilterSize(Plane plane, int step) {
- if (step == 4) {
- return dsp::kLoopFilterSize4;
- }
- if (step == 8) {
- return (plane == kPlaneY) ? dsp::kLoopFilterSize8 : dsp::kLoopFilterSize6;
- }
- return (plane == kPlaneY) ? dsp::kLoopFilterSize14 : dsp::kLoopFilterSize6;
- }
- void InitDeblockFilterParams(); // Part of 7.14.4.
- void GetDeblockFilterParams(uint8_t level, int* outer_thresh,
- int* inner_thresh, int* hev_thresh) const;
- template <LoopFilterType type>
- bool GetDeblockFilterEdgeInfo(Plane plane, int row4x4, int column4x4,
- int8_t subsampling_x, int8_t subsampling_y,
- uint8_t* level, int* step,
- int* filter_length) const;
+ bool GetHorizontalDeblockFilterEdgeInfo(Plane plane, int row4x4,
+ int column4x4, int8_t subsampling_x,
+ int8_t subsampling_y, uint8_t* level,
+ int* step, int* filter_length) const;
+ bool GetVerticalDeblockFilterEdgeInfo(Plane plane, int row4x4, int column4x4,
+ int8_t subsampling_x,
+ int8_t subsampling_y,
+ BlockParameters* const* bp_ptr,
+ uint8_t* level, int* step,
+ int* filter_length) const;
+ bool GetVerticalDeblockFilterEdgeInfoUV(Plane plane, int row4x4,
+ int column4x4, int8_t subsampling_x,
+ int8_t subsampling_y,
+ BlockParameters* const* bp_ptr,
+ uint8_t* level, int* step,
+ int* filter_length) const;
void HorizontalDeblockFilter(Plane plane, int row4x4_start,
- int column4x4_start, int unit_id);
- void VerticalDeblockFilter(Plane plane, int row4x4_start, int column4x4_start,
- int unit_id);
- // |unit_id| is not used, keep it to match the same interface as
- // HorizontalDeblockFilter().
- void HorizontalDeblockFilterNoMask(Plane plane, int row4x4_start,
- int column4x4_start, int unit_id);
- // |unit_id| is not used, keep it to match the same interface as
- // VerticalDeblockFilter().
- void VerticalDeblockFilterNoMask(Plane plane, int row4x4_start,
- int column4x4_start, int unit_id);
+ int column4x4_start);
+ void VerticalDeblockFilter(Plane plane, int row4x4_start,
+ int column4x4_start);
// HorizontalDeblockFilter and VerticalDeblockFilter must have the correct
// signature.
static_assert(std::is_same<decltype(&PostFilter::HorizontalDeblockFilter),
@@ -385,7 +364,6 @@ class PostFilter {
// Functions for the cdef filter.
uint8_t* GetCdefBufferAndStride(int start_x, int start_y, int plane,
- int subsampling_x, int subsampling_y,
int window_buffer_plane_size,
int* cdef_stride) const;
// This function prepares the input source block for cdef filtering. The input
@@ -394,9 +372,9 @@ class PostFilter {
// pixels with a large value. This achieves the required behavior defined in
// section 5.11.52 of the spec.
template <typename Pixel>
- void PrepareCdefBlock(int block_width4x4, int block_height4x4, int row_64x64,
- int column_64x64, uint16_t* cdef_source,
- ptrdiff_t cdef_stride);
+ void PrepareCdefBlock(int block_width4x4, int block_height4x4, int row4x4,
+ int column4x4, uint16_t* cdef_source,
+ ptrdiff_t cdef_stride, bool y_plane);
template <typename Pixel>
void ApplyCdefForOneUnit(uint16_t* cdef_block, int index, int block_width4x4,
int block_height4x4, int row4x4_start,
@@ -434,12 +412,14 @@ class PostFilter {
// Functions for the Loop Restoration filter.
template <typename Pixel>
- void ApplyLoopRestorationForOneUnit(
- uint8_t* cdef_buffer, ptrdiff_t cdef_buffer_stride, Plane plane,
- int plane_height, int x, int y, int row, int column, int unit_row,
- int current_process_unit_height, int plane_process_unit_width,
- int plane_unit_size, int num_horizontal_units, int plane_width,
- Array2DView<Pixel>* loop_restored_window);
+ void ApplyLoopRestorationForOneUnit(uint8_t* cdef_buffer,
+ ptrdiff_t cdef_buffer_stride, Plane plane,
+ int plane_height, int x, int y, int row,
+ int column, int unit_row,
+ int current_process_unit_height,
+ int plane_unit_size,
+ int num_horizontal_units, int plane_width,
+ Array2DView<Pixel>* loop_restored_window);
template <typename Pixel>
void ApplyLoopRestorationForSuperBlock(Plane plane, int x, int y,
int unit_row,
@@ -454,8 +434,8 @@ class PostFilter {
void ApplyLoopRestorationForOneRowInWindow(
uint8_t* cdef_buffer, ptrdiff_t cdef_buffer_stride, Plane plane,
int plane_height, int plane_width, int x, int y, int row, int unit_row,
- int current_process_unit_height, int process_unit_width, int window_width,
- int plane_unit_size, int num_horizontal_units);
+ int current_process_unit_height, int plane_unit_size, int window_width,
+ int num_horizontal_units);
// Note for ApplyLoopRestoration():
// First, we must differentiate loop restoration processing unit from loop
// restoration unit.
@@ -501,12 +481,8 @@ class PostFilter {
const int8_t subsampling_y_[kMaxPlanes];
const int8_t planes_;
const int pixel_size_;
- // This class does not take ownership of the masks/restoration_info, but it
- // could change their values.
- LoopFilterMask* const masks_;
- uint8_t inner_thresh_[kMaxLoopFilterValue + 1] = {};
- uint8_t outer_thresh_[kMaxLoopFilterValue + 1] = {};
- uint8_t hev_thresh_[kMaxLoopFilterValue + 1] = {};
+ const uint8_t* const inner_thresh_;
+ const uint8_t* const outer_thresh_;
// This stores the deblocking filter levels assuming that the delta is zero.
// This will be used by all superblocks whose delta is zero (without having to
// recompute them). The dimensions (in order) are: segment_id, level_index
@@ -529,8 +505,6 @@ class PostFilter {
// nullptr as well.
uint8_t* const threaded_window_buffer_;
LoopRestorationInfo* const restoration_info_;
- const int window_buffer_width_;
- const int window_buffer_height_;
// Pointer to the line buffer used by ApplySuperRes(). If SuperRes is on, then
// the buffer will be large enough to hold one downscaled row +
// kSuperResHorizontalBorder.
@@ -560,8 +534,10 @@ class PostFilter {
// This buffer is used only when both Cdef and Loop Restoration are on.
YuvBuffer& deblock_buffer_;
const uint8_t do_post_filter_mask_;
-
ThreadPool* const thread_pool_;
+ const int window_buffer_width_;
+ const int window_buffer_height_;
+
// Tracks the progress of the post filters.
int progress_row_ = -1;
@@ -571,13 +547,11 @@ class PostFilter {
// Wiener filter needs extended border of three pixels.
// Therefore the size of the buffer is 70x70 pixels.
alignas(alignof(uint16_t)) uint8_t
- block_buffer_[kRestorationProcessingUnitSizeWithBorders *
- kRestorationProcessingUnitSizeWithBorders *
- sizeof(uint16_t)];
+ block_buffer_[kRestorationUnitHeightWithBorders *
+ kRestorationUnitWidthWithBorders * sizeof(uint16_t)];
// A block buffer to hold the input that is converted to uint16_t before
// cdef filtering. Only used in single threaded case.
- uint16_t cdef_block_[kRestorationProcessingUnitSizeWithBorders *
- kRestorationProcessingUnitSizeWithBorders * 3];
+ uint16_t cdef_block_[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 3];
template <int bitdepth, typename Pixel>
friend class PostFilterSuperResTest;
@@ -586,75 +560,69 @@ class PostFilter {
friend class PostFilterHelperFuncTest;
};
+template <typename Pixel>
+void CopyTwoRows(const Pixel* src, const ptrdiff_t src_stride, Pixel** dst,
+ const ptrdiff_t dst_stride, const int width) {
+ for (int i = 0; i < kRestorationBorder - 1; ++i) {
+ memcpy(*dst, src, sizeof(Pixel) * width);
+ src += src_stride;
+ *dst += dst_stride;
+ }
+}
+
// This function takes the cdef filtered buffer and the deblocked buffer to
// prepare a block as input for loop restoration.
// In striped loop restoration:
-// The filtering needs to fetch the area of size (width + 6) x (height + 6),
-// in which (width + 6) x height area is from cdef filtered frame
-// (cdef_buffer). Top 3 rows and bottom 3 rows are from deblocked frame
-// (deblock_buffer).
+// The filtering needs to fetch the area of size (width + 6) x (height + 4),
+// in which (width + 6) x height area is from cdef filtered frame (cdef_buffer).
+// Top 2 rows and bottom 2 rows are from deblocked frame (deblock_buffer).
// Special cases are:
-// (1). when it is the top border, the top 3 rows are from cdef
-// filtered frame.
-// (2). when it is the bottom border, the bottom 3 rows are from cdef
-// filtered frame.
-// For the top 3 rows and bottom 3 rows, the top_row[0] is a copy of the
-// top_row[1]. The bottom_row[2] is a copy of the bottom_row[1]. If cdef is
-// not applied for this frame, cdef_buffer is the same as deblock_buffer.
+// (1). when it is the top border, the top 2 rows are from cdef filtered frame.
+// (2). when it is the bottom border, the bottom 2 rows are from cdef filtered
+// frame.
+// This function is called only when cdef is applied for this frame.
template <typename Pixel>
-void PrepareLoopRestorationBlock(const bool do_cdef, const uint8_t* cdef_buffer,
+void PrepareLoopRestorationBlock(const uint8_t* cdef_buffer,
ptrdiff_t cdef_stride,
const uint8_t* deblock_buffer,
ptrdiff_t deblock_stride, uint8_t* dest,
ptrdiff_t dest_stride, const int width,
const int height, const bool frame_top_border,
const bool frame_bottom_border) {
- const auto* cdef_ptr = reinterpret_cast<const Pixel*>(cdef_buffer);
cdef_stride /= sizeof(Pixel);
- const auto* deblock_ptr = reinterpret_cast<const Pixel*>(deblock_buffer);
deblock_stride /= sizeof(Pixel);
- auto* dst = reinterpret_cast<Pixel*>(dest);
dest_stride /= sizeof(Pixel);
- // Top 3 rows.
- cdef_ptr -= (kRestorationBorder - 1) * cdef_stride + kRestorationBorder;
- if (deblock_ptr != nullptr) deblock_ptr -= kRestorationBorder;
- for (int i = 0; i < kRestorationBorder; ++i) {
- if (frame_top_border || !do_cdef) {
- memcpy(dst, cdef_ptr, sizeof(Pixel) * (width + 2 * kRestorationBorder));
- } else {
- memcpy(dst, deblock_ptr,
- sizeof(Pixel) * (width + 2 * kRestorationBorder));
- }
- if (i > 0) {
- if (deblock_ptr != nullptr) deblock_ptr += deblock_stride;
- cdef_ptr += cdef_stride;
- }
- dst += dest_stride;
+ const auto* cdef_ptr = reinterpret_cast<const Pixel*>(cdef_buffer) -
+ (kRestorationBorder - 1) * cdef_stride -
+ kRestorationBorder;
+ const auto* deblock_ptr =
+ reinterpret_cast<const Pixel*>(deblock_buffer) - kRestorationBorder;
+ auto* dst = reinterpret_cast<Pixel*>(dest) + dest_stride;
+ int h = height;
+ // Top 2 rows.
+ if (frame_top_border) {
+ h += kRestorationBorder - 1;
+ } else {
+ CopyTwoRows<Pixel>(deblock_ptr, deblock_stride, &dst, dest_stride,
+ width + 2 * kRestorationBorder);
+ cdef_ptr += (kRestorationBorder - 1) * cdef_stride;
+ // If |frame_top_border| is true, then we are in the first superblock row,
+ // so in that case, do not increment |deblock_ptr| since we don't store
+ // anything from the first superblock row into |deblock_buffer|.
+ deblock_ptr += 4 * deblock_stride;
}
+ if (frame_bottom_border) h += kRestorationBorder - 1;
// Main body.
- for (int i = 0; i < height; ++i) {
+ do {
memcpy(dst, cdef_ptr, sizeof(Pixel) * (width + 2 * kRestorationBorder));
cdef_ptr += cdef_stride;
dst += dest_stride;
- }
- // Bottom 3 rows. If |frame_top_border| is true, then we are in the first
- // superblock row, so in that case, do not increment |deblock_ptr| since we
- // don't store anything from the first superblock row into |deblock_buffer|.
- if (deblock_ptr != nullptr && !frame_top_border) {
- deblock_ptr += deblock_stride * 4;
- }
- for (int i = 0; i < kRestorationBorder; ++i) {
- if (frame_bottom_border || !do_cdef) {
- memcpy(dst, cdef_ptr, sizeof(Pixel) * (width + 2 * kRestorationBorder));
- } else {
- memcpy(dst, deblock_ptr,
- sizeof(Pixel) * (width + 2 * kRestorationBorder));
- }
- if (i < kRestorationBorder - 2) {
- if (deblock_ptr != nullptr) deblock_ptr += deblock_stride;
- cdef_ptr += cdef_stride;
- }
- dst += dest_stride;
+ } while (--h != 0);
+ // Bottom 2 rows.
+ if (!frame_bottom_border) {
+ deblock_ptr += (kRestorationBorder - 1) * deblock_stride;
+ CopyTwoRows<Pixel>(deblock_ptr, deblock_stride, &dst, dest_stride,
+ width + 2 * kRestorationBorder);
}
}
diff --git a/chromium/third_party/libgav1/src/src/post_filter/cdef.cc b/chromium/third_party/libgav1/src/src/post_filter/cdef.cc
index c169acd6532..2b3b7119f0b 100644
--- a/chromium/third_party/libgav1/src/src/post_filter/cdef.cc
+++ b/chromium/third_party/libgav1/src/src/post_filter/cdef.cc
@@ -20,6 +20,7 @@ namespace libgav1 {
namespace {
constexpr int kStep64x64 = 16; // =64/4.
+constexpr int kCdefSkip = 8;
constexpr uint8_t kCdefUvDirection[2][2][8] = {
{{0, 1, 2, 3, 4, 5, 6, 7}, {1, 2, 2, 2, 3, 4, 6, 0}},
@@ -57,19 +58,31 @@ void CopyRowForCdef(const Pixel* src, int block_width, int unit_width,
}
}
+// For |height| rows, copy |width| pixels of size |pixel_size| from |src| to
+// |dst|.
+void CopyPixels(const uint8_t* src, int src_stride, uint8_t* dst,
+ int dst_stride, int width, int height, size_t pixel_size) {
+ int y = height;
+ do {
+ memcpy(dst, src, width * pixel_size);
+ src += src_stride;
+ dst += dst_stride;
+ } while (--y != 0);
+}
+
} // namespace
uint8_t* PostFilter::GetCdefBufferAndStride(const int start_x,
const int start_y, const int plane,
- const int subsampling_x,
- const int subsampling_y,
const int window_buffer_plane_size,
int* cdef_stride) const {
if (thread_pool_ != nullptr) {
// write output to threaded_window_buffer.
*cdef_stride = window_buffer_width_ * pixel_size_;
- const int column_window = start_x % (window_buffer_width_ >> subsampling_x);
- const int row_window = start_y % (window_buffer_height_ >> subsampling_y);
+ const int column_window =
+ start_x % (window_buffer_width_ >> subsampling_x_[plane]);
+ const int row_window =
+ start_y % (window_buffer_height_ >> subsampling_y_[plane]);
return threaded_window_buffer_ + plane * window_buffer_plane_size +
row_window * (*cdef_stride) + column_window * pixel_size_;
}
@@ -80,72 +93,82 @@ uint8_t* PostFilter::GetCdefBufferAndStride(const int start_x,
template <typename Pixel>
void PostFilter::PrepareCdefBlock(int block_width4x4, int block_height4x4,
- int row_64x64, int column_64x64,
- uint16_t* cdef_source,
- ptrdiff_t cdef_stride) {
- for (int plane = kPlaneY; plane < planes_; ++plane) {
- uint16_t* cdef_src =
- cdef_source + plane * kRestorationProcessingUnitSizeWithBorders *
- kRestorationProcessingUnitSizeWithBorders;
- const int8_t subsampling_x = subsampling_x_[plane];
- const int8_t subsampling_y = subsampling_y_[plane];
- const int start_x = MultiplyBy4(column_64x64) >> subsampling_x;
- const int start_y = MultiplyBy4(row_64x64) >> subsampling_y;
- const int plane_width = RightShiftWithRounding(width_, subsampling_x);
- const int plane_height = RightShiftWithRounding(height_, subsampling_y);
- const int block_width = MultiplyBy4(block_width4x4) >> subsampling_x;
- const int block_height = MultiplyBy4(block_height4x4) >> subsampling_y;
- // unit_width, unit_height are the same as block_width, block_height unless
- // it reaches the frame boundary, where block_width < 64 or
- // block_height < 64. unit_width, unit_height guarantee we build blocks on
- // a multiple of 8.
- const int unit_width = Align(block_width, (subsampling_x > 0) ? 4 : 8);
- const int unit_height = Align(block_height, (subsampling_y > 0) ? 4 : 8);
- const bool is_frame_left = column_64x64 == 0;
- const bool is_frame_right = start_x + block_width >= plane_width;
- const bool is_frame_top = row_64x64 == 0;
- const bool is_frame_bottom = start_y + block_height >= plane_height;
+ int row4x4, int column4x4,
+ uint16_t* cdef_source, ptrdiff_t cdef_stride,
+ const bool y_plane) {
+ assert(y_plane || planes_ == kMaxPlanes);
+ const int max_planes = y_plane ? 1 : kMaxPlanes;
+ const int8_t subsampling_x = y_plane ? 0 : subsampling_x_[kPlaneU];
+ const int8_t subsampling_y = y_plane ? 0 : subsampling_y_[kPlaneU];
+ const int start_x = MultiplyBy4(column4x4) >> subsampling_x;
+ const int start_y = MultiplyBy4(row4x4) >> subsampling_y;
+ const int plane_width = RightShiftWithRounding(width_, subsampling_x);
+ const int plane_height = RightShiftWithRounding(height_, subsampling_y);
+ const int block_width = MultiplyBy4(block_width4x4) >> subsampling_x;
+ const int block_height = MultiplyBy4(block_height4x4) >> subsampling_y;
+ // unit_width, unit_height are the same as block_width, block_height unless
+ // it reaches the frame boundary, where block_width < 64 or
+ // block_height < 64. unit_width, unit_height guarantee we build blocks on
+ // a multiple of 8.
+ const int unit_width = Align(block_width, 8 >> subsampling_x);
+ const int unit_height = Align(block_height, 8 >> subsampling_y);
+ const bool is_frame_left = column4x4 == 0;
+ const bool is_frame_right = start_x + block_width >= plane_width;
+ const bool is_frame_top = row4x4 == 0;
+ const bool is_frame_bottom = start_y + block_height >= plane_height;
+ const int y_offset = is_frame_top ? 0 : kCdefBorder;
+
+ for (int plane = y_plane ? kPlaneY : kPlaneU; plane < max_planes; ++plane) {
+ uint16_t* cdef_src = cdef_source + plane * kCdefUnitSizeWithBorders *
+ kCdefUnitSizeWithBorders;
const int src_stride = frame_buffer_.stride(plane) / sizeof(Pixel);
const Pixel* src_buffer =
reinterpret_cast<const Pixel*>(source_buffer_[plane]) +
- (start_y - (is_frame_top ? 0 : kCdefBorder)) * src_stride + start_x;
+ (start_y - y_offset) * src_stride + start_x;
// All the copying code will use negative indices for populating the left
// border. So the starting point is set to kCdefBorder.
cdef_src += kCdefBorder;
// Copy the top 2 rows.
- for (int y = 0; y < kCdefBorder; ++y) {
- if (is_frame_top) {
+ if (is_frame_top) {
+ for (int y = 0; y < kCdefBorder; ++y) {
Memset(cdef_src - kCdefBorder, kCdefLargeValue,
unit_width + 2 * kCdefBorder);
- } else {
+ cdef_src += cdef_stride;
+ }
+ } else {
+ for (int y = 0; y < kCdefBorder; ++y) {
CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
is_frame_right, cdef_src);
src_buffer += src_stride;
+ cdef_src += cdef_stride;
}
- cdef_src += cdef_stride;
}
// Copy the body.
- for (int y = 0; y < block_height; ++y) {
+ int y = block_height;
+ do {
CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
is_frame_right, cdef_src);
cdef_src += cdef_stride;
src_buffer += src_stride;
- }
+ } while (--y != 0);
// Copy the bottom 2 rows.
- for (int y = 0; y < kCdefBorder + unit_height - block_height; ++y) {
- if (is_frame_bottom) {
+ if (is_frame_bottom) {
+ do {
Memset(cdef_src - kCdefBorder, kCdefLargeValue,
unit_width + 2 * kCdefBorder);
- } else {
+ cdef_src += cdef_stride;
+ } while (++y < kCdefBorder + unit_height - block_height);
+ } else {
+ do {
CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
is_frame_right, cdef_src);
src_buffer += src_stride;
- }
- cdef_src += cdef_stride;
+ cdef_src += cdef_stride;
+ } while (++y < kCdefBorder + unit_height - block_height);
}
}
}
@@ -156,130 +179,237 @@ void PostFilter::ApplyCdefForOneUnit(uint16_t* cdef_block, const int index,
const int block_height4x4,
const int row4x4_start,
const int column4x4_start) {
- const int coeff_shift = bitdepth_ - 8;
- const int step = kNum4x4BlocksWide[kBlock8x8];
+ // Cdef operates in 8x8 blocks (4x4 for chroma with subsampling).
+ static constexpr int kStep = 8;
+ static constexpr int kStep4x4 = 2;
+
const int window_buffer_plane_size =
window_buffer_width_ * window_buffer_height_ * pixel_size_;
+ int cdef_buffer_row_base_stride[kMaxPlanes];
+ int cdef_buffer_stride[kMaxPlanes];
+ uint8_t* cdef_buffer_row_base[kMaxPlanes];
+ int src_buffer_row_base_stride[kMaxPlanes];
+ const uint8_t* src_buffer_row_base[kMaxPlanes];
+ int column_step[kMaxPlanes];
+ for (int plane = kPlaneY; plane < planes_; ++plane) {
+ const int start_y = MultiplyBy4(row4x4_start) >> subsampling_y_[plane];
+ const int start_x = MultiplyBy4(column4x4_start) >> subsampling_x_[plane];
+ cdef_buffer_row_base[plane] = GetCdefBufferAndStride(
+ start_x, start_y, plane, window_buffer_plane_size,
+ &cdef_buffer_stride[plane]);
+ cdef_buffer_row_base_stride[plane] =
+ cdef_buffer_stride[plane] * (kStep >> subsampling_y_[plane]);
+ src_buffer_row_base[plane] = source_buffer_[plane] +
+ start_y * frame_buffer_.stride(plane) +
+ start_x * pixel_size_;
+ src_buffer_row_base_stride[plane] =
+ frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
+ column_step[plane] = (kStep >> subsampling_x_[plane]) * pixel_size_;
+ }
if (index == -1) {
for (int plane = kPlaneY; plane < planes_; ++plane) {
- const int start_x = MultiplyBy4(column4x4_start) >> subsampling_x_[plane];
- const int start_y = MultiplyBy4(row4x4_start) >> subsampling_y_[plane];
- int cdef_stride;
- uint8_t* const cdef_buffer = GetCdefBufferAndStride(
- start_x, start_y, plane, subsampling_x_[plane], subsampling_y_[plane],
- window_buffer_plane_size, &cdef_stride);
- const int src_stride = frame_buffer_.stride(plane);
- uint8_t* const src_buffer =
- source_buffer_[plane] + start_y * src_stride + start_x * pixel_size_;
- const int block_width =
- MultiplyBy4(block_width4x4) >> subsampling_x_[plane];
- const int block_height =
- MultiplyBy4(block_height4x4) >> subsampling_y_[plane];
- for (int y = 0; y < block_height; ++y) {
- memcpy(cdef_buffer + y * cdef_stride, src_buffer + y * src_stride,
- block_width * pixel_size_);
+ CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
+ cdef_buffer_row_base[plane], cdef_buffer_stride[plane],
+ MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
+ MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+ pixel_size_);
+ }
+ return;
+ }
+
+ PrepareCdefBlock<Pixel>(block_width4x4, block_height4x4, row4x4_start,
+ column4x4_start, cdef_block, kCdefUnitSizeWithBorders,
+ true);
+
+ // Stored direction used during the u/v pass. If bit 3 is set, then block is
+ // a skip.
+ int direction_y[8 * 8];
+ int y_index = 0;
+
+ const uint8_t y_primary_strength =
+ frame_header_.cdef.y_primary_strength[index];
+ const uint8_t y_secondary_strength =
+ frame_header_.cdef.y_secondary_strength[index];
+
+ const bool compute_direction_and_variance =
+ (y_primary_strength | frame_header_.cdef.uv_primary_strength[index]) != 0;
+ BlockParameters* const* bp_row0_base =
+ block_parameters_.Address(row4x4_start, column4x4_start);
+ BlockParameters* const* bp_row1_base =
+ bp_row0_base + block_parameters_.columns4x4();
+ const int bp_stride = MultiplyBy2(block_parameters_.columns4x4());
+ int row4x4 = row4x4_start;
+ do {
+ uint8_t* cdef_buffer_base = cdef_buffer_row_base[kPlaneY];
+ const uint8_t* src_buffer_base = src_buffer_row_base[kPlaneY];
+ BlockParameters* const* bp0 = bp_row0_base;
+ BlockParameters* const* bp1 = bp_row1_base;
+ int column4x4 = column4x4_start;
+ do {
+ const int block_width = kStep;
+ const int block_height = kStep;
+ const int cdef_stride = cdef_buffer_stride[kPlaneY];
+ uint8_t* const cdef_buffer = cdef_buffer_base;
+ const int src_stride = frame_buffer_.stride(kPlaneY);
+ const uint8_t* const src_buffer = src_buffer_base;
+
+ const bool skip = (*bp0)->skip && (*(bp0 + 1))->skip && (*bp1)->skip &&
+ (*(bp1 + 1))->skip;
+
+ if (skip) { // No cdef filtering.
+ direction_y[y_index] = kCdefSkip;
+ CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+ block_width, block_height, pixel_size_);
+ } else {
+ // Zero out residual skip flag.
+ direction_y[y_index] = 0;
+
+ int variance = 0;
+ if (compute_direction_and_variance) {
+ dsp_.cdef_direction(src_buffer, src_stride, &direction_y[y_index],
+ &variance);
+ }
+ const int direction =
+ (y_primary_strength == 0) ? 0 : direction_y[y_index];
+ const int variance_strength =
+ ((variance >> 6) != 0) ? std::min(FloorLog2(variance >> 6), 12) : 0;
+ const uint8_t primary_strength =
+ (variance != 0)
+ ? (y_primary_strength * (4 + variance_strength) + 8) >> 4
+ : 0;
+
+ if ((primary_strength | y_secondary_strength) == 0) {
+ CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+ block_width, block_height, pixel_size_);
+ } else {
+ uint16_t* cdef_src = cdef_block + kPlaneY * kCdefUnitSizeWithBorders *
+ kCdefUnitSizeWithBorders;
+ cdef_src += kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder;
+ cdef_src +=
+ (MultiplyBy4(row4x4 - row4x4_start)) * kCdefUnitSizeWithBorders +
+ (MultiplyBy4(column4x4 - column4x4_start));
+ dsp_.cdef_filter(cdef_src, kCdefUnitSizeWithBorders, block_width,
+ block_height, primary_strength, y_secondary_strength,
+ frame_header_.cdef.damping, direction, cdef_buffer,
+ cdef_stride);
+ }
}
+ cdef_buffer_base += column_step[kPlaneY];
+ src_buffer_base += column_step[kPlaneY];
+
+ bp0 += kStep4x4;
+ bp1 += kStep4x4;
+ column4x4 += kStep4x4;
+ y_index++;
+ } while (column4x4 < column4x4_start + block_width4x4);
+
+ cdef_buffer_row_base[kPlaneY] += cdef_buffer_row_base_stride[kPlaneY];
+ src_buffer_row_base[kPlaneY] += src_buffer_row_base_stride[kPlaneY];
+ bp_row0_base += bp_stride;
+ bp_row1_base += bp_stride;
+ row4x4 += kStep4x4;
+ } while (row4x4 < row4x4_start + block_height4x4);
+
+ if (planes_ == kMaxPlanesMonochrome) {
+ return;
+ }
+
+ const uint8_t uv_primary_strength =
+ frame_header_.cdef.uv_primary_strength[index];
+ const uint8_t uv_secondary_strength =
+ frame_header_.cdef.uv_secondary_strength[index];
+
+ if ((uv_primary_strength | uv_secondary_strength) == 0) {
+ for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
+ CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
+ cdef_buffer_row_base[plane], cdef_buffer_stride[plane],
+ MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
+ MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+ pixel_size_);
}
return;
}
PrepareCdefBlock<Pixel>(block_width4x4, block_height4x4, row4x4_start,
- column4x4_start, cdef_block,
- kRestorationProcessingUnitSizeWithBorders);
-
- for (int row4x4 = row4x4_start; row4x4 < row4x4_start + block_height4x4;
- row4x4 += step) {
- for (int column4x4 = column4x4_start;
- column4x4 < column4x4_start + block_width4x4; column4x4 += step) {
- const bool skip =
- block_parameters_.Find(row4x4, column4x4) != nullptr &&
- block_parameters_.Find(row4x4 + 1, column4x4) != nullptr &&
- block_parameters_.Find(row4x4, column4x4 + 1) != nullptr &&
- block_parameters_.Find(row4x4 + 1, column4x4 + 1) != nullptr &&
- block_parameters_.Find(row4x4, column4x4)->skip &&
- block_parameters_.Find(row4x4 + 1, column4x4)->skip &&
- block_parameters_.Find(row4x4, column4x4 + 1)->skip &&
- block_parameters_.Find(row4x4 + 1, column4x4 + 1)->skip;
- int damping = frame_header_.cdef.damping + coeff_shift;
- int direction_y;
- int direction;
- int variance;
- uint8_t primary_strength;
- uint8_t secondary_strength;
+ column4x4_start, cdef_block, kCdefUnitSizeWithBorders,
+ false);
- for (int plane = kPlaneY; plane < planes_; ++plane) {
- const int8_t subsampling_x = subsampling_x_[plane];
- const int8_t subsampling_y = subsampling_y_[plane];
- const int start_x = MultiplyBy4(column4x4) >> subsampling_x;
- const int start_y = MultiplyBy4(row4x4) >> subsampling_y;
- const int block_width = 8 >> subsampling_x;
- const int block_height = 8 >> subsampling_y;
- int cdef_stride;
- uint8_t* const cdef_buffer = GetCdefBufferAndStride(
- start_x, start_y, plane, subsampling_x, subsampling_y,
- window_buffer_plane_size, &cdef_stride);
+ for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
+ const int8_t subsampling_x = subsampling_x_[plane];
+ const int8_t subsampling_y = subsampling_y_[plane];
+ const int block_width = kStep >> subsampling_x;
+ const int block_height = kStep >> subsampling_y;
+ int row4x4 = row4x4_start;
+
+ y_index = 0;
+ do {
+ uint8_t* cdef_buffer_base = cdef_buffer_row_base[plane];
+ const uint8_t* src_buffer_base = src_buffer_row_base[plane];
+ int column4x4 = column4x4_start;
+ do {
+ const int cdef_stride = cdef_buffer_stride[plane];
+ uint8_t* const cdef_buffer = cdef_buffer_base;
const int src_stride = frame_buffer_.stride(plane);
- uint8_t* const src_buffer = source_buffer_[plane] +
- start_y * src_stride +
- start_x * pixel_size_;
+ const uint8_t* const src_buffer = src_buffer_base;
+ const bool skip = direction_y[y_index] & kCdefSkip;
+ int dual_cdef = 0;
if (skip) { // No cdef filtering.
- for (int y = 0; y < block_height; ++y) {
- memcpy(cdef_buffer + y * cdef_stride, src_buffer + y * src_stride,
- block_width * pixel_size_);
+ CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+ block_width, block_height, pixel_size_);
+ } else {
+ // Make sure block pair is not out of bounds.
+ if (column4x4 + (kStep4x4 * 2) <= column4x4_start + block_width4x4) {
+ // Enable dual processing if subsampling_x is 1.
+ dual_cdef = subsampling_x;
}
- continue;
- }
- if (plane == kPlaneY) {
- dsp_.cdef_direction(src_buffer, src_stride, &direction_y, &variance);
- primary_strength = frame_header_.cdef.y_primary_strength[index]
- << coeff_shift;
- secondary_strength = frame_header_.cdef.y_secondary_strength[index]
- << coeff_shift;
- direction = (primary_strength == 0) ? 0 : direction_y;
- const int variance_strength =
- ((variance >> 6) != 0) ? std::min(FloorLog2(variance >> 6), 12)
- : 0;
- primary_strength =
- (variance != 0)
- ? (primary_strength * (4 + variance_strength) + 8) >> 4
- : 0;
- } else {
- primary_strength = frame_header_.cdef.uv_primary_strength[index]
- << coeff_shift;
- secondary_strength = frame_header_.cdef.uv_secondary_strength[index]
- << coeff_shift;
- direction =
- (primary_strength == 0)
- ? 0
- : kCdefUvDirection[subsampling_x][subsampling_y][direction_y];
- damping = frame_header_.cdef.damping + coeff_shift - 1;
- }
+ int direction = (uv_primary_strength == 0)
+ ? 0
+ : kCdefUvDirection[subsampling_x][subsampling_y]
+ [direction_y[y_index]];
+
+ if (dual_cdef != 0) {
+ if (uv_primary_strength &&
+ direction_y[y_index] != direction_y[y_index + 1]) {
+ // Disable dual processing if the second block of the pair does
+ // not have the same direction.
+ dual_cdef = 0;
+ }
- if ((primary_strength | secondary_strength) == 0) {
- for (int y = 0; y < block_height; ++y) {
- memcpy(cdef_buffer + y * cdef_stride, src_buffer + y * src_stride,
- block_width * pixel_size_);
+ // Disable dual processing if the second block of the pair is a
+ // skip.
+ if (direction_y[y_index + 1] == kCdefSkip) {
+ dual_cdef = 0;
+ }
}
- continue;
+
+ uint16_t* cdef_src = cdef_block + plane * kCdefUnitSizeWithBorders *
+ kCdefUnitSizeWithBorders;
+ cdef_src += kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder;
+ cdef_src +=
+ (MultiplyBy4(row4x4 - row4x4_start) >> subsampling_y) *
+ kCdefUnitSizeWithBorders +
+ (MultiplyBy4(column4x4 - column4x4_start) >> subsampling_x);
+ dsp_.cdef_filter(cdef_src, kCdefUnitSizeWithBorders,
+ block_width << dual_cdef, block_height,
+ uv_primary_strength, uv_secondary_strength,
+ frame_header_.cdef.damping - 1, direction,
+ cdef_buffer, cdef_stride);
}
- uint16_t* cdef_src =
- cdef_block + plane * kRestorationProcessingUnitSizeWithBorders *
- kRestorationProcessingUnitSizeWithBorders;
- cdef_src += kCdefBorder * kRestorationProcessingUnitSizeWithBorders +
- kCdefBorder;
- cdef_src += (MultiplyBy4(row4x4 - row4x4_start) >> subsampling_y) *
- kRestorationProcessingUnitSizeWithBorders +
- (MultiplyBy4(column4x4 - column4x4_start) >> subsampling_x);
- dsp_.cdef_filter(cdef_src, kRestorationProcessingUnitSizeWithBorders,
- frame_header_.rows4x4, frame_header_.columns4x4,
- start_x, start_y, subsampling_x, subsampling_y,
- primary_strength, secondary_strength, damping,
- direction, cdef_buffer, cdef_stride);
- }
- }
+ // When dual_cdef is set, the above cdef_filter() will process 2 blocks,
+ // so adjust the pointers and indexes for 2 blocks.
+ cdef_buffer_base += column_step[plane] << dual_cdef;
+ src_buffer_base += column_step[plane] << dual_cdef;
+ column4x4 += kStep4x4 << dual_cdef;
+ y_index += 1 << dual_cdef;
+ } while (column4x4 < column4x4_start + block_width4x4);
+
+ cdef_buffer_row_base[plane] += cdef_buffer_row_base_stride[plane];
+ src_buffer_row_base[plane] += src_buffer_row_base_stride[plane];
+ row4x4 += kStep4x4;
+ } while (row4x4 < row4x4_start + block_height4x4);
}
}
@@ -336,8 +466,7 @@ void PostFilter::ApplyCdefForOneSuperBlockRow(int row4x4_start, int sb4x4,
template <typename Pixel>
void PostFilter::ApplyCdefForOneRowInWindow(const int row4x4,
const int column4x4_start) {
- uint16_t cdef_block[kRestorationProcessingUnitSizeWithBorders *
- kRestorationProcessingUnitSizeWithBorders * 3];
+ uint16_t cdef_block[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 3];
for (int column4x4_64x64 = 0;
column4x4_64x64 < std::min(DivideBy4(window_buffer_width_),
diff --git a/chromium/third_party/libgav1/src/src/post_filter/deblock.cc b/chromium/third_party/libgav1/src/src/post_filter/deblock.cc
index db21d3db117..afe2895dbe3 100644
--- a/chromium/third_party/libgav1/src/src/post_filter/deblock.cc
+++ b/chromium/third_party/libgav1/src/src/post_filter/deblock.cc
@@ -17,6 +17,76 @@
#include "src/utils/blocking_counter.h"
namespace libgav1 {
+namespace {
+
+constexpr uint8_t HevThresh(int level) { return DivideBy16(level); }
+
+// GetLoopFilterSize* functions depend on this exact ordering of the
+// LoopFilterSize enums.
+static_assert(dsp::kLoopFilterSize4 == 0, "");
+static_assert(dsp::kLoopFilterSize6 == 1, "");
+static_assert(dsp::kLoopFilterSize8 == 2, "");
+static_assert(dsp::kLoopFilterSize14 == 3, "");
+
+dsp::LoopFilterSize GetLoopFilterSizeY(int filter_length) {
+ // |filter_length| must be a power of 2.
+ assert((filter_length & (filter_length - 1)) == 0);
+ // This code is the branch free equivalent of:
+ // if (filter_length == 4) return kLoopFilterSize4;
+ // if (filter_length == 8) return kLoopFilterSize8;
+ // return kLoopFilterSize14;
+ return static_cast<dsp::LoopFilterSize>(
+ MultiplyBy2(static_cast<int>(filter_length > 4)) +
+ static_cast<int>(filter_length > 8));
+}
+
+constexpr dsp::LoopFilterSize GetLoopFilterSizeUV(int filter_length) {
+ // For U & V planes, size is kLoopFilterSize4 if |filter_length| is 4,
+ // otherwise size is kLoopFilterSize6.
+ return static_cast<dsp::LoopFilterSize>(filter_length != 4);
+}
+
+// 7.14.5.
+void ComputeDeblockFilterLevelsHelper(
+ const ObuFrameHeader& frame_header, int segment_id, int level_index,
+ const int8_t delta_lf[kFrameLfCount],
+ uint8_t deblock_filter_levels[kNumReferenceFrameTypes][2]) {
+ const int delta = delta_lf[frame_header.delta_lf.multi ? level_index : 0];
+ uint8_t level = Clip3(frame_header.loop_filter.level[level_index] + delta, 0,
+ kMaxLoopFilterValue);
+ const auto feature = static_cast<SegmentFeature>(
+ kSegmentFeatureLoopFilterYVertical + level_index);
+ level =
+ Clip3(level + frame_header.segmentation.feature_data[segment_id][feature],
+ 0, kMaxLoopFilterValue);
+ if (!frame_header.loop_filter.delta_enabled) {
+ static_assert(sizeof(deblock_filter_levels[0][0]) == 1, "");
+ memset(deblock_filter_levels, level, kNumReferenceFrameTypes * 2);
+ return;
+ }
+ assert(frame_header.loop_filter.delta_enabled);
+ const int shift = level >> 5;
+ deblock_filter_levels[kReferenceFrameIntra][0] = Clip3(
+ level +
+ LeftShift(frame_header.loop_filter.ref_deltas[kReferenceFrameIntra],
+ shift),
+ 0, kMaxLoopFilterValue);
+ // deblock_filter_levels[kReferenceFrameIntra][1] is never used. So it does
+ // not have to be populated.
+ for (int reference_frame = kReferenceFrameIntra + 1;
+ reference_frame < kNumReferenceFrameTypes; ++reference_frame) {
+ for (int mode_id = 0; mode_id < 2; ++mode_id) {
+ deblock_filter_levels[reference_frame][mode_id] = Clip3(
+ level +
+ LeftShift(frame_header.loop_filter.ref_deltas[reference_frame] +
+ frame_header.loop_filter.mode_deltas[mode_id],
+ shift),
+ 0, kMaxLoopFilterValue);
+ }
+ }
+}
+
+} // namespace
void PostFilter::ComputeDeblockFilterLevels(
const int8_t delta_lf[kFrameLfCount],
@@ -28,13 +98,13 @@ void PostFilter::ComputeDeblockFilterLevels(
++segment_id) {
int level_index = 0;
for (; level_index < 2; ++level_index) {
- LoopFilterMask::ComputeDeblockFilterLevels(
+ ComputeDeblockFilterLevelsHelper(
frame_header_, segment_id, level_index, delta_lf,
deblock_filter_levels[segment_id][level_index]);
}
for (; level_index < kFrameLfCount; ++level_index) {
if (frame_header_.loop_filter.level[level_index] != 0) {
- LoopFilterMask::ComputeDeblockFilterLevels(
+ ComputeDeblockFilterLevelsHelper(
frame_header_, segment_id, level_index, delta_lf,
deblock_filter_levels[segment_id][level_index]);
}
@@ -42,62 +112,28 @@ void PostFilter::ComputeDeblockFilterLevels(
}
}
-void PostFilter::InitDeblockFilterParams() {
- const int8_t sharpness = frame_header_.loop_filter.sharpness;
- assert(0 <= sharpness && sharpness < 8);
- const int shift = DivideBy4(sharpness + 3); // ceil(sharpness / 4.0)
- for (int level = 0; level <= kMaxLoopFilterValue; ++level) {
- uint8_t limit = level >> shift;
- if (sharpness > 0) {
- limit = Clip3(limit, 1, 9 - sharpness);
- } else {
- limit = std::max(limit, static_cast<uint8_t>(1));
- }
- inner_thresh_[level] = limit;
- outer_thresh_[level] = 2 * (level + 2) + limit;
- hev_thresh_[level] = level >> 4;
- }
-}
-
-void PostFilter::GetDeblockFilterParams(uint8_t level, int* outer_thresh,
- int* inner_thresh,
- int* hev_thresh) const {
- *outer_thresh = outer_thresh_[level];
- *inner_thresh = inner_thresh_[level];
- *hev_thresh = hev_thresh_[level];
-}
-
-template <LoopFilterType type>
-bool PostFilter::GetDeblockFilterEdgeInfo(const Plane plane, int row4x4,
- int column4x4,
- const int8_t subsampling_x,
- const int8_t subsampling_y,
- uint8_t* level, int* step,
- int* filter_length) const {
+bool PostFilter::GetHorizontalDeblockFilterEdgeInfo(const Plane plane,
+ int row4x4, int column4x4,
+ const int8_t subsampling_x,
+ const int8_t subsampling_y,
+ uint8_t* level, int* step,
+ int* filter_length) const {
row4x4 = GetDeblockPosition(row4x4, subsampling_y);
column4x4 = GetDeblockPosition(column4x4, subsampling_x);
const BlockParameters* bp = block_parameters_.Find(row4x4, column4x4);
const TransformSize transform_size =
(plane == kPlaneY) ? inter_transform_sizes_[row4x4][column4x4]
: bp->uv_transform_size;
- *step = (type == kLoopFilterTypeHorizontal) ? kTransformHeight[transform_size]
- : kTransformWidth[transform_size];
- if ((type == kLoopFilterTypeHorizontal && row4x4 == subsampling_y) ||
- (type == kLoopFilterTypeVertical && column4x4 == subsampling_x)) {
- return false;
- }
+ *step = kTransformHeight[transform_size];
+ if (row4x4 == subsampling_y) return false;
- const int filter_id = kDeblockFilterLevelIndex[plane][type];
+ const int filter_id =
+ kDeblockFilterLevelIndex[plane][kLoopFilterTypeHorizontal];
const uint8_t level_this = bp->deblock_filter_level[filter_id];
- const int row4x4_prev = (type == kLoopFilterTypeHorizontal)
- ? row4x4 - (1 << subsampling_y)
- : row4x4;
- const int column4x4_prev = (type == kLoopFilterTypeHorizontal)
- ? column4x4
- : column4x4 - (1 << subsampling_x);
- assert(row4x4_prev >= 0 && column4x4_prev >= 0);
+ const int row4x4_prev = row4x4 - (1 << subsampling_y);
+ assert(row4x4_prev >= 0);
const BlockParameters* bp_prev =
- block_parameters_.Find(row4x4_prev, column4x4_prev);
+ block_parameters_.Find(row4x4_prev, column4x4);
const uint8_t level_prev = bp_prev->deblock_filter_level[filter_id];
*level = level_this;
if (level_this == 0) {
@@ -107,373 +143,91 @@ bool PostFilter::GetDeblockFilterEdgeInfo(const Plane plane, int row4x4,
const BlockSize size =
kPlaneResidualSize[bp->size][subsampling_x][subsampling_y];
- const int prediction_masks = (type == kLoopFilterTypeHorizontal)
- ? kBlockHeightPixels[size] - 1
- : kBlockWidthPixels[size] - 1;
- const int pixel_position = MultiplyBy4((type == kLoopFilterTypeHorizontal)
- ? row4x4 >> subsampling_y
- : column4x4 >> subsampling_x);
+ const int prediction_masks = kBlockHeightPixels[size] - 1;
+ const int pixel_position = MultiplyBy4(row4x4 >> subsampling_y);
const bool is_border = (pixel_position & prediction_masks) == 0;
const bool skip = bp->skip && bp->is_inter;
const bool skip_prev = bp_prev->skip && bp_prev->is_inter;
if (!skip || !skip_prev || is_border) {
const TransformSize transform_size_prev =
- (plane == kPlaneY) ? inter_transform_sizes_[row4x4_prev][column4x4_prev]
+ (plane == kPlaneY) ? inter_transform_sizes_[row4x4_prev][column4x4]
: bp_prev->uv_transform_size;
- const int step_prev = (type == kLoopFilterTypeHorizontal)
- ? kTransformHeight[transform_size_prev]
- : kTransformWidth[transform_size_prev];
+ const int step_prev = kTransformHeight[transform_size_prev];
*filter_length = std::min(*step, step_prev);
return true;
}
return false;
}
-void PostFilter::HorizontalDeblockFilter(Plane plane, int row4x4_start,
- int column4x4_start, int unit_id) {
- const int8_t subsampling_x = subsampling_x_[plane];
- const int8_t subsampling_y = subsampling_y_[plane];
- const int row_step = 1 << subsampling_y;
- const int column_step = 1 << subsampling_x;
- const size_t src_step = 4 * pixel_size_;
- const ptrdiff_t row_stride = MultiplyBy4(frame_buffer_.stride(plane));
- const ptrdiff_t src_stride = frame_buffer_.stride(plane);
- uint8_t* src = GetSourceBuffer(plane, row4x4_start, column4x4_start);
- const uint64_t single_row_mask = 0xffff;
- // 3 (11), 5 (0101).
- const uint64_t two_block_mask = (subsampling_x > 0) ? 5 : 3;
- const LoopFilterType type = kLoopFilterTypeHorizontal;
- // Subsampled UV samples correspond to the right/bottom position of
- // Y samples.
- const int column = subsampling_x;
-
- // AV1 smallest transform size is 4x4, thus minimum horizontal edge size is
- // 4x4. For SIMD implementation, sse2 could compute 8 pixels at the same time.
- // __m128i = 8 x uint16_t, AVX2 could compute 16 pixels at the same time.
- // __m256i = 16 x uint16_t, assuming pixel type is 16 bit. It means we could
- // filter 2 horizontal edges using sse2 and 4 edges using AVX2.
- // The bitmask enables us to call different SIMD implementations to filter
- // 1 edge, or 2 edges or 4 edges.
- // TODO(chengchen): Here, the implementation only consider 1 and 2 edges.
- // Add support for 4 edges. More branches involved, for example, if input is
- // 8 bit, __m128i = 16 x 8 bit, we could apply filtering for 4 edges using
- // sse2, 8 edges using AVX2. If input is 16 bit, __m128 = 8 x 16 bit, then
- // we apply filtering for 2 edges using sse2, and 4 edges using AVX2.
- for (int row4x4 = 0; MultiplyBy4(row4x4_start + row4x4) < height_ &&
- row4x4 < kNum4x4InLoopFilterMaskUnit;
- row4x4 += row_step) {
- if (row4x4_start + row4x4 == 0) {
- src += row_stride;
- continue;
- }
- // Subsampled UV samples correspond to the right/bottom position of
- // Y samples.
- const int row = GetDeblockPosition(row4x4, subsampling_y);
- const int index = GetIndex(row);
- const int shift = GetShift(row, column);
- const int level_offset = LoopFilterMask::GetLevelOffset(row, column);
- // Mask of current row. mask4x4 represents the vertical filter length for
- // the current horizontal edge is 4, and we needs to apply 3-tap filtering.
- // Similarly, mask8x8 and mask16x16 represent filter lengths are 8 and 16.
- uint64_t mask4x4 =
- (masks_->GetTop(unit_id, plane, kLoopFilterTransformSizeId4x4, index) >>
- shift) &
- single_row_mask;
- uint64_t mask8x8 =
- (masks_->GetTop(unit_id, plane, kLoopFilterTransformSizeId8x8, index) >>
- shift) &
- single_row_mask;
- uint64_t mask16x16 =
- (masks_->GetTop(unit_id, plane, kLoopFilterTransformSizeId16x16,
- index) >>
- shift) &
- single_row_mask;
- // mask4x4, mask8x8, mask16x16 are mutually exclusive.
- assert((mask4x4 & mask8x8) == 0 && (mask4x4 & mask16x16) == 0 &&
- (mask8x8 & mask16x16) == 0);
- // Apply deblock filter for one row.
- uint8_t* src_row = src;
- int column_offset = 0;
- for (uint64_t mask = mask4x4 | mask8x8 | mask16x16; mask != 0;) {
- int edge_count = 1;
- if ((mask & 1) != 0) {
- // Filter parameters of current edge.
- const uint8_t level = masks_->GetLevel(unit_id, plane, type,
- level_offset + column_offset);
- int outer_thresh_0;
- int inner_thresh_0;
- int hev_thresh_0;
- GetDeblockFilterParams(level, &outer_thresh_0, &inner_thresh_0,
- &hev_thresh_0);
- // Filter parameters of next edge. Clip the index to avoid over
- // reading at the edge of the block. The values will be unused in that
- // case.
- const int level_next_index = level_offset + column_offset + column_step;
- const uint8_t level_next =
- masks_->GetLevel(unit_id, plane, type, level_next_index & 0xff);
- int outer_thresh_1;
- int inner_thresh_1;
- int hev_thresh_1;
- GetDeblockFilterParams(level_next, &outer_thresh_1, &inner_thresh_1,
- &hev_thresh_1);
-
- if ((mask16x16 & 1) != 0) {
- const dsp::LoopFilterSize size = (plane == kPlaneY)
- ? dsp::kLoopFilterSize14
- : dsp::kLoopFilterSize6;
- const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type];
- if ((mask16x16 & two_block_mask) == two_block_mask) {
- edge_count = 2;
- // Apply filtering for two edges.
- filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
- hev_thresh_0);
- filter_func(src_row + src_step, src_stride, outer_thresh_1,
- inner_thresh_1, hev_thresh_1);
- } else {
- // Apply single edge filtering.
- filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
- hev_thresh_0);
- }
- }
-
- if ((mask8x8 & 1) != 0) {
- const dsp::LoopFilterSize size =
- plane == kPlaneY ? dsp::kLoopFilterSize8 : dsp::kLoopFilterSize6;
- const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type];
- if ((mask8x8 & two_block_mask) == two_block_mask) {
- edge_count = 2;
- // Apply filtering for two edges.
- filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
- hev_thresh_0);
- filter_func(src_row + src_step, src_stride, outer_thresh_1,
- inner_thresh_1, hev_thresh_1);
- } else {
- // Apply single edge filtering.
- filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
- hev_thresh_0);
- }
- }
+bool PostFilter::GetVerticalDeblockFilterEdgeInfo(
+ const Plane /*plane*/, int row4x4, int column4x4,
+ const int8_t /*subsampling_x*/, const int8_t /*subsampling_y*/,
+ BlockParameters* const* bp_ptr, uint8_t* level, int* step,
+ int* filter_length) const {
+ const BlockParameters* bp = *bp_ptr;
+ *step = kTransformWidth[inter_transform_sizes_[row4x4][column4x4]];
+ if (column4x4 == 0) return false;
- if ((mask4x4 & 1) != 0) {
- const dsp::LoopFilterSize size = dsp::kLoopFilterSize4;
- const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type];
- if ((mask4x4 & two_block_mask) == two_block_mask) {
- edge_count = 2;
- // Apply filtering for two edges.
- filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
- hev_thresh_0);
- filter_func(src_row + src_step, src_stride, outer_thresh_1,
- inner_thresh_1, hev_thresh_1);
- } else {
- // Apply single edge filtering.
- filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
- hev_thresh_0);
- }
- }
- }
-
- const int step = edge_count * column_step;
- mask4x4 >>= step;
- mask8x8 >>= step;
- mask16x16 >>= step;
- mask >>= step;
- column_offset += step;
- src_row += MultiplyBy4(edge_count) * pixel_size_;
- }
- src += row_stride;
+ const int filter_id = 0;
+ const uint8_t level_this = bp->deblock_filter_level[filter_id];
+ const int column4x4_prev = column4x4 - 1;
+ assert(column4x4_prev >= 0);
+ const BlockParameters* bp_prev = *(bp_ptr - 1);
+ const uint8_t level_prev = bp_prev->deblock_filter_level[filter_id];
+ *level = level_this;
+ if (level_this == 0) {
+ if (level_prev == 0) return false;
+ *level = level_prev;
}
-}
-void PostFilter::VerticalDeblockFilter(Plane plane, int row4x4_start,
- int column4x4_start, int unit_id) {
- const int8_t subsampling_x = subsampling_x_[plane];
- const int8_t subsampling_y = subsampling_y_[plane];
- const int row_step = 1 << subsampling_y;
- const int two_row_step = row_step << 1;
- const int column_step = 1 << subsampling_x;
- const size_t src_step = (bitdepth_ == 8) ? 4 : 4 * sizeof(uint16_t);
- const ptrdiff_t row_stride = MultiplyBy4(frame_buffer_.stride(plane));
- const ptrdiff_t two_row_stride = row_stride << 1;
- const ptrdiff_t src_stride = frame_buffer_.stride(plane);
- uint8_t* src = GetSourceBuffer(plane, row4x4_start, column4x4_start);
- const uint64_t single_row_mask = 0xffff;
- const LoopFilterType type = kLoopFilterTypeVertical;
- // Subsampled UV samples correspond to the right/bottom position of
- // Y samples.
- const int column = subsampling_x;
-
- // AV1 smallest transform size is 4x4, thus minimum vertical edge size is 4x4.
- // For SIMD implementation, sse2 could compute 8 pixels at the same time.
- // __m128i = 8 x uint16_t, AVX2 could compute 16 pixels at the same time.
- // __m256i = 16 x uint16_t, assuming pixel type is 16 bit. It means we could
- // filter 2 vertical edges using sse2 and 4 edges using AVX2.
- // The bitmask enables us to call different SIMD implementations to filter
- // 1 edge, or 2 edges or 4 edges.
- // TODO(chengchen): Here, the implementation only consider 1 and 2 edges.
- // Add support for 4 edges. More branches involved, for example, if input is
- // 8 bit, __m128i = 16 x 8 bit, we could apply filtering for 4 edges using
- // sse2, 8 edges using AVX2. If input is 16 bit, __m128 = 8 x 16 bit, then
- // we apply filtering for 2 edges using sse2, and 4 edges using AVX2.
- for (int row4x4 = 0; MultiplyBy4(row4x4_start + row4x4) < height_ &&
- row4x4 < kNum4x4InLoopFilterMaskUnit;
- row4x4 += two_row_step) {
- // Subsampled UV samples correspond to the right/bottom position of
- // Y samples.
- const int row = GetDeblockPosition(row4x4, subsampling_y);
- const int row_next = row + row_step;
- const int index = GetIndex(row);
- const int shift = GetShift(row, column);
- const int level_offset = LoopFilterMask::GetLevelOffset(row, column);
- const int index_next = GetIndex(row_next);
- const int shift_next_row = GetShift(row_next, column);
- const int level_offset_next_row =
- LoopFilterMask::GetLevelOffset(row_next, column);
- // TODO(chengchen): replace 0, 1, 2 to meaningful enum names.
- // mask of current row. mask4x4 represents the horizontal filter length for
- // the current vertical edge is 4, and we needs to apply 3-tap filtering.
- // Similarly, mask8x8 and mask16x16 represent filter lengths are 8 and 16.
- uint64_t mask4x4_0 =
- (masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId4x4,
- index) >>
- shift) &
- single_row_mask;
- uint64_t mask8x8_0 =
- (masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId8x8,
- index) >>
- shift) &
- single_row_mask;
- uint64_t mask16x16_0 =
- (masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId16x16,
- index) >>
- shift) &
- single_row_mask;
- // mask4x4, mask8x8, mask16x16 are mutually exclusive.
- assert((mask4x4_0 & mask8x8_0) == 0 && (mask4x4_0 & mask16x16_0) == 0 &&
- (mask8x8_0 & mask16x16_0) == 0);
- // mask of the next row. With mask of current and the next row, we can call
- // the corresponding SIMD function to apply filtering for two vertical
- // edges together.
- uint64_t mask4x4_1 =
- (masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId4x4,
- index_next) >>
- shift_next_row) &
- single_row_mask;
- uint64_t mask8x8_1 =
- (masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId8x8,
- index_next) >>
- shift_next_row) &
- single_row_mask;
- uint64_t mask16x16_1 =
- (masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId16x16,
- index_next) >>
- shift_next_row) &
- single_row_mask;
- // mask4x4, mask8x8, mask16x16 are mutually exclusive.
- assert((mask4x4_1 & mask8x8_1) == 0 && (mask4x4_1 & mask16x16_1) == 0 &&
- (mask8x8_1 & mask16x16_1) == 0);
- // Apply deblock filter for two rows.
- uint8_t* src_row = src;
- int column_offset = 0;
- for (uint64_t mask = mask4x4_0 | mask8x8_0 | mask16x16_0 | mask4x4_1 |
- mask8x8_1 | mask16x16_1;
- mask != 0;) {
- if ((mask & 1) != 0) {
- // Filter parameters of current row.
- const uint8_t level = masks_->GetLevel(unit_id, plane, type,
- level_offset + column_offset);
- int outer_thresh_0;
- int inner_thresh_0;
- int hev_thresh_0;
- GetDeblockFilterParams(level, &outer_thresh_0, &inner_thresh_0,
- &hev_thresh_0);
- // Filter parameters of next row. Clip the index to avoid over
- // reading at the edge of the block. The values will be unused in that
- // case.
- const int level_next_index = level_offset_next_row + column_offset;
- const uint8_t level_next =
- masks_->GetLevel(unit_id, plane, type, level_next_index & 0xff);
- int outer_thresh_1;
- int inner_thresh_1;
- int hev_thresh_1;
- GetDeblockFilterParams(level_next, &outer_thresh_1, &inner_thresh_1,
- &hev_thresh_1);
- uint8_t* const src_row_next = src_row + row_stride;
-
- if (((mask16x16_0 | mask16x16_1) & 1) != 0) {
- const dsp::LoopFilterSize size = (plane == kPlaneY)
- ? dsp::kLoopFilterSize14
- : dsp::kLoopFilterSize6;
- const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type];
- if ((mask16x16_0 & mask16x16_1 & 1) != 0) {
- // Apply dual vertical edge filtering.
- filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
- hev_thresh_0);
- filter_func(src_row_next, src_stride, outer_thresh_1,
- inner_thresh_1, hev_thresh_1);
- } else if ((mask16x16_0 & 1) != 0) {
- filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
- hev_thresh_0);
- } else {
- filter_func(src_row_next, src_stride, outer_thresh_1,
- inner_thresh_1, hev_thresh_1);
- }
- }
-
- if (((mask8x8_0 | mask8x8_1) & 1) != 0) {
- const dsp::LoopFilterSize size = (plane == kPlaneY)
- ? dsp::kLoopFilterSize8
- : dsp::kLoopFilterSize6;
- const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type];
- if ((mask8x8_0 & mask8x8_1 & 1) != 0) {
- filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
- hev_thresh_0);
- filter_func(src_row_next, src_stride, outer_thresh_1,
- inner_thresh_1, hev_thresh_1);
- } else if ((mask8x8_0 & 1) != 0) {
- filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
- hev_thresh_0);
- } else {
- filter_func(src_row_next, src_stride, outer_thresh_1,
- inner_thresh_1, hev_thresh_1);
- }
- }
+ const int prediction_masks = kBlockWidthPixels[bp->size] - 1;
+ const int pixel_position = MultiplyBy4(column4x4);
+ const bool is_border = (pixel_position & prediction_masks) == 0;
+ const bool skip = bp->skip && bp->is_inter;
+ const bool skip_prev = bp_prev->skip && bp_prev->is_inter;
+ if (skip && skip_prev && !is_border) return false;
+ const int step_prev =
+ kTransformWidth[inter_transform_sizes_[row4x4][column4x4_prev]];
+ *filter_length = std::min(*step, step_prev);
+ return true;
+}
- if (((mask4x4_0 | mask4x4_1) & 1) != 0) {
- const dsp::LoopFilterSize size = dsp::kLoopFilterSize4;
- const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type];
- if ((mask4x4_0 & mask4x4_1 & 1) != 0) {
- filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
- hev_thresh_0);
- filter_func(src_row_next, src_stride, outer_thresh_1,
- inner_thresh_1, hev_thresh_1);
- } else if ((mask4x4_0 & 1) != 0) {
- filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
- hev_thresh_0);
- } else {
- filter_func(src_row_next, src_stride, outer_thresh_1,
- inner_thresh_1, hev_thresh_1);
- }
- }
- }
+bool PostFilter::GetVerticalDeblockFilterEdgeInfoUV(
+ const Plane plane, int row4x4, int column4x4, const int8_t subsampling_x,
+ const int8_t subsampling_y, BlockParameters* const* bp_ptr, uint8_t* level,
+ int* step, int* filter_length) const {
+ row4x4 = GetDeblockPosition(row4x4, subsampling_y);
+ column4x4 = GetDeblockPosition(column4x4, subsampling_x);
+ const BlockParameters* bp = *bp_ptr;
+ *step = kTransformWidth[bp->uv_transform_size];
+ if (column4x4 == subsampling_x) return false;
- mask4x4_0 >>= column_step;
- mask8x8_0 >>= column_step;
- mask16x16_0 >>= column_step;
- mask4x4_1 >>= column_step;
- mask8x8_1 >>= column_step;
- mask16x16_1 >>= column_step;
- mask >>= column_step;
- column_offset += column_step;
- src_row += src_step;
- }
- src += two_row_stride;
+ const int filter_id =
+ kDeblockFilterLevelIndex[plane][kLoopFilterTypeVertical];
+ const uint8_t level_this = bp->deblock_filter_level[filter_id];
+ const BlockParameters* bp_prev = *(bp_ptr - (1 << subsampling_x));
+ const uint8_t level_prev = bp_prev->deblock_filter_level[filter_id];
+ *level = level_this;
+ if (level_this == 0) {
+ if (level_prev == 0) return false;
+ *level = level_prev;
}
+
+ const BlockSize size =
+ kPlaneResidualSize[bp->size][subsampling_x][subsampling_y];
+ const int prediction_masks = kBlockWidthPixels[size] - 1;
+ const int pixel_position = MultiplyBy4(column4x4 >> subsampling_x);
+ const bool is_border = (pixel_position & prediction_masks) == 0;
+ const bool skip = bp->skip && bp->is_inter;
+ const bool skip_prev = bp_prev->skip && bp_prev->is_inter;
+ if (skip && skip_prev && !is_border) return false;
+ const int step_prev = kTransformWidth[bp_prev->uv_transform_size];
+ *filter_length = std::min(*step, step_prev);
+ return true;
}
-void PostFilter::HorizontalDeblockFilterNoMask(Plane plane, int row4x4_start,
- int column4x4_start,
- int unit_id) {
- static_cast<void>(unit_id);
+void PostFilter::HorizontalDeblockFilter(Plane plane, int row4x4_start,
+ int column4x4_start) {
const int8_t subsampling_x = subsampling_x_[plane];
const int8_t subsampling_y = subsampling_y_[plane];
const int column_step = 1 << subsampling_x;
@@ -486,27 +240,22 @@ void PostFilter::HorizontalDeblockFilterNoMask(Plane plane, int row4x4_start,
int filter_length;
for (int column4x4 = 0; MultiplyBy4(column4x4_start + column4x4) < width_ &&
- column4x4 < kNum4x4InLoopFilterMaskUnit;
+ column4x4 < kNum4x4InLoopFilterUnit;
column4x4 += column_step, src += src_step) {
uint8_t* src_row = src;
for (int row4x4 = 0; MultiplyBy4(row4x4_start + row4x4) < height_ &&
- row4x4 < kNum4x4InLoopFilterMaskUnit;
+ row4x4 < kNum4x4InLoopFilterUnit;
row4x4 += row_step) {
- const bool need_filter =
- GetDeblockFilterEdgeInfo<kLoopFilterTypeHorizontal>(
- plane, row4x4_start + row4x4, column4x4_start + column4x4,
- subsampling_x, subsampling_y, &level, &row_step, &filter_length);
+ const bool need_filter = GetHorizontalDeblockFilterEdgeInfo(
+ plane, row4x4_start + row4x4, column4x4_start + column4x4,
+ subsampling_x, subsampling_y, &level, &row_step, &filter_length);
if (need_filter) {
- int outer_thresh;
- int inner_thresh;
- int hev_thresh;
- GetDeblockFilterParams(level, &outer_thresh, &inner_thresh,
- &hev_thresh);
const dsp::LoopFilterSize size =
- GetLoopFilterSize(plane, filter_length);
+ (plane == kPlaneY) ? GetLoopFilterSizeY(filter_length)
+ : GetLoopFilterSizeUV(filter_length);
const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type];
- filter_func(src_row, src_stride, outer_thresh, inner_thresh,
- hev_thresh);
+ filter_func(src_row, src_stride, outer_thresh_[level],
+ inner_thresh_[level], HevThresh(level));
}
// TODO(chengchen): use shifts instead of multiplication.
src_row += row_step * src_stride;
@@ -515,9 +264,8 @@ void PostFilter::HorizontalDeblockFilterNoMask(Plane plane, int row4x4_start,
}
}
-void PostFilter::VerticalDeblockFilterNoMask(Plane plane, int row4x4_start,
- int column4x4_start, int unit_id) {
- static_cast<void>(unit_id);
+void PostFilter::VerticalDeblockFilter(Plane plane, int row4x4_start,
+ int column4x4_start) {
const int8_t subsampling_x = subsampling_x_[plane];
const int8_t subsampling_y = subsampling_y_[plane];
const int row_step = 1 << subsampling_y;
@@ -529,29 +277,30 @@ void PostFilter::VerticalDeblockFilterNoMask(Plane plane, int row4x4_start,
uint8_t level;
int filter_length;
+ BlockParameters* const* bp_row_base = block_parameters_.Address(
+ GetDeblockPosition(row4x4_start, subsampling_y),
+ GetDeblockPosition(column4x4_start, subsampling_x));
+ const auto edge_info = deblock_vertical_edge_info_[plane];
+ const int bp_stride = block_parameters_.columns4x4() * row_step;
for (int row4x4 = 0; MultiplyBy4(row4x4_start + row4x4) < height_ &&
- row4x4 < kNum4x4InLoopFilterMaskUnit;
- row4x4 += row_step, src += row_stride) {
+ row4x4 < kNum4x4InLoopFilterUnit;
+ row4x4 += row_step, src += row_stride, bp_row_base += bp_stride) {
uint8_t* src_row = src;
+ BlockParameters* const* bp = bp_row_base;
for (int column4x4 = 0; MultiplyBy4(column4x4_start + column4x4) < width_ &&
- column4x4 < kNum4x4InLoopFilterMaskUnit;
- column4x4 += column_step) {
- const bool need_filter =
- GetDeblockFilterEdgeInfo<kLoopFilterTypeVertical>(
- plane, row4x4_start + row4x4, column4x4_start + column4x4,
- subsampling_x, subsampling_y, &level, &column_step,
- &filter_length);
+ column4x4 < kNum4x4InLoopFilterUnit;
+ column4x4 += column_step, bp += column_step) {
+ const bool need_filter = (this->*edge_info)(
+ plane, row4x4_start + row4x4, column4x4_start + column4x4,
+ subsampling_x, subsampling_y, bp, &level, &column_step,
+ &filter_length);
if (need_filter) {
- int outer_thresh;
- int inner_thresh;
- int hev_thresh;
- GetDeblockFilterParams(level, &outer_thresh, &inner_thresh,
- &hev_thresh);
const dsp::LoopFilterSize size =
- GetLoopFilterSize(plane, filter_length);
+ (plane == kPlaneY) ? GetLoopFilterSizeY(filter_length)
+ : GetLoopFilterSizeUV(filter_length);
const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type];
- filter_func(src_row, src_stride, outer_thresh, inner_thresh,
- hev_thresh);
+ filter_func(src_row, src_stride, outer_thresh_[level],
+ inner_thresh_[level], HevThresh(level));
}
src_row += column_step * pixel_size_;
column_step = DivideBy4(column_step << subsampling_x);
@@ -573,21 +322,19 @@ void PostFilter::ApplyDeblockFilterForOneSuperBlockRow(int row4x4_start,
if (row4x4 >= frame_header_.rows4x4) break;
int column4x4;
for (column4x4 = 0; column4x4 < frame_header_.columns4x4;
- column4x4 += kNum4x4InLoopFilterMaskUnit) {
+ column4x4 += kNum4x4InLoopFilterUnit) {
// First apply vertical filtering
- VerticalDeblockFilterNoMask(static_cast<Plane>(plane), row4x4,
- column4x4, 0);
+ VerticalDeblockFilter(static_cast<Plane>(plane), row4x4, column4x4);
// Delay one superblock to apply horizontal filtering.
if (column4x4 != 0) {
- HorizontalDeblockFilterNoMask(static_cast<Plane>(plane), row4x4,
- column4x4 - kNum4x4InLoopFilterMaskUnit,
- 0);
+ HorizontalDeblockFilter(static_cast<Plane>(plane), row4x4,
+ column4x4 - kNum4x4InLoopFilterUnit);
}
}
// Horizontal filtering for the last 64x64 block.
- HorizontalDeblockFilterNoMask(static_cast<Plane>(plane), row4x4,
- column4x4 - kNum4x4InLoopFilterMaskUnit, 0);
+ HorizontalDeblockFilter(static_cast<Plane>(plane), row4x4,
+ column4x4 - kNum4x4InLoopFilterUnit);
}
}
}
@@ -602,12 +349,11 @@ void PostFilter::DeblockFilterWorker(int jobs_per_plane, const Plane* planes,
total_jobs) {
const Plane plane = planes[job_index / jobs_per_plane];
const int row_unit = job_index % jobs_per_plane;
- const int row4x4 = row_unit * kNum4x4InLoopFilterMaskUnit;
+ const int row4x4 = row_unit * kNum4x4InLoopFilterUnit;
for (int column4x4 = 0, column_unit = 0;
column4x4 < frame_header_.columns4x4;
- column4x4 += kNum4x4InLoopFilterMaskUnit, ++column_unit) {
- const int unit_id = GetDeblockUnitId(row_unit, column_unit);
- (this->*deblock_filter)(plane, row4x4, column4x4, unit_id);
+ column4x4 += kNum4x4InLoopFilterUnit, ++column_unit) {
+ (this->*deblock_filter)(plane, row4x4, column4x4);
}
}
}
@@ -635,8 +381,7 @@ void PostFilter::ApplyDeblockFilterThreaded() {
// The only synchronization involved is to know when the each directional
// filter is complete for the entire frame.
for (auto& type : {kLoopFilterTypeVertical, kLoopFilterTypeHorizontal}) {
- const DeblockFilter deblock_filter =
- deblock_filter_type_table_[kDeblockFilterBitMask][type];
+ const DeblockFilter deblock_filter = deblock_filter_func_[type];
std::atomic<int> job_counter(0);
BlockingCounter pending_workers(num_workers);
for (int i = 0; i < num_workers; ++i) {
@@ -656,4 +401,31 @@ void PostFilter::ApplyDeblockFilterThreaded() {
}
}
+void PostFilter::ApplyDeblockFilter(LoopFilterType loop_filter_type,
+ int row4x4_start, int column4x4_start,
+ int column4x4_end, int sb4x4) {
+ assert(row4x4_start >= 0);
+ assert(DoDeblock());
+
+ column4x4_end = std::min(column4x4_end, frame_header_.columns4x4);
+ if (column4x4_start >= column4x4_end) return;
+
+ const DeblockFilter deblock_filter = deblock_filter_func_[loop_filter_type];
+ const int sb_height4x4 =
+ std::min(sb4x4, frame_header_.rows4x4 - row4x4_start);
+ for (int plane = kPlaneY; plane < planes_; ++plane) {
+ if (plane != kPlaneY && frame_header_.loop_filter.level[plane + 1] == 0) {
+ continue;
+ }
+
+ for (int y = 0; y < sb_height4x4; y += kNum4x4InLoopFilterUnit) {
+ const int row4x4 = row4x4_start + y;
+ for (int column4x4 = column4x4_start; column4x4 < column4x4_end;
+ column4x4 += kNum4x4InLoopFilterUnit) {
+ (this->*deblock_filter)(static_cast<Plane>(plane), row4x4, column4x4);
+ }
+ }
+ }
+}
+
} // namespace libgav1
diff --git a/chromium/third_party/libgav1/src/src/post_filter/deblock_thresholds.inc b/chromium/third_party/libgav1/src/src/post_filter/deblock_thresholds.inc
new file mode 100644
index 00000000000..ca12aaaeb7e
--- /dev/null
+++ b/chromium/third_party/libgav1/src/src/post_filter/deblock_thresholds.inc
@@ -0,0 +1,85 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Thresholds for the deblocking filter. Precomputed values of part of Section
+// 7.14.4 for all possible values of sharpness.
+
+constexpr uint8_t kInnerThresh[8][kMaxLoopFilterValue + 1] = {
+ {1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63},
+ {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8},
+ {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7},
+ {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6},
+ {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5},
+ {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4},
+ {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3},
+ {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}};
+
+constexpr uint8_t kOuterThresh[8][kMaxLoopFilterValue + 1] = {
+ {5, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40,
+ 43, 46, 49, 52, 55, 58, 61, 64, 67, 70, 73, 76, 79,
+ 82, 85, 88, 91, 94, 97, 100, 103, 106, 109, 112, 115, 118,
+ 121, 124, 127, 130, 133, 136, 139, 142, 145, 148, 151, 154, 157,
+ 160, 163, 166, 169, 172, 175, 178, 181, 184, 187, 190, 193},
+ {5, 7, 9, 11, 14, 16, 19, 21, 24, 26, 29, 31, 34,
+ 36, 39, 41, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+ 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88,
+ 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114,
+ 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138},
+ {5, 7, 9, 11, 14, 16, 19, 21, 24, 26, 29, 31, 34,
+ 36, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61,
+ 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87,
+ 89, 91, 93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113,
+ 115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137},
+ {5, 7, 9, 11, 14, 16, 19, 21, 24, 26, 29, 31, 34,
+ 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60,
+ 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86,
+ 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112,
+ 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136},
+ {5, 7, 9, 11, 14, 16, 19, 21, 24, 26, 29, 31, 33,
+ 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59,
+ 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85,
+ 87, 89, 91, 93, 95, 97, 99, 101, 103, 105, 107, 109, 111,
+ 113, 115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135},
+ {5, 7, 9, 11, 13, 15, 17, 19, 22, 24, 26, 28, 31,
+ 33, 35, 37, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58,
+ 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84,
+ 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110,
+ 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134},
+ {5, 7, 9, 11, 13, 15, 17, 19, 22, 24, 26, 28, 31,
+ 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57,
+ 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83,
+ 85, 87, 89, 91, 93, 95, 97, 99, 101, 103, 105, 107, 109,
+ 111, 113, 115, 117, 119, 121, 123, 125, 127, 129, 131, 133},
+ {5, 7, 9, 11, 13, 15, 17, 19, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56,
+ 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82,
+ 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108,
+ 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132}};
diff --git a/chromium/third_party/libgav1/src/src/post_filter/loop_restoration.cc b/chromium/third_party/libgav1/src/src/post_filter/loop_restoration.cc
index a36788057ba..b36ad80cf05 100644
--- a/chromium/third_party/libgav1/src/src/post_filter/loop_restoration.cc
+++ b/chromium/third_party/libgav1/src/src/post_filter/loop_restoration.cc
@@ -21,16 +21,14 @@ void PostFilter::ApplyLoopRestorationForOneUnit(
uint8_t* const cdef_buffer, const ptrdiff_t cdef_buffer_stride,
const Plane plane, const int plane_height, const int x, const int y,
const int row, const int column, const int unit_row,
- const int current_process_unit_height, const int plane_process_unit_width,
- const int plane_unit_size, const int num_horizontal_units,
- const int plane_width, Array2DView<Pixel>* const loop_restored_window) {
+ const int current_process_unit_height, const int plane_unit_size,
+ const int num_horizontal_units, const int plane_width,
+ Array2DView<Pixel>* const loop_restored_window) {
const int unit_x = x + column;
const int unit_y = y + row;
const int current_process_unit_width =
- (unit_x + plane_process_unit_width <= plane_width)
- ? plane_process_unit_width
- : plane_width - unit_x;
- uint8_t* cdef_unit_buffer =
+ std::min(plane_unit_size, plane_width - unit_x);
+ const uint8_t* cdef_unit_buffer =
cdef_buffer + unit_y * cdef_buffer_stride + unit_x * pixel_size_;
const int unit_column =
std::min(unit_x / plane_unit_size, num_horizontal_units - 1);
@@ -49,54 +47,47 @@ void PostFilter::ApplyLoopRestorationForOneUnit(
return;
}
+ const ptrdiff_t block_buffer_stride =
+ kRestorationUnitWidthWithBorders * sizeof(Pixel);
// The SIMD implementation of wiener filter (currently WienerFilter_SSE4_1())
// over-reads 6 bytes, so add 6 extra bytes at the end of block_buffer for 8
// bit.
- alignas(alignof(uint16_t))
- uint8_t block_buffer[kRestorationProcessingUnitSizeWithBorders *
- kRestorationProcessingUnitSizeWithBorders *
- sizeof(Pixel) +
- ((sizeof(Pixel) == 1) ? 6 : 0)];
- const ptrdiff_t block_buffer_stride =
- kRestorationProcessingUnitSizeWithBorders * pixel_size_;
- IntermediateBuffers intermediate_buffers;
-
- RestorationBuffer restoration_buffer = {
- {intermediate_buffers.box_filter.output[0],
- intermediate_buffers.box_filter.output[1]},
- plane_process_unit_width,
- {intermediate_buffers.box_filter.intermediate_a,
- intermediate_buffers.box_filter.intermediate_b},
- kRestorationProcessingUnitSizeWithBorders + kRestorationPadding,
- intermediate_buffers.wiener,
- kMaxSuperBlockSizeInPixels};
- const int deblock_buffer_units = 64 >> subsampling_y_[plane];
- uint8_t* const deblock_buffer = deblock_buffer_.data(plane);
- const int deblock_buffer_stride = deblock_buffer_.stride(plane);
- const int deblock_unit_y =
- std::max(MultiplyBy4(Ceil(unit_y, deblock_buffer_units)) - 4, 0);
- uint8_t* deblock_unit_buffer =
- (deblock_buffer != nullptr)
- ? deblock_buffer + deblock_unit_y * deblock_buffer_stride +
- unit_x * pixel_size_
- : nullptr;
+ alignas(alignof(uint16_t)) uint8_t
+ block_buffer[kRestorationUnitHeightWithBorders * block_buffer_stride +
+ ((sizeof(Pixel) == 1) ? 6 : 0)];
+ RestorationBuffer restoration_buffer;
+ const uint8_t* source;
+ ptrdiff_t source_stride;
+ if (DoCdef()) {
+ const int deblock_buffer_units = 64 >> subsampling_y_[plane];
+ const uint8_t* const deblock_buffer = deblock_buffer_.data(plane);
+ assert(deblock_buffer != nullptr);
+ const int deblock_buffer_stride = deblock_buffer_.stride(plane);
+ const int deblock_unit_y =
+ std::max(MultiplyBy4(Ceil(unit_y, deblock_buffer_units)) - 4, 0);
+ const uint8_t* const deblock_unit_buffer =
+ deblock_buffer + deblock_unit_y * deblock_buffer_stride +
+ unit_x * pixel_size_;
+ PrepareLoopRestorationBlock<Pixel>(
+ cdef_unit_buffer, cdef_buffer_stride, deblock_unit_buffer,
+ deblock_buffer_stride, block_buffer, block_buffer_stride,
+ current_process_unit_width, current_process_unit_height, unit_y == 0,
+ unit_y + current_process_unit_height >= plane_height);
+ source = block_buffer + kRestorationBorder * block_buffer_stride +
+ kRestorationBorder * pixel_size_;
+ source_stride = kRestorationUnitWidthWithBorders;
+ } else {
+ source = cdef_unit_buffer;
+ source_stride = cdef_buffer_stride / sizeof(Pixel);
+ }
assert(type == kLoopRestorationTypeSgrProj ||
type == kLoopRestorationTypeWiener);
const dsp::LoopRestorationFunc restoration_func =
dsp_.loop_restorations[type - 2];
- PrepareLoopRestorationBlock<Pixel>(
- DoCdef(), cdef_unit_buffer, cdef_buffer_stride, deblock_unit_buffer,
- deblock_buffer_stride, block_buffer, block_buffer_stride,
- current_process_unit_width, current_process_unit_height, unit_y == 0,
- unit_y + current_process_unit_height >= plane_height);
- restoration_func(reinterpret_cast<const uint8_t*>(
- block_buffer + kRestorationBorder * block_buffer_stride +
- kRestorationBorder * pixel_size_),
- &(*loop_restored_window)[row][column],
+ restoration_func(source, &(*loop_restored_window)[row][column],
restoration_info_->loop_restoration_info(
static_cast<Plane>(plane), unit_id),
- block_buffer_stride,
- loop_restored_window->columns() * pixel_size_,
+ source_stride, loop_restored_window->columns(),
current_process_unit_width, current_process_unit_height,
&restoration_buffer);
}
@@ -104,9 +95,8 @@ void PostFilter::ApplyLoopRestorationForOneUnit(
template <typename Pixel>
void PostFilter::ApplyLoopRestorationForSuperBlock(
const Plane plane, const int x, const int y, const int unit_row,
- const int current_process_unit_height, const int process_unit_width) {
+ const int current_process_unit_height, const int plane_unit_size) {
const int stride = frame_buffer_.stride(plane);
- const int plane_unit_size = loop_restoration_.unit_size[plane];
const int num_horizontal_units =
restoration_info_->num_horizontal_units(static_cast<Plane>(plane));
const int plane_width =
@@ -119,23 +109,14 @@ void PostFilter::ApplyLoopRestorationForSuperBlock(
x * pixel_size_));
ApplyLoopRestorationForOneUnit<Pixel>(
superres_buffer_[plane], stride, plane, plane_height, x, y, 0, 0,
- unit_row, current_process_unit_height, process_unit_width,
- plane_unit_size, num_horizontal_units, plane_width,
- &loop_restored_window);
+ unit_row, current_process_unit_height, plane_unit_size,
+ num_horizontal_units, plane_width, &loop_restored_window);
}
void PostFilter::ApplyLoopRestorationForOneSuperBlockRow(int row4x4_start,
int sb4x4) {
assert(row4x4_start >= 0);
assert(DoRestoration());
- const int plane_process_unit_width[kMaxPlanes] = {
- kRestorationProcessingUnitSize,
- kRestorationProcessingUnitSize >> subsampling_x_[kPlaneU],
- kRestorationProcessingUnitSize >> subsampling_x_[kPlaneV]};
- const int plane_process_unit_height[kMaxPlanes] = {
- kRestorationProcessingUnitSize,
- kRestorationProcessingUnitSize >> subsampling_y_[kPlaneU],
- kRestorationProcessingUnitSize >> subsampling_y_[kPlaneV]};
for (int plane = 0; plane < planes_; ++plane) {
if (frame_header_.loop_restoration.type[plane] ==
kLoopRestorationTypeNone) {
@@ -149,36 +130,36 @@ void PostFilter::ApplyLoopRestorationForOneSuperBlockRow(int row4x4_start,
subsampling_x_[plane]);
const int num_vertical_units =
restoration_info_->num_vertical_units(static_cast<Plane>(plane));
- const int process_unit_width = plane_process_unit_width[plane];
+ const int plane_unit_size = frame_header_.loop_restoration.unit_size[plane];
+ const int plane_process_unit_height =
+ kRestorationUnitHeight >> subsampling_y_[plane];
+ int y = (row4x4_start == 0)
+ ? 0
+ : (MultiplyBy4(row4x4_start) >> subsampling_y_[plane]) -
+ unit_height_offset;
+ int expected_height = plane_process_unit_height -
+ ((row4x4_start == 0) ? unit_height_offset : 0);
for (int sb_y = 0; sb_y < sb4x4; sb_y += 16) {
- const int row4x4 = row4x4_start + sb_y;
- const int y = (MultiplyBy4(row4x4) - (row4x4 == 0 ? 0 : 8)) >>
- subsampling_y_[plane];
if (y >= plane_height) break;
- const int plane_unit_size =
- frame_header_.loop_restoration.unit_size[plane];
const int unit_row = std::min((y + unit_height_offset) / plane_unit_size,
num_vertical_units - 1);
- const int expected_height = plane_process_unit_height[plane] +
- ((y == 0) ? -unit_height_offset : 0);
const int current_process_unit_height =
- (y + expected_height <= plane_height) ? expected_height
- : plane_height - y;
- for (int column4x4 = 0;; column4x4 += 16) {
- const int x = MultiplyBy4(column4x4) >> subsampling_x_[plane];
- if (x >= plane_width) break;
+ std::min(expected_height, plane_height - y);
+ for (int x = 0; x < plane_width; x += plane_unit_size) {
#if LIBGAV1_MAX_BITDEPTH >= 10
if (bitdepth_ >= 10) {
ApplyLoopRestorationForSuperBlock<uint16_t>(
static_cast<Plane>(plane), x, y, unit_row,
- current_process_unit_height, process_unit_width);
+ current_process_unit_height, plane_unit_size);
continue;
}
#endif
ApplyLoopRestorationForSuperBlock<uint8_t>(
static_cast<Plane>(plane), x, y, unit_row,
- current_process_unit_height, process_unit_width);
+ current_process_unit_height, plane_unit_size);
}
+ expected_height = plane_process_unit_height;
+ y += current_process_unit_height;
}
}
}
@@ -188,18 +169,16 @@ void PostFilter::ApplyLoopRestorationForOneRowInWindow(
uint8_t* const cdef_buffer, const ptrdiff_t cdef_buffer_stride,
const Plane plane, const int plane_height, const int plane_width,
const int x, const int y, const int row, const int unit_row,
- const int current_process_unit_height, const int process_unit_width,
- const int window_width, const int plane_unit_size,
- const int num_horizontal_units) {
+ const int current_process_unit_height, const int plane_unit_size,
+ const int window_width, const int num_horizontal_units) {
Array2DView<Pixel> loop_restored_window(
window_buffer_height_, window_buffer_width_,
reinterpret_cast<Pixel*>(threaded_window_buffer_));
- for (int column = 0; column < window_width; column += process_unit_width) {
+ for (int column = 0; column < window_width; column += plane_unit_size) {
ApplyLoopRestorationForOneUnit<Pixel>(
cdef_buffer, cdef_buffer_stride, plane, plane_height, x, y, row, column,
- unit_row, current_process_unit_height, process_unit_width,
- plane_unit_size, num_horizontal_units, plane_width,
- &loop_restored_window);
+ unit_row, current_process_unit_height, plane_unit_size,
+ num_horizontal_units, plane_width, &loop_restored_window);
}
}
@@ -210,20 +189,14 @@ void PostFilter::ApplyLoopRestorationForOneRowInWindow(
// completes filtering until all jobs are finished. This approach requires an
// extra buffer (|threaded_window_buffer_|) to hold the filtering output, whose
// size is the size of the window. It also needs block buffers (i.e.,
-// |block_buffer| and |intermediate_buffers| in
-// ApplyLoopRestorationForOneUnit()) to store intermediate results in loop
-// restoration for each thread. After all units inside the window are filtered,
-// the output is written to the frame buffer.
+// |block_buffer| in ApplyLoopRestorationForOneUnit()) to store intermediate
+// results in loop restoration for each thread. After all units inside the
+// window are filtered, the output is written to the frame buffer.
template <typename Pixel>
void PostFilter::ApplyLoopRestorationThreaded() {
- const int plane_process_unit_width[kMaxPlanes] = {
- kRestorationProcessingUnitSize,
- kRestorationProcessingUnitSize >> subsampling_x_[kPlaneU],
- kRestorationProcessingUnitSize >> subsampling_x_[kPlaneV]};
const int plane_process_unit_height[kMaxPlanes] = {
- kRestorationProcessingUnitSize,
- kRestorationProcessingUnitSize >> subsampling_y_[kPlaneU],
- kRestorationProcessingUnitSize >> subsampling_y_[kPlaneV]};
+ kRestorationUnitHeight, kRestorationUnitHeight >> subsampling_y_[kPlaneU],
+ kRestorationUnitHeight >> subsampling_y_[kPlaneV]};
for (int plane = kPlaneY; plane < planes_; ++plane) {
if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
@@ -270,11 +243,11 @@ void PostFilter::ApplyLoopRestorationThreaded() {
plane_process_unit_height[plane] +
1;
}
+ const int jobs_for_threadpool =
+ vertical_units_per_window * num_workers / (num_workers + 1);
for (int x = 0; x < plane_width; x += window_buffer_width_) {
const int actual_window_width =
std::min(window_buffer_width_, plane_width - x);
- const int jobs_for_threadpool =
- vertical_units_per_window * num_workers / (num_workers + 1);
assert(jobs_for_threadpool < vertical_units_per_window);
BlockingCounter pending_jobs(jobs_for_threadpool);
int job_count = 0;
@@ -282,37 +255,32 @@ void PostFilter::ApplyLoopRestorationThreaded() {
for (int row = 0; row < actual_window_height;
row += current_process_unit_height) {
const int unit_y = y + row;
- const int expected_height = plane_process_unit_height[plane] +
- ((unit_y == 0) ? -unit_height_offset : 0);
+ const int expected_height = plane_process_unit_height[plane] -
+ ((unit_y == 0) ? unit_height_offset : 0);
current_process_unit_height =
- (unit_y + expected_height <= plane_height)
- ? expected_height
- : plane_height - unit_y;
+ std::min(expected_height, plane_height - unit_y);
const int unit_row =
std::min((unit_y + unit_height_offset) / plane_unit_size,
num_vertical_units - 1);
- const int process_unit_width = plane_process_unit_width[plane];
if (job_count < jobs_for_threadpool) {
thread_pool_->Schedule(
- [this, src_buffer, src_stride, process_unit_width,
+ [this, src_buffer, src_stride, plane_unit_size,
current_process_unit_height, actual_window_width,
- plane_unit_size, num_horizontal_units, x, y, row, unit_row,
- plane_height, plane_width, plane, &pending_jobs]() {
+ num_horizontal_units, x, y, row, unit_row, plane_height,
+ plane_width, plane, &pending_jobs]() {
ApplyLoopRestorationForOneRowInWindow<Pixel>(
src_buffer, src_stride, static_cast<Plane>(plane),
plane_height, plane_width, x, y, row, unit_row,
- current_process_unit_height, process_unit_width,
- actual_window_width, plane_unit_size,
- num_horizontal_units);
+ current_process_unit_height, plane_unit_size,
+ actual_window_width, num_horizontal_units);
pending_jobs.Decrement();
});
} else {
ApplyLoopRestorationForOneRowInWindow<Pixel>(
src_buffer, src_stride, static_cast<Plane>(plane), plane_height,
plane_width, x, y, row, unit_row, current_process_unit_height,
- process_unit_width, actual_window_width, plane_unit_size,
- num_horizontal_units);
+ plane_unit_size, actual_window_width, num_horizontal_units);
}
++job_count;
}
diff --git a/chromium/third_party/libgav1/src/src/post_filter/post_filter.cc b/chromium/third_party/libgav1/src/src/post_filter/post_filter.cc
index 1b65e9fbcf8..6174aabdee6 100644
--- a/chromium/third_party/libgav1/src/src/post_filter/post_filter.cc
+++ b/chromium/third_party/libgav1/src/src/post_filter/post_filter.cc
@@ -31,6 +31,9 @@
namespace libgav1 {
namespace {
+// Import all the constants in the anonymous namespace.
+#include "src/post_filter/deblock_thresholds.inc"
+
// Row indices of deblocked pixels needed by loop restoration. This is used to
// populate the |deblock_buffer_| when cdef is on. The first dimension is
// subsampling_y.
@@ -122,16 +125,11 @@ void ExtendFrame(uint8_t* const frame_start, const int width, const int height,
} // namespace
-PostFilter::PostFilter(
- const ObuFrameHeader& frame_header,
- const ObuSequenceHeader& sequence_header, LoopFilterMask* const masks,
- const Array2D<int16_t>& cdef_index,
- const Array2D<TransformSize>& inter_transform_sizes,
- LoopRestorationInfo* const restoration_info,
- BlockParametersHolder* block_parameters, YuvBuffer* const frame_buffer,
- YuvBuffer* const deblock_buffer, const dsp::Dsp* dsp,
- ThreadPool* const thread_pool, uint8_t* const threaded_window_buffer,
- uint8_t* const superres_line_buffer, int do_post_filter_mask)
+PostFilter::PostFilter(const ObuFrameHeader& frame_header,
+ const ObuSequenceHeader& sequence_header,
+ FrameScratchBuffer* const frame_scratch_buffer,
+ YuvBuffer* const frame_buffer, const dsp::Dsp* dsp,
+ int do_post_filter_mask)
: frame_header_(frame_header),
loop_restoration_(frame_header.loop_restoration),
dsp_(*dsp),
@@ -149,24 +147,24 @@ PostFilter::PostFilter(
: kMaxPlanes),
pixel_size_(static_cast<int>((bitdepth_ == 8) ? sizeof(uint8_t)
: sizeof(uint16_t))),
- masks_(masks),
- cdef_index_(cdef_index),
- inter_transform_sizes_(inter_transform_sizes),
- threaded_window_buffer_(threaded_window_buffer),
- restoration_info_(restoration_info),
- window_buffer_width_(GetWindowBufferWidth(thread_pool, frame_header)),
- window_buffer_height_(GetWindowBufferHeight(thread_pool, frame_header)),
- superres_line_buffer_(superres_line_buffer),
- block_parameters_(*block_parameters),
+ inner_thresh_(kInnerThresh[frame_header.loop_filter.sharpness]),
+ outer_thresh_(kOuterThresh[frame_header.loop_filter.sharpness]),
+ cdef_index_(frame_scratch_buffer->cdef_index),
+ inter_transform_sizes_(frame_scratch_buffer->inter_transform_sizes),
+ threaded_window_buffer_(
+ frame_scratch_buffer->threaded_window_buffer.get()),
+ restoration_info_(&frame_scratch_buffer->loop_restoration_info),
+ superres_line_buffer_(frame_scratch_buffer->superres_line_buffer.get()),
+ block_parameters_(frame_scratch_buffer->block_parameters_holder),
frame_buffer_(*frame_buffer),
- deblock_buffer_(*deblock_buffer),
+ deblock_buffer_(frame_scratch_buffer->deblock_buffer),
do_post_filter_mask_(do_post_filter_mask),
- thread_pool_(thread_pool) {
+ thread_pool_(
+ frame_scratch_buffer->threading_strategy.post_filter_thread_pool()),
+ window_buffer_width_(GetWindowBufferWidth(thread_pool_, frame_header)),
+ window_buffer_height_(GetWindowBufferHeight(thread_pool_, frame_header)) {
const int8_t zero_delta_lf[kFrameLfCount] = {};
ComputeDeblockFilterLevels(zero_delta_lf, deblock_filter_levels_);
- if (DoDeblock()) {
- InitDeblockFilterParams();
- }
if (DoSuperRes()) {
for (int plane = 0; plane < planes_; ++plane) {
const int downscaled_width =
@@ -196,7 +194,7 @@ PostFilter::PostFilter(
// In single threaded mode, we apply SuperRes without making a copy of the
// input row by writing the output to one row to the top (we refer to this
// process as "in place superres" in our code).
- const bool in_place_superres = DoSuperRes() && thread_pool == nullptr;
+ const bool in_place_superres = DoSuperRes() && thread_pool_ == nullptr;
if (DoCdef() || DoRestoration() || in_place_superres) {
for (int plane = 0; plane < planes_; ++plane) {
int horizontal_shift = 0;
@@ -372,8 +370,8 @@ void PostFilter::ApplyFilteringThreaded() {
if (DoDeblock()) ApplyDeblockFilterThreaded();
if (DoCdef() && DoRestoration()) {
for (int row4x4 = 0; row4x4 < frame_header_.rows4x4;
- row4x4 += kNum4x4InLoopFilterMaskUnit) {
- SetupDeblockBuffer(row4x4, kNum4x4InLoopFilterMaskUnit);
+ row4x4 += kNum4x4InLoopFilterUnit) {
+ SetupDeblockBuffer(row4x4, kNum4x4InLoopFilterUnit);
}
}
if (DoCdef()) ApplyCdef();
@@ -383,9 +381,10 @@ void PostFilter::ApplyFilteringThreaded() {
}
int PostFilter::ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4,
- bool is_last_row) {
+ bool is_last_row,
+ bool do_deblock) {
if (row4x4 < 0) return -1;
- if (DoDeblock()) {
+ if (DoDeblock() && do_deblock) {
ApplyDeblockFilterForOneSuperBlockRow(row4x4, sb4x4);
}
if (DoRestoration() && DoCdef()) {
diff --git a/chromium/third_party/libgav1/src/src/post_filter/super_res.cc b/chromium/third_party/libgav1/src/src/post_filter/super_res.cc
index 2dc1dcd61cf..8f17a37b5cb 100644
--- a/chromium/third_party/libgav1/src/src/post_filter/super_res.cc
+++ b/chromium/third_party/libgav1/src/src/post_filter/super_res.cc
@@ -35,10 +35,10 @@ void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& buffers,
const std::array<int, kMaxPlanes>& strides,
const std::array<int, kMaxPlanes>& rows,
size_t line_buffer_offset) {
- uint8_t* const line_buffer_start =
- in_place ? nullptr
- : superres_line_buffer_ + line_buffer_offset +
- kSuperResHorizontalBorder * pixel_size_;
+ // Only used when |in_place| == false.
+ uint8_t* const line_buffer_start = superres_line_buffer_ +
+ line_buffer_offset +
+ kSuperResHorizontalBorder * pixel_size_;
for (int plane = kPlaneY; plane < planes_; ++plane) {
const int8_t subsampling_x = subsampling_x_[plane];
const int plane_width =
diff --git a/chromium/third_party/libgav1/src/src/threading_strategy.cc b/chromium/third_party/libgav1/src/src/threading_strategy.cc
index 75e2ed60270..5c0b940c835 100644
--- a/chromium/third_party/libgav1/src/src/threading_strategy.cc
+++ b/chromium/third_party/libgav1/src/src/threading_strategy.cc
@@ -16,15 +16,52 @@
#include <algorithm>
#include <cassert>
+#include <memory>
+#include "src/frame_scratch_buffer.h"
#include "src/utils/constants.h"
#include "src/utils/logging.h"
+#include "src/utils/vector.h"
namespace libgav1 {
+namespace {
+
+// Computes the number of frame threads to be used based on the following
+// heuristic:
+// * If |thread_count| == 1, return 0.
+// * If |thread_count| <= |tile_count| * 4, return 0.
+// * Otherwise, return the largest value of i which satisfies the following
+// condition: i + i * tile_columns <= thread_count. This ensures that there
+// are at least |tile_columns| worker threads for each frame thread.
+// * This function will never return 1 or a value > |thread_count|.
+//
+// This heuristic is based empirical performance data. The in-frame threading
+// model (combination of tile multithreading, superblock row multithreading and
+// post filter multithreading) performs better than the frame parallel model
+// until we reach the threshold of |thread_count| > |tile_count| * 4.
+//
+// It is a function of |tile_count| since tile threading and superblock row
+// multithreading will scale only as a factor of |tile_count|. The threshold 4
+// is arrived at based on empirical data. The general idea is that superblock
+// row multithreading plateaus at 4 * |tile_count| because in most practical
+// cases there aren't more than that many superblock rows and columns available
+// to work on in parallel.
+int ComputeFrameThreadCount(int thread_count, int tile_count,
+ int tile_columns) {
+ assert(thread_count > 0);
+ if (thread_count == 1) return 0;
+ return (thread_count <= tile_count * 4)
+ ? 0
+ : std::max(2, thread_count / (1 + tile_columns));
+}
+
+} // namespace
bool ThreadingStrategy::Reset(const ObuFrameHeader& frame_header,
int thread_count) {
assert(thread_count > 0);
+ frame_parallel_ = false;
+
if (thread_count == 1) {
thread_pool_.reset(nullptr);
tile_thread_count_ = 0;
@@ -103,14 +140,74 @@ bool ThreadingStrategy::Reset(const ObuFrameHeader& frame_header,
return true;
}
+bool ThreadingStrategy::Reset(int thread_count) {
+ assert(thread_count > 0);
+ frame_parallel_ = true;
+
+ // In frame parallel mode, we simply access the underlying |thread_pool_|
+ // directly. So ensure all the other threadpool getter functions return
+ // nullptr. Also, superblock row multithreading is always disabled in frame
+ // parallel mode.
+ tile_thread_count_ = 0;
+ max_tile_index_for_row_threads_ = 0;
+
+ if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) {
+ thread_pool_ = ThreadPool::Create("libgav1-fp", thread_count);
+ if (thread_pool_ == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.",
+ thread_count);
+ return false;
+ }
+ }
+ return true;
+}
+
bool InitializeThreadPoolsForFrameParallel(
- int thread_count, std::unique_ptr<ThreadPool>* const frame_thread_pool) {
- *frame_thread_pool = ThreadPool::Create(thread_count);
+ int thread_count, int tile_count, int tile_columns,
+ std::unique_ptr<ThreadPool>* const frame_thread_pool,
+ FrameScratchBufferPool* const frame_scratch_buffer_pool) {
+ assert(*frame_thread_pool == nullptr);
+ thread_count = std::min(thread_count, static_cast<int>(kMaxThreads));
+ const int frame_threads =
+ ComputeFrameThreadCount(thread_count, tile_count, tile_columns);
+ if (frame_threads == 0) return true;
+ *frame_thread_pool = ThreadPool::Create(frame_threads);
if (*frame_thread_pool == nullptr) {
LIBGAV1_DLOG(ERROR, "Failed to create frame thread pool with %d threads.",
- thread_count);
+ frame_threads);
return false;
}
+ int remaining_threads = thread_count - frame_threads;
+ if (remaining_threads == 0) return true;
+ int threads_per_frame = remaining_threads / frame_threads;
+ const int extra_threads = remaining_threads % frame_threads;
+ Vector<std::unique_ptr<FrameScratchBuffer>> frame_scratch_buffers;
+ if (!frame_scratch_buffers.reserve(frame_threads)) return false;
+ // Create the tile thread pools.
+ for (int i = 0; i < frame_threads && remaining_threads > 0; ++i) {
+ std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+ frame_scratch_buffer_pool->Get();
+ if (frame_scratch_buffer == nullptr) {
+ return false;
+ }
+ // If the number of tile threads cannot be divided equally amongst all the
+ // frame threads, assign one extra thread to the first |extra_threads| frame
+ // threads.
+ const int current_frame_thread_count =
+ threads_per_frame + static_cast<int>(i < extra_threads);
+ if (!frame_scratch_buffer->threading_strategy.Reset(
+ current_frame_thread_count)) {
+ return false;
+ }
+ remaining_threads -= current_frame_thread_count;
+ frame_scratch_buffers.push_back_unchecked(std::move(frame_scratch_buffer));
+ }
+ // We release the frame scratch buffers in reverse order so that the extra
+ // threads are allocated to buffers in the top of the stack.
+ for (int i = static_cast<int>(frame_scratch_buffers.size()) - 1; i >= 0;
+ --i) {
+ frame_scratch_buffer_pool->Release(std::move(frame_scratch_buffers[i]));
+ }
return true;
}
diff --git a/chromium/third_party/libgav1/src/src/threading_strategy.h b/chromium/third_party/libgav1/src/src/threading_strategy.h
index 5822bb31f36..84b35896d26 100644
--- a/chromium/third_party/libgav1/src/src/threading_strategy.h
+++ b/chromium/third_party/libgav1/src/src/threading_strategy.h
@@ -25,6 +25,8 @@
namespace libgav1 {
+class FrameScratchBufferPool;
+
// This class allocates and manages the worker threads among thread pools used
// for multi-threaded decoding.
class ThreadingStrategy {
@@ -36,18 +38,28 @@ class ThreadingStrategy {
ThreadingStrategy& operator=(const ThreadingStrategy&) = delete;
// Creates or re-allocates the thread pools based on the |frame_header| and
- // |thread_count|. This function is idempotent if the |frame_header| and
- // |thread_count| doesn't change between calls (it will only create new
- // threads on the first call and do nothing on the subsequent calls). This
- // function also starts the worker threads whenever it creates new thread
- // pools.
+ // |thread_count|. This function is used only in non frame-parallel mode. This
+ // function is idempotent if the |frame_header| and |thread_count| don't
+ // change between calls (it will only create new threads on the first call and
+ // do nothing on the subsequent calls). This function also starts the worker
+ // threads whenever it creates new thread pools.
// The following strategy is used to allocate threads:
// * One thread is allocated for decoding each Tile.
// * Any remaining threads are allocated for superblock row multi-threading
// within each of the tile in a round robin fashion.
+ // Note: During the lifetime of a ThreadingStrategy object, only one of the
+ // Reset() variants will be used.
LIBGAV1_MUST_USE_RESULT bool Reset(const ObuFrameHeader& frame_header,
int thread_count);
+ // Creates or re-allocates a thread pool with |thread_count| threads. This
+ // function is used only in frame parallel mode. This function is idempotent
+ // if the |thread_count| doesn't change between calls (it will only create new
+ // threads on the first call and do nothing on the subsequent calls).
+ // Note: During the lifetime of a ThreadingStrategy object, only one of the
+ // Reset() variants will be used.
+ LIBGAV1_MUST_USE_RESULT bool Reset(int thread_count);
+
// Returns a pointer to the ThreadPool that is to be used for Tile
// multi-threading.
ThreadPool* tile_thread_pool() const {
@@ -56,8 +68,14 @@ class ThreadingStrategy {
int tile_thread_count() const { return tile_thread_count_; }
+ // Returns a pointer to the underlying ThreadPool.
+ // Note: Valid only when |frame_parallel_| is true. This is used for
+ // facilitating in-frame multi-threading in that case.
+ ThreadPool* thread_pool() const { return thread_pool_.get(); }
+
// Returns a pointer to the ThreadPool that is to be used within the Tile at
// index |tile_index| for superblock row multi-threading.
+ // Note: Valid only when |frame_parallel_| is false.
ThreadPool* row_thread_pool(int tile_index) const {
return tile_index < max_tile_index_for_row_threads_ ? thread_pool_.get()
: nullptr;
@@ -65,20 +83,48 @@ class ThreadingStrategy {
// Returns a pointer to the ThreadPool that is to be used for post filter
// multi-threading.
- ThreadPool* post_filter_thread_pool() const { return thread_pool_.get(); }
+ // Note: Valid only when |frame_parallel_| is false.
+ ThreadPool* post_filter_thread_pool() const {
+ return frame_parallel_ ? nullptr : thread_pool_.get();
+ }
// Returns a pointer to the ThreadPool that is to be used for film grain
// synthesis and blending.
+ // Note: Valid only when |frame_parallel_| is false.
ThreadPool* film_grain_thread_pool() const { return thread_pool_.get(); }
private:
std::unique_ptr<ThreadPool> thread_pool_;
- int tile_thread_count_;
- int max_tile_index_for_row_threads_;
+ int tile_thread_count_ = 0;
+ int max_tile_index_for_row_threads_ = 0;
+ bool frame_parallel_ = false;
};
+// Initializes the |frame_thread_pool| and the necessary worker threadpools (the
+// threading_strategy objects in each of the frame scratch buffer in
+// |frame_scratch_buffer_pool|) as follows:
+// * frame_threads = ComputeFrameThreadCount();
+// * For more details on how frame_threads is computed, see the function
+// comment in ComputeFrameThreadCount().
+// * |frame_thread_pool| is created with |frame_threads| threads.
+// * divide the remaining number of threads into each frame thread and
+// initialize a frame_scratch_buffer.threading_strategy for each frame
+// thread.
+// When this function is called, |frame_scratch_buffer_pool| must be empty. If
+// this function returns true, it means the initialization was successful and
+// one of the following is true:
+// * |frame_thread_pool| has been successfully initialized and
+// |frame_scratch_buffer_pool| has been successfully populated with
+// |frame_threads| buffers to be used by each frame thread. The total
+// number of threads that this function creates will always be equal to
+// |thread_count|.
+// * |frame_thread_pool| is nullptr. |frame_scratch_buffer_pool| is not
+// modified. This means that frame threading will not be used and the
+// decoder will continue to operate normally in non frame parallel mode.
LIBGAV1_MUST_USE_RESULT bool InitializeThreadPoolsForFrameParallel(
- int thread_count, std::unique_ptr<ThreadPool>* frame_thread_pool);
+ int thread_count, int tile_count, int tile_columns,
+ std::unique_ptr<ThreadPool>* frame_thread_pool,
+ FrameScratchBufferPool* frame_scratch_buffer_pool);
} // namespace libgav1
diff --git a/chromium/third_party/libgav1/src/src/tile.h b/chromium/third_party/libgav1/src/src/tile.h
index d8f48b4df27..7fb7e2296c0 100644
--- a/chromium/third_party/libgav1/src/src/tile.h
+++ b/chromium/third_party/libgav1/src/src/tile.h
@@ -33,7 +33,6 @@
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/frame_scratch_buffer.h"
-#include "src/loop_filter_mask.h"
#include "src/loop_restoration_info.h"
#include "src/obu_parser.h"
#include "src/post_filter.h"
@@ -77,16 +76,14 @@ class Tile : public Allocable {
const WedgeMaskArray& wedge_masks,
SymbolDecoderContext* const saved_symbol_decoder_context,
const SegmentationMap* prev_segment_ids, PostFilter* const post_filter,
- BlockParametersHolder* const block_parameters_holder,
const dsp::Dsp* const dsp, ThreadPool* const thread_pool,
BlockingCounterWithStatus* const pending_tiles, bool frame_parallel,
bool use_intra_prediction_buffer) {
std::unique_ptr<Tile> tile(new (std::nothrow) Tile(
tile_number, data, size, sequence_header, frame_header, current_frame,
state, frame_scratch_buffer, wedge_masks, saved_symbol_decoder_context,
- prev_segment_ids, post_filter, block_parameters_holder, dsp,
- thread_pool, pending_tiles, frame_parallel,
- use_intra_prediction_buffer));
+ prev_segment_ids, post_filter, dsp, thread_pool, pending_tiles,
+ frame_parallel, use_intra_prediction_buffer));
return (tile != nullptr && tile->Init()) ? std::move(tile) : nullptr;
}
@@ -100,9 +97,17 @@ class Tile : public Allocable {
// Parses the entire tile.
bool Parse();
+ // Decodes the entire tile. |superblock_row_progress| and
+ // |superblock_row_progress_condvar| are arrays of size equal to the number of
+ // superblock rows in the frame. Increments |superblock_row_progress[i]| after
+ // each superblock row at index |i| is decoded. If the count reaches the
+ // number of tile columns, then it notifies
+ // |superblock_row_progress_condvar[i]|.
+ bool Decode(std::mutex* mutex, int* superblock_row_progress,
+ std::condition_variable* superblock_row_progress_condvar);
// Parses and decodes the entire tile. Depending on the configuration of this
// Tile, this function may do multithreaded decoding.
- bool ParseAndDecode(bool is_main_thread); // 5.11.2.
+ bool ParseAndDecode(); // 5.11.2.
// Processes all the columns of the superblock row at |row4x4| that are within
// this Tile. If |save_symbol_decoder_context| is true, then
// SaveSymbolDecoderContext() is invoked for the last superblock row.
@@ -118,10 +123,14 @@ class Tile : public Allocable {
return reference_frame_sign_bias_;
}
+ bool IsRow4x4Inside(int row4x4) const {
+ return row4x4 >= row4x4_start_ && row4x4 < row4x4_end_;
+ }
+
// 5.11.51.
bool IsInside(int row4x4, int column4x4) const {
- return row4x4 >= row4x4_start_ && row4x4 < row4x4_end_ &&
- column4x4 >= column4x4_start_ && column4x4 < column4x4_end_;
+ return IsRow4x4Inside(row4x4) && column4x4 >= column4x4_start_ &&
+ column4x4 < column4x4_end_;
}
bool IsLeftInside(int column4x4) const {
@@ -168,9 +177,13 @@ class Tile : public Allocable {
const BlockParameters& Parameters(int row, int column) const {
return *block_parameters_holder_.Find(row, column);
}
+
int number() const { return number_; }
int superblock_rows() const { return superblock_rows_; }
int superblock_columns() const { return superblock_columns_; }
+ int row4x4_start() const { return row4x4_start_; }
+ int column4x4_start() const { return column4x4_start_; }
+ int column4x4_end() const { return column4x4_end_; }
private:
Tile(int tile_number, const uint8_t* data, size_t size,
@@ -180,9 +193,9 @@ class Tile : public Allocable {
const WedgeMaskArray& wedge_masks,
SymbolDecoderContext* saved_symbol_decoder_context,
const SegmentationMap* prev_segment_ids, PostFilter* post_filter,
- BlockParametersHolder* block_parameters_holder, const dsp::Dsp* dsp,
- ThreadPool* thread_pool, BlockingCounterWithStatus* pending_tiles,
- bool frame_parallel, bool use_intra_prediction_buffer);
+ const dsp::Dsp* dsp, ThreadPool* thread_pool,
+ BlockingCounterWithStatus* pending_tiles, bool frame_parallel,
+ bool use_intra_prediction_buffer);
// Stores the transform tree state when reading variable size transform trees
// and when applying the transform tree. When applying the transform tree,
@@ -201,16 +214,20 @@ class Tile : public Allocable {
int depth;
};
+ // Enum to track the processing state of a superblock.
+ enum SuperBlockState : uint8_t {
+ kSuperBlockStateNone, // Not yet parsed or decoded.
+ kSuperBlockStateParsed, // Parsed but not yet decoded.
+ kSuperBlockStateScheduled, // Scheduled for decoding.
+ kSuperBlockStateDecoded // Parsed and decoded.
+ };
+
// Parameters used to facilitate multi-threading within the Tile.
struct ThreadingParameters {
std::mutex mutex;
- // Array2DView of size |superblock_rows_| by |superblock_columns_|
- // containing the processing state of each superblock. The code in this
- // class uses relative indexing of superblocks with respect to this Tile.
- // The memory for this comes from the caller (the |super_block_state|
- // parameter in the constructor). The memory is for the whole frame whereas
- // the |sb_state| array in this struct points to the beginning of this Tile.
- Array2DView<SuperBlockState> sb_state LIBGAV1_GUARDED_BY(mutex);
+ // 2d array of size |superblock_rows_| by |superblock_columns_| containing
+ // the processing state of each superblock.
+ Array2D<SuperBlockState> sb_state LIBGAV1_GUARDED_BY(mutex);
// Variable used to indicate either parse or decode failure.
bool abort LIBGAV1_GUARDED_BY(mutex) = false;
int pending_jobs LIBGAV1_GUARDED_BY(mutex) = 0;
@@ -297,14 +314,6 @@ class Tile : public Allocable {
void ResetLoopRestorationParams();
void ReadLoopRestorationCoefficients(int row4x4, int column4x4,
BlockSize block_size); // 5.11.57.
- // Build bit masks for vertical edges followed by horizontal edges.
- // Traverse through each transform edge in the current coding block, and
- // determine if a 4x4 edge needs filtering. If filtering is needed, determine
- // filter length. Set corresponding bit mask to 1.
- void BuildBitMask(const Block& block);
- void BuildBitMaskHelper(const Block& block, int row4x4, int column4x4,
- BlockSize block_size, bool is_vertical_block_border,
- bool is_horizontal_block_border);
// Helper functions for DecodeBlock.
bool ReadSegmentId(const Block& block); // 5.11.9.
@@ -582,8 +591,8 @@ class Tile : public Allocable {
}
const int number_;
- int row_;
- int column_;
+ const int row_;
+ const int column_;
const uint8_t* const data_;
size_t size_;
int row4x4_start_;
@@ -729,14 +738,17 @@ class Tile : public Allocable {
int8_t delta_lf_[kFrameLfCount];
// True if all the values in |delta_lf_| are zero. False otherwise.
bool delta_lf_all_zero_;
- bool build_bit_mask_when_parsing_;
const bool frame_parallel_;
const bool use_intra_prediction_buffer_;
// Buffer used to store the unfiltered pixels that are necessary for decoding
// the next superblock row (for the intra prediction process). Used only if
- // |use_intra_prediction_buffer_| is true.
- std::array<AlignedDynamicBuffer<uint8_t, kMaxAlignment>, kMaxPlanes>
- intra_prediction_buffer_;
+ // |use_intra_prediction_buffer_| is true. The |frame_scratch_buffer| contains
+ // one row buffer for each tile row. This tile will have to use the buffer
+ // corresponding to this tile's row.
+ IntraPredictionBuffer* const intra_prediction_buffer_;
+ // Stores the progress of the reference frames. This will be used to avoid
+ // unnecessary calls into RefCountedBuffer::WaitUntil().
+ std::array<int, kNumReferenceFrameTypes> reference_frame_progress_cache_;
};
struct Tile::Block {
diff --git a/chromium/third_party/libgav1/src/src/tile/bitstream/mode_info.cc b/chromium/third_party/libgav1/src/src/tile/bitstream/mode_info.cc
index c13fbe3b907..1bae5a3c1b6 100644
--- a/chromium/third_party/libgav1/src/src/tile/bitstream/mode_info.cc
+++ b/chromium/third_party/libgav1/src/src/tile/bitstream/mode_info.cc
@@ -1100,12 +1100,11 @@ uint16_t* Tile::GetIsExplicitCompoundTypeCdf(const Block& block) {
uint16_t* Tile::GetIsCompoundTypeAverageCdf(const Block& block) {
const BlockParameters& bp = *block.bp;
- const int forward = std::abs(GetRelativeDistance(
- current_frame_.order_hint(bp.reference_frame[0]),
- frame_header_.order_hint, sequence_header_.order_hint_shift_bits));
- const int backward = std::abs(GetRelativeDistance(
- current_frame_.order_hint(bp.reference_frame[1]),
- frame_header_.order_hint, sequence_header_.order_hint_shift_bits));
+ const ReferenceInfo& reference_info = *current_frame_.reference_info();
+ const int forward =
+ std::abs(reference_info.relative_distance_from[bp.reference_frame[0]]);
+ const int backward =
+ std::abs(reference_info.relative_distance_from[bp.reference_frame[1]]);
int context = (forward == backward) ? 3 : 0;
if (block.top_available[kPlaneY]) {
if (!block.IsTopSingle()) {
diff --git a/chromium/third_party/libgav1/src/src/tile/prediction.cc b/chromium/third_party/libgav1/src/src/tile/prediction.cc
index 672b5a2b3a7..785c1dac404 100644
--- a/chromium/third_party/libgav1/src/src/tile/prediction.cc
+++ b/chromium/third_party/libgav1/src/src/tile/prediction.cc
@@ -277,7 +277,6 @@ void Tile::IntraPrediction(const Block& block, Plane plane, int x, int y,
(mode == kPredictionModeDc && has_left);
const Pixel* top_row_src = buffer[y - 1];
- int top_row_offset = 0;
// Determine if we need to retrieve the top row from
// |intra_prediction_buffer_|.
@@ -295,13 +294,8 @@ void Tile::IntraPrediction(const Block& block, Plane plane, int x, int y,
// then we will have to retrieve the top row from the
// |intra_prediction_buffer_|.
if (current_superblock_index != top_row_superblock_index) {
- top_row_src =
- reinterpret_cast<const Pixel*>(intra_prediction_buffer_[plane].get());
- // The |intra_prediction_buffer_| only stores the top row for this Tile.
- // The |x| value in this function is absolute to the frame. So in order to
- // make it relative to this Tile, all acccesses into top_row_src must be
- // offset by negative |top_row_offset|.
- top_row_offset = MultiplyBy4(column4x4_start_) >> subsampling_x_[plane];
+ top_row_src = reinterpret_cast<const Pixel*>(
+ (*intra_prediction_buffer_)[plane].get());
}
}
@@ -309,8 +303,7 @@ void Tile::IntraPrediction(const Block& block, Plane plane, int x, int y,
// Compute top_row.
if (has_top || has_left) {
const int left_index = has_left ? x - 1 : x;
- top_row[-1] = has_top ? top_row_src[left_index - top_row_offset]
- : buffer[y][left_index];
+ top_row[-1] = has_top ? top_row_src[left_index] : buffer[y][left_index];
} else {
top_row[-1] = 1 << (bitdepth - 1);
}
@@ -320,14 +313,12 @@ void Tile::IntraPrediction(const Block& block, Plane plane, int x, int y,
Memset(top_row, (1 << (bitdepth - 1)) - 1, top_size);
} else {
const int top_limit = std::min(max_x - x + 1, top_right_size);
- memcpy(top_row, &top_row_src[x - top_row_offset],
- top_limit * sizeof(Pixel));
+ memcpy(top_row, &top_row_src[x], top_limit * sizeof(Pixel));
// Even though it is safe to call Memset with a size of 0, accessing
// top_row_src[top_limit - x + 1] is not allowed when this condition is
// false.
if (top_size - top_limit > 0) {
- Memset(top_row + top_limit,
- top_row_src[top_limit + x - 1 - top_row_offset],
+ Memset(top_row + top_limit, top_row_src[top_limit + x - 1],
top_size - top_limit);
}
}
@@ -336,13 +327,13 @@ void Tile::IntraPrediction(const Block& block, Plane plane, int x, int y,
// Compute left_column.
if (has_top || has_left) {
const int left_index = has_left ? x - 1 : x;
- left_column[-1] = has_top ? top_row_src[left_index - top_row_offset]
- : buffer[y][left_index];
+ left_column[-1] =
+ has_top ? top_row_src[left_index] : buffer[y][left_index];
} else {
left_column[-1] = 1 << (bitdepth - 1);
}
if (!has_left && has_top) {
- Memset(left_column, top_row_src[x - top_row_offset], left_size);
+ Memset(left_column, top_row_src[x], left_size);
} else if (!has_left && !has_top) {
Memset(left_column, (1 << (bitdepth - 1)) + 1, left_size);
} else {
@@ -942,14 +933,13 @@ void Tile::DistanceWeightedPrediction(void* prediction_0, void* prediction_1,
for (int reference = 0; reference < 2; ++reference) {
const BlockParameters& bp =
*block_parameters_holder_.Find(candidate_row, candidate_column);
- const unsigned int reference_hint =
- current_frame_.order_hint(bp.reference_frame[reference]);
// Note: distance[0] and distance[1] correspond to relative distance
// between current frame and reference frame [1] and [0], respectively.
- distance[1 - reference] = Clip3(
- std::abs(GetRelativeDistance(reference_hint, frame_header_.order_hint,
- sequence_header_.order_hint_shift_bits)),
- 0, kMaxFrameDistance);
+ distance[1 - reference] = std::min(
+ std::abs(static_cast<int>(
+ current_frame_.reference_info()
+ ->relative_distance_from[bp.reference_frame[reference]])),
+ static_cast<int>(kMaxFrameDistance));
}
GetDistanceWeights(distance, weight);
@@ -1136,7 +1126,11 @@ bool Tile::BlockInterPrediction(
// reference_y_max by 2 since we only track the progress of Y planes.
reference_y_max = LeftShift(reference_y_max, subsampling_y);
}
- if (!reference_frames_[reference_frame_index]->WaitUntil(reference_y_max)) {
+ if (reference_frame_progress_cache_[reference_frame_index] <
+ reference_y_max &&
+ !reference_frames_[reference_frame_index]->WaitUntil(
+ reference_y_max,
+ &reference_frame_progress_cache_[reference_frame_index])) {
return false;
}
}
@@ -1275,7 +1269,11 @@ bool Tile::BlockWarpProcess(const Block& block, const Plane plane,
// For U and V planes with subsampling, we need to multiply reference_y_max
// by 2 since we only track the progress of Y planes.
reference_y_max = LeftShift(reference_y_max, subsampling_y_[plane]);
- if (!reference_frames_[reference_frame_index]->WaitUntil(reference_y_max)) {
+ if (reference_frame_progress_cache_[reference_frame_index] <
+ reference_y_max &&
+ !reference_frames_[reference_frame_index]->WaitUntil(
+ reference_y_max,
+ &reference_frame_progress_cache_[reference_frame_index])) {
return false;
}
}
diff --git a/chromium/third_party/libgav1/src/src/tile/tile.cc b/chromium/third_party/libgav1/src/src/tile/tile.cc
index 50daf1add34..ed00e282018 100644
--- a/chromium/third_party/libgav1/src/src/tile/tile.cc
+++ b/chromium/third_party/libgav1/src/src/tile/tile.cc
@@ -17,6 +17,7 @@
#include <algorithm>
#include <array>
#include <cassert>
+#include <climits>
#include <cstdlib>
#include <cstring>
#include <memory>
@@ -100,6 +101,14 @@ constexpr PredictionMode
kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal,
kPredictionModeD157, kPredictionModeDc};
+// Mask used to determine the index for mode_deltas lookup.
+constexpr BitMaskSet kPredictionModeDeltasMask(
+ kPredictionModeNearestMv, kPredictionModeNearMv, kPredictionModeNewMv,
+ kPredictionModeNearestNearestMv, kPredictionModeNearNearMv,
+ kPredictionModeNearestNewMv, kPredictionModeNewNearestMv,
+ kPredictionModeNearNewMv, kPredictionModeNewNearMv,
+ kPredictionModeNewNewMv);
+
// This is computed as:
// min(transform_width_log2, 5) + min(transform_height_log2, 5) - 4.
constexpr uint8_t kEobMultiSizeLookup[kNumTransformSizes] = {
@@ -383,12 +392,13 @@ Tile::Tile(int tile_number, const uint8_t* const data, size_t size,
const WedgeMaskArray& wedge_masks,
SymbolDecoderContext* const saved_symbol_decoder_context,
const SegmentationMap* prev_segment_ids,
- PostFilter* const post_filter,
- BlockParametersHolder* const block_parameters_holder,
- const dsp::Dsp* const dsp, ThreadPool* const thread_pool,
+ PostFilter* const post_filter, const dsp::Dsp* const dsp,
+ ThreadPool* const thread_pool,
BlockingCounterWithStatus* const pending_tiles, bool frame_parallel,
bool use_intra_prediction_buffer)
: number_(tile_number),
+ row_(number_ / frame_header.tile_info.tile_columns),
+ column_(number_ % frame_header.tile_info.tile_columns),
data_(data),
size_(size),
read_deltas_(false),
@@ -410,7 +420,7 @@ Tile::Tile(int tile_number, const uint8_t* const data, size_t size,
prev_segment_ids_(prev_segment_ids),
dsp_(*dsp),
post_filter_(*post_filter),
- block_parameters_holder_(*block_parameters_holder),
+ block_parameters_holder_(frame_scratch_buffer->block_parameters_holder),
quantizer_(sequence_header_.color_config.bitdepth,
&frame_header_.quantizer),
residual_size_((sequence_header_.color_config.bitdepth == 8)
@@ -428,11 +438,12 @@ Tile::Tile(int tile_number, const uint8_t* const data, size_t size,
tile_scratch_buffer_pool_(
&frame_scratch_buffer->tile_scratch_buffer_pool),
pending_tiles_(pending_tiles),
- build_bit_mask_when_parsing_(false),
frame_parallel_(frame_parallel),
- use_intra_prediction_buffer_(use_intra_prediction_buffer) {
- row_ = number_ / frame_header.tile_info.tile_columns;
- column_ = number_ % frame_header.tile_info.tile_columns;
+ use_intra_prediction_buffer_(use_intra_prediction_buffer),
+ intra_prediction_buffer_(
+ use_intra_prediction_buffer_
+ ? &frame_scratch_buffer->intra_prediction_buffers.get()[row_]
+ : nullptr) {
row4x4_start_ = frame_header.tile_info.tile_row_start[row_];
row4x4_end_ = frame_header.tile_info.tile_row_start[row_ + 1];
column4x4_start_ = frame_header.tile_info.tile_column_start[column_];
@@ -454,6 +465,9 @@ Tile::Tile(int tile_number, const uint8_t* const data, size_t size,
split_parse_and_decode_ = (thread_pool_ != nullptr &&
superblock_columns_ > intra_block_copy_lag_) ||
frame_parallel;
+ if (frame_parallel_) {
+ reference_frame_progress_cache_.fill(INT_MIN);
+ }
memset(delta_lf_, 0, sizeof(delta_lf_));
delta_lf_all_zero_ = true;
const YuvBuffer& buffer = post_filter_.frame_buffer();
@@ -491,21 +505,6 @@ Tile::Tile(int tile_number, const uint8_t* const data, size_t size,
std::min(frame_header_.columns4x4, DivideBy4(plane_width + 3)
<< subsampling_x_[plane]);
}
- auto& superblock_state = frame_scratch_buffer->superblock_state;
- if (split_parse_and_decode_ && superblock_state.rows() > 0) {
- // The |superblock_state| array is for the entire frame. Set
- // |threading_.sb_state| to point to the beginning of this Tile.
- std::lock_guard<std::mutex> lock(threading_.mutex);
- const int superblock_width_log2 =
- FloorLog2(kBlockWidthPixels[SuperBlockSize()]);
- const int superblock_row_start_index =
- MultiplyBy4(row4x4_start_) >> superblock_width_log2;
- const int superblock_column_start_index =
- MultiplyBy4(column4x4_start_) >> superblock_width_log2;
- threading_.sb_state.Reset(superblock_rows_, superblock_state.columns(),
- &superblock_state[superblock_row_start_index]
- [superblock_column_start_index]);
- }
}
bool Tile::Init() {
@@ -545,28 +544,11 @@ bool Tile::Init() {
return false;
}
}
- if (use_intra_prediction_buffer_) {
- for (int plane = 0; plane < PlaneCount(); ++plane) {
- const size_t intra_prediction_buffer_size =
- (MultiplyBy4(column4x4_end_ - column4x4_start_) >>
- subsampling_x_[plane]) *
- (sequence_header_.color_config.bitdepth == 8 ? sizeof(uint8_t)
- : sizeof(uint16_t));
- if (!intra_prediction_buffer_[plane].Resize(
- intra_prediction_buffer_size)) {
- LIBGAV1_DLOG(
- ERROR, "Failed to allocate intra prediction buffer for plane %d.\n",
- plane);
- return false;
- }
- }
- }
if (frame_header_.use_ref_frame_mvs) {
assert(sequence_header_.enable_order_hint);
SetupMotionField(frame_header_, current_frame_, reference_frames_,
- sequence_header_.order_hint_shift_bits, row4x4_start_,
- row4x4_end_, column4x4_start_, column4x4_end_,
- &motion_field_);
+ row4x4_start_, row4x4_end_, column4x4_start_,
+ column4x4_end_, &motion_field_);
}
ResetLoopRestorationParams();
return true;
@@ -612,11 +594,10 @@ void Tile::SaveSymbolDecoderContext() {
}
}
-bool Tile::ParseAndDecode(bool is_main_thread) {
+bool Tile::ParseAndDecode() {
// If this is the main thread, we build the loop filter bit masks when parsing
// so that it happens in the current thread. This ensures that the main thread
// does as much work as possible.
- build_bit_mask_when_parsing_ = is_main_thread;
if (split_parse_and_decode_) {
if (!ThreadedParseAndDecode()) return false;
SaveSymbolDecoderContext();
@@ -663,9 +644,72 @@ bool Tile::Parse() {
return true;
}
+bool Tile::Decode(
+ std::mutex* const mutex, int* const superblock_row_progress,
+ std::condition_variable* const superblock_row_progress_condvar) {
+ const int block_width4x4 = sequence_header_.use_128x128_superblock ? 32 : 16;
+ const int block_width4x4_log2 =
+ sequence_header_.use_128x128_superblock ? 5 : 4;
+ std::unique_ptr<TileScratchBuffer> scratch_buffer =
+ tile_scratch_buffer_pool_->Get();
+ if (scratch_buffer == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+ return false;
+ }
+ for (int row4x4 = row4x4_start_, index = row4x4_start_ >> block_width4x4_log2;
+ row4x4 < row4x4_end_; row4x4 += block_width4x4, ++index) {
+ if (!ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+ row4x4, scratch_buffer.get())) {
+ return false;
+ }
+ if (post_filter_.DoDeblock()) {
+ // Apply vertical deblock filtering for all the columns in this tile
+ // except for the first 64 columns.
+ post_filter_.ApplyDeblockFilter(
+ kLoopFilterTypeVertical, row4x4,
+ column4x4_start_ + kNum4x4InLoopFilterUnit, column4x4_end_,
+ block_width4x4);
+ // If this is the first superblock row of the tile, then we cannot apply
+ // horizontal deblocking here since we don't know if the top row is
+ // available. So it will be done by the calling thread in that case.
+ if (row4x4 != row4x4_start_) {
+ // Apply horizontal deblock filtering for all the columns in this tile
+ // except for the first and the last 64 columns.
+ // Note about the last tile of each row: For the last tile,
+ // column4x4_end may not be a multiple of 16. In that case it is still
+ // okay to simply subtract 16 since ApplyDeblockFilter() will only do
+ // the filters in increments of 64 columns (or 32 columns for chroma
+ // with subsampling).
+ post_filter_.ApplyDeblockFilter(
+ kLoopFilterTypeHorizontal, row4x4,
+ column4x4_start_ + kNum4x4InLoopFilterUnit,
+ column4x4_end_ - kNum4x4InLoopFilterUnit, block_width4x4);
+ }
+ }
+ bool notify;
+ {
+ std::unique_lock<std::mutex> lock(*mutex);
+ notify = ++superblock_row_progress[index] ==
+ frame_header_.tile_info.tile_columns;
+ }
+ if (notify) {
+ // We are done decoding this superblock row. Notify the post filtering
+ // thread.
+ superblock_row_progress_condvar[index].notify_one();
+ }
+ }
+ tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+ return true;
+}
+
bool Tile::ThreadedParseAndDecode() {
{
std::lock_guard<std::mutex> lock(threading_.mutex);
+ if (!threading_.sb_state.Reset(superblock_rows_, superblock_columns_)) {
+ pending_tiles_->Decrement(false);
+ LIBGAV1_DLOG(ERROR, "threading.sb_state.Reset() failed.");
+ return false;
+ }
// Account for the parsing job.
++threading_.pending_jobs;
}
@@ -826,14 +870,16 @@ void Tile::PopulateIntraPredictionBuffer(int row4x4) {
if (!use_intra_prediction_buffer_ || row4x4 + block_width4x4 >= row4x4_end_) {
return;
}
+ const size_t pixel_size =
+ (sequence_header_.color_config.bitdepth == 8 ? sizeof(uint8_t)
+ : sizeof(uint16_t));
for (int plane = 0; plane < PlaneCount(); ++plane) {
const int row_to_copy =
(MultiplyBy4(row4x4 + block_width4x4) >> subsampling_y_[plane]) - 1;
const size_t pixels_to_copy =
(MultiplyBy4(column4x4_end_ - column4x4_start_) >>
subsampling_x_[plane]) *
- (sequence_header_.color_config.bitdepth == 8 ? sizeof(uint8_t)
- : sizeof(uint16_t));
+ pixel_size;
const size_t column_start =
MultiplyBy4(column4x4_start_) >> subsampling_x_[plane];
void* start;
@@ -848,7 +894,8 @@ void Tile::PopulateIntraPredictionBuffer(int row4x4) {
{
start = &buffer_[plane][row_to_copy][column_start];
}
- memcpy(intra_prediction_buffer_[plane].get(), start, pixels_to_copy);
+ memcpy((*intra_prediction_buffer_)[plane].get() + column_start * pixel_size,
+ start, pixels_to_copy);
}
}
@@ -2067,15 +2114,16 @@ bool Tile::ComputePrediction(const Block& block) {
void Tile::PopulateDeblockFilterLevel(const Block& block) {
if (!post_filter_.DoDeblock()) return;
BlockParameters& bp = *block.bp;
+ const int mode_id =
+ static_cast<int>(kPredictionModeDeltasMask.Contains(bp.y_mode));
for (int i = 0; i < kFrameLfCount; ++i) {
if (delta_lf_all_zero_) {
bp.deblock_filter_level[i] = post_filter_.GetZeroDeltaDeblockFilterLevel(
- bp.segment_id, i, bp.reference_frame[0],
- LoopFilterMask::GetModeId(bp.y_mode));
+ bp.segment_id, i, bp.reference_frame[0], mode_id);
} else {
bp.deblock_filter_level[i] =
deblock_filter_levels_[bp.segment_id][i][bp.reference_frame[0]]
- [LoopFilterMask::GetModeId(bp.y_mode)];
+ [mode_id];
}
}
}
@@ -2138,10 +2186,6 @@ bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
current_frame_.segmentation_map()->FillBlock(row4x4, column4x4, x_limit,
y_limit, bp.segment_id);
}
- if (kDeblockFilterBitMask &&
- (build_bit_mask_when_parsing_ || !split_parse_and_decode_)) {
- BuildBitMask(block);
- }
StoreMotionFieldMvsIntoCurrentFrame(block);
if (!split_parse_and_decode_) {
prediction_parameters_ = std::move(bp.prediction_parameters);
@@ -2164,9 +2208,6 @@ bool Tile::DecodeBlock(ParameterTree* const tree,
!Residual(block, kProcessingModeDecodeOnly)) {
return false;
}
- if (kDeblockFilterBitMask && !build_bit_mask_when_parsing_) {
- BuildBitMask(block);
- }
block.bp->prediction_parameters.reset(nullptr);
return true;
}
@@ -2451,176 +2492,11 @@ void Tile::ReadLoopRestorationCoefficients(int row4x4, int column4x4,
}
}
-void Tile::BuildBitMask(const Block& block) {
- if (!post_filter_.DoDeblock()) return;
- if (block.size <= kBlock64x64) {
- BuildBitMaskHelper(block, block.row4x4, block.column4x4, block.size, true,
- true);
- } else {
- const int block_width4x4 = kNum4x4BlocksWide[block.size];
- const int block_height4x4 = kNum4x4BlocksHigh[block.size];
- for (int y = 0; y < block_height4x4; y += 16) {
- for (int x = 0; x < block_width4x4; x += 16) {
- BuildBitMaskHelper(block, block.row4x4 + y, block.column4x4 + x,
- kBlock64x64, x == 0, y == 0);
- }
- }
- }
-}
-
-void Tile::BuildBitMaskHelper(const Block& block, int row4x4, int column4x4,
- BlockSize block_size,
- const bool is_vertical_block_border,
- const bool is_horizontal_block_border) {
- const int block_width4x4 = kNum4x4BlocksWide[block_size];
- const int block_height4x4 = kNum4x4BlocksHigh[block_size];
- BlockParameters& bp = *block.bp;
- const bool skip = bp.skip && bp.is_inter;
- LoopFilterMask* const masks = post_filter_.masks();
- const int unit_id = DivideBy16(row4x4) * masks->num_64x64_blocks_per_row() +
- DivideBy16(column4x4);
-
- for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
- // For U and V planes, do not build bit masks if level == 0.
- if (plane > kPlaneY && frame_header_.loop_filter.level[plane + 1] == 0) {
- continue;
- }
- // Build bit mask for vertical edges.
- const int subsampling_x = subsampling_x_[plane];
- const int subsampling_y = subsampling_y_[plane];
- const int column_limit =
- std::min(column4x4 + block_width4x4, deblock_column_limit_[plane]);
- const int row_limit =
- std::min(row4x4 + block_height4x4, deblock_row_limit_[plane]);
- const int row_start = GetDeblockPosition(row4x4, subsampling_y);
- const int column_start = GetDeblockPosition(column4x4, subsampling_x);
- if (row_start >= row_limit || column_start >= column_limit) {
- continue;
- }
- const int vertical_step = 1 << subsampling_y;
- const int horizontal_step = 1 << subsampling_x;
- const BlockParameters& bp =
- *block_parameters_holder_.Find(row_start, column_start);
- const int horizontal_level_index =
- kDeblockFilterLevelIndex[plane][kLoopFilterTypeHorizontal];
- const int vertical_level_index =
- kDeblockFilterLevelIndex[plane][kLoopFilterTypeVertical];
- const uint8_t vertical_level =
- bp.deblock_filter_level[vertical_level_index];
-
- for (int row = row_start; row < row_limit; row += vertical_step) {
- for (int column = column_start; column < column_limit;) {
- const TransformSize tx_size = (plane == kPlaneY)
- ? inter_transform_sizes_[row][column]
- : bp.uv_transform_size;
- // (1). Don't filter frame boundary.
- // (2). For tile boundary, we don't know whether the previous tile is
- // available or not, thus we handle it after all tiles are decoded.
- const bool is_vertical_border =
- (column == column_start) && is_vertical_block_border;
- if (column == GetDeblockPosition(column4x4_start_, subsampling_x) ||
- (skip && !is_vertical_border)) {
- column += kNum4x4BlocksWide[tx_size] << subsampling_x;
- continue;
- }
-
- // bp_left is the parameter of the left prediction block which
- // is guaranteed to be inside the tile.
- const BlockParameters& bp_left =
- *block_parameters_holder_.Find(row, column - horizontal_step);
- const uint8_t left_level =
- is_vertical_border
- ? bp_left.deblock_filter_level[vertical_level_index]
- : vertical_level;
- // We don't have to check if the left block is skipped or not,
- // because if the current transform block is on the edge of the coding
- // block, is_vertical_border is true; if it's not on the edge,
- // left skip is equal to skip.
- if (vertical_level != 0 || left_level != 0) {
- const TransformSize left_tx_size =
- (plane == kPlaneY)
- ? inter_transform_sizes_[row][column - horizontal_step]
- : bp_left.uv_transform_size;
- const LoopFilterTransformSizeId transform_size_id =
- GetTransformSizeIdWidth(tx_size, left_tx_size);
- const int r = row & (kNum4x4InLoopFilterMaskUnit - 1);
- const int c = column & (kNum4x4InLoopFilterMaskUnit - 1);
- const int shift = LoopFilterMask::GetShift(r, c);
- const int index = LoopFilterMask::GetIndex(r);
- const auto mask = static_cast<uint64_t>(1) << shift;
- masks->SetLeft(mask, unit_id, plane, transform_size_id, index);
- const uint8_t current_level =
- (vertical_level == 0) ? left_level : vertical_level;
- masks->SetLevel(current_level, unit_id, plane,
- kLoopFilterTypeVertical,
- LoopFilterMask::GetLevelOffset(r, c));
- }
- column += kNum4x4BlocksWide[tx_size] << subsampling_x;
- }
- }
-
- // Build bit mask for horizontal edges.
- const uint8_t horizontal_level =
- bp.deblock_filter_level[horizontal_level_index];
- for (int column = column_start; column < column_limit;
- column += horizontal_step) {
- for (int row = row_start; row < row_limit;) {
- const TransformSize tx_size = (plane == kPlaneY)
- ? inter_transform_sizes_[row][column]
- : bp.uv_transform_size;
-
- // (1). Don't filter frame boundary.
- // (2). For tile boundary, we don't know whether the previous tile is
- // available or not, thus we handle it after all tiles are decoded.
- const bool is_horizontal_border =
- (row == row_start) && is_horizontal_block_border;
- if (row == GetDeblockPosition(row4x4_start_, subsampling_y) ||
- (skip && !is_horizontal_border)) {
- row += kNum4x4BlocksHigh[tx_size] << subsampling_y;
- continue;
- }
-
- // bp_top is the parameter of the top prediction block which is
- // guaranteed to be inside the tile.
- const BlockParameters& bp_top =
- *block_parameters_holder_.Find(row - vertical_step, column);
- const uint8_t top_level =
- is_horizontal_border
- ? bp_top.deblock_filter_level[horizontal_level_index]
- : horizontal_level;
- // We don't have to check it the top block is skipped or not,
- // because if the current transform block is on the edge of the coding
- // block, is_horizontal_border is true; if it's not on the edge,
- // top skip is equal to skip.
- if (horizontal_level != 0 || top_level != 0) {
- const TransformSize top_tx_size =
- (plane == kPlaneY)
- ? inter_transform_sizes_[row - vertical_step][column]
- : bp_top.uv_transform_size;
- const LoopFilterTransformSizeId transform_size_id =
- static_cast<LoopFilterTransformSizeId>(
- std::min({kTransformHeightLog2[tx_size] - 2,
- kTransformHeightLog2[top_tx_size] - 2, 2}));
- const int r = row & (kNum4x4InLoopFilterMaskUnit - 1);
- const int c = column & (kNum4x4InLoopFilterMaskUnit - 1);
- const int shift = LoopFilterMask::GetShift(r, c);
- const int index = LoopFilterMask::GetIndex(r);
- const auto mask = static_cast<uint64_t>(1) << shift;
- masks->SetTop(mask, unit_id, plane, transform_size_id, index);
- const uint8_t current_level =
- (horizontal_level == 0) ? top_level : horizontal_level;
- masks->SetLevel(current_level, unit_id, plane,
- kLoopFilterTypeHorizontal,
- LoopFilterMask::GetLevelOffset(r, c));
- }
- row += kNum4x4BlocksHigh[tx_size] << subsampling_y;
- }
- }
- }
-}
-
void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) {
- if (frame_header_.refresh_frame_flags == 0) return;
+ if (frame_header_.refresh_frame_flags == 0 ||
+ IsIntraFrame(frame_header_.frame_type)) {
+ return;
+ }
// Iterate over odd rows/columns beginning at the first odd row/column for the
// block. It is done this way because motion field mvs are only needed at a
// 8x8 granularity.
@@ -2636,6 +2512,7 @@ void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) {
// The largest reference MV component that can be saved.
constexpr int kRefMvsLimit = (1 << 12) - 1;
const BlockParameters& bp = *block.bp;
+ ReferenceInfo* reference_info = current_frame_.reference_info();
for (int i = 1; i >= 0; --i) {
const ReferenceFrameType reference_frame_to_store = bp.reference_frame[i];
// Must make a local copy so that StoreMotionFieldMvs() knows there is no
@@ -2649,12 +2526,7 @@ void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) {
// The next line is equivalent to:
// mv_row <= kRefMvsLimit && mv_column <= kRefMvsLimit
(mv_row | mv_column) <= kRefMvsLimit &&
- GetRelativeDistance(
- reference_order_hint_
- [frame_header_.reference_frame_index[reference_frame_to_store -
- kReferenceFrameLast]],
- frame_header_.order_hint,
- sequence_header_.order_hint_shift_bits) < 0) {
+ reference_info->relative_distance_from[reference_frame_to_store] < 0) {
const int row_start8x8 = DivideBy2(row_start4x4);
const int row_limit8x8 = DivideBy2(row_limit4x4);
const int column_start8x8 = DivideBy2(column_start4x4);
@@ -2663,10 +2535,10 @@ void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) {
const int columns = column_limit8x8 - column_start8x8;
const ptrdiff_t stride = DivideBy2(current_frame_.columns4x4());
ReferenceFrameType* const reference_frame_row_start =
- current_frame_.motion_field_reference_frame(row_start8x8,
- column_start8x8);
+ &reference_info
+ ->motion_field_reference_frame[row_start8x8][column_start8x8];
MotionVector* const mv =
- current_frame_.motion_field_mv(row_start8x8, column_start8x8);
+ &reference_info->motion_field_mv[row_start8x8][column_start8x8];
// Specialize columns cases 1, 2, 4, 8 and 16. This makes memset() inlined
// and simplifies std::fill() for these cases.
diff --git a/chromium/third_party/libgav1/src/src/utils/array_2d.h b/chromium/third_party/libgav1/src/src/utils/array_2d.h
index 941d4b16f87..2df624187d0 100644
--- a/chromium/third_party/libgav1/src/src/utils/array_2d.h
+++ b/chromium/third_party/libgav1/src/src/utils/array_2d.h
@@ -113,6 +113,7 @@ class Array2D {
int columns() const { return data_view_.columns(); }
size_t size() const { return size_; }
T* data() { return data_.get(); }
+ const T* data() const { return data_.get(); }
T* operator[](int row) { return data_view_[row]; }
diff --git a/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.cc b/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.cc
index b52e91d6c97..79bb2b8f7e1 100644
--- a/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.cc
+++ b/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.cc
@@ -35,13 +35,11 @@ int RowsOrColumns4x4ToSuperBlocks(int value4x4, bool use_128x128_superblock) {
} // namespace
-BlockParametersHolder::BlockParametersHolder(int rows4x4, int columns4x4,
- bool use_128x128_superblock)
- : rows4x4_(rows4x4),
- columns4x4_(columns4x4),
- use_128x128_superblock_(use_128x128_superblock) {}
-
-bool BlockParametersHolder::Init() {
+bool BlockParametersHolder::Reset(int rows4x4, int columns4x4,
+ bool use_128x128_superblock) {
+ rows4x4_ = rows4x4;
+ columns4x4_ = columns4x4;
+ use_128x128_superblock_ = use_128x128_superblock;
if (!block_parameters_cache_.Reset(rows4x4_, columns4x4_)) {
LIBGAV1_DLOG(ERROR, "block_parameters_cache_.Reset() failed.");
return false;
diff --git a/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.h b/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.h
index 909de5eefa3..35543c30a4e 100644
--- a/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.h
+++ b/chromium/third_party/libgav1/src/src/utils/block_parameters_holder.h
@@ -31,17 +31,16 @@ namespace libgav1 {
// corresponding to a superblock.
class BlockParametersHolder {
public:
- // If |use_128x128_superblock| is true, 128x128 superblocks will be used,
- // otherwise 64x64 superblocks will be used.
- BlockParametersHolder(int rows4x4, int columns4x4,
- bool use_128x128_superblock);
+ BlockParametersHolder() = default;
// Not copyable or movable.
BlockParametersHolder(const BlockParametersHolder&) = delete;
BlockParametersHolder& operator=(const BlockParametersHolder&) = delete;
- // Must be called first.
- LIBGAV1_MUST_USE_RESULT bool Init();
+ // If |use_128x128_superblock| is true, 128x128 superblocks will be used,
+ // otherwise 64x64 superblocks will be used.
+ LIBGAV1_MUST_USE_RESULT bool Reset(int rows4x4, int columns4x4,
+ bool use_128x128_superblock);
// Finds the BlockParameters corresponding to |row4x4| and |column4x4|. This
// is done as a simple look up of the |block_parameters_cache_| matrix.
@@ -54,6 +53,10 @@ class BlockParametersHolder {
return block_parameters_cache_.data() + row4x4 * columns4x4_ + column4x4;
}
+ BlockParameters* const* Address(int row4x4, int column4x4) const {
+ return block_parameters_cache_.data() + row4x4 * columns4x4_ + column4x4;
+ }
+
int columns4x4() const { return columns4x4_; }
// Returns the ParameterTree corresponding to superblock starting at (|row|,
@@ -66,9 +69,9 @@ class BlockParametersHolder {
BlockParameters* bp);
private:
- const int rows4x4_;
- const int columns4x4_;
- const bool use_128x128_superblock_;
+ int rows4x4_ = 0;
+ int columns4x4_ = 0;
+ bool use_128x128_superblock_ = false;
Array2D<std::unique_ptr<ParameterTree>> trees_;
// This is a 2d array of size |rows4x4_| * |columns4x4_|. This is filled in by
diff --git a/chromium/third_party/libgav1/src/src/utils/common.h b/chromium/third_party/libgav1/src/src/utils/common.h
index 56f413a2849..d6e019933e2 100644
--- a/chromium/third_party/libgav1/src/src/utils/common.h
+++ b/chromium/third_party/libgav1/src/src/utils/common.h
@@ -400,19 +400,17 @@ constexpr int ApplySign(int value, int sign) { return (value ^ sign) - sign; }
// 7.9.3. (without the clamp for numerator and denominator).
inline void GetMvProjection(const MotionVector& mv, int numerator,
- int denominator, MotionVector* projection_mv) {
- // Allow numerator and denominator to be 0 so that this function can be called
- // unconditionally. When either numerator or denominator is 0, |projection_mv|
- // will be 0, and this is what we want.
+ int division_multiplier,
+ MotionVector* projection_mv) {
+ // Allow numerator and to be 0 so that this function can be called
+ // unconditionally. When numerator is 0, |projection_mv| will be 0, and this
+ // is what we want.
assert(std::abs(numerator) <= kMaxFrameDistance);
- assert(denominator >= 0);
- assert(denominator <= kMaxFrameDistance);
for (int i = 0; i < 2; ++i) {
- projection_mv->mv[i] = Clip3(
- RightShiftWithRoundingSigned(
- mv.mv[i] * numerator * kProjectionMvDivisionLookup[denominator],
- 14),
- -kProjectionMvClamp, kProjectionMvClamp);
+ projection_mv->mv[i] =
+ Clip3(RightShiftWithRoundingSigned(
+ mv.mv[i] * numerator * division_multiplier, 14),
+ -kProjectionMvClamp, kProjectionMvClamp);
}
}
diff --git a/chromium/third_party/libgav1/src/src/utils/constants.h b/chromium/third_party/libgav1/src/src/utils/constants.h
index 868bfdc8c82..f070767ecb6 100644
--- a/chromium/third_party/libgav1/src/src/utils/constants.h
+++ b/chromium/third_party/libgav1/src/src/utils/constants.h
@@ -27,11 +27,14 @@ namespace libgav1 {
// Returns the number of elements between begin (inclusive) and end (inclusive).
constexpr int EnumRangeLength(int begin, int end) { return end - begin + 1; }
-#if defined(ENABLE_DEBLOCK_BIT_MASK)
-constexpr bool kDeblockFilterBitMask = true;
+enum {
+// Maximum number of threads that the library will ever create.
+#if defined(LIBGAV1_MAX_THREADS) && LIBGAV1_MAX_THREADS > 0
+ kMaxThreads = LIBGAV1_MAX_THREADS
#else
-constexpr bool kDeblockFilterBitMask = false;
-#endif // defined(ENABLE_DEBLOCK_BIT_MASK)
+ kMaxThreads = 128
+#endif
+}; // anonymous enum
enum {
kInvalidMvValue = -32768,
@@ -44,7 +47,6 @@ enum {
kFrameLfCount = 4,
kMaxLoopFilterValue = 63,
kNum4x4In64x64 = 256,
- kNumLoopFilterMasks = 4,
kMaxAngleDelta = 3,
kDirectionalIntraModes = 8,
kMaxSuperBlockSizeLog2 = 7,
@@ -97,24 +99,19 @@ enum {
kMaxSuperBlockSizeInPixels = 128,
kMaxScaledSuperBlockSizeInPixels = 128 * 2,
kMaxSuperBlockSizeSquareInPixels = 128 * 128,
- kNum4x4InLoopFilterMaskUnit = 16,
+ kNum4x4InLoopFilterUnit = 16,
kProjectionMvClamp = (1 << 14) - 1, // == 16383
kProjectionMvMaxHorizontalOffset = 8,
+ kCdefUnitSize = 64,
+ kCdefUnitSizeWithBorders = kCdefUnitSize + 2 * kRestorationBorder,
kRestorationUnitOffset = 8,
- // 2 pixel padding for 5x5 box sum on each side.
- kRestorationPadding = 4,
// Loop restoration's processing unit size is fixed as 64x64.
- kRestorationProcessingUnitSize = 64,
- kRestorationProcessingUnitSizeWithBorders =
- kRestorationProcessingUnitSize + 2 * kRestorationBorder,
- // The max size of a box filter process output buffer.
- kMaxBoxFilterProcessOutputPixels = kRestorationProcessingUnitSize *
- kRestorationProcessingUnitSize, // == 4096
- // The max size of a box filter process intermediate buffer.
- kBoxFilterProcessIntermediatePixels =
- (kRestorationProcessingUnitSizeWithBorders + kRestorationPadding) *
- (kRestorationProcessingUnitSizeWithBorders +
- kRestorationPadding), // == 5476
+ kRestorationUnitHeight = 64,
+ kRestorationUnitWidth = 256,
+ kRestorationUnitHeightWithBorders =
+ kRestorationUnitHeight + 2 * kRestorationBorder,
+ kRestorationUnitWidthWithBorders =
+ kRestorationUnitWidth + 2 * kRestorationBorder,
kSuperResFilterBits = 6,
kSuperResFilterShifts = 1 << kSuperResFilterBits,
kSuperResFilterTaps = 8,
@@ -148,8 +145,6 @@ enum {
kMaxFrameDistance = 31,
kReferenceFrameScalePrecision = 14,
kNumWienerCoefficients = 3,
- // Maximum number of threads that the library will ever use at any given time.
- kMaxThreads = 32,
kLoopFilterMaxModeDeltas = 2,
kMaxCdefStrengths = 8,
kCdefLargeValue = 0x4000, // Used to indicate where CDEF is not available.
@@ -512,14 +507,6 @@ enum ObuType : int8_t {
kObuPadding = 15,
};
-// Enum to track the processing state of a superblock.
-enum SuperBlockState : uint8_t {
- kSuperBlockStateNone, // Not yet parsed or decoded.
- kSuperBlockStateParsed, // Parsed but not yet decoded.
- kSuperBlockStateScheduled, // Scheduled for decoding.
- kSuperBlockStateDecoded // Parsed and decoded.
-};
-
//------------------------------------------------------------------------------
// ToString()
//
diff --git a/chromium/third_party/libgav1/src/src/utils/libgav1_utils.cmake b/chromium/third_party/libgav1/src/src/utils/libgav1_utils.cmake
index 50bf941306f..8b6ec4bee32 100644
--- a/chromium/third_party/libgav1/src/src/utils/libgav1_utils.cmake
+++ b/chromium/third_party/libgav1/src/src/utils/libgav1_utils.cmake
@@ -44,6 +44,7 @@ list(APPEND libgav1_utils_sources
"${libgav1_source}/utils/queue.h"
"${libgav1_source}/utils/raw_bit_reader.cc"
"${libgav1_source}/utils/raw_bit_reader.h"
+ "${libgav1_source}/utils/reference_info.h"
"${libgav1_source}/utils/segmentation.cc"
"${libgav1_source}/utils/segmentation.h"
"${libgav1_source}/utils/segmentation_map.cc"
diff --git a/chromium/third_party/libgav1/src/src/utils/reference_info.h b/chromium/third_party/libgav1/src/src/utils/reference_info.h
new file mode 100644
index 00000000000..a6607912ab8
--- /dev/null
+++ b/chromium/third_party/libgav1/src/src/utils/reference_info.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_
+#define LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_
+
+#include <array>
+#include <cstdint>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// This struct collects some members related to reference frames in one place to
+// make it easier to pass them as parameters to some dsp functions.
+struct ReferenceInfo {
+ // Initialize |motion_field_reference_frame| so that
+ // Tile::StoreMotionFieldMvsIntoCurrentFrame() can skip some updates when
+ // the updates are the same as the initialized value.
+ // Set to kReferenceFrameIntra instead of kReferenceFrameNone to simplify
+ // branch conditions in motion field projection.
+ // The following memory initialization of contiguous memory is very fast. It
+ // is not recommended to make the initialization multi-threaded, unless the
+ // memory which needs to be initialized in each thread is still contiguous.
+ LIBGAV1_MUST_USE_RESULT bool Reset(int rows, int columns) {
+ return motion_field_reference_frame.Reset(rows, columns,
+ /*zero_initialize=*/true) &&
+ motion_field_mv.Reset(
+ rows, columns,
+#if LIBGAV1_MSAN
+ // It is set in Tile::StoreMotionFieldMvsIntoCurrentFrame() only
+ // for qualified blocks. In MotionFieldProjectionKernel() dsp
+ // optimizations, it is read no matter it was set or not.
+ /*zero_initialize=*/true
+#else
+ /*zero_initialize=*/false
+#endif
+ );
+ }
+
+ // All members are used by inter frames only.
+ // For intra frames, they are not initialized.
+
+ std::array<uint8_t, kNumReferenceFrameTypes> order_hint;
+
+ // An example when |relative_distance_from| does not equal
+ // -|relative_distance_to|:
+ // |relative_distance_from| = GetRelativeDistance(7, 71, 25) = -64
+ // -|relative_distance_to| = -GetRelativeDistance(71, 7, 25) = 64
+ // This is why we need both |relative_distance_from| and
+ // |relative_distance_to|.
+ // |relative_distance_from|: Relative distances from reference frames to this
+ // frame.
+ std::array<int8_t, kNumReferenceFrameTypes> relative_distance_from;
+ // |relative_distance_to|: Relative distances to reference frames.
+ std::array<int8_t, kNumReferenceFrameTypes> relative_distance_to;
+
+ // Skip motion field projection of specific types of frames if their
+ // |relative_distance_to| is negative or too large.
+ std::array<bool, kNumReferenceFrameTypes> skip_references;
+ // Lookup table to get motion field projection division multiplier of specific
+ // types of frames. Derived from kProjectionMvDivisionLookup.
+ std::array<int16_t, kNumReferenceFrameTypes> projection_divisions;
+
+ // The current frame's |motion_field_reference_frame| and |motion_field_mv_|
+ // are guaranteed to be allocated only when refresh_frame_flags is not 0.
+ // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds
+ // to MfRefFrames[i * 2 + 1][j * 2 + 1] in the spec.
+ Array2D<ReferenceFrameType> motion_field_reference_frame;
+ // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds
+ // to MfMvs[i * 2 + 1][j * 2 + 1] in the spec.
+ Array2D<MotionVector> motion_field_mv;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_
diff --git a/chromium/third_party/libgav1/src/src/utils/types.h b/chromium/third_party/libgav1/src/src/utils/types.h
index 8a95bdb20f9..89b35ad7b21 100644
--- a/chromium/third_party/libgav1/src/src/utils/types.h
+++ b/chromium/third_party/libgav1/src/src/utils/types.h
@@ -283,8 +283,10 @@ struct Delta {
};
struct Cdef {
- uint8_t damping;
+ uint8_t damping; // damping value from the spec + (bitdepth - 8).
uint8_t bits;
+ // All the strength values are the values from the spec and left shifted by
+ // (bitdepth - 8).
uint8_t y_primary_strength[kMaxCdefStrengths];
uint8_t y_secondary_strength[kMaxCdefStrengths];
uint8_t uv_primary_strength[kMaxCdefStrengths];