summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJana Grill <janagrill@google.com>2021-02-17 12:35:20 +0000
committerMichael Brüning <michael.bruning@qt.io>2021-04-09 10:50:31 +0000
commitfcd5c56fe795bb48ff3b31e0fff038875c5ad689 (patch)
treeeb133a6c1e56daabcf4418a517d856fb8a727c4a
parent35caa7c78115cc22c405c1b6de386b73d36f8609 (diff)
downloadqtwebengine-chromium-fcd5c56fe795bb48ff3b31e0fff038875c5ad689.tar.gz
[Backport] Security bug 1062941
Manual backport (library update) of patch originally reviewed on https://chromium-review.googlesource.com/c/chromium/src/+/2692542: Roll src/third_party/libyuv/ 6866adbec..1d3f901aa (17 commits) https://chromium.googlesource.com/libyuv/libyuv.git/+log/6866adbec5af..1d3f901aa016 $ git log 6866adbec..1d3f901aa --date=short --no-merges --format='%ad %ae %s' 2020-12-25 fbarchard Scale bug fix with msan when scaling up in height and down in width with box filter. 2020-12-22 fbarchard Test Box filter scale plane with 1 dimension growing and the other reducing 2020-12-03 eshr NV12 Copy, include scale_uv.h 2020-11-18 thakis Stop setting mac_xcode_version in DEPS 2020-11-06 libyuv-ci-autoroll-builder Roll chromium_revision 5aaa70b53c..64c8c30faa (822628:824854) 2020-11-03 fbarchard Scale by even factor low level row function 2020-10-30 libyuv-ci-autoroll-builder Roll chromium_revision df9aecfc0b..5aaa70b53c (820568:822628) 2020-10-28 fbarchard PlaneScale, UVScale and ARGBScale test 3x and 4x down sample. 2020-10-27 fbarchard MJPGToNV12 added and build files sorted 2020-10-24 libyuv-ci-autoroll-builder Roll chromium_revision e812106b13..df9aecfc0b (817907:820568) 2020-10-16 libyuv-ci-autoroll-builder Roll chromium_revision 4892423355..e812106b13 (815587:817907) 2020-10-13 fbarchard UVScale down use AVX2 and Neon for aarch32 2020-10-13 fbarchard UVScale down by 4 use SSSE3/NEON 2020-10-12 fbarchard 2x down sample for UV planes ported to SSSE3 / NEON 2020-10-09 libyuv-ci-autoroll-builder Roll chromium_revision ccec2ad009..4892423355 (811963:815587) 2020-10-02 fbarchard I420ToARGB prototype added to convert_from.h 2020-10-01 fbarchard scale neon adjust PRFM instruction to co-issue with math Created with: roll-dep src/third_party/libyuv (cherry picked from commit 1a60856f34aa15def686168c3b392dc37a120c51) Bug: chromium:1158178, chromium:1062941, libyuv:875, b/176195584 Change-Id: Iecf360198a90acabcbd71e57791634f5e3e861c3 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Eugene Zemtsov <eugene@chromium.org> Cr-Original-Commit-Position: refs/heads/master@{#839493} Commit-Queue: Jana Grill <janagrill@chromium.org> Reviewed-by: Victor-Gabriel Savu <vsavu@google.com> Cr-Commit-Position: refs/branch-heads/4240@{#1545} Cr-Branched-From: f297677702651916bbf65e59c0d4bbd4ce57d1ee-refs/heads/master@{#800218} Reviewed-by: Jüri Valdmann <juri.valdmann@qt.io>
-rw-r--r--chromium/third_party/libyuv/Android.bp26
-rw-r--r--chromium/third_party/libyuv/Android.mk12
-rw-r--r--chromium/third_party/libyuv/BUILD.gn87
-rw-r--r--chromium/third_party/libyuv/DEPS2716
-rw-r--r--chromium/third_party/libyuv/OWNERS8
-rw-r--r--chromium/third_party/libyuv/README.chromium2
-rw-r--r--chromium/third_party/libyuv/README.md8
-rw-r--r--chromium/third_party/libyuv/build_overrides/build.gni10
-rw-r--r--chromium/third_party/libyuv/docs/environment_variables.md20
-rw-r--r--chromium/third_party/libyuv/docs/formats.md12
-rw-r--r--chromium/third_party/libyuv/docs/getting_started.md26
-rw-r--r--chromium/third_party/libyuv/docs/rotation.md4
-rw-r--r--chromium/third_party/libyuv/include/libyuv.h1
-rw-r--r--chromium/third_party/libyuv/include/libyuv/compare_row.h12
-rw-r--r--chromium/third_party/libyuv/include/libyuv/convert.h138
-rw-r--r--chromium/third_party/libyuv/include/libyuv/convert_argb.h1136
-rw-r--r--chromium/third_party/libyuv/include/libyuv/convert_from.h169
-rw-r--r--chromium/third_party/libyuv/include/libyuv/convert_from_argb.h30
-rw-r--r--chromium/third_party/libyuv/include/libyuv/cpu_id.h3
-rw-r--r--chromium/third_party/libyuv/include/libyuv/macros_msa.h3
-rw-r--r--chromium/third_party/libyuv/include/libyuv/planar_functions.h141
-rw-r--r--chromium/third_party/libyuv/include/libyuv/rotate.h26
-rw-r--r--chromium/third_party/libyuv/include/libyuv/rotate_row.h29
-rw-r--r--chromium/third_party/libyuv/include/libyuv/row.h1060
-rw-r--r--chromium/third_party/libyuv/include/libyuv/scale.h73
-rw-r--r--chromium/third_party/libyuv/include/libyuv/scale_row.h429
-rw-r--r--chromium/third_party/libyuv/include/libyuv/scale_uv.h38
-rw-r--r--chromium/third_party/libyuv/include/libyuv/version.h2
-rw-r--r--chromium/third_party/libyuv/include/libyuv/video_common.h33
-rw-r--r--chromium/third_party/libyuv/libyuv.gni11
-rw-r--r--chromium/third_party/libyuv/linux.mk29
-rw-r--r--chromium/third_party/libyuv/source/compare.cc15
-rw-r--r--chromium/third_party/libyuv/source/compare_gcc.cc344
-rw-r--r--chromium/third_party/libyuv/source/compare_mmi.cc123
-rw-r--r--chromium/third_party/libyuv/source/compare_neon.cc70
-rw-r--r--chromium/third_party/libyuv/source/compare_neon64.cc68
-rw-r--r--chromium/third_party/libyuv/source/convert.cc1108
-rw-r--r--chromium/third_party/libyuv/source/convert_argb.cc2340
-rw-r--r--chromium/third_party/libyuv/source/convert_from.cc790
-rw-r--r--chromium/third_party/libyuv/source/convert_from_argb.cc616
-rw-r--r--chromium/third_party/libyuv/source/convert_jpeg.cc310
-rw-r--r--chromium/third_party/libyuv/source/convert_to_argb.cc89
-rw-r--r--chromium/third_party/libyuv/source/convert_to_i420.cc25
-rw-r--r--chromium/third_party/libyuv/source/cpu_id.cc44
-rw-r--r--chromium/third_party/libyuv/source/mjpeg_decoder.cc16
-rw-r--r--chromium/third_party/libyuv/source/mjpeg_validate.cc27
-rw-r--r--chromium/third_party/libyuv/source/planar_functions.cc984
-rw-r--r--chromium/third_party/libyuv/source/rotate.cc149
-rw-r--r--chromium/third_party/libyuv/source/rotate_any.cc6
-rw-r--r--chromium/third_party/libyuv/source/rotate_argb.cc95
-rw-r--r--chromium/third_party/libyuv/source/rotate_gcc.cc540
-rw-r--r--chromium/third_party/libyuv/source/rotate_mmi.cc291
-rw-r--r--chromium/third_party/libyuv/source/rotate_neon.cc218
-rw-r--r--chromium/third_party/libyuv/source/rotate_neon64.cc281
-rw-r--r--chromium/third_party/libyuv/source/row_any.cc379
-rw-r--r--chromium/third_party/libyuv/source/row_common.cc888
-rw-r--r--chromium/third_party/libyuv/source/row_gcc.cc6422
-rw-r--r--chromium/third_party/libyuv/source/row_mmi.cc7842
-rw-r--r--chromium/third_party/libyuv/source/row_msa.cc602
-rw-r--r--chromium/third_party/libyuv/source/row_neon.cc2482
-rw-r--r--chromium/third_party/libyuv/source/row_neon64.cc2893
-rw-r--r--chromium/third_party/libyuv/source/row_win.cc31
-rw-r--r--chromium/third_party/libyuv/source/scale.cc196
-rw-r--r--chromium/third_party/libyuv/source/scale_any.cc231
-rw-r--r--chromium/third_party/libyuv/source/scale_argb.cc83
-rw-r--r--chromium/third_party/libyuv/source/scale_common.cc247
-rw-r--r--chromium/third_party/libyuv/source/scale_gcc.cc1420
-rw-r--r--chromium/third_party/libyuv/source/scale_mmi.cc1168
-rw-r--r--chromium/third_party/libyuv/source/scale_neon.cc754
-rw-r--r--chromium/third_party/libyuv/source/scale_neon64.cc940
-rw-r--r--chromium/third_party/libyuv/source/scale_uv.cc891
-rw-r--r--chromium/third_party/libyuv/tools_libyuv/OWNERS5
-rwxr-xr-xchromium/third_party/libyuv/tools_libyuv/autoroller/roll_deps.py83
-rw-r--r--chromium/third_party/libyuv/tools_libyuv/msan/OWNERS4
-rw-r--r--chromium/third_party/libyuv/tools_libyuv/ubsan/OWNERS5
-rw-r--r--chromium/third_party/libyuv/unit_test/color_test.cc153
-rw-r--r--chromium/third_party/libyuv/unit_test/compare_test.cc7
-rw-r--r--chromium/third_party/libyuv/unit_test/convert_test.cc1805
-rw-r--r--chromium/third_party/libyuv/unit_test/cpu_test.cc24
-rw-r--r--chromium/third_party/libyuv/unit_test/cpu_thread_test.cc4
-rw-r--r--chromium/third_party/libyuv/unit_test/math_test.cc5
-rw-r--r--chromium/third_party/libyuv/unit_test/planar_test.cc569
-rw-r--r--chromium/third_party/libyuv/unit_test/rotate_argb_test.cc42
-rw-r--r--chromium/third_party/libyuv/unit_test/rotate_test.cc117
-rw-r--r--chromium/third_party/libyuv/unit_test/scale_argb_test.cc81
-rw-r--r--chromium/third_party/libyuv/unit_test/scale_test.cc752
-rw-r--r--chromium/third_party/libyuv/unit_test/unit_test.cc6
-rw-r--r--chromium/third_party/libyuv/unit_test/video_common_test.cc2
-rw-r--r--chromium/third_party/libyuv/util/cpuid.c13
-rw-r--r--chromium/third_party/libyuv/util/i444tonv12_eg.cc28
-rw-r--r--chromium/third_party/libyuv/util/psnr.cc2
-rw-r--r--chromium/third_party/libyuv/winarm.mk1
92 files changed, 35235 insertions, 10785 deletions
diff --git a/chromium/third_party/libyuv/Android.bp b/chromium/third_party/libyuv/Android.bp
index fc6a81fc66a..d0b23432628 100644
--- a/chromium/third_party/libyuv/Android.bp
+++ b/chromium/third_party/libyuv/Android.bp
@@ -9,28 +9,34 @@ cc_library {
"source/compare.cc",
"source/compare_common.cc",
"source/compare_gcc.cc",
+ "source/compare_mmi.cc",
+ "source/compare_msa.cc",
"source/compare_neon.cc",
"source/compare_neon64.cc",
- "source/compare_msa.cc",
"source/convert.cc",
"source/convert_argb.cc",
"source/convert_from.cc",
"source/convert_from_argb.cc",
+ "source/convert_jpeg.cc",
"source/convert_to_argb.cc",
"source/convert_to_i420.cc",
"source/cpu_id.cc",
+ "source/mjpeg_decoder.cc",
+ "source/mjpeg_validate.cc",
"source/planar_functions.cc",
"source/rotate.cc",
"source/rotate_any.cc",
"source/rotate_argb.cc",
"source/rotate_common.cc",
"source/rotate_gcc.cc",
+ "source/rotate_mmi.cc",
"source/rotate_msa.cc",
"source/rotate_neon.cc",
"source/rotate_neon64.cc",
"source/row_any.cc",
"source/row_common.cc",
"source/row_gcc.cc",
+ "source/row_mmi.cc",
"source/row_msa.cc",
"source/row_neon.cc",
"source/row_neon64.cc",
@@ -39,13 +45,12 @@ cc_library {
"source/scale_argb.cc",
"source/scale_common.cc",
"source/scale_gcc.cc",
+ "source/scale_mmi.cc",
"source/scale_msa.cc",
"source/scale_neon.cc",
"source/scale_neon64.cc",
+ "source/scale_uv.cc",
"source/video_common.cc",
- "source/convert_jpeg.cc",
- "source/mjpeg_decoder.cc",
- "source/mjpeg_validate.cc",
],
cflags: [
@@ -65,6 +70,7 @@ cc_library {
// with libyuv (b/37646797)
cc_library_static {
name: "libyuv_static",
+ vendor_available: true,
whole_static_libs: ["libyuv"],
}
@@ -74,7 +80,6 @@ cc_test {
shared_libs: ["libjpeg"],
cflags: ["-Wall", "-Werror"],
srcs: [
- "unit_test/unit_test.cc",
"unit_test/basictypes_test.cc",
"unit_test/color_test.cc",
"unit_test/compare_test.cc",
@@ -87,6 +92,8 @@ cc_test {
"unit_test/rotate_test.cc",
"unit_test/scale_argb_test.cc",
"unit_test/scale_test.cc",
+ "unit_test/scale_uv_test.cc",
+ "unit_test/unit_test.cc",
"unit_test/video_common_test.cc",
],
}
@@ -101,6 +108,15 @@ cc_test {
}
cc_test {
+ name: "i444tonv12_eg",
+ gtest: false,
+ srcs: [
+ "util/i444tonv12_eg.cc",
+ ],
+ static_libs: ["libyuv"],
+}
+
+cc_test {
name: "cpuid",
gtest: false,
srcs: [
diff --git a/chromium/third_party/libyuv/Android.mk b/chromium/third_party/libyuv/Android.mk
index dbc6cad37ab..2ceb49281be 100644
--- a/chromium/third_party/libyuv/Android.mk
+++ b/chromium/third_party/libyuv/Android.mk
@@ -9,9 +9,11 @@ LOCAL_SRC_FILES := \
source/compare.cc \
source/compare_common.cc \
source/compare_gcc.cc \
+ source/compare_mmi.cc \
source/compare_msa.cc \
source/compare_neon.cc \
source/compare_neon64.cc \
+ source/compare_win.cc \
source/convert.cc \
source/convert_argb.cc \
source/convert_from.cc \
@@ -25,23 +27,30 @@ LOCAL_SRC_FILES := \
source/rotate_argb.cc \
source/rotate_common.cc \
source/rotate_gcc.cc \
+ source/rotate_mmi.cc \
source/rotate_msa.cc \
source/rotate_neon.cc \
source/rotate_neon64.cc \
+ source/rotate_win.cc \
source/row_any.cc \
source/row_common.cc \
source/row_gcc.cc \
+ source/row_mmi.cc \
source/row_msa.cc \
source/row_neon.cc \
source/row_neon64.cc \
+ source/row_win.cc \
source/scale.cc \
source/scale_any.cc \
source/scale_argb.cc \
source/scale_common.cc \
source/scale_gcc.cc \
+ source/scale_mmi.cc \
source/scale_msa.cc \
source/scale_neon.cc \
source/scale_neon64.cc \
+ source/scale_uv.cc \
+ source/scale_win.cc \
source/video_common.cc
common_CFLAGS := -Wall -fexceptions
@@ -81,7 +90,6 @@ LOCAL_MODULE_TAGS := tests
LOCAL_CPP_EXTENSION := .cc
LOCAL_C_INCLUDES += $(LOCAL_PATH)/include
LOCAL_SRC_FILES := \
- unit_test/unit_test.cc \
unit_test/basictypes_test.cc \
unit_test/color_test.cc \
unit_test/compare_test.cc \
@@ -94,6 +102,8 @@ LOCAL_SRC_FILES := \
unit_test/rotate_test.cc \
unit_test/scale_argb_test.cc \
unit_test/scale_test.cc \
+ unit_test/scale_uv_test.cc \
+ unit_test/unit_test.cc \
unit_test/video_common_test.cc
LOCAL_MODULE := libyuv_unittest
diff --git a/chromium/third_party/libyuv/BUILD.gn b/chromium/third_party/libyuv/BUILD.gn
index 9518a8db3e1..3d5298d7041 100644
--- a/chromium/third_party/libyuv/BUILD.gn
+++ b/chromium/third_party/libyuv/BUILD.gn
@@ -6,9 +6,9 @@
# in the file PATENTS. All contributing project authors may
# be found in the AUTHORS file in the root of the source tree.
-import("libyuv.gni")
import("//build/config/features.gni")
import("//testing/test.gni")
+import("libyuv.gni")
declare_args() {
# Set to false to disable building with gflags.
@@ -33,13 +33,12 @@ config("libyuv_config") {
# This target is built when no specific target is specified on the command line.
group("default") {
testonly = true
- deps = [
- ":libyuv",
- ]
+ deps = [ ":libyuv" ]
if (libyuv_include_tests) {
deps += [
":compare",
":cpuid",
+ ":i444tonv12_eg",
":libyuv_unittest",
":psnr",
":yuvconvert",
@@ -53,13 +52,9 @@ group("libyuv") {
if (is_win && target_cpu == "x64" && !use_qt) {
# Compile with clang in order to get inline assembly
- public_deps = [
- ":libyuv_internal(//build/toolchain/win:win_clang_x64)",
- ]
+ public_deps = [ ":libyuv_internal(//build/toolchain/win:win_clang_x64)" ]
} else {
- public_deps = [
- ":libyuv_internal",
- ]
+ public_deps = [ ":libyuv_internal" ]
}
if (libyuv_use_neon) {
@@ -70,7 +65,11 @@ group("libyuv") {
deps += [ ":libyuv_msa" ]
}
- if (!is_ios) {
+ if (libyuv_use_mmi) {
+ deps += [ ":libyuv_mmi" ]
+ }
+
+ if (!is_ios && !libyuv_disable_jpeg) {
# Make sure that clients of libyuv link with libjpeg. This can't go in
# libyuv_internal because in Windows x64 builds that will generate a clang
# build of libjpeg, and we don't want two copies.
@@ -100,6 +99,7 @@ static_library("libyuv_internal") {
"include/libyuv/scale.h",
"include/libyuv/scale_argb.h",
"include/libyuv/scale_row.h",
+ "include/libyuv/scale_uv.h",
"include/libyuv/version.h",
"include/libyuv/video_common.h",
@@ -134,6 +134,7 @@ static_library("libyuv_internal") {
"source/scale_argb.cc",
"source/scale_common.cc",
"source/scale_gcc.cc",
+ "source/scale_uv.cc",
"source/scale_win.cc",
"source/video_common.cc",
]
@@ -147,7 +148,7 @@ static_library("libyuv_internal") {
configs += [ "//build/config/gcc:symbol_visibility_default" ]
}
- if (!is_ios) {
+ if (!is_ios && !libyuv_disable_jpeg) {
defines += [ "HAVE_JPEG" ]
# Needed to pull in libjpeg headers. Can't add //third_party:jpeg to deps
@@ -173,6 +174,9 @@ static_library("libyuv_internal") {
"-ffp-contract=fast", # Enable fma vectorization for NEON.
]
}
+ if (!libyuv_use_mmi) {
+ defines += [ "LIBYUV_DISABLE_MMI" ]
+ }
}
if (libyuv_use_neon) {
@@ -189,9 +193,7 @@ if (libyuv_use_neon) {
"source/scale_neon64.cc",
]
- deps = [
- ":libyuv_internal",
- ]
+ deps = [ ":libyuv_internal" ]
public_configs = [ ":libyuv_config" ]
@@ -222,10 +224,24 @@ if (libyuv_use_msa) {
"source/scale_msa.cc",
]
- deps = [
- ":libyuv_internal",
+ deps = [ ":libyuv_internal" ]
+
+ public_configs = [ ":libyuv_config" ]
+ }
+}
+
+if (libyuv_use_mmi) {
+ static_library("libyuv_mmi") {
+ sources = [
+ # MMI Source Files
+ "source/compare_mmi.cc",
+ "source/rotate_mmi.cc",
+ "source/row_mmi.cc",
+ "source/scale_mmi.cc",
]
+ deps = [ ":libyuv_internal" ]
+
public_configs = [ ":libyuv_config" ]
}
}
@@ -254,8 +270,6 @@ if (libyuv_include_tests) {
testonly = true
sources = [
- # sources
- # headers
"unit_test/basictypes_test.cc",
"unit_test/color_test.cc",
"unit_test/compare_test.cc",
@@ -268,6 +282,7 @@ if (libyuv_include_tests) {
"unit_test/rotate_test.cc",
"unit_test/scale_argb_test.cc",
"unit_test/scale_test.cc",
+ "unit_test/scale_uv_test.cc",
"unit_test/unit_test.cc",
"unit_test/unit_test.h",
"unit_test/video_common_test.cc",
@@ -286,12 +301,10 @@ if (libyuv_include_tests) {
configs += [ ":libyuv_unittest_warnings_config" ]
- public_deps = [
- "//testing/gtest",
- ]
+ public_deps = [ "//testing/gtest" ]
public_configs = [ ":libyuv_unittest_config" ]
- if (is_linux) {
+ if (is_linux || is_chromeos) {
cflags = [ "-fexceptions" ]
}
if (is_ios) {
@@ -328,10 +341,8 @@ if (libyuv_include_tests) {
# sources
"util/compare.cc",
]
- deps = [
- ":libyuv",
- ]
- if (is_linux) {
+ deps = [ ":libyuv" ]
+ if (is_linux || is_chromeos) {
cflags = [ "-fexceptions" ]
}
}
@@ -341,10 +352,8 @@ if (libyuv_include_tests) {
# sources
"util/yuvconvert.cc",
]
- deps = [
- ":libyuv",
- ]
- if (is_linux) {
+ deps = [ ":libyuv" ]
+ if (is_linux || is_chromeos) {
cflags = [ "-fexceptions" ]
}
}
@@ -356,22 +365,28 @@ if (libyuv_include_tests) {
"util/psnr_main.cc",
"util/ssim.cc",
]
- deps = [
- ":libyuv",
- ]
+ deps = [ ":libyuv" ]
if (!is_ios && !libyuv_disable_jpeg) {
defines = [ "HAVE_JPEG" ]
}
}
- executable("cpuid") {
+ executable("i444tonv12_eg") {
sources = [
# sources
- "util/cpuid.c",
+ "util/i444tonv12_eg.cc",
]
deps = [
":libyuv",
]
}
+
+ executable("cpuid") {
+ sources = [
+ # sources
+ "util/cpuid.c",
+ ]
+ deps = [ ":libyuv" ]
+ }
}
diff --git a/chromium/third_party/libyuv/DEPS b/chromium/third_party/libyuv/DEPS
index 60e437ef6bd..de185434500 100644
--- a/chromium/third_party/libyuv/DEPS
+++ b/chromium/third_party/libyuv/DEPS
@@ -1,45 +1,81 @@
vars = {
'chromium_git': 'https://chromium.googlesource.com',
- 'swarming_revision': '88229872dd17e71658fe96763feaa77915d8cbd6',
- # Three lines of non-changing comments so that
- # the commit queue can handle CLs rolling lss
- # and whatever else without interference from each other.
- 'lss_revision': 'e6527b0cd469e3ff5764785dadcb39bf7d787154',
- # Three lines of non-changing comments so that
- # the commit queue can handle CLs rolling catapult
- # and whatever else without interference from each other.
- 'catapult_revision': 'f3ce003c2baaf3b2aba669681f832139efe5d773',
+ 'chromium_revision': '64c8c30faaf969c15c028131dfcd0819208039c1',
+ 'gn_version': 'git_revision:6f13aaac55a977e1948910942675c69f2b4f7a94',
}
deps = {
'src/build':
- Var('chromium_git') + '/chromium/src/build' + '@' + '8cb53523220fec0dee401d2ee5f046cbf43b0656',
+ Var('chromium_git') + '/chromium/src/build' + '@' + '2d2f9f2b85592bb9af5753ef300c055e6feb709f',
'src/buildtools':
- Var('chromium_git') + '/chromium/buildtools.git' + '@' + '5941c1b3df96c1db756a2834343533335c394c4a',
+ Var('chromium_git') + '/chromium/src/buildtools' + '@' + '6302c1175607a436e18947a5abe9df2209e845fc',
'src/testing':
- Var('chromium_git') + '/chromium/src/testing' + '@' + '60b2c69b17816251c0d20557eb818d26ac7e0fe4',
+ Var('chromium_git') + '/chromium/src/testing' + '@' + '40b44171056045ed1f85ca0b57485e46c03d7867',
'src/third_party':
- Var('chromium_git') + '/chromium/src/third_party' + '@' + 'e755204b7ae59ba1c63e5720a0420d8661672642',
+ Var('chromium_git') + '/chromium/src/third_party' + '@' + '24ccdf9b7553446791983bf357261c5e0a4314a0',
+
+ 'src/buildtools/linux64': {
+ 'packages': [
+ {
+ 'package': 'gn/gn/linux-amd64',
+ 'version': Var('gn_version'),
+ }
+ ],
+ 'dep_type': 'cipd',
+ 'condition': 'checkout_linux',
+ },
+ 'src/buildtools/mac': {
+ 'packages': [
+ {
+ 'package': 'gn/gn/mac-amd64',
+ 'version': Var('gn_version'),
+ }
+ ],
+ 'dep_type': 'cipd',
+ 'condition': 'checkout_mac',
+ },
+ 'src/buildtools/win': {
+ 'packages': [
+ {
+ 'package': 'gn/gn/windows-amd64',
+ 'version': Var('gn_version'),
+ }
+ ],
+ 'dep_type': 'cipd',
+ 'condition': 'checkout_win',
+ },
+
+ 'src/buildtools/clang_format/script':
+ Var('chromium_git') + '/chromium/llvm-project/cfe/tools/clang-format.git' + '@' + '96636aa0e9f047f17447f2d45a094d0b59ed7917',
+ 'src/buildtools/third_party/libc++/trunk':
+ Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxx.git' + '@' + 'd9040c75cfea5928c804ab7c235fed06a63f743a',
+ 'src/buildtools/third_party/libc++abi/trunk':
+ Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxxabi.git' + '@' + '196ba1aaa8ac285d94f4ea8d9836390a45360533',
+ 'src/buildtools/third_party/libunwind/trunk':
+ Var('chromium_git') + '/external/github.com/llvm/llvm-project/libunwind.git' + '@' + 'd999d54f4bca789543a2eb6c995af2d9b5a1f3ed',
+
'src/third_party/catapult':
- Var('chromium_git') + '/catapult.git' + '@' + Var('catapult_revision'),
+ Var('chromium_git') + '/catapult.git' + '@' + 'ccc9dd2835f5a7c5c82ae3c1a2fbc2fe2fd9dfd1',
'src/third_party/colorama/src':
Var('chromium_git') + '/external/colorama.git' + '@' + '799604a1041e9b3bc5d2789ecbd7e8db2e18e6b8',
+ 'src/third_party/depot_tools':
+ Var('chromium_git') + '/chromium/tools/depot_tools.git' + '@' + '91bb7506bd20ed22b8787e7a8b9975cc07e97175',
'src/third_party/freetype/src':
- Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + 'a44e20879cefea41663bb36ff4af908cc4146fb8',
+ Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + '26e2a89598d69c7aba76c83f6a1fcf1db17574ab',
'src/third_party/googletest/src':
- Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + 'ba96d0b1161f540656efdaed035b3c062b60e006',
+ Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + '4fe018038f87675c083d0cfb6a6b57c274fb1753',
'src/third_party/harfbuzz-ng/src':
- Var('chromium_git') + '/external/github.com/harfbuzz/harfbuzz.git' + '@' + '957e7756634a4fdf1654041e20e883cf964ecac9',
+ Var('chromium_git') + '/external/github.com/harfbuzz/harfbuzz.git' + '@' + 'c39ab82c90479341dcf28eaa8174af6f08c0d7ae',
'src/third_party/libjpeg_turbo':
- Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + 'a1750dbc79a8792dde3d3f7d7d8ac28ba01ac9dd',
+ Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + 'd5148db386ceb4a608058320071cbed890bd6ad2',
+ 'src/third_party/nasm':
+ Var('chromium_git') + '/chromium/deps/nasm.git' + '@' + '19f3fad68da99277b2882939d3b2fa4c4b8d51d9',
'src/third_party/yasm/source/patched-yasm':
- Var('chromium_git') + '/chromium/deps/yasm/patched-yasm.git' + '@' + 'b98114e18d8b9b84586b10d24353ab8616d4c5fc',
+ Var('chromium_git') + '/chromium/deps/yasm/patched-yasm.git' + '@' + '720b70524a4424b15fc57e82263568c8ba0496ad',
'src/tools':
- Var('chromium_git') + '/chromium/src/tools' + '@' + '55c65d8fecf04f55f5ba9e14b1fdba170f0202d0',
- 'src/tools/gyp':
- Var('chromium_git') + '/external/gyp.git' + '@' + 'd61a9397e668fa9843c4aa7da9e79460fe590bfb',
- 'src/tools/swarming_client':
- Var('chromium_git') + '/infra/luci/client-py.git' + '@' + Var('swarming_revision'),
+ Var('chromium_git') + '/chromium/src/tools' + '@' + '1bb7c085e67a0fc8c63511af83299d1632f5a3f3',
+ 'src/tools/swarming_client':
+ Var('chromium_git') + '/infra/luci/client-py.git' + '@' + 'd46ea7635f2911208268170512cb611412488fd8',
# libyuv-only dependencies (not present in Chromium).
'src/third_party/gflags':
@@ -50,7 +86,7 @@ deps = {
Var('chromium_git') + '/external/webrtc/deps/third_party/gtest-parallel' + '@' + '1dad0e9f6d82ff994130b529d7d814b40eb32b0e',
'src/third_party/lss': {
- 'url': Var('chromium_git') + '/linux-syscall-support.git' + '@' + Var('lss_revision'),
+ 'url': Var('chromium_git') + '/linux-syscall-support.git' + '@' + '29f7c7e018f4ce706a709f0b0afbf8bacf869480',
'condition': 'checkout_android or checkout_linux',
},
@@ -59,25 +95,27 @@ deps = {
'packages': [
{
'package': 'chromium/third_party/accessibility-test-framework',
- 'version': 'version:2.1-cr0',
+ 'version': 'b5ec1e56e58e56bc1a0c77d43111c37f9b512c8a',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
'src/third_party/auto/src': {
- 'url': Var('chromium_git') + '/external/github.com/google/auto.git' + '@' + '8a81a858ae7b78a1aef71ac3905fade0bbd64e82',
+ 'url': Var('chromium_git') + '/external/github.com/google/auto.git' + '@' + 'f40317ae215863102cf87fe0679ad66f4b19454e',
'condition': 'checkout_android',
},
+ 'src/third_party/boringssl/src':
+ 'https://boringssl.googlesource.com/boringssl.git' + '@' + '1607f54fed72c6589d560254626909a64124f091',
'src/base': {
- 'url': Var('chromium_git') + '/chromium/src/base' + '@' + '733a32608c5cd39c03a578cf6001afc2e6c636a2',
+ 'url': Var('chromium_git') + '/chromium/src/base' + '@' + 'e096814b0448fba1095c6e7be7c7a0b5d7264251',
'condition': 'checkout_android',
},
'src/third_party/bazel': {
'packages': [
{
'package': 'chromium/third_party/bazel',
- 'version': 'version:0.10.0',
+ 'version': 'VjMsf48QUWw8n7XtJP2AuSjIGmbQeYdWdwyxVvIRLmAC',
},
],
'condition': 'checkout_android',
@@ -87,42 +125,96 @@ deps = {
'packages': [
{
'package': 'chromium/third_party/bouncycastle',
- 'version': 'version:1.46-cr0',
+ 'version': 'c078e87552ba26e776566fdaf0f22cd8712743d0',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
'src/third_party/android_ndk': {
- 'url': Var('chromium_git') + '/android_ndk.git' + '@' + '5cd86312e794bdf542a3685c6f10cbb96072990b',
+ 'url': Var('chromium_git') + '/android_ndk.git' + '@' + '27c0a8d090c666a50e40fceb4ee5b40b1a2d3f87',
'condition': 'checkout_android',
},
'src/third_party/android_support_test_runner': {
'packages': [
{
'package': 'chromium/third_party/android_support_test_runner',
- 'version': 'version:0.5-cr0',
+ 'version': '96d4bf848cd210fdcbca6bcc8c1b4b39cbd93141',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
- 'src/third_party/android_tools': {
- 'url': Var('chromium_git') + '/android_tools.git' + '@' + 'c22a664c39af72dd8f89200220713dcad811300a',
- 'condition': 'checkout_android',
+ 'src/third_party/android_sdk/public': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_sdk/public/build-tools/30.0.1',
+ 'version': '8LZujEmLjSh0g3JciDA3cslSptxKs9HOa_iUPXkOeYQC',
+ },
+ {
+ 'package': 'chromium/third_party/android_sdk/public/cmdline-tools',
+ 'version': 'ijpIFSitwBfaEdO9VXBGPqDHUVzPimXy_whw3aHTN9oC',
+ },
+ {
+ 'package': 'chromium/third_party/android_sdk/public/emulator',
+ 'version': 'A4EvXZUIuQho0QRDJopMUpgyp6NA3aiDQjGKPUKbowMC',
+ },
+ {
+ 'package': 'chromium/third_party/android_sdk/public/extras',
+ 'version': 'ppQ4TnqDvBHQ3lXx5KPq97egzF5X2FFyOrVHkGmiTMQC',
+ },
+ {
+ 'package': 'chromium/third_party/android_sdk/public/patcher',
+ 'version': 'I6FNMhrXlpB-E1lOhMlvld7xt9lBVNOO83KIluXDyA0C',
+ },
+ {
+ 'package': 'chromium/third_party/android_sdk/public/platform-tools',
+ 'version': '8tF0AOj7Dwlv4j7_nfkhxWB0jzrvWWYjEIpirt8FIWYC',
+ },
+ {
+ 'package': 'chromium/third_party/android_sdk/public/platforms/android-30',
+ 'version': 'YMUu9EHNZ__2Xcxl-KsaSf-dI5TMt_P62IseUVsxktMC',
+ },
+ {
+ 'package': 'chromium/third_party/android_sdk/public/sources/android-29',
+ 'version': '4gxhM8E62bvZpQs7Q3d0DinQaW0RLCIefhXrQBFkNy8C',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/tools/clang/dsymutil': {
+ 'packages': [
+ {
+ 'package': 'chromium/llvm-build-tools/dsymutil',
+ 'version': 'OWlhXkmj18li3yhJk59Kmjbc5KdgLh56TwCd1qBdzlIC',
+ }
+ ],
+ 'condition': 'checkout_mac',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/android_build_tools/aapt2': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_build_tools/aapt2',
+ 'version': 'version:3.6.0-alpha03-5516695-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
},
'src/third_party/byte_buddy': {
'packages': [
{
'package': 'chromium/third_party/byte_buddy',
- 'version': 'version:1.4.17-cr0',
+ 'version': 'c9b53316603fc2d997c899c7ca1707f809b918cd',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
'src/third_party/ced/src': {
- 'url': Var('chromium_git') + '/external/github.com/google/compact_enc_det.git' + '@' + '94c367a1fe3a13207f4b22604fcfd1d9f9ddf6d9',
+ 'url': Var('chromium_git') + '/external/github.com/google/compact_enc_det.git' + '@' + 'ba412eaaacd3186085babcd901679a48863c7dd5',
'condition': 'checkout_android',
},
'src/third_party/errorprone/lib': {
@@ -137,7 +229,7 @@ deps = {
'packages': [
{
'package': 'chromium/third_party/gson',
- 'version': 'version:2.8.0-cr0',
+ 'version': '681931c9778045903a0ed59856ce2dd8dd7bf7ca',
},
],
'condition': 'checkout_android',
@@ -147,7 +239,7 @@ deps = {
'packages': [
{
'package': 'chromium/third_party/guava',
- 'version': 'version:23.0-cr0',
+ 'version': 'a6fba501f3a0de88b9be1daa2052632de5b96a46',
},
],
'condition': 'checkout_android',
@@ -157,20 +249,21 @@ deps = {
'packages': [
{
'package': 'chromium/third_party/hamcrest',
- 'version': 'version:1.3-cr0',
+ 'version': '37eccfc658fe79695d6abb6dd497463c4372032f',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
+
'src/third_party/icu': {
- 'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'd888fd2a1be890f4d35e43f68d6d79f42519a357',
+ 'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'c2a4cae149aae7fd30c4cbe3cf1b30df03b386f1',
},
'src/third_party/icu4j': {
'packages': [
{
'package': 'chromium/third_party/icu4j',
- 'version': 'version:53.1-cr0',
+ 'version': 'e87e5bed2b4935913ee26a3ebd0b723ee2344354',
},
],
'condition': 'checkout_android',
@@ -180,7 +273,21 @@ deps = {
'packages': [
{
'package': 'chromium/third_party/intellij',
- 'version': 'version:12.0-cr0',
+ 'version': '77c2721b024b36ee073402c08e6d8428c0295336',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/jdk': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/jdk',
+ 'version': 'PfRSnxe8Od6WU4zBXomq-zsgcJgWmm3z4gMQNB-r2QcC',
+ },
+ {
+ 'package': 'chromium/third_party/jdk/extras',
+ 'version': 'fkhuOQ3r-zKtWEdKplpo6k0vKkjl-LY_rJTmtzFCQN4C',
},
],
'condition': 'checkout_android',
@@ -194,15 +301,19 @@ deps = {
'url': Var('chromium_git') + '/external/junit.git' + '@' + '64155f8a9babcfcf4263cf4d08253a1556e75481',
'condition': 'checkout_android',
},
+ 'src/third_party/libunwindstack': {
+ 'url': Var('chromium_git') + '/chromium/src/third_party/libunwindstack.git' + '@' + '11659d420a71e7323b379ea8781f07c6f384bc7e',
+ 'condition': 'checkout_android',
+ },
'src/third_party/mockito/src': {
- 'url': Var('chromium_git') + '/external/mockito/mockito.git' + '@' + 'de83ad4598ad4cf5ea53c69a8a8053780b04b850',
+ 'url': Var('chromium_git') + '/external/mockito/mockito.git' + '@' + '04a2a289a4222f80ad20717c25144981210d2eac',
'condition': 'checkout_android',
},
'src/third_party/objenesis': {
'packages': [
{
'package': 'chromium/third_party/objenesis',
- 'version': 'version:2.4-cr0',
+ 'version': '9e367f55e5a65781ee77bfcbaa88fb82b30e75c0',
},
],
'condition': 'checkout_android',
@@ -212,7 +323,7 @@ deps = {
'packages': [
{
'package': 'chromium/third_party/ow2_asm',
- 'version': 'version:5.0.1-cr0',
+ 'version': 'NNAhdJzMdnutUVqfSJm5v0tVazA9l3Dd6CRwH6N4Q5kC',
},
],
'condition': 'checkout_android',
@@ -222,40 +333,64 @@ deps = {
'packages': [
{
'package': 'chromium/third_party/r8',
- 'version': 'version:1.0.30',
+ 'version': 'N9LppKV-9lFkp7JQtmcLHhm7xHqFv0SPa6aDPtgNCdwC',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/proguard': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/proguard',
+ 'version': '3bd778c422ea5496de2ef25c007a517dbb5ce5ca',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
'src/third_party/requests/src': {
- 'url': Var('chromium_git') + '/external/github.com/kennethreitz/requests.git' + '@' + 'f172b30356d821d180fa4ecfa3e71c7274a32de4',
+ 'url': Var('chromium_git') + '/external/github.com/kennethreitz/requests.git' + '@' + 'refs/tags/v2.23.0',
'condition': 'checkout_android',
},
'src/third_party/robolectric': {
'packages': [
{
'package': 'chromium/third_party/robolectric',
- 'version': 'version:3.5.1',
+ 'version': '1KXoOiNP1a_uZNdM2ybWKwAQNow1dHTXTig-ZK4Xgq8C',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
'src/third_party/robolectric/robolectric': {
- 'url': Var('chromium_git') + '/external/robolectric.git' + '@' + '7e067f1112e1502caa742f7be72d37b5678d3403',
+ 'url': Var('chromium_git') + '/external/robolectric.git' + '@' + '2f3e0a3ac450a17dbf2e7d4eaab3a1f14dda50e6',
'condition': 'checkout_android',
},
'src/third_party/sqlite4java': {
'packages': [
{
'package': 'chromium/third_party/sqlite4java',
- 'version': 'version:0.282-cr0',
+ 'version': '889660698187baa7c8b0d79f7bf58563125fbd66',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/turbine': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/turbine',
+ 'version': 'O_jNDJ4VdwYKBSDbd2BJ3mknaTFoVkvE7Po8XIiKy8sC',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
+ 'src/third_party/turbine/src': {
+ 'url': Var('chromium_git') + '/external/github.com/google/turbine.git' + '@' + '0f2a5024fe4a9bb745bcd5ac7c655cebe11649bc',
+ 'condition': 'checkout_android',
+ },
'src/third_party/ub-uiautomator/lib': {
'url': Var('chromium_git') + '/chromium/third_party/ub-uiautomator.git' + '@' + '00270549ce3161ae72ceb24712618ea28b4f9434',
'condition': 'checkout_android',
@@ -264,7 +399,7 @@ deps = {
'packages': [
{
'package': 'chromium/third_party/xstream',
- 'version': 'version:1.4.8-cr0',
+ 'version': '4278b1b78b86ab7a1a29e64d5aec9a47a9aab0fe',
},
],
'condition': 'checkout_android',
@@ -273,7 +408,7 @@ deps = {
# iOS deps:
'src/ios': {
- 'url': Var('chromium_git') + '/chromium/src/ios' + '@' + '299ef76e844a74a1f2f4ce7f06d101861fb49aba',
+ 'url': Var('chromium_git') + '/chromium/src/ios' + '@' + '60ef55beac67e3c0eda1c35ab7944c786b377313',
'condition': 'checkout_ios'
},
@@ -285,11 +420,176 @@ deps = {
},
# === ANDROID_DEPS Generated Code Start ===
- # Generated by //tools/android/roll/android_deps/fetch_all.sh
- 'src/third_party/android_deps/repository/android_arch_core_common': {
+ # Generated by //third_party/android_deps/fetch_all.py
+ 'src/third_party/android_deps/libs/android_arch_core_common': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/android_arch_core_common',
+ 'version': 'version:1.1.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/android_arch_core_runtime': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/android_arch_core_runtime',
+ 'version': 'version:1.1.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/android_arch_lifecycle_common': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/android_arch_core_common',
+ 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common',
+ 'version': 'version:1.1.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/android_arch_lifecycle_common_java8': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common_java8',
+ 'version': 'version:1.1.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/android_arch_lifecycle_livedata': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_livedata',
+ 'version': 'version:1.1.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/android_arch_lifecycle_livedata_core': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_livedata_core',
+ 'version': 'version:1.1.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/android_arch_lifecycle_runtime': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_runtime',
+ 'version': 'version:1.1.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/android_arch_lifecycle_viewmodel': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_viewmodel',
+ 'version': 'version:1.1.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_activity_activity': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_activity_activity',
+ 'version': 'version:1.1.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_annotation_annotation': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_annotation_annotation',
+ 'version': 'version:1.1.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_annotation_annotation_experimental': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_annotation_annotation_experimental',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_appcompat_appcompat': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_appcompat_appcompat',
+ 'version': 'version:1.2.0-beta01-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_appcompat_appcompat_resources': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_appcompat_appcompat_resources',
+ 'version': 'version:1.2.0-beta01-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_arch_core_core_common': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_arch_core_core_common',
+ 'version': 'version:2.1.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_arch_core_core_runtime': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_arch_core_core_runtime',
+ 'version': 'version:2.1.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_asynclayoutinflater_asynclayoutinflater': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_asynclayoutinflater_asynclayoutinflater',
'version': 'version:1.0.0-cr0',
},
],
@@ -297,10 +597,10 @@ deps = {
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/android_arch_lifecycle_common': {
+ 'src/third_party/android_deps/libs/androidx_cardview_cardview': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/android_arch_lifecycle_common',
+ 'package': 'chromium/third_party/android_deps/libs/androidx_cardview_cardview',
'version': 'version:1.0.0-cr0',
},
],
@@ -308,10 +608,21 @@ deps = {
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/android_arch_lifecycle_runtime': {
+ 'src/third_party/android_deps/libs/androidx_collection_collection': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/android_arch_lifecycle_runtime',
+ 'package': 'chromium/third_party/android_deps/libs/androidx_collection_collection',
+ 'version': 'version:1.1.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_concurrent_concurrent_futures': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_concurrent_concurrent_futures',
'version': 'version:1.0.0-cr0',
},
],
@@ -319,87 +630,98 @@ deps = {
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/com_android_support_animated_vector_drawable': {
+ 'src/third_party/android_deps/libs/androidx_coordinatorlayout_coordinatorlayout': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/com_android_support_animated_vector_drawable',
- 'version': 'version:27.0.0-cr0',
+ 'package': 'chromium/third_party/android_deps/libs/androidx_coordinatorlayout_coordinatorlayout',
+ 'version': 'version:1.1.0-cr0',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/com_android_support_appcompat_v7': {
+ 'src/third_party/android_deps/libs/androidx_core_core': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/com_android_support_appcompat_v7',
- 'version': 'version:27.0.0-cr0',
+ 'package': 'chromium/third_party/android_deps/libs/androidx_core_core',
+ 'version': 'version:1.3.0-beta01-cr0',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/com_android_support_cardview_v7': {
+ 'src/third_party/android_deps/libs/androidx_cursoradapter_cursoradapter': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/com_android_support_cardview_v7',
- 'version': 'version:27.0.0-cr0',
+ 'package': 'chromium/third_party/android_deps/libs/androidx_cursoradapter_cursoradapter',
+ 'version': 'version:1.0.0-cr0',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/com_android_support_design': {
+ 'src/third_party/android_deps/libs/androidx_customview_customview': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/com_android_support_design',
- 'version': 'version:27.0.0-cr0',
+ 'package': 'chromium/third_party/android_deps/libs/androidx_customview_customview',
+ 'version': 'version:1.0.0-cr0',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/com_android_support_gridlayout_v7': {
+ 'src/third_party/android_deps/libs/androidx_documentfile_documentfile': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/com_android_support_gridlayout_v7',
- 'version': 'version:27.0.0-cr0',
+ 'package': 'chromium/third_party/android_deps/libs/androidx_documentfile_documentfile',
+ 'version': 'version:1.0.0-cr0',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/com_android_support_leanback_v17': {
+ 'src/third_party/android_deps/libs/androidx_drawerlayout_drawerlayout': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/com_android_support_leanback_v17',
- 'version': 'version:27.0.0-cr0',
+ 'package': 'chromium/third_party/android_deps/libs/androidx_drawerlayout_drawerlayout',
+ 'version': 'version:1.0.0-cr0',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/com_android_support_mediarouter_v7': {
+ 'src/third_party/android_deps/libs/androidx_exifinterface_exifinterface': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/com_android_support_mediarouter_v7',
- 'version': 'version:27.0.0-cr0',
+ 'package': 'chromium/third_party/android_deps/libs/androidx_exifinterface_exifinterface',
+ 'version': 'version:1.0.0-cr0',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/com_android_support_multidex': {
+ 'src/third_party/android_deps/libs/androidx_fragment_fragment': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/com_android_support_multidex',
+ 'package': 'chromium/third_party/android_deps/libs/androidx_fragment_fragment',
+ 'version': 'version:1.2.5-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_gridlayout_gridlayout': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_gridlayout_gridlayout',
'version': 'version:1.0.0-cr0',
},
],
@@ -407,165 +729,2211 @@ deps = {
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/com_android_support_palette_v7': {
+ 'src/third_party/android_deps/libs/androidx_interpolator_interpolator': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_interpolator_interpolator',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_leanback_leanback': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_leanback_leanback',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_leanback_leanback_preference': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_leanback_leanback_preference',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_legacy_legacy_preference_v14': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_legacy_legacy_preference_v14',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_legacy_legacy_support_core_ui': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_legacy_legacy_support_core_ui',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_legacy_legacy_support_core_utils': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_legacy_legacy_support_core_utils',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_legacy_legacy_support_v13': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_legacy_legacy_support_v13',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_legacy_legacy_support_v4': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_legacy_legacy_support_v4',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_lifecycle_lifecycle_common': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_lifecycle_lifecycle_common',
+ 'version': 'version:2.2.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_lifecycle_lifecycle_common_java8': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_lifecycle_lifecycle_common_java8',
+ 'version': 'version:2.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_lifecycle_lifecycle_livedata': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_lifecycle_lifecycle_livedata',
+ 'version': 'version:2.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_lifecycle_lifecycle_livedata_core': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_lifecycle_lifecycle_livedata_core',
+ 'version': 'version:2.2.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_lifecycle_lifecycle_runtime': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_lifecycle_lifecycle_runtime',
+ 'version': 'version:2.2.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_lifecycle_lifecycle_viewmodel': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_lifecycle_lifecycle_viewmodel',
+ 'version': 'version:2.2.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_lifecycle_lifecycle_viewmodel_savedstate': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_lifecycle_lifecycle_viewmodel_savedstate',
+ 'version': 'version:2.2.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_loader_loader': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_loader_loader',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_localbroadcastmanager_localbroadcastmanager': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_localbroadcastmanager_localbroadcastmanager',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_media_media': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_media_media',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_mediarouter_mediarouter': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_mediarouter_mediarouter',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_multidex_multidex': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_multidex_multidex',
+ 'version': 'version:2.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_palette_palette': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_palette_palette',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_preference_preference': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_preference_preference',
+ 'version': 'version:1.1.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_print_print': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_print_print',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_recyclerview_recyclerview': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_recyclerview_recyclerview',
+ 'version': 'version:1.1.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_savedstate_savedstate': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_savedstate_savedstate',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_slice_slice_builders': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_slice_slice_builders',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_slice_slice_core': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_slice_slice_core',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_slidingpanelayout_slidingpanelayout': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_slidingpanelayout_slidingpanelayout',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_swiperefreshlayout_swiperefreshlayout': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_swiperefreshlayout_swiperefreshlayout',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_test_core': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_test_core',
+ 'version': 'version:1.2.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_test_espresso_espresso_contrib': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_test_espresso_espresso_contrib',
+ 'version': 'version:3.2.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_test_espresso_espresso_core': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_test_espresso_espresso_core',
+ 'version': 'version:3.2.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_test_espresso_espresso_idling_resource': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_test_espresso_espresso_idling_resource',
+ 'version': 'version:3.2.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_test_espresso_espresso_intents': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_test_espresso_espresso_intents',
+ 'version': 'version:3.2.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_test_espresso_espresso_web': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_test_espresso_espresso_web',
+ 'version': 'version:3.2.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_test_ext_junit': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_test_ext_junit',
+ 'version': 'version:1.1.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_test_monitor': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_test_monitor',
+ 'version': 'version:1.2.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_test_rules': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_test_rules',
+ 'version': 'version:1.2.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_test_runner': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_test_runner',
+ 'version': 'version:1.2.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_test_uiautomator_uiautomator': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_test_uiautomator_uiautomator',
+ 'version': 'version:2.2.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_transition_transition': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_transition_transition',
+ 'version': 'version:1.2.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_tvprovider_tvprovider': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_tvprovider_tvprovider',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_vectordrawable_vectordrawable': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_vectordrawable_vectordrawable',
+ 'version': 'version:1.1.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_vectordrawable_vectordrawable_animated': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_vectordrawable_vectordrawable_animated',
+ 'version': 'version:1.1.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_versionedparcelable_versionedparcelable': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_versionedparcelable_versionedparcelable',
+ 'version': 'version:1.1.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_viewpager2_viewpager2': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_viewpager2_viewpager2',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_viewpager_viewpager': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_viewpager_viewpager',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_webkit_webkit': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_webkit_webkit',
+ 'version': 'version:1.3.0-rc01-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/androidx_window_window': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/androidx_window_window',
+ 'version': 'version:1.0.0-alpha01-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/backport_util_concurrent_backport_util_concurrent': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/backport_util_concurrent_backport_util_concurrent',
+ 'version': 'version:3.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/classworlds_classworlds': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/classworlds_classworlds',
+ 'version': 'version:1.1-alpha-2-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_animated_vector_drawable': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_animated_vector_drawable',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_appcompat_v7': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_appcompat_v7',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_asynclayoutinflater': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_asynclayoutinflater',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_cardview_v7': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_cardview_v7',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_collections': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_collections',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_coordinatorlayout': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_coordinatorlayout',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_cursoradapter': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_cursoradapter',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_customview': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_customview',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_design': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_design',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_documentfile': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_documentfile',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_drawerlayout': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_drawerlayout',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_gridlayout_v7': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_gridlayout_v7',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_interpolator': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_interpolator',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_leanback_v17': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_leanback_v17',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_loader': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_loader',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_localbroadcastmanager': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_localbroadcastmanager',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_mediarouter_v7': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_mediarouter_v7',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_multidex': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_multidex',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_palette_v7': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_palette_v7',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_preference_leanback_v17': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_preference_leanback_v17',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_preference_v14': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_preference_v14',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_preference_v7': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_preference_v7',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_print': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_print',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_recyclerview_v7': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_recyclerview_v7',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_slidingpanelayout': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_slidingpanelayout',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_annotations': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_annotations',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_compat': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_compat',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_core_ui': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_ui',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_core_utils': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_utils',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_fragment': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_fragment',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_media_compat': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_media_compat',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_v13': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_v13',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_v4': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_v4',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_vector_drawable': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_vector_drawable',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_swiperefreshlayout': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_swiperefreshlayout',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_transition': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_transition',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_versionedparcelable': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_versionedparcelable',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_viewpager': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_viewpager',
+ 'version': 'version:28.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_tools_build_jetifier_jetifier_core': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_tools_build_jetifier_jetifier_core',
+ 'version': 'version:1.0.0-beta08-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_tools_build_jetifier_jetifier_processor': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_tools_build_jetifier_jetifier_processor',
+ 'version': 'version:1.0.0-beta08-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs',
+ 'version': 'version:1.0.10-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs_configuration': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs_configuration',
+ 'version': 'version:1.0.10-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine',
+ 'version': 'version:2.8.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms',
+ 'version': 'version:1.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth',
+ 'version': 'version:17.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone',
+ 'version': 'version:17.1.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base',
+ 'version': 'version:17.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_base': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_base',
+ 'version': 'version:17.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_basement': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_basement',
+ 'version': 'version:17.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_cast': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast',
+ 'version': 'version:17.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework',
+ 'version': 'version:17.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_clearcut': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_clearcut',
+ 'version': 'version:17.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_fido': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_fido',
+ 'version': 'version:18.1.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_flags': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_flags',
+ 'version': 'version:17.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_gcm': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_gcm',
+ 'version': 'version:17.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_iid': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_iid',
+ 'version': 'version:17.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps',
+ 'version': 'version:17.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_location': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_location',
+ 'version': 'version:17.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_phenotype': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_phenotype',
+ 'version': 'version:17.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_places_placereport': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_places_placereport',
+ 'version': 'version:17.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_stats': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_stats',
+ 'version': 'version:17.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_tasks': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_tasks',
+ 'version': 'version:17.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_vision': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision',
+ 'version': 'version:18.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common',
+ 'version': 'version:18.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_material_material': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_material_material',
+ 'version': 'version:1.2.0-alpha06-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_auto_auto_common': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_auto_auto_common',
+ 'version': 'version:0.10-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_auto_service_auto_service': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_auto_service_auto_service',
+ 'version': 'version:1.0-rc6-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_auto_service_auto_service_annotations': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_auto_service_auto_service_annotations',
+ 'version': 'version:1.0-rc6-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_auto_value_auto_value_annotations': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_auto_value_auto_value_annotations',
+ 'version': 'version:1.7-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_code_findbugs_jFormatString': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_code_findbugs_jformatstring',
+ 'version': 'version:3.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_code_findbugs_jsr305': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_code_findbugs_jsr305',
+ 'version': 'version:3.0.2-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_code_gson_gson': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_code_gson_gson',
+ 'version': 'version:2.8.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_dagger_dagger': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger',
+ 'version': 'version:2.26-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_dagger_dagger_compiler': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_compiler',
+ 'version': 'version:2.26-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_dagger_dagger_producers': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_producers',
+ 'version': 'version:2.26-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_dagger_dagger_spi': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_spi',
+ 'version': 'version:2.26-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_annotation': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_annotation',
+ 'version': 'version:2.4.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_annotations': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_annotations',
+ 'version': 'version:2.4.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_check_api': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_check_api',
+ 'version': 'version:2.4.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_core': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_core',
+ 'version': 'version:2.4.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_type_annotations': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_type_annotations',
+ 'version': 'version:2.4.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_errorprone_javac': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_javac',
+ 'version': 'version:9+181-r4173-1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_errorprone_javac_shaded': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_javac_shaded',
+ 'version': 'version:9-dev-r4023-3-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_googlejavaformat_google_java_format': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_googlejavaformat_google_java_format',
+ 'version': 'version:1.5-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_guava_failureaccess': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_guava_failureaccess',
+ 'version': 'version:1.0.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_guava_guava': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/com_android_support_palette_v7',
- 'version': 'version:27.0.0-cr0',
+ 'package': 'chromium/third_party/android_deps/libs/com_google_guava_guava',
+ 'version': 'version:27.1-jre-cr0',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/com_android_support_preference_leanback_v17': {
+ 'src/third_party/android_deps/libs/com_google_guava_listenablefuture': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/com_android_support_preference_leanback_v17',
- 'version': 'version:27.0.0-cr0',
+ 'package': 'chromium/third_party/android_deps/libs/com_google_guava_listenablefuture',
+ 'version': 'version:1.0-cr0',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/com_android_support_preference_v14': {
+ 'src/third_party/android_deps/libs/com_google_j2objc_j2objc_annotations': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/com_android_support_preference_v14',
- 'version': 'version:27.0.0-cr0',
+ 'package': 'chromium/third_party/android_deps/libs/com_google_j2objc_j2objc_annotations',
+ 'version': 'version:1.1-cr0',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/com_android_support_preference_v7': {
+ 'src/third_party/android_deps/libs/com_google_protobuf_protobuf_java': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/com_android_support_preference_v7',
- 'version': 'version:27.0.0-cr0',
+ 'package': 'chromium/third_party/android_deps/libs/com_google_protobuf_protobuf_java',
+ 'version': 'version:3.4.0-cr0',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/com_android_support_recyclerview_v7': {
+ 'src/third_party/android_deps/libs/com_google_protobuf_protobuf_javalite': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/com_android_support_recyclerview_v7',
- 'version': 'version:27.0.0-cr0',
+ 'package': 'chromium/third_party/android_deps/libs/com_google_protobuf_protobuf_javalite',
+ 'version': 'version:3.13.0-cr0',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/com_android_support_support_annotations': {
+ 'src/third_party/android_deps/libs/com_googlecode_java_diff_utils_diffutils': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/com_android_support_support_annotations',
- 'version': 'version:27.0.0-cr0',
+ 'package': 'chromium/third_party/android_deps/libs/com_googlecode_java_diff_utils_diffutils',
+ 'version': 'version:1.3.0-cr0',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/com_android_support_support_compat': {
+ 'src/third_party/android_deps/libs/com_squareup_javapoet': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/com_android_support_support_compat',
- 'version': 'version:27.0.0-cr0',
+ 'package': 'chromium/third_party/android_deps/libs/com_squareup_javapoet',
+ 'version': 'version:1.11.1-cr0',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/com_android_support_support_core_ui': {
+ 'src/third_party/android_deps/libs/com_squareup_javawriter': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/com_android_support_support_core_ui',
- 'version': 'version:27.0.0-cr0',
+ 'package': 'chromium/third_party/android_deps/libs/com_squareup_javawriter',
+ 'version': 'version:2.1.1-cr0',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/com_android_support_support_core_utils': {
+ 'src/third_party/android_deps/libs/commons_cli_commons_cli': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/com_android_support_support_core_utils',
- 'version': 'version:27.0.0-cr0',
+ 'package': 'chromium/third_party/android_deps/libs/commons_cli_commons_cli',
+ 'version': 'version:1.3.1-cr0',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/com_android_support_support_fragment': {
+ 'src/third_party/android_deps/libs/javax_annotation_javax_annotation_api': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/com_android_support_support_fragment',
- 'version': 'version:27.0.0-cr0',
+ 'package': 'chromium/third_party/android_deps/libs/javax_annotation_javax_annotation_api',
+ 'version': 'version:1.3.2-cr0',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/com_android_support_support_media_compat': {
+ 'src/third_party/android_deps/libs/javax_annotation_jsr250_api': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/com_android_support_support_media_compat',
- 'version': 'version:27.0.0-cr0',
+ 'package': 'chromium/third_party/android_deps/libs/javax_annotation_jsr250_api',
+ 'version': 'version:1.0-cr0',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/com_android_support_support_v13': {
+ 'src/third_party/android_deps/libs/javax_inject_javax_inject': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/com_android_support_support_v13',
- 'version': 'version:27.0.0-cr0',
+ 'package': 'chromium/third_party/android_deps/libs/javax_inject_javax_inject',
+ 'version': 'version:1-cr0',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/com_android_support_support_v4': {
+ 'src/third_party/android_deps/libs/nekohtml_nekohtml': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/com_android_support_support_v4',
- 'version': 'version:27.0.0-cr0',
+ 'package': 'chromium/third_party/android_deps/libs/nekohtml_nekohtml',
+ 'version': 'version:1.9.6.2-cr0',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/com_android_support_support_vector_drawable': {
+ 'src/third_party/android_deps/libs/nekohtml_xercesMinimal': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/com_android_support_support_vector_drawable',
- 'version': 'version:27.0.0-cr0',
+ 'package': 'chromium/third_party/android_deps/libs/nekohtml_xercesminimal',
+ 'version': 'version:1.9.6.2-cr0',
},
],
'condition': 'checkout_android',
'dep_type': 'cipd',
},
- 'src/third_party/android_deps/repository/com_android_support_transition': {
+ 'src/third_party/android_deps/libs/net_ltgt_gradle_incap_incap': {
'packages': [
{
- 'package': 'chromium/third_party/android_deps/repository/com_android_support_transition',
- 'version': 'version:27.0.0-cr0',
+ 'package': 'chromium/third_party/android_deps/libs/net_ltgt_gradle_incap_incap',
+ 'version': 'version:0.2-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/net_sf_kxml_kxml2': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/net_sf_kxml_kxml2',
+ 'version': 'version:2.3.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_apache_ant_ant': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_apache_ant_ant',
+ 'version': 'version:1.8.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_apache_ant_ant_launcher': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_apache_ant_ant_launcher',
+ 'version': 'version:1.8.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_apache_maven_maven_ant_tasks': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_ant_tasks',
+ 'version': 'version:2.1.3-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_apache_maven_maven_artifact': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_artifact',
+ 'version': 'version:2.2.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_apache_maven_maven_artifact_manager': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_artifact_manager',
+ 'version': 'version:2.2.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_apache_maven_maven_error_diagnostics': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_error_diagnostics',
+ 'version': 'version:2.2.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_apache_maven_maven_model': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_model',
+ 'version': 'version:2.2.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_apache_maven_maven_plugin_registry': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_plugin_registry',
+ 'version': 'version:2.2.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_apache_maven_maven_profile': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_profile',
+ 'version': 'version:2.2.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_apache_maven_maven_project': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_project',
+ 'version': 'version:2.2.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_apache_maven_maven_repository_metadata': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_repository_metadata',
+ 'version': 'version:2.2.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_apache_maven_maven_settings': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_settings',
+ 'version': 'version:2.2.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_file': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_file',
+ 'version': 'version:1.0-beta-6-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_lightweight': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_lightweight',
+ 'version': 'version:1.0-beta-6-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_shared': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_shared',
+ 'version': 'version:1.0-beta-6-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_provider_api': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_provider_api',
+ 'version': 'version:1.0-beta-6-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup',
+ 'version': 'version:1.2.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_checkerframework_checker_compat_qual': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_compat_qual',
+ 'version': 'version:2.5.3-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_checkerframework_checker_qual': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_qual',
+ 'version': 'version:2.10.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_checkerframework_dataflow_shaded': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_checkerframework_dataflow_shaded',
+ 'version': 'version:3.1.2-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations',
+ 'version': 'version:1.17-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_codehaus_plexus_plexus_container_default': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_codehaus_plexus_plexus_container_default',
+ 'version': 'version:1.0-alpha-9-stable-1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_codehaus_plexus_plexus_interpolation': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_codehaus_plexus_plexus_interpolation',
+ 'version': 'version:1.11-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_codehaus_plexus_plexus_utils': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_codehaus_plexus_plexus_utils',
+ 'version': 'version:1.5.15-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_jdom_jdom2': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_jdom_jdom2',
+ 'version': 'version:2.0.6-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_jetbrains_annotations': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_annotations',
+ 'version': 'version:13.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib',
+ 'version': 'version:1.3.50-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_common': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_common',
+ 'version': 'version:1.3.50-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm',
+ 'version': 'version:0.1.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_ow2_asm_asm': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm',
+ 'version': 'version:7.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_ow2_asm_asm_analysis': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_analysis',
+ 'version': 'version:7.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_ow2_asm_asm_commons': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_commons',
+ 'version': 'version:7.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_ow2_asm_asm_tree': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_tree',
+ 'version': 'version:7.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_ow2_asm_asm_util': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_util',
+ 'version': 'version:7.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_pcollections_pcollections': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_pcollections_pcollections',
+ 'version': 'version:2.1.2-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_annotations': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_annotations',
+ 'version': 'version:4.3.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_junit': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_junit',
+ 'version': 'version:4.3.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_pluginapi': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_pluginapi',
+ 'version': 'version:4.3.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_plugins_maven_dependency_resolver': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_plugins_maven_dependency_resolver',
+ 'version': 'version:4.3.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_resources': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_resources',
+ 'version': 'version:4.3.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_robolectric': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_robolectric',
+ 'version': 'version:4.3.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_sandbox': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_sandbox',
+ 'version': 'version:4.3.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_shadowapi': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadowapi',
+ 'version': 'version:4.3.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_shadows_framework': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadows_framework',
+ 'version': 'version:4.3.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_shadows_multidex': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadows_multidex',
+ 'version': 'version:4.3.1-cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_shadows_playservices': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadows_playservices',
+ 'version': 'version:4.3.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_utils': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_utils',
+ 'version': 'version:4.3.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_utils_reflector': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_utils_reflector',
+ 'version': 'version:4.3.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_threeten_threeten_extra': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_threeten_threeten_extra',
+ 'version': 'version:1.5.0-cr0',
},
],
'condition': 'checkout_android',
@@ -651,24 +3019,39 @@ hooks = [
'name': 'mac_toolchain',
'pattern': '.',
'action': ['python', 'src/build/mac_toolchain.py'],
+ 'condition': 'checkout_mac',
},
- # Pull binutils for linux, enabled debug fission for faster linking /
- # debugging when used with clang on Ubuntu Precise.
- # https://code.google.com/p/chromium/issues/detail?id=352046
+ # Pull the msan libraries on linux.
{
- 'name': 'binutils',
- 'pattern': 'src/third_party/binutils',
- 'action': [
- 'python',
- 'src/third_party/binutils/download.py',
- ],
+ 'name': 'msan_chained_origins',
+ 'pattern': '.',
+ 'condition': 'checkout_linux',
+ 'action': [ 'python',
+ 'src/third_party/depot_tools/download_from_google_storage.py',
+ '--no_resume',
+ '--no_auth',
+ '--bucket', 'chromium-instrumented-libraries',
+ '-s', 'src/third_party/instrumented_libraries/binaries/msan-chained-origins-trusty.tgz.sha1',
+ ],
+ },
+ {
+ 'name': 'msan_no_origins',
+ 'pattern': '.',
+ 'condition': 'checkout_linux',
+ 'action': [ 'python',
+ 'src/third_party/depot_tools/download_from_google_storage.py',
+ '--no_resume',
+ '--no_auth',
+ '--bucket', 'chromium-instrumented-libraries',
+ '-s', 'src/third_party/instrumented_libraries/binaries/msan-no-origins-trusty.tgz.sha1',
+ ],
},
{
# Pull clang if needed or requested via GYP_DEFINES.
# Note: On Win, this should run after win_toolchain, as it may use it.
'name': 'clang',
'pattern': '.',
- 'action': ['python', 'src/tools/clang/scripts/update.py', '--if-needed'],
+ 'action': ['python', 'src/tools/clang/scripts/update.py'],
},
{
# Update LASTCHANGE.
@@ -677,40 +3060,6 @@ hooks = [
'action': ['python', 'src/build/util/lastchange.py',
'-o', 'src/build/util/LASTCHANGE'],
},
- # Pull GN binaries.
- {
- 'name': 'gn_win',
- 'pattern': '.',
- 'action': [ 'download_from_google_storage',
- '--no_resume',
- '--platform=win32',
- '--no_auth',
- '--bucket', 'chromium-gn',
- '-s', 'src/buildtools/win/gn.exe.sha1',
- ],
- },
- {
- 'name': 'gn_mac',
- 'pattern': '.',
- 'action': [ 'download_from_google_storage',
- '--no_resume',
- '--platform=darwin',
- '--no_auth',
- '--bucket', 'chromium-gn',
- '-s', 'src/buildtools/mac/gn.sha1',
- ],
- },
- {
- 'name': 'gn_linux64',
- 'pattern': '.',
- 'action': [ 'download_from_google_storage',
- '--no_resume',
- '--platform=linux*',
- '--no_auth',
- '--bucket', 'chromium-gn',
- '-s', 'src/buildtools/linux64/gn.sha1',
- ],
- },
# Pull clang-format binaries using checked-in hashes.
{
'name': 'clang_format_win',
@@ -737,6 +3086,7 @@ hooks = [
{
'name': 'clang_format_linux',
'pattern': '.',
+ 'condition': 'host_os == "linux"',
'action': [ 'download_from_google_storage',
'--no_resume',
'--platform=linux*',
@@ -791,26 +3141,6 @@ hooks = [
'--root', 'src',
],
},
- # Android dependencies. Many are downloaded using Google Storage these days.
- # They're copied from https://cs.chromium.org/chromium/src/DEPS for all
- # such dependencies we share with Chromium.
- {
- # This downloads SDK extras and puts them in the
- # third_party/android_tools/sdk/extras directory.
- 'name': 'sdkextras',
- 'pattern': '.',
- # When adding a new sdk extras package to download, add the package
- # directory and zip file to .gitignore in third_party/android_tools.
- 'action': ['python',
- 'src/build/android/play_services/update.py',
- 'download'
- ],
- },
]
-recursedeps = [
- # buildtools provides clang_format, libc++, and libc++abi.
- 'src/buildtools',
- # android_tools manages the NDK.
- 'src/third_party/android_tools',
-]
+recursedeps = []
diff --git a/chromium/third_party/libyuv/OWNERS b/chromium/third_party/libyuv/OWNERS
index 7b21adfe6c7..755c220be4d 100644
--- a/chromium/third_party/libyuv/OWNERS
+++ b/chromium/third_party/libyuv/OWNERS
@@ -1,8 +1,12 @@
+mbonadei@chromium.org
fbarchard@chromium.org
magjed@chromium.org
+pbos@chromium.org
-per-file *.gn=phoglund@chromium.org
+per-file *.gn=mbonadei@chromium.org
per-file .gitignore=*
per-file AUTHORS=*
per-file DEPS=*
-per-file PRESUBMIT.py=phoglund@chromium.org
+per-file PRESUBMIT.py=mbonadei@chromium.org
+
+# COMPONENT: Internals>Images>Codecs
diff --git a/chromium/third_party/libyuv/README.chromium b/chromium/third_party/libyuv/README.chromium
index 4ecdcb2840b..4a7e30b087c 100644
--- a/chromium/third_party/libyuv/README.chromium
+++ b/chromium/third_party/libyuv/README.chromium
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 1714
+Version: 1768
License: BSD
License File: LICENSE
diff --git a/chromium/third_party/libyuv/README.md b/chromium/third_party/libyuv/README.md
index 7b6619220b8..db70b7f08d3 100644
--- a/chromium/third_party/libyuv/README.md
+++ b/chromium/third_party/libyuv/README.md
@@ -10,9 +10,9 @@
### Development
-See [Getting started] [1] for instructions on how to get started developing.
+See [Getting started][1] for instructions on how to get started developing.
-You can also browse the [docs directory] [2] for more documentation.
+You can also browse the [docs directory][2] for more documentation.
-[1]: https://chromium.googlesource.com/libyuv/libyuv/+/master/docs/getting_started.md
-[2]: https://chromium.googlesource.com/libyuv/libyuv/+/master/docs/
+[1]: ./docs/getting_started.md
+[2]: ./docs/
diff --git a/chromium/third_party/libyuv/build_overrides/build.gni b/chromium/third_party/libyuv/build_overrides/build.gni
index 6d8319b965e..a83860a8eb8 100644
--- a/chromium/third_party/libyuv/build_overrides/build.gni
+++ b/chromium/third_party/libyuv/build_overrides/build.gni
@@ -44,3 +44,13 @@ if (host_os == "mac") {
"hermetic toolchain if the minimum OS version is not met.")
use_system_xcode = _result == 0
}
+
+declare_args() {
+ # Tracing support requires //third_party/perfetto.
+ enable_base_tracing = false
+ use_perfetto_client_library = false
+
+ # Allows googletest to pretty-print various absl types.
+ # Defined here rather than in gtest.gni to match chromium.
+ gtest_enable_absl_printers = true
+}
diff --git a/chromium/third_party/libyuv/docs/environment_variables.md b/chromium/third_party/libyuv/docs/environment_variables.md
index c28d83e7dc1..cd8159ad5a8 100644
--- a/chromium/third_party/libyuv/docs/environment_variables.md
+++ b/chromium/third_party/libyuv/docs/environment_variables.md
@@ -6,7 +6,10 @@ For test purposes, environment variables can be set to control libyuv behavior.
By default the cpu is detected and the most advanced form of SIMD is used. But you can disable instruction sets selectively, or completely, falling back on C code. Set the variable to 1 to disable the specified instruction set.
+## All CPUs
LIBYUV_DISABLE_ASM
+
+## Intel CPUs
LIBYUV_DISABLE_X86
LIBYUV_DISABLE_SSE2
LIBYUV_DISABLE_SSSE3
@@ -14,12 +17,25 @@ By default the cpu is detected and the most advanced form of SIMD is used. But
LIBYUV_DISABLE_SSE42
LIBYUV_DISABLE_AVX
LIBYUV_DISABLE_AVX2
- LIBYUV_DISABLE_AVX512BW
LIBYUV_DISABLE_ERMS
LIBYUV_DISABLE_FMA3
- LIBYUV_DISABLE_MSA
+ LIBYUV_DISABLE_F16C
+ LIBYUV_DISABLE_AVX512BW
+ LIBYUV_DISABLE_AVX512VL
+ LIBYUV_DISABLE_AVX512VBMI
+ LIBYUV_DISABLE_AVX512VBMI2
+ LIBYUV_DISABLE_AVX512VBITALG
+ LIBYUV_DISABLE_AVX512VPOPCNTDQ
+ LIBYUV_DISABLE_GFNI
+
+## ARM CPUs
+
LIBYUV_DISABLE_NEON
+## MIPS CPUs
+ LIBYUV_DISABLE_MSA
+ LIBYUV_DISABLE_MMI
+
# Test Width/Height/Repeat
The unittests default to a small image (128x72) to run fast. This can be set by environment variable to test a specific resolutions.
diff --git a/chromium/third_party/libyuv/docs/formats.md b/chromium/third_party/libyuv/docs/formats.md
index 97e8ce05f48..a29ed5c3043 100644
--- a/chromium/third_party/libyuv/docs/formats.md
+++ b/chromium/third_party/libyuv/docs/formats.md
@@ -36,7 +36,7 @@ This is how OSX formats map to libyuv
The following is extracted from video_common.h as a complete list of formats supported by libyuv.
enum FourCC {
- // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+ // 10 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
FOURCC_I420 = FOURCC('I', '4', '2', '0'),
FOURCC_I422 = FOURCC('I', '4', '2', '2'),
FOURCC_I444 = FOURCC('I', '4', '4', '4'),
@@ -46,9 +46,11 @@ The following is extracted from video_common.h as a complete list of formats sup
FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
FOURCC_H010 = FOURCC('H', '0', '1', '0'), // unofficial fourcc. 10 bit lsb
+ FOURCC_U010 = FOURCC('U', '0', '1', '0'), // bt.2020, unofficial fourcc.
+ // 10 bit lsb
// 1 Secondary YUV format: row biplanar.
- FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+ FOURCC_M420 = FOURCC('M', '4', '2', '0'), // deprecated.
// 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc
FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
@@ -66,7 +68,7 @@ The following is extracted from video_common.h as a complete list of formats sup
// 1 Primary Compressed YUV format.
FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
- // 8 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+ // 11 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
@@ -75,6 +77,9 @@ The following is extracted from video_common.h as a complete list of formats sup
FOURCC_J400 = FOURCC('J', '4', '0', '0'), // unofficial fourcc
FOURCC_H420 = FOURCC('H', '4', '2', '0'), // unofficial fourcc
FOURCC_H422 = FOURCC('H', '4', '2', '2'), // unofficial fourcc
+ FOURCC_U420 = FOURCC('U', '4', '2', '0'), // bt.2020, unofficial fourcc
+ FOURCC_U422 = FOURCC('U', '4', '2', '2'), // bt.2020, unofficial fourcc
+ FOURCC_U444 = FOURCC('U', '4', '4', '4'), // bt.2020, unofficial fourcc
// 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc.
FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420.
@@ -161,3 +166,4 @@ The 12 in NV12 refers to 12 bits per pixel. NV12 has a half width and half
height chroma channel, and therefore is a 420 subsampling.
NV16 is 16 bits per pixel, with half width and full height. aka 422.
NV24 is 24 bits per pixel with full sized chroma channel. aka 444.
+Most NV12 functions allow the destination Y pointer to be NULL.
diff --git a/chromium/third_party/libyuv/docs/getting_started.md b/chromium/third_party/libyuv/docs/getting_started.md
index f547c419d67..3e339712e19 100644
--- a/chromium/third_party/libyuv/docs/getting_started.md
+++ b/chromium/third_party/libyuv/docs/getting_started.md
@@ -27,7 +27,7 @@ Then you'll get a .gclient file like:
},
];
-For iOS add `;target_os=['ios'];` to your OSX .gclient and run `GYP_DEFINES="OS=ios" gclient sync.`
+For iOS add `;target_os=['ios'];` to your OSX .gclient and run `gclient sync.`
Browse the Git reprository: https://chromium.googlesource.com/libyuv/libyuv/+/master
@@ -48,11 +48,8 @@ For Android add `;target_os=['android'];` to your Linux .gclient
Then run:
- export GYP_DEFINES="OS=android"
gclient sync
-The sync will generate native build files for your environment using gyp (Windows: Visual Studio, OSX: XCode, Linux: make). This generation can also be forced manually: `gclient runhooks`
-
To get just the source (not buildable):
git clone https://chromium.googlesource.com/libyuv/libyuv
@@ -135,8 +132,8 @@ ia32
mips
- gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=true"
- gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=true"
+ gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true"
+ gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true"
ninja -v -C out/Debug libyuv_unittest
ninja -v -C out/Release libyuv_unittest
@@ -152,15 +149,15 @@ arm disassembly:
Running tests:
- build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=*
+ out/Release/bin/run_libyuv_unittest -vv --gtest_filter=*
Running test as benchmark:
- build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1 --libyuv_cpu_info=-1"
+ out/Release/bin/run_libyuv_unittest -vv --gtest_filter=* --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1 --libyuv_cpu_info=-1
Running test with C code:
- build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=1 --libyuv_cpu_info=1"
+ out/Release/bin/run_libyuv_unittest -vv --gtest_filter=* --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=1 --libyuv_cpu_info=1
### Build targets
@@ -178,13 +175,22 @@ Running test with C code:
ninja -v -C out/Debug libyuv_unittest
ninja -v -C out/Release libyuv_unittest
+### MIPS Linux
+
+mips
+
+ gn gen out/Release "--args=is_debug=false target_os=\"linux\" target_cpu=\"mips64el\" mips_arch_variant=\"loongson3\" mips_use_mmi=true is_component_build=false use_sysroot=false use_gold=false"
+ gn gen out/Debug "--args=is_debug=true target_os=\"linux\" target_cpu=\"mips64el\" mips_arch_variant=\"loongson3\" mips_use_mmi=true is_component_build=false use_sysroot=false use_gold=false"
+ ninja -v -C out/Debug libyuv_unittest
+ ninja -v -C out/Release libyuv_unittest
+
## Building the Library with make
### Linux
make V=1 -f linux.mk
make V=1 -f linux.mk clean
- make V=1 -f linux.mk CXX=clang++
+ make V=1 -f linux.mk CXX=clang++ CC=clang
## Building the library with cmake
diff --git a/chromium/third_party/libyuv/docs/rotation.md b/chromium/third_party/libyuv/docs/rotation.md
index fb84fce5a9c..a08430fded0 100644
--- a/chromium/third_party/libyuv/docs/rotation.md
+++ b/chromium/third_party/libyuv/docs/rotation.md
@@ -100,4 +100,8 @@ Inverting can be achieved with almost any libyuv function by passing a negative
I420Mirror and ARGBMirror can also be used to rotate by 180 degrees by passing a negative height.
+# Cropping - Vertical Flip
+When cropping from a subsampled format like NV21, the method of setting the start pointers wont work for odd crop start y on the UV plane.
+If the height after cropping will be odd, invert the source - point to the last row, negate the strides, and pass negative height, which
+will re-invert the image as the conversion outputs.
diff --git a/chromium/third_party/libyuv/include/libyuv.h b/chromium/third_party/libyuv/include/libyuv.h
index aeffd5ef7a4..a06e1233abb 100644
--- a/chromium/third_party/libyuv/include/libyuv.h
+++ b/chromium/third_party/libyuv/include/libyuv.h
@@ -26,6 +26,7 @@
#include "libyuv/scale.h"
#include "libyuv/scale_argb.h"
#include "libyuv/scale_row.h"
+#include "libyuv/scale_uv.h"
#include "libyuv/version.h"
#include "libyuv/video_common.h"
diff --git a/chromium/third_party/libyuv/include/libyuv/compare_row.h b/chromium/third_party/libyuv/include/libyuv/compare_row.h
index e81f7455eee..e95b9d93eb2 100644
--- a/chromium/third_party/libyuv/include/libyuv/compare_row.h
+++ b/chromium/third_party/libyuv/include/libyuv/compare_row.h
@@ -84,6 +84,11 @@ extern "C" {
#define HAS_SUMSQUAREERROR_MSA
#endif
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+#define HAS_HAMMINGDISTANCE_MMI
+#define HAS_SUMSQUAREERROR_MMI
+#endif
+
uint32_t HammingDistance_C(const uint8_t* src_a,
const uint8_t* src_b,
int count);
@@ -102,7 +107,9 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
uint32_t HammingDistance_MSA(const uint8_t* src_a,
const uint8_t* src_b,
int count);
-
+uint32_t HammingDistance_MMI(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count);
uint32_t SumSquareError_C(const uint8_t* src_a,
const uint8_t* src_b,
int count);
@@ -118,6 +125,9 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
uint32_t SumSquareError_MSA(const uint8_t* src_a,
const uint8_t* src_b,
int count);
+uint32_t SumSquareError_MMI(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count);
uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed);
uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed);
diff --git a/chromium/third_party/libyuv/include/libyuv/convert.h b/chromium/third_party/libyuv/include/libyuv/convert.h
index d12ef24f799..026b153cefe 100644
--- a/chromium/third_party/libyuv/include/libyuv/convert.h
+++ b/chromium/third_party/libyuv/include/libyuv/convert.h
@@ -42,6 +42,36 @@ int I444ToI420(const uint8_t* src_y,
int width,
int height);
+// Convert I444 to NV12.
+LIBYUV_API
+int I444ToNV12(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert I444 to NV21.
+LIBYUV_API
+int I444ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
// Convert I422 to I420.
LIBYUV_API
int I422ToI420(const uint8_t* src_y,
@@ -59,6 +89,21 @@ int I422ToI420(const uint8_t* src_y,
int width,
int height);
+// Convert I422 to NV21.
+LIBYUV_API
+int I422ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
// Copy I420 to I420.
#define I420ToI420 I420Copy
LIBYUV_API
@@ -127,6 +172,17 @@ int I400ToI420(const uint8_t* src_y,
int width,
int height);
+// Convert I400 (grey) to NV21.
+LIBYUV_API
+int I400ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
#define J400ToJ420 I400ToI420
// Convert NV12 to I420.
@@ -185,16 +241,25 @@ int UYVYToI420(const uint8_t* src_uyvy,
int width,
int height);
-// Convert M420 to I420.
+// Convert AYUV to NV12.
LIBYUV_API
-int M420ToI420(const uint8_t* src_m420,
- int src_stride_m420,
+int AYUVToNV12(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
uint8_t* dst_y,
int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert AYUV to NV21.
+LIBYUV_API
+int AYUVToNV21(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
int width,
int height);
@@ -281,6 +346,19 @@ int RGB24ToI420(const uint8_t* src_rgb24,
int width,
int height);
+// RGB little endian (bgr in memory) to J420.
+LIBYUV_API
+int RGB24ToJ420(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
// RGB big endian (rgb in memory) to I420.
LIBYUV_API
int RAWToI420(const uint8_t* src_raw,
@@ -333,7 +411,24 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
int width,
int height);
-#ifdef HAVE_JPEG
+// RGB little endian (bgr in memory) to J400.
+LIBYUV_API
+int RGB24ToJ400(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height);
+
+// RGB big endian (rgb in memory) to J400.
+LIBYUV_API
+int RAWToJ400(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height);
+
// src_width/height provided by capture.
// dst_width/height for clipping determine final size.
LIBYUV_API
@@ -350,13 +445,38 @@ int MJPGToI420(const uint8_t* sample,
int dst_width,
int dst_height);
+// JPEG to NV21
+LIBYUV_API
+int MJPGToNV21(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height);
+
+// JPEG to NV12
+LIBYUV_API
+int MJPGToNV12(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height);
+
// Query size of MJPG in pixels.
LIBYUV_API
int MJPGSize(const uint8_t* sample,
size_t sample_size,
int* width,
int* height);
-#endif
// Convert camera sample to I420 with cropping, rotation and vertical flip.
// "src_size" is needed to parse MJPG.
diff --git a/chromium/third_party/libyuv/include/libyuv/convert_argb.h b/chromium/third_party/libyuv/include/libyuv/convert_argb.h
index ab772b6c323..715a3dad97d 100644
--- a/chromium/third_party/libyuv/include/libyuv/convert_argb.h
+++ b/chromium/third_party/libyuv/include/libyuv/convert_argb.h
@@ -15,16 +15,41 @@
#include "libyuv/rotate.h" // For enum RotationMode.
-// TODO(fbarchard): This set of functions should exactly match convert.h
-// TODO(fbarchard): Add tests. Create random content of right size and convert
-// with C vs Opt and or to I420 and compare.
-// TODO(fbarchard): Some of these functions lack parameter setting.
-
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
+// Conversion matrix for YUV to RGB
+LIBYUV_API extern const struct YuvConstants kYuvI601Constants; // BT.601
+LIBYUV_API extern const struct YuvConstants kYuvJPEGConstants; // JPeg
+LIBYUV_API extern const struct YuvConstants kYuvH709Constants; // BT.709
+LIBYUV_API extern const struct YuvConstants kYuv2020Constants; // BT.2020
+
+// Conversion matrix for YVU to BGR
+LIBYUV_API extern const struct YuvConstants kYvuI601Constants; // BT.601
+LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants; // JPeg
+LIBYUV_API extern const struct YuvConstants kYvuH709Constants; // BT.709
+LIBYUV_API extern const struct YuvConstants kYvu2020Constants; // BT.2020
+
+// Macros for end swapped destination Matrix conversions.
+// Swap UV and pass mirrored kYvuJPEGConstants matrix.
+// TODO(fbarchard): Add macro for each Matrix function.
+#define kYuvI601ConstantsVU kYvuI601Constants
+#define kYuvJPEGConstantsVU kYvuJPEGConstants
+#define kYuvH709ConstantsVU kYvuH709Constants
+#define kYuv2020ConstantsVU kYvu2020Constants
+#define NV12ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
+ NV21ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV21ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
+ NV12ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV12ToRAWMatrix(a, b, c, d, e, f, g, h, i) \
+ NV21ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV21ToRAWMatrix(a, b, c, d, e, f, g, h, i) \
+ NV12ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
+#define I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+ I420AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+
// Alias.
#define ARGBToARGB ARGBCopy
@@ -50,7 +75,7 @@ int I420ToARGB(const uint8_t* src_y,
int width,
int height);
-// Duplicate prototype for function in convert_from.h for remoting.
+// Convert I420 to ABGR.
LIBYUV_API
int I420ToABGR(const uint8_t* src_y,
int src_stride_y,
@@ -63,19 +88,292 @@ int I420ToABGR(const uint8_t* src_y,
int width,
int height);
-// Convert I010 to ARGB.
+// Convert J420 to ARGB.
LIBYUV_API
-int I010ToARGB(const uint16_t* src_y,
+int J420ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint16_t* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint16_t* src_v,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert J420 to ABGR.
+LIBYUV_API
+int J420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert H420 to ARGB.
+LIBYUV_API
+int H420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert H420 to ABGR.
+LIBYUV_API
+int H420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert U420 to ARGB.
+LIBYUV_API
+int U420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
int src_stride_v,
uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
+// Convert U420 to ABGR.
+LIBYUV_API
+int U420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert J422 to ABGR.
+LIBYUV_API
+int J422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert H422 to ARGB.
+LIBYUV_API
+int H422ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert H422 to ABGR.
+LIBYUV_API
+int H422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert U422 to ARGB.
+LIBYUV_API
+int U422ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert U422 to ABGR.
+LIBYUV_API
+int U422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert J444 to ABGR.
+LIBYUV_API
+int J444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert H444 to ARGB.
+LIBYUV_API
+int H444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert H444 to ABGR.
+LIBYUV_API
+int H444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert U444 to ARGB.
+LIBYUV_API
+int U444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert U444 to ABGR.
+LIBYUV_API
+int U444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
// Convert I010 to ARGB.
LIBYUV_API
int I010ToARGB(const uint16_t* src_y,
@@ -128,52 +426,104 @@ int H010ToABGR(const uint16_t* src_y,
int width,
int height);
-// Convert I422 to ARGB.
+// Convert U010 to ARGB.
LIBYUV_API
-int I422ToARGB(const uint8_t* src_y,
+int U010ToARGB(const uint16_t* src_y,
int src_stride_y,
- const uint8_t* src_u,
+ const uint16_t* src_u,
int src_stride_u,
- const uint8_t* src_v,
+ const uint16_t* src_v,
int src_stride_v,
uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
-// Convert I444 to ARGB.
+// Convert U010 to ABGR.
LIBYUV_API
-int I444ToARGB(const uint8_t* src_y,
+int U010ToABGR(const uint16_t* src_y,
int src_stride_y,
- const uint8_t* src_u,
+ const uint16_t* src_u,
int src_stride_u,
- const uint8_t* src_v,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert I210 to ARGB.
+LIBYUV_API
+int I210ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
int src_stride_v,
uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
-// Convert J444 to ARGB.
+// Convert I210 to ABGR.
LIBYUV_API
-int J444ToARGB(const uint8_t* src_y,
+int I210ToABGR(const uint16_t* src_y,
int src_stride_y,
- const uint8_t* src_u,
+ const uint16_t* src_u,
int src_stride_u,
- const uint8_t* src_v,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert H210 to ARGB.
+LIBYUV_API
+int H210ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
int src_stride_v,
uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
-// Convert I444 to ABGR.
+// Convert H210 to ABGR.
LIBYUV_API
-int I444ToABGR(const uint8_t* src_y,
+int H210ToABGR(const uint16_t* src_y,
int src_stride_y,
- const uint8_t* src_u,
+ const uint16_t* src_u,
int src_stride_u,
- const uint8_t* src_v,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert U210 to ARGB.
+LIBYUV_API
+int U210ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert U210 to ABGR.
+LIBYUV_API
+int U210ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
int src_stride_v,
uint8_t* dst_abgr,
int dst_stride_abgr,
@@ -256,6 +606,7 @@ int NV21ToARGB(const uint8_t* src_y,
int height);
// Convert NV12 to ABGR.
+LIBYUV_API
int NV12ToABGR(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_uv,
@@ -298,14 +649,38 @@ int NV21ToRGB24(const uint8_t* src_y,
int width,
int height);
-// Convert M420 to ARGB.
+// Convert NV21 to YUV24.
LIBYUV_API
-int M420ToARGB(const uint8_t* src_m420,
- int src_stride_m420,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
+int NV21ToYUV24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_yuv24,
+ int dst_stride_yuv24,
+ int width,
+ int height);
+
+// Convert NV12 to RAW.
+LIBYUV_API
+int NV12ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
+
+// Convert NV21 to RAW.
+LIBYUV_API
+int NV21ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
// Convert YUY2 to ARGB.
LIBYUV_API
@@ -325,126 +700,113 @@ int UYVYToARGB(const uint8_t* src_uyvy,
int width,
int height);
-// Convert J420 to ARGB.
-LIBYUV_API
-int J420ToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert J422 to ARGB.
+// Convert I010 to AR30.
LIBYUV_API
-int J422ToARGB(const uint8_t* src_y,
+int I010ToAR30(const uint16_t* src_y,
int src_stride_y,
- const uint8_t* src_u,
+ const uint16_t* src_u,
int src_stride_u,
- const uint8_t* src_v,
+ const uint16_t* src_v,
int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
int width,
int height);
-// Convert J420 to ABGR.
+// Convert I010 to AB30.
LIBYUV_API
-int J420ToABGR(const uint8_t* src_y,
+int I010ToAB30(const uint16_t* src_y,
int src_stride_y,
- const uint8_t* src_u,
+ const uint16_t* src_u,
int src_stride_u,
- const uint8_t* src_v,
+ const uint16_t* src_v,
int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
int width,
int height);
-// Convert J422 to ABGR.
+// Convert H010 to AR30.
LIBYUV_API
-int J422ToABGR(const uint8_t* src_y,
+int H010ToAR30(const uint16_t* src_y,
int src_stride_y,
- const uint8_t* src_u,
+ const uint16_t* src_u,
int src_stride_u,
- const uint8_t* src_v,
+ const uint16_t* src_v,
int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
int width,
int height);
-// Convert H420 to ARGB.
+// Convert H010 to AB30.
LIBYUV_API
-int H420ToARGB(const uint8_t* src_y,
+int H010ToAB30(const uint16_t* src_y,
int src_stride_y,
- const uint8_t* src_u,
+ const uint16_t* src_u,
int src_stride_u,
- const uint8_t* src_v,
+ const uint16_t* src_v,
int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
int width,
int height);
-// Convert H422 to ARGB.
+// Convert U010 to AR30.
LIBYUV_API
-int H422ToARGB(const uint8_t* src_y,
+int U010ToAR30(const uint16_t* src_y,
int src_stride_y,
- const uint8_t* src_u,
+ const uint16_t* src_u,
int src_stride_u,
- const uint8_t* src_v,
+ const uint16_t* src_v,
int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
int width,
int height);
-// Convert H420 to ABGR.
+// Convert U010 to AB30.
LIBYUV_API
-int H420ToABGR(const uint8_t* src_y,
+int U010ToAB30(const uint16_t* src_y,
int src_stride_y,
- const uint8_t* src_u,
+ const uint16_t* src_u,
int src_stride_u,
- const uint8_t* src_v,
+ const uint16_t* src_v,
int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
int width,
int height);
-// Convert H422 to ABGR.
+// Convert I210 to AR30.
LIBYUV_API
-int H422ToABGR(const uint8_t* src_y,
+int I210ToAR30(const uint16_t* src_y,
int src_stride_y,
- const uint8_t* src_u,
+ const uint16_t* src_u,
int src_stride_u,
- const uint8_t* src_v,
+ const uint16_t* src_v,
int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
int width,
int height);
-// Convert H010 to ARGB.
+// Convert I210 to AB30.
LIBYUV_API
-int H010ToARGB(const uint16_t* src_y,
+int I210ToAB30(const uint16_t* src_y,
int src_stride_y,
const uint16_t* src_u,
int src_stride_u,
const uint16_t* src_v,
int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
int width,
int height);
-// Convert I010 to AR30.
+// Convert H210 to AR30.
LIBYUV_API
-int I010ToAR30(const uint16_t* src_y,
+int H210ToAR30(const uint16_t* src_y,
int src_stride_y,
const uint16_t* src_u,
int src_stride_u,
@@ -455,35 +817,35 @@ int I010ToAR30(const uint16_t* src_y,
int width,
int height);
-// Convert H010 to AR30.
+// Convert H210 to AB30.
LIBYUV_API
-int H010ToAR30(const uint16_t* src_y,
+int H210ToAB30(const uint16_t* src_y,
int src_stride_y,
const uint16_t* src_u,
int src_stride_u,
const uint16_t* src_v,
int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
int width,
int height);
-// Convert I010 to AB30.
+// Convert U210 to AR30.
LIBYUV_API
-int I010ToAB30(const uint16_t* src_y,
+int U210ToAR30(const uint16_t* src_y,
int src_stride_y,
const uint16_t* src_u,
int src_stride_u,
const uint16_t* src_v,
int src_stride_v,
- uint8_t* dst_ab30,
- int dst_stride_ab30,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
int width,
int height);
-// Convert H010 to AB30.
+// Convert U210 to AB30.
LIBYUV_API
-int H010ToAB30(const uint16_t* src_y,
+int U210ToAB30(const uint16_t* src_y,
int src_stride_y,
const uint16_t* src_u,
int src_stride_u,
@@ -542,6 +904,15 @@ int RAWToARGB(const uint8_t* src_raw,
int width,
int height);
+// RGB big endian (rgb in memory) to RGBA.
+LIBYUV_API
+int RAWToRGBA(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height);
+
// RGB16 (RGBP fourcc) little endian to ARGB.
LIBYUV_API
int RGB565ToARGB(const uint8_t* src_rgb565,
@@ -601,7 +972,6 @@ int AR30ToAB30(const uint8_t* src_ar30,
int width,
int height);
-#ifdef HAVE_JPEG
// src_width/height provided by capture
// dst_width/height for clipping determine final size.
LIBYUV_API
@@ -613,7 +983,6 @@ int MJPGToARGB(const uint8_t* sample,
int src_height,
int dst_width,
int dst_height);
-#endif
// Convert Android420 to ARGB.
LIBYUV_API
@@ -643,6 +1012,561 @@ int Android420ToABGR(const uint8_t* src_y,
int width,
int height);
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height);
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToBGRA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToRGBA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
+
+LIBYUV_API
+int H420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+LIBYUV_API
+int H420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
+
+LIBYUV_API
+int J420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+LIBYUV_API
+int J420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+LIBYUV_API
+int J420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+LIBYUV_API
+int H420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+LIBYUV_API
+int I422ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+
+LIBYUV_API
+int I420ToRGB565Dither(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const uint8_t* dither4x4,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToARGB1555(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb1555,
+ int dst_stride_argb1555,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToARGB4444(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb4444,
+ int dst_stride_argb4444,
+ int width,
+ int height);
+
+// Convert I420 to AR30.
+LIBYUV_API
+int I420ToAR30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert H420 to AR30.
+LIBYUV_API
+int H420ToAR30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert I420 to ARGB with matrix.
+LIBYUV_API
+int I420ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I422 to ARGB with matrix.
+LIBYUV_API
+int I422ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I444 to ARGB with matrix.
+LIBYUV_API
+int I444ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// multiply 10 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I010ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// multiply 10 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I210ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert 10 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I010ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert 10 bit 422 YUV to ARGB with matrix.
+LIBYUV_API
+int I210ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I420 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I420AlphaToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate);
+
+// Convert NV12 to ARGB with matrix.
+LIBYUV_API
+int NV12ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert NV21 to ARGB with matrix.
+LIBYUV_API
+int NV21ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert NV12 to RGB565 with matrix.
+LIBYUV_API
+int NV12ToRGB565Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert NV12 to RGB24 with matrix.
+LIBYUV_API
+int NV12ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert NV21 to RGB24 with matrix.
+LIBYUV_API
+int NV21ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert Android420 to ARGB with matrix.
+LIBYUV_API
+int Android420ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_pixel_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I422ToRGBAMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I420ToRGBAMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I420 to RGB24 with matrix.
+LIBYUV_API
+int I420ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I420 to RGB565 with specified color matrix.
+LIBYUV_API
+int I420ToRGB565Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I420 to AR30 with matrix.
+LIBYUV_API
+int I420ToAR30Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I400 (grey) to ARGB. Reverse of ARGBToI400.
+LIBYUV_API
+int I400ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
// Convert camera sample to ARGB with cropping, rotation and vertical flip.
// "sample_size" is needed to parse MJPG.
// "dst_stride_argb" number of bytes in a row of the dst_argb plane.
diff --git a/chromium/third_party/libyuv/include/libyuv/convert_from.h b/chromium/third_party/libyuv/include/libyuv/convert_from.h
index 5cd8a4bfc04..5140ed4f3e9 100644
--- a/chromium/third_party/libyuv/include/libyuv/convert_from.h
+++ b/chromium/third_party/libyuv/include/libyuv/convert_from.h
@@ -23,6 +23,7 @@ extern "C" {
// Convert 8 bit YUV to 10 bit.
#define H420ToH010 I420ToI010
+LIBYUV_API
int I420ToI010(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_u,
@@ -131,6 +132,10 @@ int I420ToUYVY(const uint8_t* src_y,
int width,
int height);
+// The following are from convert_argb.h
+// DEPRECATED: The prototypes will be removed in future. Use convert_argb.h
+
+// Convert I420 to ARGB.
LIBYUV_API
int I420ToARGB(const uint8_t* src_y,
int src_stride_y,
@@ -143,18 +148,7 @@ int I420ToARGB(const uint8_t* src_y,
int width,
int height);
-LIBYUV_API
-int I420ToBGRA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_bgra,
- int dst_stride_bgra,
- int width,
- int height);
-
+// Convert I420 to ABGR.
LIBYUV_API
int I420ToABGR(const uint8_t* src_y,
int src_stride_y,
@@ -167,157 +161,6 @@ int I420ToABGR(const uint8_t* src_y,
int width,
int height);
-LIBYUV_API
-int I420ToRGBA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgba,
- int dst_stride_rgba,
- int width,
- int height);
-
-LIBYUV_API
-int I420ToRGB24(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- int width,
- int height);
-
-LIBYUV_API
-int I420ToRAW(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_raw,
- int dst_stride_raw,
- int width,
- int height);
-
-LIBYUV_API
-int H420ToRGB24(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- int width,
- int height);
-
-LIBYUV_API
-int H420ToRAW(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_raw,
- int dst_stride_raw,
- int width,
- int height);
-
-LIBYUV_API
-int I420ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height);
-
-LIBYUV_API
-int I422ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height);
-
-// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
-// Values in dither matrix from 0 to 7 recommended.
-// The order of the dither matrix is first byte is upper left.
-
-LIBYUV_API
-int I420ToRGB565Dither(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- const uint8_t* dither4x4,
- int width,
- int height);
-
-LIBYUV_API
-int I420ToARGB1555(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb1555,
- int dst_stride_argb1555,
- int width,
- int height);
-
-LIBYUV_API
-int I420ToARGB4444(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb4444,
- int dst_stride_argb4444,
- int width,
- int height);
-
-// Convert I420 to AR30.
-LIBYUV_API
-int I420ToAR30(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- int width,
- int height);
-
-// Convert H420 to AR30.
-LIBYUV_API
-int H420ToAR30(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- int width,
- int height);
-
// Convert I420 to specified format.
// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
// buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
diff --git a/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h b/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h
index 05c815a093e..d992363cebb 100644
--- a/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h
+++ b/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h
@@ -77,6 +77,10 @@ int ARGBToAR30(const uint8_t* src_argb,
int width,
int height);
+// Aliases
+#define ABGRToRGB24 ARGBToRAW
+#define ABGRToRAW ARGBToRGB24
+
// Convert ARGB To RGB24.
LIBYUV_API
int ARGBToRGB24(const uint8_t* src_argb,
@@ -210,6 +214,15 @@ int ARGBToJ400(const uint8_t* src_argb,
int width,
int height);
+// Convert RGBA to J400. (JPeg full range).
+LIBYUV_API
+int RGBAToJ400(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height);
+
// Convert ARGB to I400.
LIBYUV_API
int ARGBToI400(const uint8_t* src_argb,
@@ -250,10 +263,21 @@ int ARGBToNV21(const uint8_t* src_argb,
int width,
int height);
-// Convert ARGB To NV21.
+// Convert ABGR To NV12.
LIBYUV_API
-int ARGBToNV21(const uint8_t* src_argb,
- int src_stride_argb,
+int ABGRToNV12(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert ABGR To NV21.
+LIBYUV_API
+int ABGRToNV21(const uint8_t* src_abgr,
+ int src_stride_abgr,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_vu,
diff --git a/chromium/third_party/libyuv/include/libyuv/cpu_id.h b/chromium/third_party/libyuv/include/libyuv/cpu_id.h
index 0229cb5e736..3e27cc107dc 100644
--- a/chromium/third_party/libyuv/include/libyuv/cpu_id.h
+++ b/chromium/third_party/libyuv/include/libyuv/cpu_id.h
@@ -48,6 +48,7 @@ static const int kCpuHasAVX512VPOPCNTDQ = 0x100000;
// These flags are only valid on MIPS processors.
static const int kCpuHasMIPS = 0x200000;
static const int kCpuHasMSA = 0x400000;
+static const int kCpuHasMMI = 0x800000;
// Optional init function. TestCpuFlag does an auto-init.
// Returns cpu_info flags.
@@ -70,6 +71,8 @@ static __inline int TestCpuFlag(int test_flag) {
// Internal function for parsing /proc/cpuinfo.
LIBYUV_API
int ArmCpuCaps(const char* cpuinfo_name);
+LIBYUV_API
+int MipsCpuCaps(const char* cpuinfo_name);
// For testing, allow CPU flags to be disabled.
// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
diff --git a/chromium/third_party/libyuv/include/libyuv/macros_msa.h b/chromium/third_party/libyuv/include/libyuv/macros_msa.h
index 29997ce11fd..4e232b66bfe 100644
--- a/chromium/third_party/libyuv/include/libyuv/macros_msa.h
+++ b/chromium/third_party/libyuv/include/libyuv/macros_msa.h
@@ -140,6 +140,9 @@
#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
#define LD_UB(...) LD_B(const v16u8, __VA_ARGS__)
+#define LD_H(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
+#define LD_UH(...) LD_H(const v8u16, __VA_ARGS__)
+
#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
diff --git a/chromium/third_party/libyuv/include/libyuv/planar_functions.h b/chromium/third_party/libyuv/include/libyuv/planar_functions.h
index 91137baba25..8d868b95425 100644
--- a/chromium/third_party/libyuv/include/libyuv/planar_functions.h
+++ b/chromium/third_party/libyuv/include/libyuv/planar_functions.h
@@ -105,6 +105,28 @@ void MergeUVPlane(const uint8_t* src_u,
int width,
int height);
+// Scale U and V to half width and height and merge into interleaved UV plane.
+// width and height are source size, allowing odd sizes.
+// Use for converting I444 or I422 to NV12.
+LIBYUV_API
+void HalfMergeUVPlane(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Swap U and V channels in interleaved UV plane.
+LIBYUV_API
+void SwapUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
// Split interleaved RGB plane into separate R, G and B planes.
LIBYUV_API
void SplitRGBPlane(const uint8_t* src_rgb,
@@ -178,6 +200,16 @@ int I444Copy(const uint8_t* src_y,
int width,
int height);
+// Copy NV12. Supports inverting.
+int NV12Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv,
+ int src_stride_uv, uint8_t* dst_y, int dst_stride_y,
+ uint8_t* dst_uv, int dst_stride_uv, int width, int height);
+
+// Copy NV21. Supports inverting.
+int NV21Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu,
+ int src_stride_vu, uint8_t* dst_y, int dst_stride_y,
+ uint8_t* dst_vu, int dst_stride_vu, int width, int height);
+
// Convert YUY2 to I422.
LIBYUV_API
int YUY2ToI422(const uint8_t* src_yuy2,
@@ -224,6 +256,19 @@ int UYVYToNV12(const uint8_t* src_uyvy,
int width,
int height);
+// Convert NV21 to NV12.
+LIBYUV_API
+int NV21ToNV12(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
LIBYUV_API
int YUY2ToY(const uint8_t* src_yuy2,
int src_stride_yuy2,
@@ -280,6 +325,22 @@ int I400Mirror(const uint8_t* src_y,
int height);
// Alias
+#define NV12ToNV12Mirror NV12Mirror
+
+// NV12 mirror.
+LIBYUV_API
+int NV12Mirror(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Alias
#define ARGBToARGBMirror ARGBMirror
// ARGB mirror.
@@ -291,56 +352,35 @@ int ARGBMirror(const uint8_t* src_argb,
int width,
int height);
-// Convert NV12 to RGB565.
-LIBYUV_API
-int NV12ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height);
+// Alias
+#define RGB24ToRGB24Mirror RGB24Mirror
-// I422ToARGB is in convert_argb.h
-// Convert I422 to BGRA.
+// RGB24 mirror.
LIBYUV_API
-int I422ToBGRA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_bgra,
- int dst_stride_bgra,
- int width,
- int height);
+int RGB24Mirror(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
-// Convert I422 to ABGR.
+// Mirror a plane of data.
LIBYUV_API
-int I422ToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height);
+void MirrorPlane(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
-// Convert I422 to RGBA.
+// Mirror a plane of UV data.
LIBYUV_API
-int I422ToRGBA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgba,
- int dst_stride_rgba,
- int width,
- int height);
+void MirrorUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
// Alias
#define RGB24ToRAW RAWToRGB24
@@ -721,6 +761,19 @@ int ARGBBlur(const uint8_t* src_argb,
int height,
int radius);
+// Gaussian 5x5 blur a float plane.
+// Coefficients of 1, 4, 6, 4, 1.
+// Each destination pixel is a blur of the 5x5
+// pixels from the source.
+// Source edges are clamped.
+LIBYUV_API
+int GaussPlane_F32(const float* src,
+ int src_stride,
+ float* dst,
+ int dst_stride,
+ int width,
+ int height);
+
// Multiply ARGB image by ARGB value.
LIBYUV_API
int ARGBShade(const uint8_t* src_argb,
diff --git a/chromium/third_party/libyuv/include/libyuv/rotate.h b/chromium/third_party/libyuv/include/libyuv/rotate.h
index 76b692be8b0..308882242cb 100644
--- a/chromium/third_party/libyuv/include/libyuv/rotate.h
+++ b/chromium/third_party/libyuv/include/libyuv/rotate.h
@@ -49,6 +49,24 @@ int I420Rotate(const uint8_t* src_y,
int height,
enum RotationMode mode);
+// Rotate I444 frame.
+LIBYUV_API
+int I444Rotate(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode);
+
// Rotate NV12 input and store in I420.
LIBYUV_API
int NV12ToI420Rotate(const uint8_t* src_y,
@@ -100,6 +118,10 @@ void RotatePlane270(const uint8_t* src,
int width,
int height);
+// Rotations for when U and V are interleaved.
+// These functions take one input pointer and
+// split the data into two buffers while
+// rotating them. Deprecated.
LIBYUV_API
void RotateUV90(const uint8_t* src,
int src_stride,
@@ -110,10 +132,6 @@ void RotateUV90(const uint8_t* src,
int width,
int height);
-// Rotations for when U and V are interleaved.
-// These functions take one input pointer and
-// split the data into two buffers while
-// rotating them. Deprecated.
LIBYUV_API
void RotateUV180(const uint8_t* src,
int src_stride,
diff --git a/chromium/third_party/libyuv/include/libyuv/rotate_row.h b/chromium/third_party/libyuv/include/libyuv/rotate_row.h
index 5edc0fcf13a..022293eef2c 100644
--- a/chromium/third_party/libyuv/include/libyuv/rotate_row.h
+++ b/chromium/third_party/libyuv/include/libyuv/rotate_row.h
@@ -60,6 +60,11 @@ extern "C" {
#define HAS_TRANSPOSEUVWX16_MSA
#endif
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+#define HAS_TRANSPOSEWX8_MMI
+#define HAS_TRANSPOSEUVWX8_MMI
+#endif
+
void TransposeWxH_C(const uint8_t* src,
int src_stride,
uint8_t* dst,
@@ -87,6 +92,11 @@ void TransposeWx8_SSSE3(const uint8_t* src,
uint8_t* dst,
int dst_stride,
int width);
+void TransposeWx8_MMI(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width);
void TransposeWx8_Fast_SSSE3(const uint8_t* src,
int src_stride,
uint8_t* dst,
@@ -108,6 +118,11 @@ void TransposeWx8_Any_SSSE3(const uint8_t* src,
uint8_t* dst,
int dst_stride,
int width);
+void TransposeWx8_Any_MMI(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width);
void TransposeWx8_Fast_Any_SSSE3(const uint8_t* src,
int src_stride,
uint8_t* dst,
@@ -156,6 +171,13 @@ void TransposeUVWx8_NEON(const uint8_t* src,
uint8_t* dst_b,
int dst_stride_b,
int width);
+void TransposeUVWx8_MMI(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width);
void TransposeUVWx16_MSA(const uint8_t* src,
int src_stride,
uint8_t* dst_a,
@@ -178,6 +200,13 @@ void TransposeUVWx8_Any_NEON(const uint8_t* src,
uint8_t* dst_b,
int dst_stride_b,
int width);
+void TransposeUVWx8_Any_MMI(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width);
void TransposeUVWx16_Any_MSA(const uint8_t* src,
int src_stride,
uint8_t* dst_a,
diff --git a/chromium/third_party/libyuv/include/libyuv/row.h b/chromium/third_party/libyuv/include/libyuv/row.h
index 1468f4b9925..a27788c1f69 100644
--- a/chromium/third_party/libyuv/include/libyuv/row.h
+++ b/chromium/third_party/libyuv/include/libyuv/row.h
@@ -98,7 +98,6 @@ extern "C" {
#define HAS_COPYROW_SSE2
#define HAS_H422TOARGBROW_SSSE3
#define HAS_HALFFLOATROW_SSE2
-#define HAS_I400TOARGBROW_SSE2
#define HAS_I422TOARGB1555ROW_SSSE3
#define HAS_I422TOARGB4444ROW_SSSE3
#define HAS_I422TOARGBROW_SSSE3
@@ -112,7 +111,7 @@ extern "C" {
#define HAS_J422TOARGBROW_SSSE3
#define HAS_MERGEUVROW_SSE2
#define HAS_MIRRORROW_SSSE3
-#define HAS_MIRRORUVROW_SSSE3
+#define HAS_MIRRORSPLITUVROW_SSSE3
#define HAS_NV12TOARGBROW_SSSE3
#define HAS_NV12TORGB24ROW_SSSE3
#define HAS_NV12TORGB565ROW_SSSE3
@@ -123,6 +122,8 @@ extern "C" {
#define HAS_RAWTOYROW_SSSE3
#define HAS_RGB24TOARGBROW_SSSE3
#define HAS_RGB24TOYROW_SSSE3
+#define HAS_RGB24TOYJROW_SSSE3
+#define HAS_RAWTOYJROW_SSSE3
#define HAS_RGB565TOARGBROW_SSE2
#define HAS_RGBATOUVROW_SSSE3
#define HAS_RGBATOYROW_SSSE3
@@ -194,11 +195,12 @@ extern "C" {
#define HAS_ARGBTOUVROW_AVX2
#define HAS_ARGBTOYJROW_AVX2
#define HAS_ARGBTOYROW_AVX2
+#define HAS_RGB24TOYJROW_AVX2
+#define HAS_RAWTOYJROW_AVX2
#define HAS_COPYROW_AVX
#define HAS_H422TOARGBROW_AVX2
#define HAS_HALFFLOATROW_AVX2
// #define HAS_HALFFLOATROW_F16C // Enable to test halffloat cast
-#define HAS_I400TOARGBROW_AVX2
#define HAS_I422TOARGB1555ROW_AVX2
#define HAS_I422TOARGB4444ROW_AVX2
#define HAS_I422TOARGBROW_AVX2
@@ -269,12 +271,19 @@ extern "C" {
#define HAS_ARGBTOAR30ROW_SSSE3
#define HAS_CONVERT16TO8ROW_SSSE3
#define HAS_CONVERT8TO16ROW_SSE2
-// I210 is for H010. 2 = 422. I for 601 vs H for 709.
+#define HAS_HALFMERGEUVROW_SSSE3
#define HAS_I210TOAR30ROW_SSSE3
#define HAS_I210TOARGBROW_SSSE3
+#define HAS_I400TOARGBROW_SSE2
#define HAS_I422TOAR30ROW_SSSE3
#define HAS_MERGERGBROW_SSSE3
+#define HAS_MIRRORUVROW_AVX2
+#define HAS_MIRRORUVROW_SSSE3
+#define HAS_RAWTORGBAROW_SSSE3
+#define HAS_RGB24MIRRORROW_SSSE3
+#define HAS_RGBATOYJROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3
+#define HAS_SWAPUVROW_SSSE3
#endif
// The following are available for AVX2 gcc/clang x86 platforms:
@@ -283,18 +292,26 @@ extern "C" {
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#define HAS_ABGRTOAR30ROW_AVX2
+#define HAS_ABGRTOUVROW_AVX2
+#define HAS_ABGRTOYROW_AVX2
#define HAS_ARGBTOAR30ROW_AVX2
#define HAS_ARGBTORAWROW_AVX2
#define HAS_ARGBTORGB24ROW_AVX2
#define HAS_CONVERT16TO8ROW_AVX2
#define HAS_CONVERT8TO16ROW_AVX2
+#define HAS_HALFMERGEUVROW_AVX2
#define HAS_I210TOAR30ROW_AVX2
#define HAS_I210TOARGBROW_AVX2
+#define HAS_I400TOARGBROW_AVX2
#define HAS_I422TOAR30ROW_AVX2
#define HAS_I422TOUYVYROW_AVX2
#define HAS_I422TOYUY2ROW_AVX2
#define HAS_MERGEUVROW_16_AVX2
#define HAS_MULTIPLYROW_16_AVX2
+#define HAS_RGBATOYJROW_AVX2
+#define HAS_SWAPUVROW_AVX2
+// TODO(fbarchard): Fix AVX2 version of YUV24
+// #define HAS_NV21TOYUV24ROW_AVX2
#endif
// The following are available for AVX512 clang x86 platforms:
@@ -330,11 +347,15 @@ extern "C" {
#define HAS_ARGBTOUVROW_NEON
#define HAS_ARGBTOYJROW_NEON
#define HAS_ARGBTOYROW_NEON
+#define HAS_AYUVTOUVROW_NEON
+#define HAS_AYUVTOVUROW_NEON
+#define HAS_AYUVTOYROW_NEON
#define HAS_BGRATOUVROW_NEON
#define HAS_BGRATOYROW_NEON
#define HAS_BYTETOFLOATROW_NEON
#define HAS_COPYROW_NEON
#define HAS_HALFFLOATROW_NEON
+#define HAS_HALFMERGEUVROW_NEON
#define HAS_I400TOARGBROW_NEON
#define HAS_I422ALPHATOARGBROW_NEON
#define HAS_I422TOARGB1555ROW_NEON
@@ -350,26 +371,33 @@ extern "C" {
#define HAS_MERGEUVROW_NEON
#define HAS_MIRRORROW_NEON
#define HAS_MIRRORUVROW_NEON
+#define HAS_MIRRORSPLITUVROW_NEON
#define HAS_NV12TOARGBROW_NEON
#define HAS_NV12TORGB24ROW_NEON
#define HAS_NV12TORGB565ROW_NEON
#define HAS_NV21TOARGBROW_NEON
#define HAS_NV21TORGB24ROW_NEON
+#define HAS_NV21TOYUV24ROW_NEON
#define HAS_RAWTOARGBROW_NEON
#define HAS_RAWTORGB24ROW_NEON
+#define HAS_RAWTORGBAROW_NEON
#define HAS_RAWTOUVROW_NEON
+#define HAS_RAWTOYJROW_NEON
#define HAS_RAWTOYROW_NEON
#define HAS_RGB24TOARGBROW_NEON
#define HAS_RGB24TOUVROW_NEON
+#define HAS_RGB24TOYJROW_NEON
#define HAS_RGB24TOYROW_NEON
#define HAS_RGB565TOARGBROW_NEON
#define HAS_RGB565TOUVROW_NEON
#define HAS_RGB565TOYROW_NEON
#define HAS_RGBATOUVROW_NEON
+#define HAS_RGBATOYJROW_NEON
#define HAS_RGBATOYROW_NEON
#define HAS_SETROW_NEON
#define HAS_SPLITRGBROW_NEON
#define HAS_SPLITUVROW_NEON
+#define HAS_SWAPUVROW_NEON
#define HAS_UYVYTOARGBROW_NEON
#define HAS_UYVYTOUV422ROW_NEON
#define HAS_UYVYTOUVROW_NEON
@@ -386,6 +414,7 @@ extern "C" {
#define HAS_ARGBCOLORMATRIXROW_NEON
#define HAS_ARGBGRAYROW_NEON
#define HAS_ARGBMIRRORROW_NEON
+#define HAS_RGB24MIRRORROW_NEON
#define HAS_ARGBMULTIPLYROW_NEON
#define HAS_ARGBQUANTIZEROW_NEON
#define HAS_ARGBSEPIAROW_NEON
@@ -403,6 +432,9 @@ extern "C" {
// The following are available on AArch64 platforms:
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#define HAS_SCALESUMSAMPLES_NEON
+#define HAS_GAUSSROW_F32_NEON
+#define HAS_GAUSSCOL_F32_NEON
+
#endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
#define HAS_ABGRTOUVROW_MSA
@@ -447,11 +479,14 @@ extern "C" {
#define HAS_I422TOUYVYROW_MSA
#define HAS_I422TOYUY2ROW_MSA
#define HAS_I444TOARGBROW_MSA
+#define HAS_I422TOARGB1555ROW_MSA
+#define HAS_I422TORGB565ROW_MSA
#define HAS_INTERPOLATEROW_MSA
#define HAS_J400TOARGBROW_MSA
#define HAS_MERGEUVROW_MSA
#define HAS_MIRRORROW_MSA
#define HAS_MIRRORUVROW_MSA
+#define HAS_MIRRORSPLITUVROW_MSA
#define HAS_NV12TOARGBROW_MSA
#define HAS_NV12TORGB565ROW_MSA
#define HAS_NV21TOARGBROW_MSA
@@ -483,6 +518,98 @@ extern "C" {
#define HAS_YUY2TOYROW_MSA
#endif
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+#define HAS_ABGRTOUVROW_MMI
+#define HAS_ABGRTOYROW_MMI
+#define HAS_ARGB1555TOARGBROW_MMI
+#define HAS_ARGB1555TOUVROW_MMI
+#define HAS_ARGB1555TOYROW_MMI
+#define HAS_ARGB4444TOARGBROW_MMI
+#define HAS_ARGB4444TOUVROW_MMI
+#define HAS_ARGB4444TOYROW_MMI
+#define HAS_ARGBADDROW_MMI
+#define HAS_ARGBATTENUATEROW_MMI
+#define HAS_ARGBBLENDROW_MMI
+#define HAS_ARGBCOLORMATRIXROW_MMI
+#define HAS_ARGBCOPYALPHAROW_MMI
+#define HAS_ARGBCOPYYTOALPHAROW_MMI
+#define HAS_ARGBEXTRACTALPHAROW_MMI
+#define HAS_ARGBGRAYROW_MMI
+#define HAS_ARGBMIRRORROW_MMI
+#define HAS_ARGBMULTIPLYROW_MMI
+#define HAS_ARGBSEPIAROW_MMI
+#define HAS_ARGBSETROW_MMI
+#define HAS_ARGBSHADEROW_MMI
+#define HAS_ARGBSHUFFLEROW_MMI
+#define HAS_ARGBSUBTRACTROW_MMI
+#define HAS_ARGBTOARGB1555ROW_MMI
+#define HAS_ARGBTOARGB4444ROW_MMI
+#define HAS_ARGBTORAWROW_MMI
+#define HAS_ARGBTORGB24ROW_MMI
+#define HAS_ARGBTORGB565DITHERROW_MMI
+#define HAS_ARGBTORGB565ROW_MMI
+#define HAS_ARGBTOUV444ROW_MMI
+#define HAS_ARGBTOUVJROW_MMI
+#define HAS_ARGBTOUVROW_MMI
+#define HAS_ARGBTOYJROW_MMI
+#define HAS_ARGBTOYROW_MMI
+#define HAS_BGRATOUVROW_MMI
+#define HAS_BGRATOYROW_MMI
+#define HAS_BLENDPLANEROW_MMI
+#define HAS_COMPUTECUMULATIVESUMROW_MMI
+#define HAS_CUMULATIVESUMTOAVERAGEROW_MMI
+#define HAS_HALFFLOATROW_MMI
+#define HAS_I400TOARGBROW_MMI
+#define HAS_I422TOUYVYROW_MMI
+#define HAS_I422TOYUY2ROW_MMI
+#define HAS_I422TOARGBROW_MMI
+#define HAS_I444TOARGBROW_MMI
+#define HAS_INTERPOLATEROW_MMI
+#define HAS_J400TOARGBROW_MMI
+#define HAS_MERGERGBROW_MMI
+#define HAS_MERGEUVROW_MMI
+#define HAS_MIRRORROW_MMI
+#define HAS_MIRRORSPLITUVROW_MMI
+#define HAS_RAWTOARGBROW_MMI
+#define HAS_RAWTORGB24ROW_MMI
+#define HAS_RAWTOUVROW_MMI
+#define HAS_RAWTOYROW_MMI
+#define HAS_RGB24TOARGBROW_MMI
+#define HAS_RGB24TOUVROW_MMI
+#define HAS_RGB24TOYROW_MMI
+#define HAS_RGB565TOARGBROW_MMI
+#define HAS_RGB565TOUVROW_MMI
+#define HAS_RGB565TOYROW_MMI
+#define HAS_RGBATOUVROW_MMI
+#define HAS_RGBATOYROW_MMI
+#define HAS_SOBELROW_MMI
+#define HAS_SOBELTOPLANEROW_MMI
+#define HAS_SOBELXROW_MMI
+#define HAS_SOBELXYROW_MMI
+#define HAS_SOBELYROW_MMI
+#define HAS_SPLITRGBROW_MMI
+#define HAS_SPLITUVROW_MMI
+#define HAS_UYVYTOUVROW_MMI
+#define HAS_UYVYTOYROW_MMI
+#define HAS_YUY2TOUV422ROW_MMI
+#define HAS_YUY2TOUVROW_MMI
+#define HAS_YUY2TOYROW_MMI
+#define HAS_I210TOARGBROW_MMI
+#define HAS_I422TOARGB4444ROW_MMI
+#define HAS_I422TOARGB1555ROW_MMI
+#define HAS_I422TORGB565ROW_MMI
+#define HAS_NV21TORGB24ROW_MMI
+#define HAS_NV12TORGB24ROW_MMI
+#define HAS_I422ALPHATOARGBROW_MMI
+#define HAS_I422TORGB24ROW_MMI
+#define HAS_NV12TOARGBROW_MMI
+#define HAS_NV21TOARGBROW_MMI
+#define HAS_NV12TORGB565ROW_MMI
+#define HAS_YUY2TOARGBROW_MMI
+#define HAS_UYVYTOARGBROW_MMI
+#define HAS_I422TORGBAROW_MMI
+#endif
+
#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
#if defined(VISUALC_HAS_AVX2)
#define SIMD_ALIGNED(var) __declspec(align(32)) var
@@ -491,6 +618,7 @@ extern "C" {
#endif
typedef __declspec(align(16)) int16_t vec16[8];
typedef __declspec(align(16)) int32_t vec32[4];
+typedef __declspec(align(16)) float vecf32[4];
typedef __declspec(align(16)) int8_t vec8[16];
typedef __declspec(align(16)) uint16_t uvec16[8];
typedef __declspec(align(16)) uint32_t uvec32[4];
@@ -510,6 +638,7 @@ typedef __declspec(align(32)) uint8_t ulvec8[32];
#endif
typedef int16_t __attribute__((vector_size(16))) vec16;
typedef int32_t __attribute__((vector_size(16))) vec32;
+typedef float __attribute__((vector_size(16))) vecf32;
typedef int8_t __attribute__((vector_size(16))) vec8;
typedef uint16_t __attribute__((vector_size(16))) uvec16;
typedef uint32_t __attribute__((vector_size(16))) uvec32;
@@ -524,6 +653,7 @@ typedef uint8_t __attribute__((vector_size(32))) ulvec8;
#define SIMD_ALIGNED(var) var
typedef int16_t vec16[8];
typedef int32_t vec32[4];
+typedef float vecf32[4];
typedef int8_t vec8[16];
typedef uint16_t uvec16[8];
typedef uint32_t uvec32[4];
@@ -564,6 +694,7 @@ struct YuvConstants {
int16_t kUVBiasG[16];
int16_t kUVBiasR[16];
int16_t kYToRgb[16];
+ int16_t kYBiasToRgb[16];
};
// Offsets into YuvConstants structure
@@ -574,17 +705,9 @@ struct YuvConstants {
#define KUVBIASG 128
#define KUVBIASR 160
#define KYTORGB 192
-#endif
-
-// Conversion matrix for YUV to RGB
-extern const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants); // BT.601
-extern const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants); // JPeg
-extern const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants); // BT.709
+#define KYBIASTORGB 224
-// Conversion matrix for YVU to BGR
-extern const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants); // BT.601
-extern const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants); // JPeg
-extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants); // BT.709
+#endif
#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
@@ -740,6 +863,10 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width);
void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
@@ -754,6 +881,12 @@ void I444ToARGBRow_MSA(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
+void I444ToARGBRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToARGBRow_MSA(const uint8_t* src_y,
const uint8_t* src_u,
@@ -767,6 +900,12 @@ void I422ToRGBARow_MSA(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToARGBRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@@ -824,19 +963,31 @@ void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void ABGRToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void RGBAToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width);
void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_y, int width);
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -850,7 +1001,16 @@ void ARGBToUV444Row_MSA(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGBToUVRow_MSA(const uint8_t* src_argb0,
+void ARGBToUVRow_MSA(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUV444Row_MMI(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVRow_MMI(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -900,32 +1060,32 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
+void ARGBToUVJRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
+void BGRAToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
+void ABGRToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void RGBAToUVRow_MSA(const uint8_t* src_rgb0,
+void RGBAToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
+void RGB24ToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void RAWToUVRow_MSA(const uint8_t* src_rgb0,
+void RAWToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -940,11 +1100,58 @@ void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void ARGBToUVJRow_MMI(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void BGRAToUVRow_MMI(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ABGRToUVRow_MMI(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGBAToUVRow_MMI(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB24ToUVRow_MMI(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RAWToUVRow_MMI(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB565ToUVRow_MMI(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444,
+ int src_stride_argb4444,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width);
void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width);
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width);
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_y,
@@ -952,37 +1159,59 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_y,
int width);
-void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
-void ARGBToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ARGBToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void BGRAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ABGRToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGBAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGB24ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RAWToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
+void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
+
+void ARGBToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void BGRAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ABGRToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB24ToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB24ToYJRow_C(const uint8_t* src_argb, uint8_t* dst_yj, int width);
+void RAWToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RAWToYJRow_C(const uint8_t* src_argb, uint8_t* dst_yj, int width);
void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGB24ToYRow_Any_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
-void RAWToYRow_Any_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RGB24ToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RAWToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB565ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGB1555ToYRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
@@ -1001,38 +1230,57 @@ void RGB565ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGB1555ToYRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
+void BGRAToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB565ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGB1555ToYRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGB4444ToYRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
-void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+void ARGBToUVRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
+void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
int src_stride_bgra,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
+void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
+void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
int src_stride_rgba,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1042,6 +1290,11 @@ void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void ABGRToUVRow_Any_AVX2(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr,
int src_stride_ptr,
uint8_t* dst_u,
@@ -1090,6 +1343,15 @@ void ARGBToUVRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void ARGBToUV444Row_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr,
int src_stride_ptr,
uint8_t* dst_u,
@@ -1175,47 +1437,92 @@ void ARGB1555ToUVRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGBToUVRow_C(const uint8_t* src_rgb0,
+void ARGBToUVJRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void BGRAToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ABGRToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGBAToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB24ToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RAWToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB565ToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGB1555ToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGB4444ToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGBToUVJRow_C(const uint8_t* src_rgb0,
+void ARGBToUVJRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGBToUVRow_C(const uint8_t* src_rgb0,
+void ARGBToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGBToUVJRow_C(const uint8_t* src_rgb0,
+void ARGBToUVJRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void BGRAToUVRow_C(const uint8_t* src_rgb0,
+void BGRAToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ABGRToUVRow_C(const uint8_t* src_rgb0,
+void ABGRToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void RGBAToUVRow_C(const uint8_t* src_rgb0,
+void RGBAToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void RGB24ToUVRow_C(const uint8_t* src_rgb0,
+void RGB24ToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void RAWToUVRow_C(const uint8_t* src_rgb0,
+void RAWToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1254,34 +1561,50 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width);
void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void MirrorUVRow_SSSE3(const uint8_t* src,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width);
-void MirrorUVRow_NEON(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width);
-void MirrorUVRow_MSA(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width);
-void MirrorUVRow_C(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width);
+void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void MirrorSplitUVRow_MSA(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void MirrorSplitUVRow_MMI(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void MirrorSplitUVRow_C(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width);
void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
@@ -1293,6 +1616,17 @@ void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBMirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+
+void RGB24MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void RGB24MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void RGB24MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
+void RGB24MirrorRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RGB24MirrorRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
void SplitUVRow_C(const uint8_t* src_uv,
uint8_t* dst_u,
@@ -1314,6 +1648,10 @@ void SplitUVRow_MSA(const uint8_t* src_uv,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void SplitUVRow_MMI(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void SplitUVRow_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1330,6 +1668,10 @@ void SplitUVRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void SplitUVRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void MergeUVRow_C(const uint8_t* src_u,
const uint8_t* src_v,
@@ -1351,6 +1693,10 @@ void MergeUVRow_MSA(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width);
+void MergeUVRow_MMI(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width);
void MergeUVRow_Any_SSE2(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
@@ -1367,6 +1713,38 @@ void MergeUVRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
+void MergeUVRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+
+void HalfMergeUVRow_C(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width);
+
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width);
+
+void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width);
+
+void HalfMergeUVRow_AVX2(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width);
void SplitRGBRow_C(const uint8_t* src_rgb,
uint8_t* dst_r,
@@ -1383,6 +1761,11 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
+void SplitRGBRow_MMI(const uint8_t* src_rgb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_r,
uint8_t* dst_g,
@@ -1393,6 +1776,11 @@ void SplitRGBRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
+void SplitRGBRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
void MergeRGBRow_C(const uint8_t* src_r,
const uint8_t* src_g,
@@ -1409,6 +1797,11 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_b,
uint8_t* dst_rgb,
int width);
+void MergeRGBRow_MMI(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width);
void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
@@ -1419,6 +1812,11 @@ void MergeRGBRow_Any_NEON(const uint8_t* src_r,
const uint8_t* src_b,
uint8_t* dst_rgb,
int width);
+void MergeRGBRow_Any_MMI(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width);
void MergeUVRow_16_C(const uint16_t* src_u,
const uint16_t* src_v,
@@ -1497,12 +1895,16 @@ void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count);
void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);
void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width);
void ARGBCopyAlphaRow_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void ARGBCopyAlphaRow_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
+void ARGBCopyAlphaRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width);
void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
@@ -1517,6 +1919,9 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
uint8_t* dst_a,
int width);
+void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width);
void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
@@ -1529,16 +1934,23 @@ void ARGBExtractAlphaRow_Any_NEON(const uint8_t* src_ptr,
void ARGBExtractAlphaRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
+void ARGBExtractAlphaRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);
void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width);
void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void ARGBCopyYToAlphaRow_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
+void ARGBCopyYToAlphaRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
void SetRow_C(uint8_t* dst, uint8_t v8, int width);
void SetRow_MSA(uint8_t* dst, uint8_t v8, int width);
@@ -1554,6 +1966,8 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width);
void ARGBSetRow_Any_NEON(uint8_t* dst_ptr, uint32_t v32, int width);
void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width);
void ARGBSetRow_Any_MSA(uint8_t* dst_ptr, uint32_t v32, int width);
+void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width);
+void ARGBSetRow_Any_MMI(uint8_t* dst_ptr, uint32_t v32, int width);
// ARGBShufflers for BGRAToARGB etc.
void ARGBShuffleRow_C(const uint8_t* src_argb,
@@ -1576,6 +1990,10 @@ void ARGBShuffleRow_MSA(const uint8_t* src_argb,
uint8_t* dst_argb,
const uint8_t* shuffler,
int width);
+void ARGBShuffleRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width);
void ARGBShuffleRow_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
const uint8_t* param,
@@ -1592,11 +2010,16 @@ void ARGBShuffleRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_ptr,
const uint8_t* param,
int width);
+void ARGBShuffleRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint8_t* param,
+ int width);
void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
uint8_t* dst_argb,
int width);
void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
@@ -1615,30 +2038,44 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_argb,
int width);
void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
+void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width);
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
uint8_t* dst_argb,
int width);
void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565,
uint8_t* dst_argb,
int width);
+void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width);
void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_argb,
int width);
void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555,
uint8_t* dst_argb,
int width);
+void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
+ int width);
void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_argb,
int width);
void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444,
uint8_t* dst_argb,
int width);
+void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width);
void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
void RGB565ToARGBRow_C(const uint8_t* src_rgb565, uint8_t* dst_argb, int width);
void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
@@ -1658,6 +2095,9 @@ void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
+void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
@@ -1687,24 +2127,36 @@ void RGB24ToARGBRow_Any_NEON(const uint8_t* src_ptr,
void RGB24ToARGBRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
+void RGB24ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
void RAWToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToRGBARow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToRGB24Row_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void RAWToRGB24Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToRGB24Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB565ToARGBRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void RGB565ToARGBRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
+void RGB565ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
void ARGB1555ToARGBRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void ARGB1555ToARGBRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
+void ARGB1555ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
void ARGB4444ToARGBRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
@@ -1712,6 +2164,9 @@ void ARGB4444ToARGBRow_Any_NEON(const uint8_t* src_ptr,
void ARGB4444ToARGBRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
+void ARGB4444ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
@@ -1780,6 +2235,20 @@ void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
const uint32_t dither4,
int width);
+void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_MMI(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width);
+void ARGBToARGB4444Row_MMI(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width);
+void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
+ int width);
+
void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
@@ -1793,6 +2262,7 @@ void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width);
void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width);
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width);
void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width);
void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
@@ -1804,6 +2274,7 @@ void J400ToARGBRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void J400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void J400ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void I444ToARGBRow_C(const uint8_t* src_y,
const uint8_t* src_u,
@@ -1867,6 +2338,10 @@ void NV21ToRGB24Row_C(const uint8_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
+void NV21ToYUV24Row_C(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width);
void YUY2ToARGBRow_C(const uint8_t* src_yuy2,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
@@ -2033,6 +2508,10 @@ void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
+void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width);
void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_rgb565,
@@ -2238,6 +2717,10 @@ void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void NV21ToYUV24Row_Any_AVX2(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width);
void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
@@ -2319,21 +2802,50 @@ void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
const struct YuvConstants* yuvconstants,
int width);
-void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width);
-void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width);
-void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width);
-void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
-void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void I400ToARGBRow_C(const uint8_t* src_y,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_SSE2(const uint8_t* y_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_AVX2(const uint8_t* y_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_NEON(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_MSA(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_MMI(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
int width);
void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
int width);
void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
int width);
-void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
// ARGB preattenuated alpha blend.
void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
@@ -2348,6 +2860,10 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
+void ARGBBlendRow_MMI(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
void ARGBBlendRow_C(const uint8_t* src_argb0,
const uint8_t* src_argb1,
uint8_t* dst_argb,
@@ -2374,6 +2890,16 @@ void BlendPlaneRow_Any_AVX2(const uint8_t* y_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
int width);
+void BlendPlaneRow_MMI(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
+ int width);
+void BlendPlaneRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
void BlendPlaneRow_C(const uint8_t* src0,
const uint8_t* src1,
const uint8_t* alpha,
@@ -2418,6 +2944,14 @@ void ARGBMultiplyRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
+void ARGBMultiplyRow_MMI(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBMultiplyRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
// ARGB add images.
void ARGBAddRow_C(const uint8_t* src_argb0,
@@ -2456,6 +2990,14 @@ void ARGBAddRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
+void ARGBAddRow_MMI(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBAddRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
// ARGB subtract images. Same API as Blend, but these require
// pointer and width alignment for SSE2.
@@ -2495,6 +3037,14 @@ void ARGBSubtractRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
+void ARGBSubtractRow_MMI(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBSubtractRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
@@ -2584,6 +3134,24 @@ void ARGBToRGB565DitherRow_Any_MSA(const uint8_t* src_ptr,
const uint32_t param,
int width);
+void ARGBToRGB24Row_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRAWRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB565Row_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToARGB1555Row_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToARGB4444Row_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRGB565DitherRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint32_t param,
+ int width);
+
void I444ToARGBRow_Any_NEON(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
@@ -2653,6 +3221,10 @@ void NV21ToRGB24Row_Any_NEON(const uint8_t* y_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void NV21ToYUV24Row_Any_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width);
void NV12ToRGB565Row_Any_NEON(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
@@ -2672,12 +3244,24 @@ void I444ToARGBRow_Any_MSA(const uint8_t* y_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void I444ToARGBRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToARGBRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToARGBRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToRGBARow_Any_MSA(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
@@ -2770,15 +3354,25 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
uint8_t* dst_v,
int width);
void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,
int src_stride_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void YUY2ToUVRow_MMI(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
void YUY2ToUVRow_C(const uint8_t* src_yuy2,
int src_stride_yuy2,
@@ -2820,15 +3414,25 @@ void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_v,
int width);
void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr,
int src_stride_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void YUY2ToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void YUY2ToUV422Row_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
int stride_uyvy,
@@ -2870,15 +3474,25 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
uint8_t* dst_v,
int width);
void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
void UYVYToUVRow_MSA(const uint8_t* src_uyvy,
int src_stride_uyvy,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void UYVYToUVRow_MMI(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void UYVYToUV422Row_MMI(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
void UYVYToUVRow_C(const uint8_t* src_uyvy,
@@ -2921,15 +3535,59 @@ void UYVYToUV422Row_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_v,
int width);
void UYVYToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void UYVYToUVRow_Any_MSA(const uint8_t* src_ptr,
int src_stride_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void UYVYToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void UYVYToUV422Row_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
+void AYUVToUVRow_C(const uint8_t* src_ayuv,
+ int stride_ayuv,
+ uint8_t* dst_uv,
+ int width);
+void AYUVToVURow_C(const uint8_t* src_ayuv,
+ int stride_ayuv,
+ uint8_t* dst_vu,
+ int width);
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+ int stride_ayuv,
+ uint8_t* dst_uv,
+ int width);
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+ int stride_ayuv,
+ uint8_t* dst_vu,
+ int width);
+void AYUVToYRow_Any_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
+void AYUVToUVRow_Any_NEON(const uint8_t* src_ayuv,
+ int stride_ayuv,
+ uint8_t* dst_uv,
+ int width);
+void AYUVToVURow_Any_NEON(const uint8_t* src_ayuv,
+ int stride_ayuv,
+ uint8_t* dst_vu,
+ int width);
void I422ToYUY2Row_C(const uint8_t* src_y,
const uint8_t* src_u,
@@ -3006,21 +3664,41 @@ void I422ToYUY2Row_MSA(const uint8_t* src_y,
const uint8_t* src_v,
uint8_t* dst_yuy2,
int width);
+void I422ToYUY2Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
+ int width);
void I422ToUYVYRow_MSA(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uyvy,
int width);
+void I422ToUYVYRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width);
void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
int width);
+void I422ToYUY2Row_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
int width);
+void I422ToUYVYRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
// Effects related row functions.
void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
@@ -3036,6 +3714,9 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
uint8_t* dst_argb,
int width);
+void ARGBAttenuateRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width);
void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
@@ -3048,6 +3729,9 @@ void ARGBAttenuateRow_Any_NEON(const uint8_t* src_ptr,
void ARGBAttenuateRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
+void ARGBAttenuateRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
// Inverse table for unattenuate, shared by C and SSE2.
extern const uint32_t fixed_invtbl8[256];
@@ -3071,11 +3755,13 @@ void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width);
void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width);
void ARGBSepiaRow_C(uint8_t* dst_argb, int width);
void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width);
void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width);
void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width);
void ARGBColorMatrixRow_C(const uint8_t* src_argb,
uint8_t* dst_argb,
@@ -3093,6 +3779,10 @@ void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,
uint8_t* dst_argb,
const int8_t* matrix_argb,
int width);
+void ARGBColorMatrixRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width);
void ARGBColorTableRow_C(uint8_t* dst_argb,
const uint8_t* table_argb,
@@ -3145,6 +3835,10 @@ void ARGBShadeRow_MSA(const uint8_t* src_argb,
uint8_t* dst_argb,
int width,
uint32_t value);
+void ARGBShadeRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ uint32_t value);
// Used for blur.
void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
@@ -3158,6 +3852,11 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
const int32_t* previous_cumsum,
int width);
+void ComputeCumulativeSumRow_MMI(const uint8_t* row,
+ int32_t* cumsum,
+ const int32_t* previous_cumsum,
+ int width);
+
void CumulativeSumToAverageRow_C(const int32_t* tl,
const int32_t* bl,
int w,
@@ -3208,6 +3907,11 @@ void InterpolateRow_MSA(uint8_t* dst_ptr,
ptrdiff_t src_stride,
int width,
int source_y_fraction);
+void InterpolateRow_MMI(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int width,
+ int source_y_fraction);
void InterpolateRow_Any_NEON(uint8_t* dst_ptr,
const uint8_t* src_ptr,
ptrdiff_t src_stride_ptr,
@@ -3228,6 +3932,11 @@ void InterpolateRow_Any_MSA(uint8_t* dst_ptr,
ptrdiff_t src_stride_ptr,
int width,
int source_y_fraction);
+void InterpolateRow_Any_MMI(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
+ int source_y_fraction);
void InterpolateRow_16_C(uint16_t* dst_ptr,
const uint16_t* src_ptr,
@@ -3256,6 +3965,11 @@ void SobelXRow_MSA(const uint8_t* src_y0,
const uint8_t* src_y2,
uint8_t* dst_sobelx,
int width);
+void SobelXRow_MMI(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
+ int width);
void SobelYRow_C(const uint8_t* src_y0,
const uint8_t* src_y1,
uint8_t* dst_sobely,
@@ -3272,6 +3986,10 @@ void SobelYRow_MSA(const uint8_t* src_y0,
const uint8_t* src_y1,
uint8_t* dst_sobely,
int width);
+void SobelYRow_MMI(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
+ int width);
void SobelRow_C(const uint8_t* src_sobelx,
const uint8_t* src_sobely,
uint8_t* dst_argb,
@@ -3288,6 +4006,10 @@ void SobelRow_MSA(const uint8_t* src_sobelx,
const uint8_t* src_sobely,
uint8_t* dst_argb,
int width);
+void SobelRow_MMI(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width);
void SobelToPlaneRow_C(const uint8_t* src_sobelx,
const uint8_t* src_sobely,
uint8_t* dst_y,
@@ -3304,6 +4026,10 @@ void SobelToPlaneRow_MSA(const uint8_t* src_sobelx,
const uint8_t* src_sobely,
uint8_t* dst_y,
int width);
+void SobelToPlaneRow_MMI(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
+ int width);
void SobelXYRow_C(const uint8_t* src_sobelx,
const uint8_t* src_sobely,
uint8_t* dst_argb,
@@ -3320,6 +4046,10 @@ void SobelXYRow_MSA(const uint8_t* src_sobelx,
const uint8_t* src_sobely,
uint8_t* dst_argb,
int width);
+void SobelXYRow_MMI(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width);
void SobelRow_Any_SSE2(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
@@ -3332,6 +4062,10 @@ void SobelRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
+void SobelRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
void SobelToPlaneRow_Any_SSE2(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
@@ -3344,6 +4078,10 @@ void SobelToPlaneRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
+void SobelToPlaneRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
void SobelXYRow_Any_SSE2(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
@@ -3356,6 +4094,10 @@ void SobelXYRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
+void SobelXYRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
void ARGBPolynomialRow_C(const uint8_t* src_argb,
uint8_t* dst_argb,
@@ -3462,6 +4204,178 @@ float ScaleSumSamples_NEON(const float* src,
void ScaleSamples_C(const float* src, float* dst, float scale, int width);
void ScaleSamples_NEON(const float* src, float* dst, float scale, int width);
+void I210ToARGBRow_MMI(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB24Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToRGB24Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_MMI(const uint8_t* src_uyvy,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210ToARGBRow_Any_MMI(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB24Row_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToRGB24Row_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+
+void GaussRow_F32_NEON(const float* src, float* dst, int width);
+void GaussRow_F32_C(const float* src, float* dst, int width);
+
+void GaussCol_F32_NEON(const float* src0,
+ const float* src1,
+ const float* src2,
+ const float* src3,
+ const float* src4,
+ float* dst,
+ int width);
+
+void GaussCol_F32_C(const float* src0,
+ const float* src1,
+ const float* src2,
+ const float* src3,
+ const float* src4,
+ float* dst,
+ int width);
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/chromium/third_party/libyuv/include/libyuv/scale.h b/chromium/third_party/libyuv/include/libyuv/scale.h
index b937d348cab..add5a9eb622 100644
--- a/chromium/third_party/libyuv/include/libyuv/scale.h
+++ b/chromium/third_party/libyuv/include/libyuv/scale.h
@@ -97,6 +97,79 @@ int I420Scale_16(const uint16_t* src_y,
int dst_height,
enum FilterMode filtering);
+// Scales a YUV 4:4:4 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+
+LIBYUV_API
+int I444Scale(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+LIBYUV_API
+int I444Scale_16(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+// Scales an NV12 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// kFilterBox is not supported for the UV channel and will be treated as
+// bilinear.
+// Returns 0 if successful.
+
+LIBYUV_API
+int NV12Scale(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
#ifdef __cplusplus
// Legacy API. Deprecated.
LIBYUV_API
diff --git a/chromium/third_party/libyuv/include/libyuv/scale_row.h b/chromium/third_party/libyuv/include/libyuv/scale_row.h
index 7194ba09f84..a386d499895 100644
--- a/chromium/third_party/libyuv/include/libyuv/scale_row.h
+++ b/chromium/third_party/libyuv/include/libyuv/scale_row.h
@@ -58,6 +58,7 @@ extern "C" {
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#define HAS_FIXEDDIV1_X86
#define HAS_FIXEDDIV_X86
+#define HAS_SCALEADDROW_SSE2
#define HAS_SCALEARGBCOLS_SSE2
#define HAS_SCALEARGBCOLSUP2_SSE2
#define HAS_SCALEARGBFILTERCOLS_SSSE3
@@ -69,7 +70,22 @@ extern "C" {
#define HAS_SCALEROWDOWN34_SSSE3
#define HAS_SCALEROWDOWN38_SSSE3
#define HAS_SCALEROWDOWN4_SSSE3
-#define HAS_SCALEADDROW_SSE2
+#endif
+
+// The following are available for gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#define HAS_SCALEUVROWDOWN2BOX_SSSE3
+#endif
+
+// The following are available for gcc/clang x86 platforms, but
+// require clang 3.4 or gcc 4.7.
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || defined(__i386__)) && !defined(_MSC_VER) && \
+ (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_SCALEUVROWDOWN2BOX_AVX2
#endif
// The following are available on all x86 platforms, but
@@ -86,7 +102,9 @@ extern "C" {
// The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SCALEADDROW_NEON
#define HAS_SCALEARGBCOLS_NEON
+#define HAS_SCALEARGBFILTERCOLS_NEON
#define HAS_SCALEARGBROWDOWN2_NEON
#define HAS_SCALEARGBROWDOWNEVEN_NEON
#define HAS_SCALEFILTERCOLS_NEON
@@ -94,7 +112,8 @@ extern "C" {
#define HAS_SCALEROWDOWN34_NEON
#define HAS_SCALEROWDOWN38_NEON
#define HAS_SCALEROWDOWN4_NEON
-#define HAS_SCALEARGBFILTERCOLS_NEON
+#define HAS_SCALEUVROWDOWN2BOX_NEON
+#define HAS_SCALEUVROWDOWNEVEN_NEON
#endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
@@ -110,6 +129,24 @@ extern "C" {
#define HAS_SCALEROWDOWN4_MSA
#endif
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+#define HAS_FIXEDDIV1_MIPS
+#define HAS_FIXEDDIV_MIPS
+#define HAS_SCALEADDROW_16_MMI
+#define HAS_SCALEADDROW_MMI
+#define HAS_SCALEARGBCOLS_MMI
+#define HAS_SCALEARGBCOLSUP2_MMI
+#define HAS_SCALEARGBROWDOWN2_MMI
+#define HAS_SCALEARGBROWDOWNEVEN_MMI
+#define HAS_SCALECOLS_16_MMI
+#define HAS_SCALECOLS_MMI
+#define HAS_SCALEROWDOWN2_16_MMI
+#define HAS_SCALEROWDOWN2_MMI
+#define HAS_SCALEROWDOWN4_16_MMI
+#define HAS_SCALEROWDOWN4_MMI
+#define HAS_SCALEROWDOWN34_MMI
+#endif
+
// Scale ARGB vertically with bilinear interpolation.
void ScalePlaneVertical(int src_height,
int dst_width,
@@ -147,12 +184,17 @@ enum FilterMode ScaleFilterReduce(int src_width,
// Divide num by div and return as 16.16 fixed point result.
int FixedDiv_C(int num, int div);
int FixedDiv_X86(int num, int div);
+int FixedDiv_MIPS(int num, int div);
// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
int FixedDiv1_C(int num, int div);
int FixedDiv1_X86(int num, int div);
+int FixedDiv1_MIPS(int num, int div);
#ifdef HAS_FIXEDDIV_X86
#define FixedDiv FixedDiv_X86
#define FixedDiv1 FixedDiv1_X86
+#elif defined HAS_FIXEDDIV_MIPS
+#define FixedDiv FixedDiv_MIPS
+#define FixedDiv1 FixedDiv1_MIPS
#else
#define FixedDiv FixedDiv_C
#define FixedDiv1 FixedDiv1_C
@@ -352,6 +394,53 @@ void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
int dst_width,
int x32,
int dx);
+void ScaleUVRowDown2_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Linear_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEven_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVCols_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleUVCols64_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x32,
+ int dx);
+void ScaleUVColsUp2_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int,
+ int);
+void ScaleUVFilterCols_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleUVFilterCols64_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x32,
+ int dx);
// Specialized scalers for x86.
void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
@@ -569,6 +658,16 @@ void ScaleARGBCols_Any_MSA(uint8_t* dst_ptr,
int dst_width,
int x,
int dx);
+void ScaleARGBCols_MMI(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBCols_Any_MMI(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
// ARGB Row functions
void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
@@ -607,6 +706,18 @@ void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
int dst_width);
+void ScaleARGBRowDown2_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
void ScaleARGBRowDown2_Any_SSE2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
@@ -643,7 +754,18 @@ void ScaleARGBRowDown2Box_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
-
+void ScaleARGBRowDown2_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDown2Linear_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDown2Box_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
ptrdiff_t src_stride,
int src_stepx,
@@ -674,6 +796,16 @@ void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb,
int src_stepx,
uint8_t* dst_argb,
int dst_width);
+void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width);
void ScaleARGBRowDownEven_Any_SSE2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
int src_stepx,
@@ -704,6 +836,202 @@ void ScaleARGBRowDownEvenBox_Any_MSA(const uint8_t* src_ptr,
int src_stepx,
uint8_t* dst_ptr,
int dst_width);
+void ScaleARGBRowDownEven_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+
+// UV Row functions
+void ScaleUVRowDown2_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleUVRowDown2_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Linear_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Linear_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Box_Any_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Linear_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Box_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Box_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Linear_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Box_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEven_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEvenBox_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEven_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEvenBox_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEven_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEvenBox_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEven_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEvenBox_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEven_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEvenBox_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEven_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEvenBox_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEven_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEvenBox_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
// ScaleRowDown2Box also used by planar functions
// NEON downscalers with interpolation.
@@ -874,6 +1202,10 @@ void ScaleRowDown34_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
+void ScaleRowDown34_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* d,
@@ -927,6 +1259,10 @@ void ScaleRowDown34_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
+void ScaleRowDown34_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
void ScaleRowDown34_0_Box_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
@@ -936,6 +1272,93 @@ void ScaleRowDown34_1_Box_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
+void ScaleRowDown2_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+void ScaleRowDown2Box_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+void ScaleRowDown2Box_Odd_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown4_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown4_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+void ScaleRowDown4Box_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_16_MMI(const uint16_t* src_ptr,
+ uint32_t* dst_ptr,
+ int src_width);
+void ScaleColsUp2_MMI(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleColsUp2_16_MMI(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBColsUp2_MMI(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+
+void ScaleRowDown2_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Linear_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown4_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown4Box_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleAddRow_Any_MMI(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/chromium/third_party/libyuv/include/libyuv/scale_uv.h b/chromium/third_party/libyuv/include/libyuv/scale_uv.h
new file mode 100644
index 00000000000..1b6327aaed1
--- /dev/null
+++ b/chromium/third_party/libyuv/include/libyuv/scale_uv.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2020 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_UV_H_
+#define INCLUDE_LIBYUV_SCALE_UV_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h" // For FilterMode
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+int UVScale(const uint8_t* src_uv,
+ int src_stride_uv,
+ int src_width,
+ int src_height,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_SCALE_UV_H_
diff --git a/chromium/third_party/libyuv/include/libyuv/version.h b/chromium/third_party/libyuv/include/libyuv/version.h
index 249f61f71ac..efaac73e3ab 100644
--- a/chromium/third_party/libyuv/include/libyuv/version.h
+++ b/chromium/third_party/libyuv/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1714
+#define LIBYUV_VERSION 1768
#endif // INCLUDE_LIBYUV_VERSION_H_
diff --git a/chromium/third_party/libyuv/include/libyuv/video_common.h b/chromium/third_party/libyuv/include/libyuv/video_common.h
index ffcbdbf1b0c..b9823d71d09 100644
--- a/chromium/third_party/libyuv/include/libyuv/video_common.h
+++ b/chromium/third_party/libyuv/include/libyuv/video_common.h
@@ -50,7 +50,7 @@ extern "C" {
// Secondary formats are converted in 2 steps.
// Auxilliary formats call primary converters.
enum FourCC {
- // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+ // 10 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
FOURCC_I420 = FOURCC('I', '4', '2', '0'),
FOURCC_I422 = FOURCC('I', '4', '2', '2'),
FOURCC_I444 = FOURCC('I', '4', '4', '4'),
@@ -59,9 +59,10 @@ enum FourCC {
FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
- FOURCC_H010 = FOURCC('H', '0', '1', '0'), // unofficial fourcc. 10 bit lsb
+ FOURCC_I010 = FOURCC('I', '0', '1', '0'), // bt.601 10 bit 420
+ FOURCC_I210 = FOURCC('I', '0', '1', '0'), // bt.601 10 bit 422
- // 1 Secondary YUV format: row biplanar.
+ // 1 Secondary YUV format: row biplanar. deprecated.
FOURCC_M420 = FOURCC('M', '4', '2', '0'),
// 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc
@@ -80,15 +81,29 @@ enum FourCC {
// 1 Primary Compressed YUV format.
FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
- // 8 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+ // 14 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420.
- FOURCC_J420 = FOURCC('J', '4', '2', '0'),
- FOURCC_J400 = FOURCC('J', '4', '0', '0'), // unofficial fourcc
- FOURCC_H420 = FOURCC('H', '4', '2', '0'), // unofficial fourcc
- FOURCC_H422 = FOURCC('H', '4', '2', '2'), // unofficial fourcc
+ FOURCC_J420 =
+ FOURCC('J', '4', '2', '0'), // jpeg (bt.601 full), unofficial fourcc
+ FOURCC_J422 =
+ FOURCC('J', '4', '2', '2'), // jpeg (bt.601 full), unofficial fourcc
+ FOURCC_J444 =
+ FOURCC('J', '4', '4', '4'), // jpeg (bt.601 full), unofficial fourcc
+ FOURCC_J400 =
+ FOURCC('J', '4', '0', '0'), // jpeg (bt.601 full), unofficial fourcc
+ FOURCC_H420 = FOURCC('H', '4', '2', '0'), // bt.709, unofficial fourcc
+ FOURCC_H422 = FOURCC('H', '4', '2', '2'), // bt.709, unofficial fourcc
+ FOURCC_H444 = FOURCC('H', '4', '4', '4'), // bt.709, unofficial fourcc
+ FOURCC_U420 = FOURCC('U', '4', '2', '0'), // bt.2020, unofficial fourcc
+ FOURCC_U422 = FOURCC('U', '4', '2', '2'), // bt.2020, unofficial fourcc
+ FOURCC_U444 = FOURCC('U', '4', '4', '4'), // bt.2020, unofficial fourcc
+ FOURCC_H010 = FOURCC('H', '0', '1', '0'), // bt.709 10 bit 420
+ FOURCC_U010 = FOURCC('U', '0', '1', '0'), // bt.2020 10 bit 420
+ FOURCC_H210 = FOURCC('H', '0', '1', '0'), // bt.709 10 bit 422
+ FOURCC_U210 = FOURCC('U', '0', '1', '0'), // bt.2020 10 bit 422
// 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc.
FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420.
@@ -133,7 +148,7 @@ enum FourCCBpp {
FOURCC_BPP_NV12 = 12,
FOURCC_BPP_YUY2 = 16,
FOURCC_BPP_UYVY = 16,
- FOURCC_BPP_M420 = 12,
+ FOURCC_BPP_M420 = 12, // deprecated
FOURCC_BPP_Q420 = 12,
FOURCC_BPP_ARGB = 32,
FOURCC_BPP_BGRA = 32,
diff --git a/chromium/third_party/libyuv/libyuv.gni b/chromium/third_party/libyuv/libyuv.gni
index 89e4d382327..8df40ba2d77 100644
--- a/chromium/third_party/libyuv/libyuv.gni
+++ b/chromium/third_party/libyuv/libyuv.gni
@@ -13,8 +13,11 @@ import("//build/config/mips.gni")
declare_args() {
libyuv_include_tests = !build_with_chromium
libyuv_disable_jpeg = false
- libyuv_use_neon = (current_cpu == "arm64" ||
- (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon)))
- libyuv_use_msa = (current_cpu == "mips64el" || current_cpu == "mipsel") &&
- mips_use_msa
+ libyuv_use_neon =
+ current_cpu == "arm64" ||
+ (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon))
+ libyuv_use_msa =
+ (current_cpu == "mips64el" || current_cpu == "mipsel") && mips_use_msa
+ libyuv_use_mmi =
+ (current_cpu == "mips64el" || current_cpu == "mipsel") && mips_use_mmi
}
diff --git a/chromium/third_party/libyuv/linux.mk b/chromium/third_party/libyuv/linux.mk
index 3cb6addddd4..3e93b710d49 100644
--- a/chromium/third_party/libyuv/linux.mk
+++ b/chromium/third_party/libyuv/linux.mk
@@ -13,14 +13,15 @@ LOCAL_OBJ_FILES := \
source/compare.o \
source/compare_common.o \
source/compare_gcc.o \
+ source/compare_mmi.o \
source/compare_msa.o \
- source/compare_neon64.o \
source/compare_neon.o \
+ source/compare_neon64.o \
source/compare_win.o \
- source/convert_argb.o \
source/convert.o \
- source/convert_from_argb.o \
+ source/convert_argb.o \
source/convert_from.o \
+ source/convert_from_argb.o \
source/convert_jpeg.o \
source/convert_to_argb.o \
source/convert_to_i420.o \
@@ -28,30 +29,34 @@ LOCAL_OBJ_FILES := \
source/mjpeg_decoder.o \
source/mjpeg_validate.o \
source/planar_functions.o \
+ source/rotate.o \
source/rotate_any.o \
source/rotate_argb.o \
- source/rotate.o \
source/rotate_common.o \
source/rotate_gcc.o \
+ source/rotate_mmi.o \
source/rotate_msa.o \
- source/rotate_neon64.o \
source/rotate_neon.o \
+ source/rotate_neon64.o \
source/rotate_win.o \
source/row_any.o \
source/row_common.o \
source/row_gcc.o \
+ source/row_mmi.o \
source/row_msa.o \
- source/row_neon64.o \
source/row_neon.o \
+ source/row_neon64.o \
source/row_win.o \
+ source/scale.o \
source/scale_any.o \
source/scale_argb.o \
- source/scale.o \
source/scale_common.o \
source/scale_gcc.o \
+ source/scale_mmi.o \
source/scale_msa.o \
- source/scale_neon64.o \
source/scale_neon.o \
+ source/scale_neon64.o \
+ source/scale_uv.o \
source/scale_win.o \
source/video_common.o
@@ -61,7 +66,7 @@ LOCAL_OBJ_FILES := \
.c.o:
$(CC) -c $(CFLAGS) $*.c -o $*.o
-all: libyuv.a yuvconvert cpuid psnr
+all: libyuv.a i444tonv12_eg yuvconvert cpuid psnr
libyuv.a: $(LOCAL_OBJ_FILES)
$(AR) $(ARFLAGS) $@ $(LOCAL_OBJ_FILES)
@@ -74,6 +79,10 @@ yuvconvert: util/yuvconvert.cc libyuv.a
psnr: util/psnr.cc
$(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/psnr.cc util/psnr_main.cc util/ssim.cc
+# A simple conversion example.
+i444tonv12_eg: util/i444tonv12_eg.cc libyuv.a
+ $(CC) $(CFLAGS) -o $@ util/i444tonv12_eg.cc libyuv.a
+
# A C test utility that uses libyuv conversion from C.
# gcc 4.4 and older require -fno-exceptions to avoid link error on __gxx_personality_v0
# CC=gcc-4.4 CXXFLAGS=-fno-exceptions CXX=g++-4.4 make -f linux.mk
@@ -81,4 +90,4 @@ cpuid: util/cpuid.c libyuv.a
$(CC) $(CFLAGS) -o $@ util/cpuid.c libyuv.a
clean:
- /bin/rm -f source/*.o *.ii *.s libyuv.a yuvconvert cpuid psnr
+ /bin/rm -f source/*.o *.ii *.s libyuv.a i444tonv12_eg yuvconvert cpuid psnr
diff --git a/chromium/third_party/libyuv/source/compare.cc b/chromium/third_party/libyuv/source/compare.cc
index 50e3abd0556..e93aba1b53e 100644
--- a/chromium/third_party/libyuv/source/compare.cc
+++ b/chromium/third_party/libyuv/source/compare.cc
@@ -69,13 +69,13 @@ static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) {
if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB.
return FOURCC_BGRA;
}
- if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA.
+ if (argb[3] != 255) { // Fourth byte is not Alpha of 255, so not BGRA.
return FOURCC_ARGB;
}
if (argb[4] != 255) { // Second pixel first byte is not Alpha of 255.
return FOURCC_BGRA;
}
- if (argb[7] != 255) { // Second pixel 4th byte is not Alpha of 255.
+ if (argb[7] != 255) { // Second pixel fourth byte is not Alpha of 255.
return FOURCC_ARGB;
}
argb += 8;
@@ -149,11 +149,17 @@ uint64_t ComputeHammingDistance(const uint8_t* src_a,
HammingDistance = HammingDistance_AVX2;
}
#endif
+#if defined(HAS_HAMMINGDISTANCE_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ HammingDistance = HammingDistance_MMI;
+ }
+#endif
#if defined(HAS_HAMMINGDISTANCE_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
HammingDistance = HammingDistance_MSA;
}
#endif
+
#ifdef _OPENMP
#pragma omp parallel for reduction(+ : diff)
#endif
@@ -205,6 +211,11 @@ uint64_t ComputeSumSquareError(const uint8_t* src_a,
SumSquareError = SumSquareError_AVX2;
}
#endif
+#if defined(HAS_SUMSQUAREERROR_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SumSquareError = SumSquareError_MMI;
+ }
+#endif
#if defined(HAS_SUMSQUAREERROR_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SumSquareError = SumSquareError_MSA;
diff --git a/chromium/third_party/libyuv/source/compare_gcc.cc b/chromium/third_party/libyuv/source/compare_gcc.cc
index 676527c1b1b..6700f9697e0 100644
--- a/chromium/third_party/libyuv/source/compare_gcc.cc
+++ b/chromium/third_party/libyuv/source/compare_gcc.cc
@@ -29,38 +29,38 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
uint64_t diff = 0u;
asm volatile(
- "xor %3,%3 \n"
- "xor %%r8,%%r8 \n"
- "xor %%r9,%%r9 \n"
- "xor %%r10,%%r10 \n"
+ "xor %3,%3 \n"
+ "xor %%r8,%%r8 \n"
+ "xor %%r9,%%r9 \n"
+ "xor %%r10,%%r10 \n"
// Process 32 bytes per loop.
LABELALIGN
"1: \n"
- "mov (%0),%%rcx \n"
- "mov 0x8(%0),%%rdx \n"
- "xor (%1),%%rcx \n"
- "xor 0x8(%1),%%rdx \n"
- "popcnt %%rcx,%%rcx \n"
- "popcnt %%rdx,%%rdx \n"
- "mov 0x10(%0),%%rsi \n"
- "mov 0x18(%0),%%rdi \n"
- "xor 0x10(%1),%%rsi \n"
- "xor 0x18(%1),%%rdi \n"
- "popcnt %%rsi,%%rsi \n"
- "popcnt %%rdi,%%rdi \n"
- "add $0x20,%0 \n"
- "add $0x20,%1 \n"
- "add %%rcx,%3 \n"
- "add %%rdx,%%r8 \n"
- "add %%rsi,%%r9 \n"
- "add %%rdi,%%r10 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "mov (%0),%%rcx \n"
+ "mov 0x8(%0),%%rdx \n"
+ "xor (%1),%%rcx \n"
+ "xor 0x8(%1),%%rdx \n"
+ "popcnt %%rcx,%%rcx \n"
+ "popcnt %%rdx,%%rdx \n"
+ "mov 0x10(%0),%%rsi \n"
+ "mov 0x18(%0),%%rdi \n"
+ "xor 0x10(%1),%%rsi \n"
+ "xor 0x18(%1),%%rdi \n"
+ "popcnt %%rsi,%%rsi \n"
+ "popcnt %%rdi,%%rdi \n"
+ "add $0x20,%0 \n"
+ "add $0x20,%1 \n"
+ "add %%rcx,%3 \n"
+ "add %%rdx,%%r8 \n"
+ "add %%rsi,%%r9 \n"
+ "add %%rdi,%%r10 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
- "add %%r8, %3 \n"
- "add %%r9, %3 \n"
- "add %%r10, %3 \n"
+ "add %%r8, %3 \n"
+ "add %%r9, %3 \n"
+ "add %%r10, %3 \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
@@ -80,26 +80,26 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
// Process 16 bytes per loop.
LABELALIGN
"1: \n"
- "mov (%0),%%ecx \n"
- "mov 0x4(%0),%%edx \n"
- "xor (%1),%%ecx \n"
- "xor 0x4(%1),%%edx \n"
- "popcnt %%ecx,%%ecx \n"
- "add %%ecx,%3 \n"
- "popcnt %%edx,%%edx \n"
- "add %%edx,%3 \n"
- "mov 0x8(%0),%%ecx \n"
- "mov 0xc(%0),%%edx \n"
- "xor 0x8(%1),%%ecx \n"
- "xor 0xc(%1),%%edx \n"
- "popcnt %%ecx,%%ecx \n"
- "add %%ecx,%3 \n"
- "popcnt %%edx,%%edx \n"
- "add %%edx,%3 \n"
- "add $0x10,%0 \n"
- "add $0x10,%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "mov (%0),%%ecx \n"
+ "mov 0x4(%0),%%edx \n"
+ "xor (%1),%%ecx \n"
+ "xor 0x4(%1),%%edx \n"
+ "popcnt %%ecx,%%ecx \n"
+ "add %%ecx,%3 \n"
+ "popcnt %%edx,%%edx \n"
+ "add %%edx,%3 \n"
+ "mov 0x8(%0),%%ecx \n"
+ "mov 0xc(%0),%%edx \n"
+ "xor 0x8(%1),%%ecx \n"
+ "xor 0xc(%1),%%edx \n"
+ "popcnt %%ecx,%%ecx \n"
+ "add %%ecx,%3 \n"
+ "popcnt %%edx,%%edx \n"
+ "add %%edx,%3 \n"
+ "add $0x10,%0 \n"
+ "add $0x10,%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
@@ -121,46 +121,46 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
uint32_t diff = 0u;
asm volatile(
- "movdqa %4,%%xmm2 \n"
- "movdqa %5,%%xmm3 \n"
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "sub %0,%1 \n"
+ "movdqa %4,%%xmm2 \n"
+ "movdqa %5,%%xmm3 \n"
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "movdqa (%0),%%xmm4 \n"
- "movdqa 0x10(%0), %%xmm5 \n"
- "pxor (%0,%1), %%xmm4 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "pand %%xmm2,%%xmm6 \n"
- "psrlw $0x4,%%xmm4 \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "pshufb %%xmm6,%%xmm7 \n"
- "pand %%xmm2,%%xmm4 \n"
- "movdqa %%xmm3,%%xmm6 \n"
- "pshufb %%xmm4,%%xmm6 \n"
- "paddb %%xmm7,%%xmm6 \n"
- "pxor 0x10(%0,%1),%%xmm5 \n"
- "add $0x20,%0 \n"
- "movdqa %%xmm5,%%xmm4 \n"
- "pand %%xmm2,%%xmm5 \n"
- "psrlw $0x4,%%xmm4 \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "pshufb %%xmm5,%%xmm7 \n"
- "pand %%xmm2,%%xmm4 \n"
- "movdqa %%xmm3,%%xmm5 \n"
- "pshufb %%xmm4,%%xmm5 \n"
- "paddb %%xmm7,%%xmm5 \n"
- "paddb %%xmm5,%%xmm6 \n"
- "psadbw %%xmm1,%%xmm6 \n"
- "paddd %%xmm6,%%xmm0 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "movdqa (%0),%%xmm4 \n"
+ "movdqa 0x10(%0), %%xmm5 \n"
+ "pxor (%0,%1), %%xmm4 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "pand %%xmm2,%%xmm6 \n"
+ "psrlw $0x4,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "pshufb %%xmm6,%%xmm7 \n"
+ "pand %%xmm2,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "paddb %%xmm7,%%xmm6 \n"
+ "pxor 0x10(%0,%1),%%xmm5 \n"
+ "add $0x20,%0 \n"
+ "movdqa %%xmm5,%%xmm4 \n"
+ "pand %%xmm2,%%xmm5 \n"
+ "psrlw $0x4,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "pshufb %%xmm5,%%xmm7 \n"
+ "pand %%xmm2,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm5 \n"
+ "pshufb %%xmm4,%%xmm5 \n"
+ "paddb %%xmm7,%%xmm5 \n"
+ "paddb %%xmm5,%%xmm6 \n"
+ "psadbw %%xmm1,%%xmm6 \n"
+ "paddd %%xmm6,%%xmm0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
- "pshufd $0xaa,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "movd %%xmm0, %3 \n"
+ "pshufd $0xaa,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "movd %%xmm0, %3 \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
@@ -182,40 +182,40 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
asm volatile(
"vbroadcastf128 %4,%%ymm2 \n"
"vbroadcastf128 %5,%%ymm3 \n"
- "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
- "vpxor %%ymm1,%%ymm1,%%ymm1 \n"
- "sub %0,%1 \n"
+ "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpxor %%ymm1,%%ymm1,%%ymm1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "vmovdqa (%0),%%ymm4 \n"
- "vmovdqa 0x20(%0), %%ymm5 \n"
- "vpxor (%0,%1), %%ymm4, %%ymm4 \n"
- "vpand %%ymm2,%%ymm4,%%ymm6 \n"
- "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
- "vpshufb %%ymm6,%%ymm3,%%ymm6 \n"
- "vpand %%ymm2,%%ymm4,%%ymm4 \n"
- "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
- "vpaddb %%ymm4,%%ymm6,%%ymm6 \n"
- "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n"
- "add $0x40,%0 \n"
- "vpand %%ymm2,%%ymm4,%%ymm5 \n"
- "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
- "vpshufb %%ymm5,%%ymm3,%%ymm5 \n"
- "vpand %%ymm2,%%ymm4,%%ymm4 \n"
- "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
- "vpaddb %%ymm5,%%ymm4,%%ymm4 \n"
- "vpaddb %%ymm6,%%ymm4,%%ymm4 \n"
- "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n"
- "vpaddd %%ymm0,%%ymm4,%%ymm0 \n"
- "sub $0x40,%2 \n"
- "jg 1b \n"
+ "vmovdqa (%0),%%ymm4 \n"
+ "vmovdqa 0x20(%0), %%ymm5 \n"
+ "vpxor (%0,%1), %%ymm4, %%ymm4 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm6 \n"
+ "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm6,%%ymm3,%%ymm6 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
+ "vpaddb %%ymm4,%%ymm6,%%ymm6 \n"
+ "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n"
+ "add $0x40,%0 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm5 \n"
+ "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm5 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
+ "vpaddb %%ymm5,%%ymm4,%%ymm4 \n"
+ "vpaddb %%ymm6,%%ymm4,%%ymm4 \n"
+ "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n"
+ "vpaddd %%ymm0,%%ymm4,%%ymm0 \n"
+ "sub $0x40,%2 \n"
+ "jg 1b \n"
- "vpermq $0xb1,%%ymm0,%%ymm1 \n"
- "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xaa,%%ymm0,%%ymm1 \n"
- "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovd %%xmm0, %3 \n"
+ "vpermq $0xb1,%%ymm0,%%ymm1 \n"
+ "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xaa,%%ymm0,%%ymm1 \n"
+ "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovd %%xmm0, %3 \n"
"vzeroupper \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
@@ -234,34 +234,34 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
int count) {
uint32_t sse;
asm volatile(
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "movdqu (%1),%%xmm2 \n"
- "lea 0x10(%1),%1 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psubusb %%xmm2,%%xmm1 \n"
- "psubusb %%xmm3,%%xmm2 \n"
- "por %%xmm2,%%xmm1 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpckhbw %%xmm5,%%xmm2 \n"
- "pmaddwd %%xmm1,%%xmm1 \n"
- "pmaddwd %%xmm2,%%xmm2 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psubusb %%xmm2,%%xmm1 \n"
+ "psubusb %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm2 \n"
+ "pmaddwd %%xmm1,%%xmm1 \n"
+ "pmaddwd %%xmm2,%%xmm2 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
- "pshufd $0xee,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "pshufd $0x1,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "movd %%xmm0,%3 \n"
+ "pshufd $0xee,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "pshufd $0x1,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "movd %%xmm0,%3 \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
@@ -301,44 +301,44 @@ static const uvec32 kHashMul3 = {
uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
uint32_t hash;
asm volatile(
- "movd %2,%%xmm0 \n"
- "pxor %%xmm7,%%xmm7 \n"
- "movdqa %4,%%xmm6 \n"
+ "movd %2,%%xmm0 \n"
+ "pxor %%xmm7,%%xmm7 \n"
+ "movdqa %4,%%xmm6 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "pmulld %%xmm6,%%xmm0 \n"
- "movdqa %5,%%xmm5 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm7,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm7,%%xmm3 \n"
- "pmulld %%xmm5,%%xmm3 \n"
- "movdqa %6,%%xmm5 \n"
- "movdqa %%xmm2,%%xmm4 \n"
- "punpckhwd %%xmm7,%%xmm4 \n"
- "pmulld %%xmm5,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "punpckhbw %%xmm7,%%xmm1 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklwd %%xmm7,%%xmm2 \n"
- "pmulld %%xmm5,%%xmm2 \n"
- "movdqa %8,%%xmm5 \n"
- "punpckhwd %%xmm7,%%xmm1 \n"
- "pmulld %%xmm5,%%xmm1 \n"
- "paddd %%xmm4,%%xmm3 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "paddd %%xmm3,%%xmm1 \n"
- "pshufd $0xe,%%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "pshufd $0x1,%%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "sub $0x10,%1 \n"
- "jg 1b \n"
- "movd %%xmm0,%3 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "pmulld %%xmm6,%%xmm0 \n"
+ "movdqa %5,%%xmm5 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm7,%%xmm3 \n"
+ "pmulld %%xmm5,%%xmm3 \n"
+ "movdqa %6,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpckhwd %%xmm7,%%xmm4 \n"
+ "pmulld %%xmm5,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "punpckhbw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm7,%%xmm2 \n"
+ "pmulld %%xmm5,%%xmm2 \n"
+ "movdqa %8,%%xmm5 \n"
+ "punpckhwd %%xmm7,%%xmm1 \n"
+ "pmulld %%xmm5,%%xmm1 \n"
+ "paddd %%xmm4,%%xmm3 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm1 \n"
+ "pshufd $0xe,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "pshufd $0x1,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "sub $0x10,%1 \n"
+ "jg 1b \n"
+ "movd %%xmm0,%3 \n"
: "+r"(src), // %0
"+r"(count), // %1
"+rm"(seed), // %2
diff --git a/chromium/third_party/libyuv/source/compare_mmi.cc b/chromium/third_party/libyuv/source/compare_mmi.cc
new file mode 100644
index 00000000000..7640d9468cb
--- /dev/null
+++ b/chromium/third_party/libyuv/source/compare_mmi.cc
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Mips MMI.
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+// Hakmem method for hamming distance.
+uint32_t HammingDistance_MMI(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff = 0u;
+
+ uint64_t temp = 0, temp1 = 0, ta = 0, tb = 0;
+ uint64_t c1 = 0x5555555555555555;
+ uint64_t c2 = 0x3333333333333333;
+ uint64_t c3 = 0x0f0f0f0f0f0f0f0f;
+ uint32_t c4 = 0x01010101;
+ uint64_t s1 = 1, s2 = 2, s3 = 4;
+ __asm__ volatile(
+ "1: \n\t"
+ "ldc1 %[ta], 0(%[src_a]) \n\t"
+ "ldc1 %[tb], 0(%[src_b]) \n\t"
+ "xor %[temp], %[ta], %[tb] \n\t"
+ "psrlw %[temp1], %[temp], %[s1] \n\t" // temp1=x>>1
+ "and %[temp1], %[temp1], %[c1] \n\t" // temp1&=c1
+ "psubw %[temp1], %[temp], %[temp1] \n\t" // x-temp1
+ "and %[temp], %[temp1], %[c2] \n\t" // t = (u&c2)
+ "psrlw %[temp1], %[temp1], %[s2] \n\t" // u>>2
+ "and %[temp1], %[temp1], %[c2] \n\t" // u>>2 & c2
+ "paddw %[temp1], %[temp1], %[temp] \n\t" // t1 = t1+t
+ "psrlw %[temp], %[temp1], %[s3] \n\t" // u>>4
+ "paddw %[temp1], %[temp1], %[temp] \n\t" // u+(u>>4)
+ "and %[temp1], %[temp1], %[c3] \n\t" //&c3
+ "dmfc1 $t0, %[temp1] \n\t"
+ "dsrl32 $t0, $t0, 0 \n\t "
+ "mul $t0, $t0, %[c4] \n\t"
+ "dsrl $t0, $t0, 24 \n\t"
+ "dadd %[diff], %[diff], $t0 \n\t"
+ "dmfc1 $t0, %[temp1] \n\t"
+ "mul $t0, $t0, %[c4] \n\t"
+ "dsrl $t0, $t0, 24 \n\t"
+ "dadd %[diff], %[diff], $t0 \n\t"
+ "daddiu %[src_a], %[src_a], 8 \n\t"
+ "daddiu %[src_b], %[src_b], 8 \n\t"
+ "addiu %[count], %[count], -8 \n\t"
+ "bgtz %[count], 1b \n\t"
+ "nop \n\t"
+ : [diff] "+r"(diff), [src_a] "+r"(src_a), [src_b] "+r"(src_b),
+ [count] "+r"(count), [ta] "+f"(ta), [tb] "+f"(tb), [temp] "+f"(temp),
+ [temp1] "+f"(temp1)
+ : [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [c4] "r"(c4), [s1] "f"(s1),
+ [s2] "f"(s2), [s3] "f"(s3)
+ : "memory");
+ return diff;
+}
+
+uint32_t SumSquareError_MMI(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t sse = 0u;
+ uint32_t sse_hi = 0u, sse_lo = 0u;
+
+ uint64_t src1, src2;
+ uint64_t diff, diff_hi, diff_lo;
+ uint64_t sse_sum, sse_tmp;
+
+ const uint64_t mask = 0x0ULL;
+
+ __asm__ volatile(
+ "xor %[sse_sum], %[sse_sum], %[sse_sum] \n\t"
+
+ "1: \n\t"
+ "ldc1 %[src1], 0x00(%[src_a]) \n\t"
+ "ldc1 %[src2], 0x00(%[src_b]) \n\t"
+ "pasubub %[diff], %[src1], %[src2] \n\t"
+ "punpcklbh %[diff_lo], %[diff], %[mask] \n\t"
+ "punpckhbh %[diff_hi], %[diff], %[mask] \n\t"
+ "pmaddhw %[sse_tmp], %[diff_lo], %[diff_lo] \n\t"
+ "paddw %[sse_sum], %[sse_sum], %[sse_tmp] \n\t"
+ "pmaddhw %[sse_tmp], %[diff_hi], %[diff_hi] \n\t"
+ "paddw %[sse_sum], %[sse_sum], %[sse_tmp] \n\t"
+
+ "daddiu %[src_a], %[src_a], 0x08 \n\t"
+ "daddiu %[src_b], %[src_b], 0x08 \n\t"
+ "daddiu %[count], %[count], -0x08 \n\t"
+ "bnez %[count], 1b \n\t"
+
+ "mfc1 %[sse_lo], %[sse_sum] \n\t"
+ "mfhc1 %[sse_hi], %[sse_sum] \n\t"
+ "daddu %[sse], %[sse_hi], %[sse_lo] \n\t"
+ : [sse] "+&r"(sse), [diff] "=&f"(diff), [src1] "=&f"(src1),
+ [src2] "=&f"(src2), [diff_lo] "=&f"(diff_lo), [diff_hi] "=&f"(diff_hi),
+ [sse_sum] "=&f"(sse_sum), [sse_tmp] "=&f"(sse_tmp),
+ [sse_hi] "+&r"(sse_hi), [sse_lo] "+&r"(sse_lo)
+ : [src_a] "r"(src_a), [src_b] "r"(src_b), [count] "r"(count),
+ [mask] "f"(mask)
+ : "memory");
+
+ return sse;
+}
+
+#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/compare_neon.cc b/chromium/third_party/libyuv/source/compare_neon.cc
index 2a2181e0cb3..afdd6012164 100644
--- a/chromium/third_party/libyuv/source/compare_neon.cc
+++ b/chromium/third_party/libyuv/source/compare_neon.cc
@@ -29,24 +29,24 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
uint32_t diff;
asm volatile(
- "vmov.u16 q4, #0 \n" // accumulator
+ "vmov.u16 q4, #0 \n" // accumulator
"1: \n"
- "vld1.8 {q0, q1}, [%0]! \n"
- "vld1.8 {q2, q3}, [%1]! \n"
- "veor.32 q0, q0, q2 \n"
- "veor.32 q1, q1, q3 \n"
- "vcnt.i8 q0, q0 \n"
- "vcnt.i8 q1, q1 \n"
- "subs %2, %2, #32 \n"
- "vadd.u8 q0, q0, q1 \n" // 16 byte counts
- "vpadal.u8 q4, q0 \n" // 8 shorts
- "bgt 1b \n"
+ "vld1.8 {q0, q1}, [%0]! \n"
+ "vld1.8 {q2, q3}, [%1]! \n"
+ "veor.32 q0, q0, q2 \n"
+ "veor.32 q1, q1, q3 \n"
+ "vcnt.i8 q0, q0 \n"
+ "vcnt.i8 q1, q1 \n"
+ "subs %2, %2, #32 \n"
+ "vadd.u8 q0, q0, q1 \n" // 16 byte counts
+ "vpadal.u8 q4, q0 \n" // 8 shorts
+ "bgt 1b \n"
- "vpaddl.u16 q0, q4 \n" // 4 ints
- "vpadd.u32 d0, d0, d1 \n"
- "vpadd.u32 d0, d0, d0 \n"
- "vmov.32 %3, d0[0] \n"
+ "vpaddl.u16 q0, q4 \n" // 4 ints
+ "vpadd.u32 d0, d0, d1 \n"
+ "vpadd.u32 d0, d0, d0 \n"
+ "vmov.32 %3, d0[0] \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
:
@@ -59,29 +59,29 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
int count) {
uint32_t sse;
asm volatile(
- "vmov.u8 q8, #0 \n"
- "vmov.u8 q10, #0 \n"
- "vmov.u8 q9, #0 \n"
- "vmov.u8 q11, #0 \n"
+ "vmov.u8 q8, #0 \n"
+ "vmov.u8 q10, #0 \n"
+ "vmov.u8 q9, #0 \n"
+ "vmov.u8 q11, #0 \n"
"1: \n"
- "vld1.8 {q0}, [%0]! \n"
- "vld1.8 {q1}, [%1]! \n"
- "subs %2, %2, #16 \n"
- "vsubl.u8 q2, d0, d2 \n"
- "vsubl.u8 q3, d1, d3 \n"
- "vmlal.s16 q8, d4, d4 \n"
- "vmlal.s16 q9, d6, d6 \n"
- "vmlal.s16 q10, d5, d5 \n"
- "vmlal.s16 q11, d7, d7 \n"
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n"
+ "vld1.8 {q1}, [%1]! \n"
+ "subs %2, %2, #16 \n"
+ "vsubl.u8 q2, d0, d2 \n"
+ "vsubl.u8 q3, d1, d3 \n"
+ "vmlal.s16 q8, d4, d4 \n"
+ "vmlal.s16 q9, d6, d6 \n"
+ "vmlal.s16 q10, d5, d5 \n"
+ "vmlal.s16 q11, d7, d7 \n"
+ "bgt 1b \n"
- "vadd.u32 q8, q8, q9 \n"
- "vadd.u32 q10, q10, q11 \n"
- "vadd.u32 q11, q8, q10 \n"
- "vpaddl.u32 q1, q11 \n"
- "vadd.u64 d0, d2, d3 \n"
- "vmov.32 %3, d0[0] \n"
+ "vadd.u32 q8, q8, q9 \n"
+ "vadd.u32 q10, q10, q11 \n"
+ "vadd.u32 q11, q8, q10 \n"
+ "vpaddl.u32 q1, q11 \n"
+ "vadd.u64 d0, d2, d3 \n"
+ "vmov.32 %3, d0[0] \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
diff --git a/chromium/third_party/libyuv/source/compare_neon64.cc b/chromium/third_party/libyuv/source/compare_neon64.cc
index 6e8f672ab73..70fb9b9143f 100644
--- a/chromium/third_party/libyuv/source/compare_neon64.cc
+++ b/chromium/third_party/libyuv/source/compare_neon64.cc
@@ -27,22 +27,24 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
int count) {
uint32_t diff;
asm volatile(
- "movi v4.8h, #0 \n"
+ "movi v4.8h, #0 \n"
"1: \n"
- "ld1 {v0.16b, v1.16b}, [%0], #32 \n"
- "ld1 {v2.16b, v3.16b}, [%1], #32 \n"
- "eor v0.16b, v0.16b, v2.16b \n"
- "eor v1.16b, v1.16b, v3.16b \n"
- "cnt v0.16b, v0.16b \n"
- "cnt v1.16b, v1.16b \n"
- "subs %w2, %w2, #32 \n"
- "add v0.16b, v0.16b, v1.16b \n"
- "uadalp v4.8h, v0.16b \n"
- "b.gt 1b \n"
+ "ld1 {v0.16b, v1.16b}, [%0], #32 \n"
+ "ld1 {v2.16b, v3.16b}, [%1], #32 \n"
+ "eor v0.16b, v0.16b, v2.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "eor v1.16b, v1.16b, v3.16b \n"
+ "cnt v0.16b, v0.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "cnt v1.16b, v1.16b \n"
+ "subs %w2, %w2, #32 \n"
+ "add v0.16b, v0.16b, v1.16b \n"
+ "uadalp v4.8h, v0.16b \n"
+ "b.gt 1b \n"
- "uaddlv s4, v4.8h \n"
- "fmov %w3, s4 \n"
+ "uaddlv s4, v4.8h \n"
+ "fmov %w3, s4 \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
:
: "cc", "v0", "v1", "v2", "v3", "v4");
@@ -54,28 +56,30 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
int count) {
uint32_t sse;
asm volatile(
- "eor v16.16b, v16.16b, v16.16b \n"
- "eor v18.16b, v18.16b, v18.16b \n"
- "eor v17.16b, v17.16b, v17.16b \n"
- "eor v19.16b, v19.16b, v19.16b \n"
+ "eor v16.16b, v16.16b, v16.16b \n"
+ "eor v18.16b, v18.16b, v18.16b \n"
+ "eor v17.16b, v17.16b, v17.16b \n"
+ "eor v19.16b, v19.16b, v19.16b \n"
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n"
- "ld1 {v1.16b}, [%1], #16 \n"
- "subs %w2, %w2, #16 \n"
- "usubl v2.8h, v0.8b, v1.8b \n"
- "usubl2 v3.8h, v0.16b, v1.16b \n"
- "smlal v16.4s, v2.4h, v2.4h \n"
- "smlal v17.4s, v3.4h, v3.4h \n"
- "smlal2 v18.4s, v2.8h, v2.8h \n"
- "smlal2 v19.4s, v3.8h, v3.8h \n"
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n"
+ "ld1 {v1.16b}, [%1], #16 \n"
+ "subs %w2, %w2, #16 \n"
+ "usubl v2.8h, v0.8b, v1.8b \n"
+ "usubl2 v3.8h, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "smlal v16.4s, v2.4h, v2.4h \n"
+ "smlal v17.4s, v3.4h, v3.4h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "smlal2 v18.4s, v2.8h, v2.8h \n"
+ "smlal2 v19.4s, v3.8h, v3.8h \n"
+ "b.gt 1b \n"
- "add v16.4s, v16.4s, v17.4s \n"
- "add v18.4s, v18.4s, v19.4s \n"
- "add v19.4s, v16.4s, v18.4s \n"
- "addv s0, v19.4s \n"
- "fmov %w3, s0 \n"
+ "add v16.4s, v16.4s, v17.4s \n"
+ "add v18.4s, v18.4s, v19.4s \n"
+ "add v19.4s, v16.4s, v18.4s \n"
+ "addv s0, v19.4s \n"
+ "fmov %w3, s0 \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
diff --git a/chromium/third_party/libyuv/source/convert.cc b/chromium/third_party/libyuv/source/convert.cc
index 375cc732c1d..98258b9bc93 100644
--- a/chromium/third_party/libyuv/source/convert.cc
+++ b/chromium/third_party/libyuv/source/convert.cc
@@ -215,6 +215,195 @@ int I422ToI420(const uint8_t* src_y,
dst_v, dst_stride_v, width, height, src_uv_width, height);
}
+// TODO(fbarchard): Implement row conversion.
+LIBYUV_API
+int I422ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ // Allocate u and v buffers
+ align_buffer_64(plane_u, halfwidth * halfheight * 2);
+ uint8_t* plane_v = plane_u + halfwidth * halfheight;
+
+ I422ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
+ height);
+ MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu,
+ halfwidth, halfheight);
+ free_aligned_buffer_64(plane_u);
+ return 0;
+}
+
+#ifdef I422TONV21_ROW_VERSION
+// Unittest fails for this version.
+// 422 chroma is 1/2 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+// Swap src_u and src_v to implement I422ToNV12
+LIBYUV_API
+int I422ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int y;
+ void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
+ uint8_t* dst_uv, int width) = MergeUVRow_C;
+ void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_u || !src_v || !dst_vu || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeUVRow = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow = MergeUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow = MergeUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MergeUVRow = MergeUVRow_Any_MMI;
+ if (IS_ALIGNED(halfwidth, 8)) {
+ MergeUVRow = MergeUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow = MergeUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, halfwidth, height);
+ }
+ {
+ // Allocate 2 rows of vu.
+ int awidth = halfwidth * 2;
+ align_buffer_64(row_vu_0, awidth * 2);
+ uint8_t* row_vu_1 = row_vu_0 + awidth;
+
+ for (y = 0; y < height - 1; y += 2) {
+ MergeUVRow(src_v, src_u, row_vu_0, halfwidth);
+ MergeUVRow(src_v + src_stride_v, src_u + src_stride_u, row_vu_1,
+ halfwidth);
+ InterpolateRow(dst_vu, row_vu_0, awidth, awidth, 128);
+ src_u += src_stride_u * 2;
+ src_v += src_stride_v * 2;
+ dst_vu += dst_stride_vu;
+ }
+ if (height & 1) {
+ MergeUVRow(src_v, src_u, dst_vu, halfwidth);
+ }
+ free_aligned_buffer_64(row_vu_0);
+ }
+ return 0;
+}
+#endif // I422TONV21_ROW_VERSION
+
// 444 chroma is 1x width, 1x height
// 420 chroma is 1/2 width, 1/2 height
LIBYUV_API
@@ -237,6 +426,59 @@ int I444ToI420(const uint8_t* src_y,
dst_v, dst_stride_v, width, height, width, height);
}
+LIBYUV_API
+int I444ToNV12(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ HalfMergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv,
+ dst_stride_uv, width, height);
+ return 0;
+}
+
+LIBYUV_API
+int I444ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ return I444ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
+ width, height);
+}
+
// I400 is greyscale typically used in MJPG
LIBYUV_API
int I400ToI420(const uint8_t* src_y,
@@ -269,70 +511,50 @@ int I400ToI420(const uint8_t* src_y,
return 0;
}
-static void CopyPlane2(const uint8_t* src,
- int src_stride_0,
- int src_stride_1,
- uint8_t* dst,
- int dst_stride,
- int width,
- int height) {
- int y;
- void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
- }
-#endif
-#if defined(HAS_COPYROW_AVX)
- if (TestCpuFlag(kCpuHasAVX)) {
- CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
- }
-#endif
-#if defined(HAS_COPYROW_ERMS)
- if (TestCpuFlag(kCpuHasERMS)) {
- CopyRow = CopyRow_ERMS;
- }
-#endif
-#if defined(HAS_COPYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+// I400 is greyscale typically used in MJPG
+LIBYUV_API
+int I400ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!dst_vu || width <= 0 || height == 0) {
+ return -1;
}
-#endif
-
- // Copy plane
- for (y = 0; y < height - 1; y += 2) {
- CopyRow(src, dst, width);
- CopyRow(src + src_stride_0, dst + dst_stride, width);
- src += src_stride_0 + src_stride_1;
- dst += dst_stride * 2;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
}
- if (height & 1) {
- CopyRow(src, dst, width);
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
}
+ SetPlane(dst_vu, dst_stride_vu, halfwidth * 2, halfheight, 128);
+ return 0;
}
-// Support converting from FOURCC_M420
-// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
-// easy conversion to I420.
-// M420 format description:
-// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
-// Chroma is half width / half height. (420)
-// src_stride_m420 is row planar. Normally this will be the width in pixels.
-// The UV plane is half width, but 2 values, so src_stride_m420 applies to
-// this as well as the two Y planes.
-static int X420ToI420(const uint8_t* src_y,
- int src_stride_y0,
- int src_stride_y1,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
+// Convert NV12 to I420.
+// TODO(fbarchard): Consider inverting destination. Faster on ARM with prfm.
+LIBYUV_API
+int NV12ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) {
@@ -342,21 +564,16 @@ static int X420ToI420(const uint8_t* src_y,
if (height < 0) {
height = -height;
halfheight = (height + 1) >> 1;
- if (dst_y) {
- dst_y = dst_y + (height - 1) * dst_stride_y;
- }
- dst_u = dst_u + (halfheight - 1) * dst_stride_u;
- dst_v = dst_v + (halfheight - 1) * dst_stride_v;
- dst_stride_y = -dst_stride_y;
- dst_stride_u = -dst_stride_u;
- dst_stride_v = -dst_stride_v;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+ src_stride_y = -src_stride_y;
+ src_stride_uv = -src_stride_uv;
}
// Coalesce rows.
- if (src_stride_y0 == width && src_stride_y1 == width &&
- dst_stride_y == width) {
+ if (src_stride_y == width && dst_stride_y == width) {
width *= height;
height = 1;
- src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
+ src_stride_y = dst_stride_y = 0;
}
// Coalesce rows.
if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth &&
@@ -367,12 +584,7 @@ static int X420ToI420(const uint8_t* src_y,
}
if (dst_y) {
- if (src_stride_y0 == src_stride_y1) {
- CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);
- } else {
- CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
- width, height);
- }
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
}
// Split UV plane - NV12 / NV21
@@ -382,25 +594,6 @@ static int X420ToI420(const uint8_t* src_y,
return 0;
}
-// Convert NV12 to I420.
-LIBYUV_API
-int NV12ToI420(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- return X420ToI420(src_y, src_stride_y, src_stride_y, src_uv, src_stride_uv,
- dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
- dst_stride_v, width, height);
-}
-
// Convert NV21 to I420. Same as NV12 but u and v pointers swapped.
LIBYUV_API
int NV21ToI420(const uint8_t* src_y,
@@ -415,26 +608,8 @@ int NV21ToI420(const uint8_t* src_y,
int dst_stride_v,
int width,
int height) {
- return X420ToI420(src_y, src_stride_y, src_stride_y, src_vu, src_stride_vu,
- dst_y, dst_stride_y, dst_v, dst_stride_v, dst_u,
- dst_stride_u, width, height);
-}
-
-// Convert M420 to I420.
-LIBYUV_API
-int M420ToI420(const uint8_t* src_m420,
- int src_stride_m420,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
- src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, dst_y,
- dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
+ return NV12ToI420(src_y, src_stride_y, src_vu, src_stride_vu, dst_y,
+ dst_stride_y, dst_v, dst_stride_v, dst_u, dst_stride_u,
width, height);
}
@@ -492,7 +667,19 @@ int YUY2ToI420(const uint8_t* src_yuy2,
}
}
#endif
-#if defined(HAS_YUY2TOYROW_MSA)
+#if defined(HAS_YUY2TOYROW_MMI) && defined(HAS_YUY2TOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ YUY2ToYRow = YUY2ToYRow_Any_MMI;
+ YUY2ToUVRow = YUY2ToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ YUY2ToYRow = YUY2ToYRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToUVRow = YUY2ToUVRow_MMI;
+ }
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
YUY2ToYRow = YUY2ToYRow_Any_MSA;
YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
@@ -573,6 +760,16 @@ int UYVYToI420(const uint8_t* src_uyvy,
}
}
#endif
+#if defined(HAS_UYVYTOYROW_MMI) && defined(HAS_UYVYTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ UYVYToYRow = UYVYToYRow_Any_MMI;
+ UYVYToUVRow = UYVYToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_MMI;
+ UYVYToUVRow = UYVYToUVRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_UYVYTOYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
UYVYToYRow = UYVYToYRow_Any_MSA;
@@ -600,6 +797,144 @@ int UYVYToI420(const uint8_t* src_uyvy,
return 0;
}
+// Convert AYUV to NV12.
+LIBYUV_API
+int AYUVToNV12(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ void (*AYUVToUVRow)(const uint8_t* src_ayuv, int src_stride_ayuv,
+ uint8_t* dst_uv, int width) = AYUVToUVRow_C;
+ void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
+ AYUVToYRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
+ src_stride_ayuv = -src_stride_ayuv;
+ }
+// place holders for future intel code
+#if defined(HAS_AYUVTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ AYUVToUVRow = AYUVToUVRow_Any_SSE2;
+ AYUVToYRow = AYUVToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ AYUVToUVRow = AYUVToUVRow_SSE2;
+ AYUVToYRow = AYUVToYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_AYUVTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ AYUVToUVRow = AYUVToUVRow_Any_AVX2;
+ AYUVToYRow = AYUVToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ AYUVToUVRow = AYUVToUVRow_AVX2;
+ AYUVToYRow = AYUVToYRow_AVX2;
+ }
+ }
+#endif
+
+#if defined(HAS_AYUVTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ AYUVToYRow = AYUVToYRow_Any_NEON;
+ AYUVToUVRow = AYUVToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ AYUVToYRow = AYUVToYRow_NEON;
+ AYUVToUVRow = AYUVToUVRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ AYUVToUVRow(src_ayuv, src_stride_ayuv, dst_uv, width);
+ AYUVToYRow(src_ayuv, dst_y, width);
+ AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
+ src_ayuv += src_stride_ayuv * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ AYUVToUVRow(src_ayuv, 0, dst_uv, width);
+ AYUVToYRow(src_ayuv, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert AYUV to NV21.
+LIBYUV_API
+int AYUVToNV21(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int y;
+ void (*AYUVToVURow)(const uint8_t* src_ayuv, int src_stride_ayuv,
+ uint8_t* dst_vu, int width) = AYUVToVURow_C;
+ void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
+ AYUVToYRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
+ src_stride_ayuv = -src_stride_ayuv;
+ }
+// place holders for future intel code
+#if defined(HAS_AYUVTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ AYUVToVURow = AYUVToVURow_Any_SSE2;
+ AYUVToYRow = AYUVToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ AYUVToVURow = AYUVToVURow_SSE2;
+ AYUVToYRow = AYUVToYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_AYUVTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ AYUVToVURow = AYUVToVURow_Any_AVX2;
+ AYUVToYRow = AYUVToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ AYUVToVURow = AYUVToVURow_AVX2;
+ AYUVToYRow = AYUVToYRow_AVX2;
+ }
+ }
+#endif
+
+#if defined(HAS_AYUVTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ AYUVToYRow = AYUVToYRow_Any_NEON;
+ AYUVToVURow = AYUVToVURow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ AYUVToYRow = AYUVToYRow_NEON;
+ AYUVToVURow = AYUVToVURow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ AYUVToVURow(src_ayuv, src_stride_ayuv, dst_vu, width);
+ AYUVToYRow(src_ayuv, dst_y, width);
+ AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
+ src_ayuv += src_stride_ayuv * 2;
+ dst_y += dst_stride_y * 2;
+ dst_vu += dst_stride_vu;
+ }
+ if (height & 1) {
+ AYUVToVURow(src_ayuv, 0, dst_vu, width);
+ AYUVToYRow(src_ayuv, dst_y, width);
+ }
+ return 0;
+}
+
// Convert ARGB to I420.
LIBYUV_API
int ARGBToI420(const uint8_t* src_argb,
@@ -663,17 +998,25 @@ int ARGBToI420(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToYRow = ARGBToYRow_Any_MSA;
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_MSA;
+ ARGBToUVRow = ARGBToUVRow_MMI;
}
}
#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_MSA;
}
@@ -749,18 +1092,24 @@ int BGRAToI420(const uint8_t* src_bgra,
}
}
#endif
-#if defined(HAS_BGRATOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- BGRAToYRow = BGRAToYRow_Any_MSA;
+#if defined(HAS_BGRATOYROW_MMI) && defined(HAS_BGRATOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ BGRAToYRow = BGRAToYRow_Any_MMI;
+ BGRAToUVRow = BGRAToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ BGRAToYRow = BGRAToYRow_MMI;
+ }
if (IS_ALIGNED(width, 16)) {
- BGRAToYRow = BGRAToYRow_MSA;
+ BGRAToUVRow = BGRAToUVRow_MMI;
}
}
#endif
-#if defined(HAS_BGRATOUVROW_MSA)
+#if defined(HAS_BGRATOYROW_MSA) && defined(HAS_BGRATOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
+ BGRAToYRow = BGRAToYRow_Any_MSA;
BGRAToUVRow = BGRAToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
+ BGRAToYRow = BGRAToYRow_MSA;
BGRAToUVRow = BGRAToUVRow_MSA;
}
}
@@ -819,6 +1168,16 @@ int ABGRToI420(const uint8_t* src_abgr,
}
}
#endif
+#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+ ABGRToYRow = ABGRToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_AVX2;
+ ABGRToYRow = ABGRToYRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_ABGRTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ABGRToYRow = ABGRToYRow_Any_NEON;
@@ -835,18 +1194,24 @@ int ABGRToI420(const uint8_t* src_abgr,
}
}
#endif
-#if defined(HAS_ABGRTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ABGRToYRow = ABGRToYRow_Any_MSA;
+#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ABGRToYRow = ABGRToYRow_Any_MMI;
+ ABGRToUVRow = ABGRToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ABGRToYRow = ABGRToYRow_MMI;
+ }
if (IS_ALIGNED(width, 16)) {
- ABGRToYRow = ABGRToYRow_MSA;
+ ABGRToUVRow = ABGRToUVRow_MMI;
}
}
#endif
-#if defined(HAS_ABGRTOUVROW_MSA)
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
+ ABGRToYRow = ABGRToYRow_Any_MSA;
ABGRToUVRow = ABGRToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_MSA;
ABGRToUVRow = ABGRToUVRow_MSA;
}
}
@@ -921,18 +1286,24 @@ int RGBAToI420(const uint8_t* src_rgba,
}
}
#endif
-#if defined(HAS_RGBATOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- RGBAToYRow = RGBAToYRow_Any_MSA;
+#if defined(HAS_RGBATOYROW_MMI) && defined(HAS_RGBATOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGBAToYRow = RGBAToYRow_Any_MMI;
+ RGBAToUVRow = RGBAToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RGBAToYRow = RGBAToYRow_MMI;
+ }
if (IS_ALIGNED(width, 16)) {
- RGBAToYRow = RGBAToYRow_MSA;
+ RGBAToUVRow = RGBAToUVRow_MMI;
}
}
#endif
-#if defined(HAS_RGBATOUVROW_MSA)
+#if defined(HAS_RGBATOYROW_MSA) && defined(HAS_RGBATOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
+ RGBAToYRow = RGBAToYRow_Any_MSA;
RGBAToUVRow = RGBAToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
+ RGBAToYRow = RGBAToYRow_MSA;
RGBAToUVRow = RGBAToUVRow_MSA;
}
}
@@ -967,7 +1338,8 @@ int RGB24ToI420(const uint8_t* src_rgb24,
int width,
int height) {
int y;
-#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+ defined(HAS_RGB24TOYROW_MMI))
void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
uint8_t* dst_u, uint8_t* dst_v, int width) =
RGB24ToUVRow_C;
@@ -1004,7 +1376,21 @@ int RGB24ToI420(const uint8_t* src_rgb24,
}
}
}
-#elif defined(HAS_RGB24TOYROW_MSA)
+// MMI and MSA version does direct RGB24 to YUV.
+#elif (defined(HAS_RGB24TOYROW_MMI) || defined(HAS_RGB24TOYROW_MSA))
+#if defined(HAS_RGB24TOYROW_MMI) && defined(HAS_RGB24TOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGB24ToUVRow = RGB24ToUVRow_Any_MMI;
+ RGB24ToYRow = RGB24ToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToYRow = RGB24ToYRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToUVRow = RGB24ToUVRow_MMI;
+ }
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYROW_MSA) && defined(HAS_RGB24TOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RGB24ToUVRow = RGB24ToUVRow_Any_MSA;
RGB24ToYRow = RGB24ToYRow_Any_MSA;
@@ -1013,6 +1399,7 @@ int RGB24ToI420(const uint8_t* src_rgb24,
RGB24ToUVRow = RGB24ToUVRow_MSA;
}
}
+#endif
// Other platforms do intermediate conversion from RGB24 to ARGB.
#else
#if defined(HAS_RGB24TOARGBROW_SSSE3)
@@ -1046,14 +1433,16 @@ int RGB24ToI420(const uint8_t* src_rgb24,
#endif
{
-#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+ defined(HAS_RGB24TOYROW_MMI))
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+ defined(HAS_RGB24TOYROW_MMI))
RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
RGB24ToYRow(src_rgb24, dst_y, width);
RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
@@ -1070,7 +1459,8 @@ int RGB24ToI420(const uint8_t* src_rgb24,
dst_v += dst_stride_v;
}
if (height & 1) {
-#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+ defined(HAS_RGB24TOYROW_MMI))
RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
RGB24ToYRow(src_rgb24, dst_y, width);
#else
@@ -1079,7 +1469,160 @@ int RGB24ToI420(const uint8_t* src_rgb24,
ARGBToYRow(row, dst_y, width);
#endif
}
-#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+ defined(HAS_RGB24TOYROW_MMI))
+ free_aligned_buffer_64(row);
+#endif
+ }
+ return 0;
+}
+
+// TODO(fbarchard): Use Matrix version to implement I420 and J420.
+// Convert RGB24 to J420.
+LIBYUV_API
+int RGB24ToJ420(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+#if (defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+ defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI)
+ void (*RGB24ToUVJRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ RGB24ToUVJRow_C;
+ void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
+ RGB24ToYJRow_C;
+#else
+ void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+ RGB24ToARGBRow_C;
+ void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVJRow_C;
+ void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYJRow_C;
+#endif
+ if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+
+// Neon version does direct RGB24 to YUV.
+#if defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB24ToUVJRow = RGB24ToUVJRow_Any_NEON;
+ RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToYJRow = RGB24ToYJRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToUVJRow = RGB24ToUVJRow_NEON;
+ }
+ }
+ }
+// MMI and MSA version does direct RGB24 to YUV.
+#elif (defined(HAS_RGB24TOYJROW_MMI) || defined(HAS_RGB24TOYJROW_MSA))
+#if defined(HAS_RGB24TOYJROW_MMI) && defined(HAS_RGB24TOUVJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGB24ToUVJRow = RGB24ToUVJRow_Any_MMI;
+ RGB24ToYJRow = RGB24ToYJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToYJRow = RGB24ToYJRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToUVJRow = RGB24ToUVJRow_MMI;
+ }
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYJROW_MSA) && defined(HAS_RGB24TOUVJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB24ToUVJRow = RGB24ToUVJRow_Any_MSA;
+ RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYJRow = RGB24ToYJRow_MSA;
+ RGB24ToUVJRow = RGB24ToUVJRow_MSA;
+ }
+ }
+#endif
+#else
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVJRow = ARGBToUVJRow_AVX2;
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#endif
+
+ {
+#if !((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+ defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if ((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+ defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
+ RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+ RGB24ToYJRow(src_rgb24, dst_y, width);
+ RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+ ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYJRow(row, dst_y, width);
+ ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_rgb24 += src_stride_rgb24 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if ((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+ defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
+ RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width);
+ RGB24ToYJRow(src_rgb24, dst_y, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ ARGBToUVJRow(row, 0, dst_u, dst_v, width);
+ ARGBToYJRow(row, dst_y, width);
+#endif
+ }
+#if !((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+ defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
free_aligned_buffer_64(row);
#endif
}
@@ -1099,7 +1642,8 @@ int RAWToI420(const uint8_t* src_raw,
int width,
int height) {
int y;
-#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+#if (defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON)) || \
+ defined(HAS_RAWTOYROW_MSA) || defined(HAS_RAWTOYROW_MMI)
void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u,
uint8_t* dst_v, int width) = RAWToUVRow_C;
void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
@@ -1124,7 +1668,7 @@ int RAWToI420(const uint8_t* src_raw,
}
// Neon version does direct RAW to YUV.
-#if defined(HAS_RAWTOYROW_NEON)
+#if defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RAWToUVRow = RAWToUVRow_Any_NEON;
RAWToYRow = RAWToYRow_Any_NEON;
@@ -1135,7 +1679,21 @@ int RAWToI420(const uint8_t* src_raw,
}
}
}
-#elif defined(HAS_RAWTOYROW_MSA)
+// MMI and MSA version does direct RAW to YUV.
+#elif (defined(HAS_RAWTOYROW_MMI) || defined(HAS_RAWTOYROW_MSA))
+#if defined(HAS_RAWTOYROW_MMI) && defined(HAS_RAWTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RAWToUVRow = RAWToUVRow_Any_MMI;
+ RAWToYRow = RAWToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToYRow = RAWToYRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToUVRow = RAWToUVRow_MMI;
+ }
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYROW_MSA) && defined(HAS_RAWTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RAWToUVRow = RAWToUVRow_Any_MSA;
RAWToYRow = RAWToYRow_Any_MSA;
@@ -1144,6 +1702,7 @@ int RAWToI420(const uint8_t* src_raw,
RAWToUVRow = RAWToUVRow_MSA;
}
}
+#endif
// Other platforms do intermediate conversion from RAW to ARGB.
#else
#if defined(HAS_RAWTOARGBROW_SSSE3)
@@ -1177,14 +1736,16 @@ int RAWToI420(const uint8_t* src_raw,
#endif
{
-#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+ defined(HAS_RAWTOYROW_MMI))
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+ defined(HAS_RAWTOYROW_MMI))
RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
RAWToYRow(src_raw, dst_y, width);
RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
@@ -1201,7 +1762,8 @@ int RAWToI420(const uint8_t* src_raw,
dst_v += dst_stride_v;
}
if (height & 1) {
-#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+ defined(HAS_RAWTOYROW_MMI))
RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
RAWToYRow(src_raw, dst_y, width);
#else
@@ -1210,7 +1772,8 @@ int RAWToI420(const uint8_t* src_raw,
ARGBToYRow(row, dst_y, width);
#endif
}
-#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+ defined(HAS_RAWTOYROW_MMI))
free_aligned_buffer_64(row);
#endif
}
@@ -1230,7 +1793,8 @@ int RGB565ToI420(const uint8_t* src_rgb565,
int width,
int height) {
int y;
-#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+ defined(HAS_RGB565TOYROW_MMI))
void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565,
uint8_t* dst_u, uint8_t* dst_v, int width) =
RGB565ToUVRow_C;
@@ -1267,7 +1831,21 @@ int RGB565ToI420(const uint8_t* src_rgb565,
}
}
}
-#elif defined(HAS_RGB565TOYROW_MSA)
+// MMI and MSA version does direct RGB565 to YUV.
+#elif (defined(HAS_RGB565TOYROW_MMI) || defined(HAS_RGB565TOYROW_MSA))
+#if defined(HAS_RGB565TOYROW_MMI) && defined(HAS_RGB565TOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGB565ToUVRow = RGB565ToUVRow_Any_MMI;
+ RGB565ToYRow = RGB565ToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToYRow = RGB565ToYRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToUVRow = RGB565ToUVRow_MMI;
+ }
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOYROW_MSA) && defined(HAS_RGB565TOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RGB565ToUVRow = RGB565ToUVRow_Any_MSA;
RGB565ToYRow = RGB565ToYRow_Any_MSA;
@@ -1276,6 +1854,7 @@ int RGB565ToI420(const uint8_t* src_rgb565,
RGB565ToUVRow = RGB565ToUVRow_MSA;
}
}
+#endif
// Other platforms do intermediate conversion from RGB565 to ARGB.
#else
#if defined(HAS_RGB565TOARGBROW_SSE2)
@@ -1316,13 +1895,15 @@ int RGB565ToI420(const uint8_t* src_rgb565,
#endif
#endif
{
-#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+ defined(HAS_RGB565TOYROW_MMI))
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+ defined(HAS_RGB565TOYROW_MMI))
RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
RGB565ToYRow(src_rgb565, dst_y, width);
RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
@@ -1339,7 +1920,8 @@ int RGB565ToI420(const uint8_t* src_rgb565,
dst_v += dst_stride_v;
}
if (height & 1) {
-#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+ defined(HAS_RGB565TOYROW_MMI))
RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
RGB565ToYRow(src_rgb565, dst_y, width);
#else
@@ -1348,7 +1930,8 @@ int RGB565ToI420(const uint8_t* src_rgb565,
ARGBToYRow(row, dst_y, width);
#endif
}
-#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+ defined(HAS_RGB565TOYROW_MMI))
free_aligned_buffer_64(row);
#endif
}
@@ -1368,7 +1951,8 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
int width,
int height) {
int y;
-#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+ defined(HAS_ARGB1555TOYROW_MMI))
void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555,
uint8_t* dst_u, uint8_t* dst_v, int width) =
ARGB1555ToUVRow_C;
@@ -1406,7 +1990,21 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
}
}
}
-#elif defined(HAS_ARGB1555TOYROW_MSA)
+// MMI and MSA version does direct ARGB1555 to YUV.
+#elif (defined(HAS_ARGB1555TOYROW_MMI) || defined(HAS_ARGB1555TOYROW_MSA))
+#if defined(HAS_ARGB1555TOYROW_MMI) && defined(HAS_ARGB1555TOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MMI;
+ ARGB1555ToYRow = ARGB1555ToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToYRow = ARGB1555ToYRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_MMI;
+ }
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOYROW_MSA) && defined(HAS_ARGB1555TOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA;
ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA;
@@ -1415,6 +2013,7 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
ARGB1555ToUVRow = ARGB1555ToUVRow_MSA;
}
}
+#endif
// Other platforms do intermediate conversion from ARGB1555 to ARGB.
#else
#if defined(HAS_ARGB1555TOARGBROW_SSE2)
@@ -1455,14 +2054,16 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
#endif
#endif
{
-#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+ defined(HAS_ARGB1555TOYROW_MMI))
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+ defined(HAS_ARGB1555TOYROW_MMI))
ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
ARGB1555ToYRow(src_argb1555, dst_y, width);
ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
@@ -1481,7 +2082,8 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
dst_v += dst_stride_v;
}
if (height & 1) {
-#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+ defined(HAS_ARGB1555TOYROW_MMI))
ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
ARGB1555ToYRow(src_argb1555, dst_y, width);
#else
@@ -1490,7 +2092,8 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
ARGBToYRow(row, dst_y, width);
#endif
}
-#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+ defined(HAS_ARGB1555TOYROW_MMI))
free_aligned_buffer_64(row);
#endif
}
@@ -1510,7 +2113,7 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
int width,
int height) {
int y;
-#if defined(HAS_ARGB4444TOYROW_NEON)
+#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444,
uint8_t* dst_u, uint8_t* dst_v, int width) =
ARGB4444ToUVRow_C;
@@ -1548,6 +2151,17 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
}
}
}
+#elif defined(HAS_ARGB4444TOYROW_MMI) && defined(HAS_ARGB4444TOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGB4444ToUVRow = ARGB4444ToUVRow_Any_MMI;
+ ARGB4444ToYRow = ARGB4444ToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToYRow = ARGB4444ToYRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToUVRow = ARGB4444ToUVRow_MMI;
+ }
+ }
+ }
// Other platforms do intermediate conversion from ARGB4444 to ARGB.
#else
#if defined(HAS_ARGB4444TOARGBROW_SSE2)
@@ -1594,7 +2208,19 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_MMI;
+ }
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToUVRow = ARGBToUVRow_Any_MSA;
ARGBToYRow = ARGBToYRow_Any_MSA;
@@ -1609,14 +2235,14 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
#endif
{
-#if !defined(HAS_ARGB4444TOYROW_NEON)
+#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_ARGB4444TOYROW_NEON)
+#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
ARGB4444ToYRow(src_argb4444, dst_y, width);
ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
@@ -1635,7 +2261,7 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
dst_v += dst_stride_v;
}
if (height & 1) {
-#if defined(HAS_ARGB4444TOYROW_NEON)
+#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
ARGB4444ToYRow(src_argb4444, dst_y, width);
#else
@@ -1644,13 +2270,161 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
ARGBToYRow(row, dst_y, width);
#endif
}
-#if !defined(HAS_ARGB4444TOYROW_NEON)
+#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
free_aligned_buffer_64(row);
#endif
}
return 0;
}
+// Convert RGB24 to J400.
+LIBYUV_API
+int RGB24ToJ400(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height) {
+ int y;
+ void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) =
+ RGB24ToYJRow_C;
+ if (!src_rgb24 || !dst_yj || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+ // Coalesce rows.
+ if (src_stride_rgb24 == width * 3 && dst_stride_yj == width) {
+ width *= height;
+ height = 1;
+ src_stride_rgb24 = dst_stride_yj = 0;
+ }
+#if defined(HAS_RGB24TOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYJRow = RGB24ToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ RGB24ToYJRow = RGB24ToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToYJRow = RGB24ToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToYJRow = RGB24ToYJRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYJRow = RGB24ToYJRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RGB24ToYJRow(src_rgb24, dst_yj, width);
+ src_rgb24 += src_stride_rgb24;
+ dst_yj += dst_stride_yj;
+ }
+ return 0;
+}
+
+// Convert RAW to J400.
+LIBYUV_API
+int RAWToJ400(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height) {
+ int y;
+ void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_yj, int width) =
+ RAWToYJRow_C;
+ if (!src_raw || !dst_yj || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+ // Coalesce rows.
+ if (src_stride_raw == width * 3 && dst_stride_yj == width) {
+ width *= height;
+ height = 1;
+ src_stride_raw = dst_stride_yj = 0;
+ }
+#if defined(HAS_RAWTOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToYJRow = RAWToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYJRow = RAWToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ RAWToYJRow = RAWToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ RAWToYJRow = RAWToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToYJRow = RAWToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToYJRow = RAWToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RAWToYJRow = RAWToYJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToYJRow = RAWToYJRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToYJRow = RAWToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYJRow = RAWToYJRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RAWToYJRow(src_raw, dst_yj, width);
+ src_raw += src_stride_raw;
+ dst_yj += dst_stride_yj;
+ }
+ return 0;
+}
+
static void SplitPixels(const uint8_t* src_u,
int src_pixel_stride_uv,
uint8_t* dst_u,
diff --git a/chromium/third_party/libyuv/source/convert_argb.cc b/chromium/third_party/libyuv/source/convert_argb.cc
index f2fe474f704..5e7225faf21 100644
--- a/chromium/third_party/libyuv/source/convert_argb.cc
+++ b/chromium/third_party/libyuv/source/convert_argb.cc
@@ -47,18 +47,19 @@ int ARGBCopy(const uint8_t* src_argb,
return 0;
}
-// Convert I420 to ARGB with matrix
-static int I420ToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert I420 to ARGB with matrix.
+LIBYUV_API
+int I420ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
const uint8_t* v_buf, uint8_t* rgb_buf,
@@ -97,6 +98,14 @@ static int I420ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToARGBRow = I422ToARGBRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_I422TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToARGBRow = I422ToARGBRow_Any_MSA;
@@ -226,18 +235,55 @@ int H420ToABGR(const uint8_t* src_y,
width, height);
}
-// Convert I422 to ARGB with matrix
-static int I422ToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert U420 to ARGB.
+LIBYUV_API
+int U420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert U420 to ABGR.
+LIBYUV_API
+int U420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvu2020Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I422 to ARGB with matrix.
+LIBYUV_API
+int I422ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
const uint8_t* v_buf, uint8_t* rgb_buf,
@@ -283,6 +329,14 @@ static int I422ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToARGBRow = I422ToARGBRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_I422TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToARGBRow = I422ToARGBRow_Any_MSA;
@@ -410,20 +464,286 @@ int H422ToABGR(const uint8_t* src_y,
width, height);
}
-// Convert 10 bit YUV to ARGB with matrix
+// Convert U422 to ARGB.
+LIBYUV_API
+int U422ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert U422 to ABGR.
+LIBYUV_API
+int U422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvu2020Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I444 to ARGB with matrix.
+LIBYUV_API
+int I444ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I444ToARGBRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+ }
+#if defined(HAS_I444TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I444ToARGBRow = I444ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I444ToARGBRow = I444ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I444ToARGBRow = I444ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I444ToARGBRow = I444ToARGBRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I444ToARGBRow = I444ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvJPEGConstants, width, height);
+}
+
+// Convert J444 to ABGR.
+LIBYUV_API
+int J444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuJPEGConstants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert H444 to ARGB.
+LIBYUV_API
+int H444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert H444 to ABGR.
+LIBYUV_API
+int H444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert U444 to ARGB.
+LIBYUV_API
+int U444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert U444 to ABGR.
+LIBYUV_API
+int U444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvu2020Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert 10 bit YUV to ARGB with matrix.
// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
// multiply 10 bit yuv into high bits to allow any number of bits.
-static int I010ToAR30Matrix(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+LIBYUV_API
+int I010ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
const uint16_t* v_buf, uint8_t* rgb_buf,
@@ -500,6 +820,23 @@ int H010ToAR30(const uint16_t* src_y,
&kYuvH709Constants, width, height);
}
+// Convert U010 to AR30.
+LIBYUV_API
+int U010ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuv2020Constants, width, height);
+}
+
// Convert I010 to AB30.
LIBYUV_API
int I010ToAB30(const uint16_t* src_y,
@@ -534,18 +871,193 @@ int H010ToAB30(const uint16_t* src_y,
&kYvuH709Constants, width, height);
}
-// Convert 10 bit YUV to ARGB with matrix
-static int I010ToARGBMatrix(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert U010 to AB30.
+LIBYUV_API
+int U010ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert 10 bit YUV to ARGB with matrix.
+// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
+// multiply 10 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I210ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I210ToAR30Row_C;
+ if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_I210TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I210ToAR30Row = I210ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I210TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I210ToAR30Row = I210ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I210ToAR30Row = I210ToAR30Row_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I210 to AR30.
+LIBYUV_API
+int I210ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert H210 to AR30.
+LIBYUV_API
+int H210ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert U210 to AR30.
+LIBYUV_API
+int U210ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert I210 to AB30.
+LIBYUV_API
+int I210ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYvuI601Constants, width, height);
+}
+
+// Convert H210 to AB30.
+LIBYUV_API
+int H210ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYvuH709Constants, width, height);
+}
+
+// Convert U210 to AB30.
+LIBYUV_API
+int U210ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert 10 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I010ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
const uint16_t* v_buf, uint8_t* rgb_buf,
@@ -576,6 +1088,14 @@ static int I010ToARGBMatrix(const uint16_t* src_y,
}
}
#endif
+#if defined(HAS_I210TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I210ToARGBRow = I210ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I210ToARGBRow = I210ToARGBRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
dst_argb += dst_stride_argb;
@@ -660,23 +1180,60 @@ int H010ToABGR(const uint16_t* src_y,
width, height);
}
-// Convert I444 to ARGB with matrix
-static int I444ToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert U010 to ARGB.
+LIBYUV_API
+int U010ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert U010 to ABGR.
+LIBYUV_API
+int U010ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I010ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvu2020Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert 10 bit 422 YUV to ARGB with matrix.
+LIBYUV_API
+int I210ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
- void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
+ void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants, int width) =
- I444ToARGBRow_C;
+ I210ToARGBRow_C;
if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -686,48 +1243,32 @@ static int I444ToARGBMatrix(const uint8_t* src_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
- // Coalesce rows.
- if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
- }
-#if defined(HAS_I444TOARGBROW_SSSE3)
+#if defined(HAS_I210TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+ I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I444ToARGBRow = I444ToARGBRow_SSSE3;
+ I210ToARGBRow = I210ToARGBRow_SSSE3;
}
}
#endif
-#if defined(HAS_I444TOARGBROW_AVX2)
+#if defined(HAS_I210TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+ I210ToARGBRow = I210ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
- I444ToARGBRow = I444ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I444TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I444ToARGBRow = I444ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I444ToARGBRow = I444ToARGBRow_NEON;
+ I210ToARGBRow = I210ToARGBRow_AVX2;
}
}
#endif
-#if defined(HAS_I444TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I444ToARGBRow = I444ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I444ToARGBRow = I444ToARGBRow_MSA;
+#if defined(HAS_I210TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I210ToARGBRow = I210ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I210ToARGBRow = I210ToARGBRow_MMI;
}
}
#endif
-
for (y = 0; y < height; ++y) {
- I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
src_u += src_stride_u;
@@ -736,74 +1277,130 @@ static int I444ToARGBMatrix(const uint8_t* src_y,
return 0;
}
-// Convert I444 to ARGB.
+// Convert I210 to ARGB.
LIBYUV_API
-int I444ToARGB(const uint8_t* src_y,
+int I210ToARGB(const uint16_t* src_y,
int src_stride_y,
- const uint8_t* src_u,
+ const uint16_t* src_u,
int src_stride_u,
- const uint8_t* src_v,
+ const uint16_t* src_v,
int src_stride_v,
uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
- return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_argb, dst_stride_argb,
&kYuvI601Constants, width, height);
}
-// Convert I444 to ABGR.
+// Convert I210 to ABGR.
LIBYUV_API
-int I444ToABGR(const uint8_t* src_y,
+int I210ToABGR(const uint16_t* src_y,
int src_stride_y,
- const uint8_t* src_u,
+ const uint16_t* src_u,
int src_stride_u,
- const uint8_t* src_v,
+ const uint16_t* src_v,
int src_stride_v,
uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height) {
- return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+ return I210ToARGBMatrix(src_y, src_stride_y, src_v,
src_stride_v, // Swap U and V
src_u, src_stride_u, dst_abgr, dst_stride_abgr,
&kYvuI601Constants, // Use Yvu matrix
width, height);
}
-// Convert J444 to ARGB.
+// Convert H210 to ARGB.
LIBYUV_API
-int J444ToARGB(const uint8_t* src_y,
+int H210ToARGB(const uint16_t* src_y,
int src_stride_y,
- const uint8_t* src_u,
+ const uint16_t* src_u,
int src_stride_u,
- const uint8_t* src_v,
+ const uint16_t* src_v,
int src_stride_v,
uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
- return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_argb, dst_stride_argb,
- &kYuvJPEGConstants, width, height);
+ &kYuvH709Constants, width, height);
+}
+
+// Convert H210 to ABGR.
+LIBYUV_API
+int H210ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I210ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert U210 to ARGB.
+LIBYUV_API
+int U210ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuv2020Constants, width, height);
}
-// Convert I420 with Alpha to preattenuated ARGB.
-static int I420AlphaToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- const uint8_t* src_a,
- int src_stride_a,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height,
- int attenuate) {
+// Convert U210 to ABGR.
+LIBYUV_API
+int U210ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I210ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvu2020Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I420 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I420AlphaToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
int y;
void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
const uint8_t* v_buf, const uint8_t* a_buf,
@@ -845,6 +1442,14 @@ static int I420AlphaToARGBMatrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422ALPHATOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_I422ALPHATOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
@@ -877,6 +1482,14 @@ static int I420AlphaToARGBMatrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_ARGBATTENUATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBATTENUATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
@@ -946,16 +1559,18 @@ int I420AlphaToABGR(const uint8_t* src_y,
width, height, attenuate);
}
-// Convert I400 to ARGB.
+// Convert I400 to ARGB with matrix.
LIBYUV_API
-int I400ToARGB(const uint8_t* src_y,
- int src_stride_y,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+int I400ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
- void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, int width) =
+ void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
I400ToARGBRow_C;
if (!src_y || !dst_argb || width <= 0 || height == 0) {
return -1;
@@ -996,6 +1611,14 @@ int I400ToARGB(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I400TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I400ToARGBRow = I400ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ I400ToARGBRow = I400ToARGBRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_I400TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I400ToARGBRow = I400ToARGBRow_Any_MSA;
@@ -1006,13 +1629,25 @@ int I400ToARGB(const uint8_t* src_y,
#endif
for (y = 0; y < height; ++y) {
- I400ToARGBRow(src_y, dst_argb, width);
+ I400ToARGBRow(src_y, dst_argb, yuvconstants, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
}
return 0;
}
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I400ToARGBMatrix(src_y, src_stride_y, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
+}
+
// Convert J400 to ARGB.
LIBYUV_API
int J400ToARGB(const uint8_t* src_y,
@@ -1063,6 +1698,14 @@ int J400ToARGB(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_J400TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ J400ToARGBRow = J400ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ J400ToARGBRow = J400ToARGBRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_J400TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
J400ToARGBRow = J400ToARGBRow_Any_MSA;
@@ -1193,6 +1836,14 @@ int RGB24ToARGB(const uint8_t* src_rgb24,
}
}
#endif
+#if defined(HAS_RGB24TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_RGB24TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA;
@@ -1252,6 +1903,14 @@ int RAWToARGB(const uint8_t* src_raw,
}
}
#endif
+#if defined(HAS_RAWTOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RAWToARGBRow = RAWToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ RAWToARGBRow = RAWToARGBRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_RAWTOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RAWToARGBRow = RAWToARGBRow_Any_MSA;
@@ -1269,6 +1928,57 @@ int RAWToARGB(const uint8_t* src_raw,
return 0;
}
+// Convert RAW to RGBA.
+LIBYUV_API
+int RAWToRGBA(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height) {
+ int y;
+ void (*RAWToRGBARow)(const uint8_t* src_rgb, uint8_t* dst_rgba, int width) =
+ RAWToRGBARow_C;
+ if (!src_raw || !dst_rgba || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+ // Coalesce rows.
+ if (src_stride_raw == width * 3 && dst_stride_rgba == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_raw = dst_stride_rgba = 0;
+ }
+#if defined(HAS_RAWTORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToRGBARow = RAWToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToRGBARow = RAWToRGBARow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RAWTORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToRGBARow = RAWToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToRGBARow = RAWToRGBARow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RAWToRGBARow(src_raw, dst_rgba, width);
+ src_raw += src_stride_raw;
+ dst_rgba += dst_stride_rgba;
+ }
+ return 0;
+}
+
// Convert RGB565 to ARGB.
LIBYUV_API
int RGB565ToARGB(const uint8_t* src_rgb565,
@@ -1319,6 +2029,14 @@ int RGB565ToARGB(const uint8_t* src_rgb565,
}
}
#endif
+#if defined(HAS_RGB565TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_RGB565TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA;
@@ -1386,6 +2104,14 @@ int ARGB1555ToARGB(const uint8_t* src_argb1555,
}
}
#endif
+#if defined(HAS_ARGB1555TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGB1555TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA;
@@ -1453,6 +2179,14 @@ int ARGB4444ToARGB(const uint8_t* src_argb4444,
}
}
#endif
+#if defined(HAS_ARGB4444TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGB4444TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
@@ -1566,16 +2300,17 @@ int AR30ToAB30(const uint8_t* src_ar30,
return 0;
}
-// Convert NV12 to ARGB with matrix
-static int NV12ToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert NV12 to ARGB with matrix.
+LIBYUV_API
+int NV12ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*NV12ToARGBRow)(
const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
@@ -1613,6 +2348,14 @@ static int NV12ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_NV12TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ NV12ToARGBRow = NV12ToARGBRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_NV12TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
@@ -1633,16 +2376,17 @@ static int NV12ToARGBMatrix(const uint8_t* src_y,
return 0;
}
-// Convert NV21 to ARGB with matrix
-static int NV21ToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_vu,
- int src_stride_vu,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert NV21 to ARGB with matrix.
+LIBYUV_API
+int NV21ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*NV21ToARGBRow)(
const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
@@ -1680,6 +2424,14 @@ static int NV21ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_NV21TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ NV21ToARGBRow = NV21ToARGBRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_NV21TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
NV21ToARGBRow = NV21ToARGBRow_Any_MSA;
@@ -1729,8 +2481,9 @@ int NV21ToARGB(const uint8_t* src_y,
}
// Convert NV12 to ABGR.
-// To output ABGR instead of ARGB swap the UV and use a mirrrored yuc matrix.
+// To output ABGR instead of ARGB swap the UV and use a mirrored yuv matrix.
// To swap the UV use NV12 instead of NV21.LIBYUV_API
+LIBYUV_API
int NV12ToABGR(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_uv,
@@ -1758,16 +2511,17 @@ int NV21ToABGR(const uint8_t* src_y,
}
// TODO(fbarchard): Consider SSSE3 2 step conversion.
-// Convert NV12 to RGB24 with matrix
-static int NV12ToRGB24Matrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert NV12 to RGB24 with matrix.
+LIBYUV_API
+int NV12ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*NV12ToRGB24Row)(
const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
@@ -1805,6 +2559,14 @@ static int NV12ToRGB24Matrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_NV12TORGB24ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width);
@@ -1817,16 +2579,17 @@ static int NV12ToRGB24Matrix(const uint8_t* src_y,
return 0;
}
-// Convert NV21 to RGB24 with matrix
-static int NV21ToRGB24Matrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_vu,
- int src_stride_vu,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert NV21 to RGB24 with matrix.
+LIBYUV_API
+int NV21ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*NV21ToRGB24Row)(
const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
@@ -1864,6 +2627,14 @@ static int NV21ToRGB24Matrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_NV21TORGB24ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width);
@@ -1876,7 +2647,6 @@ static int NV21ToRGB24Matrix(const uint8_t* src_y,
return 0;
}
-// TODO(fbarchard): NV12ToRAW can be implemented by mirrored matrix.
// Convert NV12 to RGB24.
LIBYUV_API
int NV12ToRGB24(const uint8_t* src_y,
@@ -1907,72 +2677,79 @@ int NV21ToRGB24(const uint8_t* src_y,
width, height);
}
-// Convert M420 to ARGB.
+// Convert NV12 to RAW.
LIBYUV_API
-int M420ToARGB(const uint8_t* src_m420,
- int src_stride_m420,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+int NV12ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return NV21ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_raw,
+ dst_stride_raw, &kYvuI601Constants, width, height);
+}
+
+// Convert NV21 to RAW.
+LIBYUV_API
+int NV21ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return NV12ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_raw,
+ dst_stride_raw, &kYvuI601Constants, width, height);
+}
+
+// Convert NV21 to YUV24
+int NV21ToYUV24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_yuv24,
+ int dst_stride_yuv24,
+ int width,
+ int height) {
int y;
- void (*NV12ToARGBRow)(
- const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
- if (!src_m420 || !dst_argb || width <= 0 || height == 0) {
+ void (*NV21ToYUV24Row)(const uint8_t* src_y, const uint8_t* src_vu,
+ uint8_t* dst_yuv24, int width) = NV21ToYUV24Row_C;
+ if (!src_y || !src_vu || !dst_yuv24 || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
+ dst_yuv24 = dst_yuv24 + (height - 1) * dst_stride_yuv24;
+ dst_stride_yuv24 = -dst_stride_yuv24;
}
-#if defined(HAS_NV12TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_NV12TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+#if defined(HAS_NV21TOYUV24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV21ToYUV24Row = NV21ToYUV24Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
- NV12ToARGBRow = NV12ToARGBRow_AVX2;
+ NV21ToYUV24Row = NV21ToYUV24Row_NEON;
}
}
#endif
-#if defined(HAS_NV12TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_NEON;
+#if defined(HAS_NV21TOYUV24ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV21ToYUV24Row = NV21ToYUV24Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ NV21ToYUV24Row = NV21ToYUV24Row_AVX2;
}
}
#endif
-#if defined(HAS_NV12TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_MSA;
+ for (y = 0; y < height; ++y) {
+ NV21ToYUV24Row(src_y, src_vu, dst_yuv24, width);
+ dst_yuv24 += dst_stride_yuv24;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_vu += src_stride_vu;
}
}
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
- &kYuvI601Constants, width);
- NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
- dst_argb + dst_stride_argb, &kYuvI601Constants, width);
- dst_argb += dst_stride_argb * 2;
- src_m420 += src_stride_m420 * 3;
- }
- if (height & 1) {
- NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
- &kYuvI601Constants, width);
- }
return 0;
}
@@ -2027,6 +2804,14 @@ int YUY2ToARGB(const uint8_t* src_yuy2,
}
}
#endif
+#if defined(HAS_YUY2TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_YUY2TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;
@@ -2094,6 +2879,14 @@ int UYVYToARGB(const uint8_t* src_uyvy,
}
}
#endif
+#if defined(HAS_UYVYTOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ UYVYToARGBRow = UYVYToARGBRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_UYVYTOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
UYVYToARGBRow = UYVYToARGBRow_Any_MSA;
@@ -2124,7 +2917,7 @@ static void WeavePixels(const uint8_t* src_u,
}
}
-// Convert Android420 to ARGB.
+// Convert Android420 to ARGB with matrix.
LIBYUV_API
int Android420ToARGBMatrix(const uint8_t* src_y,
int src_stride_y,
@@ -2225,6 +3018,1107 @@ int Android420ToABGR(const uint8_t* src_y,
height);
}
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I422ToRGBAMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGBARow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+ dst_stride_rgba = -dst_stride_rgba;
+ }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGBARow = I422ToRGBARow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGBARow = I422ToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToRGBARow = I422ToRGBARow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToRGBARow = I422ToRGBARow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGBARow = I422ToRGBARow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+ dst_rgba += dst_stride_rgba;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height) {
+ return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgba, dst_stride_rgba,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height) {
+ return I422ToRGBAMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_bgra, dst_stride_bgra,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert NV12 to RGB565 with matrix.
+LIBYUV_API
+int NV12ToRGB565Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*NV12ToRGB565Row)(
+ const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
+ if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ NV12ToRGB565Row(src_y, src_uv, dst_rgb565, yuvconstants, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ return NV12ToRGB565Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
+ dst_rgb565, dst_stride_rgb565, &kYuvI601Constants,
+ width, height);
+}
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I420ToRGBAMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGBARow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+ dst_stride_rgba = -dst_stride_rgba;
+ }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGBARow = I422ToRGBARow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGBARow = I422ToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToRGBARow = I422ToRGBARow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToRGBARow = I422ToRGBARow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGBARow = I422ToRGBARow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+ dst_rgba += dst_stride_rgba;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGBA.
+LIBYUV_API
+int I420ToRGBA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height) {
+ return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgba, dst_stride_rgba,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to BGRA.
+LIBYUV_API
+int I420ToBGRA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height) {
+ return I420ToRGBAMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_bgra, dst_stride_bgra,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I420 to RGB24 with matrix.
+LIBYUV_API
+int I420ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGB24Row_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+ dst_stride_rgb24 = -dst_stride_rgb24;
+ }
+#if defined(HAS_I422TORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToRGB24Row = I422ToRGB24Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB24Row = I422ToRGB24Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToRGB24Row = I422ToRGB24Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB24Row = I422ToRGB24Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+ dst_rgb24 += dst_stride_rgb24;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB24.
+LIBYUV_API
+int I420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb24, dst_stride_rgb24,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to RAW.
+LIBYUV_API
+int I420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_raw, dst_stride_raw,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert J420 to RGB24.
+LIBYUV_API
+int J420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb24, dst_stride_rgb24,
+ &kYuvJPEGConstants, width, height);
+}
+
+// Convert J420 to RAW.
+LIBYUV_API
+int J420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_raw, dst_stride_raw,
+ &kYvuJPEGConstants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert H420 to RGB24.
+LIBYUV_API
+int H420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb24, dst_stride_rgb24,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert H420 to RAW.
+LIBYUV_API
+int H420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_raw, dst_stride_raw,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I420 to ARGB1555.
+LIBYUV_API
+int I420ToARGB1555(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb1555,
+ int dst_stride_argb1555,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422ToARGB1555Row_C;
+ if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
+ dst_stride_argb1555 = -dst_stride_argb1555;
+ }
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
+ width);
+ dst_argb1555 += dst_stride_argb1555;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to ARGB4444.
+LIBYUV_API
+int I420ToARGB4444(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb4444,
+ int dst_stride_argb4444,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422ToARGB4444Row_C;
+ if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
+ dst_stride_argb4444 = -dst_stride_argb4444;
+ }
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
+ width);
+ dst_argb4444 += dst_stride_argb4444;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB565 with specified color matrix.
+LIBYUV_API
+int I420ToRGB565Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGB565Row_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB565Row = I422ToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToRGB565Row = I422ToRGB565Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB565.
+LIBYUV_API
+int I420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb565, dst_stride_rgb565,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert J420 to RGB565.
+LIBYUV_API
+int J420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb565, dst_stride_rgb565,
+ &kYuvJPEGConstants, width, height);
+}
+
+// Convert H420 to RGB565.
+LIBYUV_API
+int H420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb565, dst_stride_rgb565,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert I422 to RGB565.
+LIBYUV_API
+int I422ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGB565Row_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB565Row = I422ToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
+static const uint8_t kDither565_4x4[16] = {
+ 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
+};
+
+// Convert I420 to RGB565 with dithering.
+LIBYUV_API
+int I420ToRGB565Dither(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const uint8_t* dither4x4,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToARGBRow_C;
+ void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
+ const uint32_t dither4, int width) =
+ ARGBToRGB565DitherRow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+ if (!dither4x4) {
+ dither4x4 = kDither565_4x4;
+ }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToARGBRow = I422ToARGBRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
+ }
+ }
+#endif
+ {
+ // Allocate a row of argb.
+ align_buffer_64(row_argb, width * 4);
+ for (y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
+ ARGBToRGB565DitherRow(row_argb, dst_rgb565,
+ *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
+ width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ free_aligned_buffer_64(row_argb);
+ }
+ return 0;
+}
+
+// Convert I420 to AR30 with matrix.
+LIBYUV_API
+int I420ToAR30Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToAR30Row_C;
+
+ if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+
+#if defined(HAS_I422TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToAR30Row = I422ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToAR30Row = I422ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToAR30Row = I422ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToAR30Row = I422ToAR30Row_AVX2;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to AR30.
+LIBYUV_API
+int I420ToAR30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert H420 to AR30.
+LIBYUV_API
+int H420ToAR30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYvuH709Constants, width, height);
+}
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/chromium/third_party/libyuv/source/convert_from.cc b/chromium/third_party/libyuv/source/convert_from.cc
index 6fa253237ee..f2cfc1d8f53 100644
--- a/chromium/third_party/libyuv/source/convert_from.cc
+++ b/chromium/third_party/libyuv/source/convert_from.cc
@@ -294,6 +294,14 @@ int I420ToYUY2(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422TOYUY2ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToYUY2Row = I422ToYUY2Row_MMI;
+ }
+ }
+#endif
#if defined(HAS_I422TOYUY2ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
@@ -373,6 +381,14 @@ int I422ToUYVY(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422TOUYVYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToUYVYRow = I422ToUYVYRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_I422TOUYVYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
@@ -440,6 +456,14 @@ int I420ToUYVY(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422TOUYVYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToUYVYRow = I422ToUYVYRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_I422TOUYVYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
@@ -464,7 +488,6 @@ int I420ToUYVY(const uint8_t* src_y,
return 0;
}
-// TODO(fbarchard): test negative height for invert.
LIBYUV_API
int I420ToNV12(const uint8_t* src_y,
int src_stride_y,
@@ -478,12 +501,23 @@ int I420ToNV12(const uint8_t* src_y,
int dst_stride_uv,
int width,
int height) {
+ int halfwidth = (width + 1) / 2;
+ int halfheight = (height + 1) / 2;
if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
height == 0) {
return -1;
}
- int halfwidth = (width + 1) / 2;
- int halfheight = height > 0 ? (height + 1) / 2 : (height - 1) / 2;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
if (dst_y) {
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
}
@@ -510,755 +544,6 @@ int I420ToNV21(const uint8_t* src_y,
width, height);
}
-// Convert I422 to RGBA with matrix
-static int I420ToRGBAMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgba,
- int dst_stride_rgba,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToRGBARow_C;
- if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
- dst_stride_rgba = -dst_stride_rgba;
- }
-#if defined(HAS_I422TORGBAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToRGBARow = I422ToRGBARow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGBARow = I422ToRGBARow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGBARow = I422ToRGBARow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToRGBARow = I422ToRGBARow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
- dst_rgba += dst_stride_rgba;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to RGBA.
-LIBYUV_API
-int I420ToRGBA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgba,
- int dst_stride_rgba,
- int width,
- int height) {
- return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgba, dst_stride_rgba,
- &kYuvI601Constants, width, height);
-}
-
-// Convert I420 to BGRA.
-LIBYUV_API
-int I420ToBGRA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_bgra,
- int dst_stride_bgra,
- int width,
- int height) {
- return I420ToRGBAMatrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_bgra, dst_stride_bgra,
- &kYvuI601Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert I420 to RGB24 with matrix
-static int I420ToRGB24Matrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToRGB24Row_C;
- if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
- dst_stride_rgb24 = -dst_stride_rgb24;
- }
-#if defined(HAS_I422TORGB24ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB24Row = I422ToRGB24Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TORGB24ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGB24Row = I422ToRGB24Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TORGB24ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB24Row = I422ToRGB24Row_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TORGB24ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGB24Row = I422ToRGB24Row_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
- dst_rgb24 += dst_stride_rgb24;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to RGB24.
-LIBYUV_API
-int I420ToRGB24(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- int width,
- int height) {
- return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgb24, dst_stride_rgb24,
- &kYuvI601Constants, width, height);
-}
-
-// Convert I420 to RAW.
-LIBYUV_API
-int I420ToRAW(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_raw,
- int dst_stride_raw,
- int width,
- int height) {
- return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_raw, dst_stride_raw,
- &kYvuI601Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert H420 to RGB24.
-LIBYUV_API
-int H420ToRGB24(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- int width,
- int height) {
- return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgb24, dst_stride_rgb24,
- &kYuvH709Constants, width, height);
-}
-
-// Convert H420 to RAW.
-LIBYUV_API
-int H420ToRAW(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_raw,
- int dst_stride_raw,
- int width,
- int height) {
- return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_raw, dst_stride_raw,
- &kYvuH709Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert I420 to ARGB1555.
-LIBYUV_API
-int I420ToARGB1555(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb1555,
- int dst_stride_argb1555,
- int width,
- int height) {
- int y;
- void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = I422ToARGB1555Row_C;
- if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
- height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
- dst_stride_argb1555 = -dst_stride_argb1555;
- }
-#if defined(HAS_I422TOARGB1555ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB1555Row = I422ToARGB1555Row_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB1555Row = I422ToARGB1555Row_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
- width);
- dst_argb1555 += dst_stride_argb1555;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to ARGB4444.
-LIBYUV_API
-int I420ToARGB4444(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb4444,
- int dst_stride_argb4444,
- int width,
- int height) {
- int y;
- void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = I422ToARGB4444Row_C;
- if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
- height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
- dst_stride_argb4444 = -dst_stride_argb4444;
- }
-#if defined(HAS_I422TOARGB4444ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB4444Row = I422ToARGB4444Row_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB4444Row = I422ToARGB4444Row_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
- width);
- dst_argb4444 += dst_stride_argb4444;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to RGB565.
-LIBYUV_API
-int I420ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height) {
- int y;
- void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToRGB565Row_C;
- if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
- dst_stride_rgb565 = -dst_stride_rgb565;
- }
-#if defined(HAS_I422TORGB565ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGB565Row = I422ToRGB565Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
- dst_rgb565 += dst_stride_rgb565;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I422 to RGB565.
-LIBYUV_API
-int I422ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height) {
- int y;
- void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToRGB565Row_C;
- if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
- dst_stride_rgb565 = -dst_stride_rgb565;
- }
-#if defined(HAS_I422TORGB565ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGB565Row = I422ToRGB565Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
- dst_rgb565 += dst_stride_rgb565;
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- return 0;
-}
-
-// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
-static const uint8_t kDither565_4x4[16] = {
- 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
-};
-
-// Convert I420 to RGB565 with dithering.
-LIBYUV_API
-int I420ToRGB565Dither(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- const uint8_t* dither4x4,
- int width,
- int height) {
- int y;
- void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToARGBRow_C;
- void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
- const uint32_t dither4, int width) =
- ARGBToRGB565DitherRow_C;
- if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
- dst_stride_rgb565 = -dst_stride_rgb565;
- }
- if (!dither4x4) {
- dither4x4 = kDither565_4x4;
- }
-#if defined(HAS_I422TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToARGBRow = I422ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToARGBRow = I422ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToARGBRow = I422ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToARGBRow = I422ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
- if (IS_ALIGNED(width, 4)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
- if (IS_ALIGNED(width, 8)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
- }
- }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
- }
- }
-#endif
- {
- // Allocate a row of argb.
- align_buffer_64(row_argb, width * 4);
- for (y = 0; y < height; ++y) {
- I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
- ARGBToRGB565DitherRow(row_argb, dst_rgb565,
- *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
- width);
- dst_rgb565 += dst_stride_rgb565;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- free_aligned_buffer_64(row_argb);
- }
- return 0;
-}
-
-// Convert I420 to AR30 with matrix
-static int I420ToAR30Matrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToAR30Row_C;
-
- if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
- dst_stride_ar30 = -dst_stride_ar30;
- }
-
-#if defined(HAS_I422TOAR30ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToAR30Row = I422ToAR30Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToAR30Row = I422ToAR30Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TOAR30ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToAR30Row = I422ToAR30Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToAR30Row = I422ToAR30Row_AVX2;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
- dst_ar30 += dst_stride_ar30;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to AR30.
-LIBYUV_API
-int I420ToAR30(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- int width,
- int height) {
- return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_ar30, dst_stride_ar30,
- &kYuvI601Constants, width, height);
-}
-
-// Convert H420 to AR30.
-LIBYUV_API
-int H420ToAR30(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- int width,
- int height) {
- return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_ar30, dst_stride_ar30,
- &kYvuH709Constants, width, height);
-}
-
// Convert I420 to specified format
LIBYUV_API
int ConvertFromI420(const uint8_t* y,
@@ -1360,7 +645,6 @@ int ConvertFromI420(const uint8_t* y,
height);
break;
}
- // TODO(fbarchard): Add M420.
// Triplanar formats
case FOURCC_I420:
case FOURCC_YV12: {
diff --git a/chromium/third_party/libyuv/source/convert_from_argb.cc b/chromium/third_party/libyuv/source/convert_from_argb.cc
index c8d91252e9b..4ba4bb5e0f5 100644
--- a/chromium/third_party/libyuv/source/convert_from_argb.cc
+++ b/chromium/third_party/libyuv/source/convert_from_argb.cc
@@ -68,6 +68,14 @@ int ARGBToI444(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOUV444ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToUV444Row = ARGBToUV444Row_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOUV444ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToUV444Row = ARGBToUV444Row_Any_MSA;
@@ -100,6 +108,14 @@ int ARGBToI444(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA;
@@ -191,17 +207,26 @@ int ARGBToI422(const uint8_t* src_argb,
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToYRow = ARGBToYRow_Any_MSA;
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_MSA;
+ ARGBToUVRow = ARGBToUVRow_MMI;
}
}
#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
+
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_MSA;
}
@@ -282,17 +307,25 @@ int ARGBToNV12(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToYRow = ARGBToYRow_Any_MSA;
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_MSA;
+ ARGBToUVRow = ARGBToUVRow_MMI;
}
}
#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_MSA;
}
@@ -322,6 +355,14 @@ int ARGBToNV12(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_MERGEUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MergeUVRow_ = MergeUVRow_Any_MMI;
+ if (IS_ALIGNED(halfwidth, 8)) {
+ MergeUVRow_ = MergeUVRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_MERGEUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
MergeUVRow_ = MergeUVRow_Any_MSA;
@@ -418,17 +459,25 @@ int ARGBToNV21(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToYRow = ARGBToYRow_Any_MSA;
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_MSA;
+ ARGBToUVRow = ARGBToUVRow_MMI;
}
}
#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_MSA;
}
@@ -458,6 +507,14 @@ int ARGBToNV21(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_MERGEUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MergeUVRow_ = MergeUVRow_Any_MMI;
+ if (IS_ALIGNED(halfwidth, 8)) {
+ MergeUVRow_ = MergeUVRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_MERGEUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
MergeUVRow_ = MergeUVRow_Any_MSA;
@@ -490,6 +547,309 @@ int ARGBToNV21(const uint8_t* src_argb,
return 0;
}
+LIBYUV_API
+int ABGRToNV12(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+ void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ABGRToUVRow_C;
+ void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
+ ABGRToYRow_C;
+ void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+ uint8_t* dst_uv, int width) = MergeUVRow_C;
+ if (!src_abgr || !dst_y || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+ src_stride_abgr = -src_stride_abgr;
+ }
+#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+ ABGRToYRow = ABGRToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_SSSE3;
+ ABGRToYRow = ABGRToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+ ABGRToYRow = ABGRToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_AVX2;
+ ABGRToYRow = ABGRToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToYRow = ABGRToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ABGRToYRow = ABGRToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToUVRow = ABGRToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ABGRToYRow = ABGRToYRow_Any_MMI;
+ ABGRToUVRow = ABGRToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ABGRToYRow = ABGRToYRow_MMI;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ABGRToYRow = ABGRToYRow_Any_MSA;
+ ABGRToUVRow = ABGRToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeUVRow_ = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow_ = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow_ = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow_ = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MergeUVRow_ = MergeUVRow_Any_MMI;
+ if (IS_ALIGNED(halfwidth, 8)) {
+ MergeUVRow_ = MergeUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow_ = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_MSA;
+ }
+ }
+#endif
+ {
+ // Allocate a rows of uv.
+ align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+ uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+
+ for (y = 0; y < height - 1; y += 2) {
+ ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ABGRToYRow(src_abgr, dst_y, width);
+ ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+ src_abgr += src_stride_abgr * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ ABGRToUVRow(src_abgr, 0, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ABGRToYRow(src_abgr, dst_y, width);
+ }
+ free_aligned_buffer_64(row_u);
+ }
+ return 0;
+}
+
+// Same as NV12 but U and V swapped.
+LIBYUV_API
+int ABGRToNV21(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+ void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ABGRToUVRow_C;
+ void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
+ ABGRToYRow_C;
+ void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+ uint8_t* dst_vu, int width) = MergeUVRow_C;
+ if (!src_abgr || !dst_y || !dst_vu || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+ src_stride_abgr = -src_stride_abgr;
+ }
+#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+ ABGRToYRow = ABGRToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_SSSE3;
+ ABGRToYRow = ABGRToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+ ABGRToYRow = ABGRToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_AVX2;
+ ABGRToYRow = ABGRToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToYRow = ABGRToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ABGRToYRow = ABGRToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToUVRow = ABGRToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ABGRToYRow = ABGRToYRow_Any_MMI;
+ ABGRToUVRow = ABGRToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ABGRToYRow = ABGRToYRow_MMI;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ABGRToYRow = ABGRToYRow_Any_MSA;
+ ABGRToUVRow = ABGRToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeUVRow_ = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow_ = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow_ = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow_ = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MergeUVRow_ = MergeUVRow_Any_MMI;
+ if (IS_ALIGNED(halfwidth, 8)) {
+ MergeUVRow_ = MergeUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow_ = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_MSA;
+ }
+ }
+#endif
+ {
+ // Allocate a rows of uv.
+ align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+ uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+
+ for (y = 0; y < height - 1; y += 2) {
+ ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+ ABGRToYRow(src_abgr, dst_y, width);
+ ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+ src_abgr += src_stride_abgr * 2;
+ dst_y += dst_stride_y * 2;
+ dst_vu += dst_stride_vu;
+ }
+ if (height & 1) {
+ ABGRToUVRow(src_abgr, 0, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+ ABGRToYRow(src_abgr, dst_y, width);
+ }
+ free_aligned_buffer_64(row_u);
+ }
+ return 0;
+}
+
// Convert ARGB to YUY2.
LIBYUV_API
int ARGBToYUY2(const uint8_t* src_argb,
@@ -559,17 +919,25 @@ int ARGBToYUY2(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToYRow = ARGBToYRow_Any_MSA;
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_MSA;
+ ARGBToUVRow = ARGBToUVRow_MMI;
}
}
#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_MSA;
}
@@ -599,6 +967,14 @@ int ARGBToYUY2(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_I422TOYUY2ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToYUY2Row = I422ToYUY2Row_MMI;
+ }
+ }
+#endif
#if defined(HAS_I422TOYUY2ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
@@ -696,17 +1072,25 @@ int ARGBToUYVY(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToYRow = ARGBToYRow_Any_MSA;
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_MSA;
+ ARGBToUVRow = ARGBToUVRow_MMI;
}
}
#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_MSA;
}
@@ -736,6 +1120,14 @@ int ARGBToUYVY(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_I422TOUYVYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToUYVYRow = I422ToUYVYRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_I422TOUYVYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
@@ -813,6 +1205,14 @@ int ARGBToI400(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA;
@@ -903,6 +1303,14 @@ int ARGBToRGB24(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTORGB24ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBTORGB24ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA;
@@ -969,6 +1377,14 @@ int ARGBToRAW(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTORAWROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRAWRow = ARGBToRAWRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBTORAWROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToRAWRow = ARGBToRAWRow_Any_MSA;
@@ -1039,6 +1455,14 @@ int ARGBToRGB565Dither(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
@@ -1108,6 +1532,14 @@ int ARGBToRGB565(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTORGB565ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBTORGB565ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToRGB565Row = ARGBToRGB565Row_Any_MSA;
@@ -1174,6 +1606,14 @@ int ARGBToARGB1555(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOARGB1555ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOARGB1555ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MSA;
@@ -1240,6 +1680,14 @@ int ARGBToARGB4444(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOARGB4444ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOARGB4444ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MSA;
@@ -1416,17 +1864,25 @@ int ARGBToJ420(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYJROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToYJRow = ARGBToYJRow_Any_MSA;
+#if defined(HAS_ARGBTOYJROW_MMI) && defined(HAS_ARGBTOUVJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MMI;
+ ARGBToUVJRow = ARGBToUVJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_MMI;
+ }
if (IS_ALIGNED(width, 16)) {
- ARGBToYJRow = ARGBToYJRow_MSA;
+ ARGBToUVJRow = ARGBToUVJRow_MMI;
}
}
#endif
-#if defined(HAS_ARGBTOUVJROW_MSA)
+#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MSA;
ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_MSA;
+ }
if (IS_ALIGNED(width, 32)) {
ARGBToUVJRow = ARGBToUVJRow_MSA;
}
@@ -1517,17 +1973,25 @@ int ARGBToJ422(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYJROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToYJRow = ARGBToYJRow_Any_MSA;
+#if defined(HAS_ARGBTOYJROW_MMI) && defined(HAS_ARGBTOUVJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MMI;
+ ARGBToUVJRow = ARGBToUVJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_MMI;
+ }
if (IS_ALIGNED(width, 16)) {
- ARGBToYJRow = ARGBToYJRow_MSA;
+ ARGBToUVJRow = ARGBToUVJRow_MMI;
}
}
#endif
-#if defined(HAS_ARGBTOUVJROW_MSA)
+#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MSA;
ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_MSA;
+ }
if (IS_ALIGNED(width, 32)) {
ARGBToUVJRow = ARGBToUVJRow_MSA;
}
@@ -1594,6 +2058,14 @@ int ARGBToJ400(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYJRow = ARGBToYJRow_Any_MSA;
@@ -1611,6 +2083,80 @@ int ARGBToJ400(const uint8_t* src_argb,
return 0;
}
+// Convert RGBA to J400.
+LIBYUV_API
+int RGBAToJ400(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height) {
+ int y;
+ void (*RGBAToYJRow)(const uint8_t* src_rgba, uint8_t* dst_yj, int width) =
+ RGBAToYJRow_C;
+ if (!src_rgba || !dst_yj || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+ src_stride_rgba = -src_stride_rgba;
+ }
+ // Coalesce rows.
+ if (src_stride_rgba == width * 4 && dst_stride_yj == width) {
+ width *= height;
+ height = 1;
+ src_stride_rgba = dst_stride_yj = 0;
+ }
+#if defined(HAS_RGBATOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGBAToYJRow = RGBAToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToYJRow = RGBAToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ RGBAToYJRow = RGBAToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ RGBAToYJRow = RGBAToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGBAToYJRow = RGBAToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGBAToYJRow = RGBAToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGBAToYJRow = RGBAToYJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RGBAToYJRow = RGBAToYJRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGBAToYJRow = RGBAToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToYJRow = RGBAToYJRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RGBAToYJRow(src_rgba, dst_yj, width);
+ src_rgba += src_stride_rgba;
+ dst_yj += dst_stride_yj;
+ }
+ return 0;
+}
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/chromium/third_party/libyuv/source/convert_jpeg.cc b/chromium/third_party/libyuv/source/convert_jpeg.cc
index ae3cc18cd24..d7556ee91ba 100644
--- a/chromium/third_party/libyuv/source/convert_jpeg.cc
+++ b/chromium/third_party/libyuv/source/convert_jpeg.cc
@@ -89,12 +89,12 @@ static void JpegI400ToI420(void* opaque,
// Query size of MJPG in pixels.
LIBYUV_API
-int MJPGSize(const uint8_t* sample,
- size_t sample_size,
+int MJPGSize(const uint8_t* src_mjpg,
+ size_t src_size_mjpg,
int* width,
int* height) {
MJpegDecoder mjpeg_decoder;
- LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
if (ret) {
*width = mjpeg_decoder.GetWidth();
*height = mjpeg_decoder.GetHeight();
@@ -107,8 +107,8 @@ int MJPGSize(const uint8_t* sample,
// TODO(fbarchard): review src_width and src_height requirement. dst_width and
// dst_height may be enough.
LIBYUV_API
-int MJPGToI420(const uint8_t* sample,
- size_t sample_size,
+int MJPGToI420(const uint8_t* src_mjpg,
+ size_t src_size_mjpg,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
@@ -119,14 +119,14 @@ int MJPGToI420(const uint8_t* sample,
int src_height,
int dst_width,
int dst_height) {
- if (sample_size == kUnknownDataSize) {
+ if (src_size_mjpg == kUnknownDataSize) {
// ERROR: MJPEG frame size unknown
return -1;
}
// TODO(fbarchard): Port MJpeg to C.
MJpegDecoder mjpeg_decoder;
- LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
if (ret && (mjpeg_decoder.GetWidth() != src_width ||
mjpeg_decoder.GetHeight() != src_height)) {
// ERROR: MJPEG frame has unexpected dimensions
@@ -180,9 +180,281 @@ int MJPGToI420(const uint8_t* sample,
ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dst_width,
dst_height);
} else {
- // TODO(fbarchard): Implement conversion for any other colorspace/sample
- // factors that occur in practice.
- // ERROR: Unable to convert MJPEG frame because format is not supported
+ // TODO(fbarchard): Implement conversion for any other
+ // colorspace/subsample factors that occur in practice. ERROR: Unable to
+ // convert MJPEG frame because format is not supported
+ mjpeg_decoder.UnloadFrame();
+ return 1;
+ }
+ }
+ return ret ? 0 : 1;
+}
+
+struct NV21Buffers {
+ uint8_t* y;
+ int y_stride;
+ uint8_t* vu;
+ int vu_stride;
+ int w;
+ int h;
+};
+
+static void JpegI420ToNV21(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ I420ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI422ToNV21(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ I422ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI444ToNV21(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ I444ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI400ToNV21(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu,
+ dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+// MJPG (Motion JPeg) to NV21
+LIBYUV_API
+int MJPGToNV21(const uint8_t* src_mjpg,
+ size_t src_size_mjpg,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height) {
+ if (src_size_mjpg == kUnknownDataSize) {
+ // ERROR: MJPEG frame size unknown
+ return -1;
+ }
+
+ // TODO(fbarchard): Port MJpeg to C.
+ MJpegDecoder mjpeg_decoder;
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
+ if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+ mjpeg_decoder.GetHeight() != src_height)) {
+ // ERROR: MJPEG frame has unexpected dimensions
+ mjpeg_decoder.UnloadFrame();
+ return 1; // runtime failure
+ }
+ if (ret) {
+ NV21Buffers bufs = {dst_y, dst_stride_y, dst_vu,
+ dst_stride_vu, dst_width, dst_height};
+ // YUV420
+ if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV21, &bufs, dst_width,
+ dst_height);
+ // YUV422
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV21, &bufs, dst_width,
+ dst_height);
+ // YUV444
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV21, &bufs, dst_width,
+ dst_height);
+ // YUV400
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceGrayscale &&
+ mjpeg_decoder.GetNumComponents() == 1 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV21, &bufs, dst_width,
+ dst_height);
+ } else {
+ // Unknown colorspace.
+ mjpeg_decoder.UnloadFrame();
+ return 1;
+ }
+ }
+ return ret ? 0 : 1;
+}
+
+static void JpegI420ToNV12(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ // Use NV21 with VU swapped.
+ I420ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI422ToNV12(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ // Use NV21 with VU swapped.
+ I422ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI444ToNV12(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ // Use NV21 with VU swapped.
+ I444ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI400ToNV12(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ // Use NV21 since there is no UV plane.
+ I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu,
+ dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+// MJPG (Motion JPEG) to NV12.
+LIBYUV_API
+int MJPGToNV12(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height) {
+ if (sample_size == kUnknownDataSize) {
+ // ERROR: MJPEG frame size unknown
+ return -1;
+ }
+
+ // TODO(fbarchard): Port MJpeg to C.
+ MJpegDecoder mjpeg_decoder;
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+ if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+ mjpeg_decoder.GetHeight() != src_height)) {
+ // ERROR: MJPEG frame has unexpected dimensions
+ mjpeg_decoder.UnloadFrame();
+ return 1; // runtime failure
+ }
+ if (ret) {
+ // Use NV21Buffers but with UV instead of VU.
+ NV21Buffers bufs = {dst_y, dst_stride_y, dst_uv,
+ dst_stride_uv, dst_width, dst_height};
+ // YUV420
+ if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV12, &bufs, dst_width,
+ dst_height);
+ // YUV422
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV12, &bufs, dst_width,
+ dst_height);
+ // YUV444
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV12, &bufs, dst_width,
+ dst_height);
+ // YUV400
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceGrayscale &&
+ mjpeg_decoder.GetNumComponents() == 1 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV12, &bufs, dst_width,
+ dst_height);
+ } else {
+ // Unknown colorspace.
mjpeg_decoder.UnloadFrame();
return 1;
}
@@ -190,7 +462,6 @@ int MJPGToI420(const uint8_t* sample,
return ret ? 0 : 1;
}
-#ifdef HAVE_JPEG
struct ARGBBuffers {
uint8_t* argb;
int argb_stride;
@@ -245,22 +516,22 @@ static void JpegI400ToARGB(void* opaque,
// TODO(fbarchard): review src_width and src_height requirement. dst_width and
// dst_height may be enough.
LIBYUV_API
-int MJPGToARGB(const uint8_t* sample,
- size_t sample_size,
+int MJPGToARGB(const uint8_t* src_mjpg,
+ size_t src_size_mjpg,
uint8_t* dst_argb,
int dst_stride_argb,
int src_width,
int src_height,
int dst_width,
int dst_height) {
- if (sample_size == kUnknownDataSize) {
+ if (src_size_mjpg == kUnknownDataSize) {
// ERROR: MJPEG frame size unknown
return -1;
}
// TODO(fbarchard): Port MJpeg to C.
MJpegDecoder mjpeg_decoder;
- LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
if (ret && (mjpeg_decoder.GetWidth() != src_width ||
mjpeg_decoder.GetHeight() != src_height)) {
// ERROR: MJPEG frame has unexpected dimensions
@@ -313,18 +584,17 @@ int MJPGToARGB(const uint8_t* sample,
ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dst_width,
dst_height);
} else {
- // TODO(fbarchard): Implement conversion for any other colorspace/sample
- // factors that occur in practice.
- // ERROR: Unable to convert MJPEG frame because format is not supported
+ // TODO(fbarchard): Implement conversion for any other
+ // colorspace/subsample factors that occur in practice. ERROR: Unable to
+ // convert MJPEG frame because format is not supported
mjpeg_decoder.UnloadFrame();
return 1;
}
}
return ret ? 0 : 1;
}
-#endif
-#endif
+#endif // HAVE_JPEG
#ifdef __cplusplus
} // extern "C"
diff --git a/chromium/third_party/libyuv/source/convert_to_argb.cc b/chromium/third_party/libyuv/source/convert_to_argb.cc
index bde1aa8891b..84df16c8c26 100644
--- a/chromium/third_party/libyuv/source/convert_to_argb.cc
+++ b/chromium/third_party/libyuv/source/convert_to_argb.cc
@@ -32,9 +32,6 @@ extern "C" {
// TODO(fbarchard): Add the following:
// H010ToARGB
// I010ToARGB
-// J400ToARGB
-// J422ToARGB
-// J444ToARGB
LIBYUV_API
int ConvertToARGB(const uint8_t* sample,
@@ -161,6 +158,11 @@ int ConvertToARGB(const uint8_t* sample,
r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
inv_crop_height);
break;
+ case FOURCC_J400:
+ src = sample + src_width * crop_y + crop_x;
+ r = J400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
+ inv_crop_height);
+ break;
// Biplanar formats
case FOURCC_NV12:
@@ -178,12 +180,6 @@ int ConvertToARGB(const uint8_t* sample,
r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
dst_stride_argb, crop_width, inv_crop_height);
break;
- case FOURCC_M420:
- src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
- r = M420ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
- inv_crop_height);
- break;
-
// Triplanar formats
case FOURCC_I420:
case FOURCC_YV12: {
@@ -208,6 +204,19 @@ int ConvertToARGB(const uint8_t* sample,
break;
}
+ case FOURCC_J420: {
+ int halfwidth = (src_width + 1) / 2;
+ int halfheight = (abs_src_height + 1) / 2;
+ const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8_t* src_u = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ const uint8_t* src_v = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
case FOURCC_H420: {
int halfwidth = (src_width + 1) / 2;
int halfheight = (abs_src_height + 1) / 2;
@@ -221,7 +230,7 @@ int ConvertToARGB(const uint8_t* sample,
break;
}
- case FOURCC_J420: {
+ case FOURCC_U420: {
int halfwidth = (src_width + 1) / 2;
int halfheight = (abs_src_height + 1) / 2;
const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
@@ -229,7 +238,7 @@ int ConvertToARGB(const uint8_t* sample,
(halfwidth * crop_y + crop_x) / 2;
const uint8_t* src_v = sample + src_width * abs_src_height +
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
- r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ r = U420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
dst_argb, dst_stride_argb, crop_width, inv_crop_height);
break;
}
@@ -256,6 +265,18 @@ int ConvertToARGB(const uint8_t* sample,
break;
}
+ case FOURCC_J422: {
+ int halfwidth = (src_width + 1) / 2;
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u =
+ sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
+ const uint8_t* src_v = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ r = J422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
case FOURCC_H422: {
int halfwidth = (src_width + 1) / 2;
const uint8_t* src_y = sample + src_width * crop_y + crop_x;
@@ -268,6 +289,18 @@ int ConvertToARGB(const uint8_t* sample,
break;
}
+ case FOURCC_U422: {
+ int halfwidth = (src_width + 1) / 2;
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u =
+ sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
+ const uint8_t* src_v = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
case FOURCC_I444:
case FOURCC_YV24: {
const uint8_t* src_y = sample + src_width * crop_y + crop_x;
@@ -284,6 +317,40 @@ int ConvertToARGB(const uint8_t* sample,
dst_argb, dst_stride_argb, crop_width, inv_crop_height);
break;
}
+
+ case FOURCC_J444: {
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u;
+ const uint8_t* src_v;
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ r = J444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
+ case FOURCC_H444: {
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u;
+ const uint8_t* src_v;
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ r = H444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
+ case FOURCC_U444: {
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u;
+ const uint8_t* src_v;
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ r = U444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
#ifdef HAVE_JPEG
case FOURCC_MJPG:
r = MJPGToARGB(sample, sample_size, dst_argb, dst_stride_argb, src_width,
diff --git a/chromium/third_party/libyuv/source/convert_to_i420.cc b/chromium/third_party/libyuv/source/convert_to_i420.cc
index df08309f9ba..ac6eeab24ef 100644
--- a/chromium/third_party/libyuv/source/convert_to_i420.cc
+++ b/chromium/third_party/libyuv/source/convert_to_i420.cc
@@ -179,11 +179,6 @@ int ConvertToI420(const uint8_t* sample,
dst_stride_y, dst_v, dst_stride_v, dst_u,
dst_stride_u, crop_width, inv_crop_height, rotation);
break;
- case FOURCC_M420:
- src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
- r = M420ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u,
- dst_v, dst_stride_v, crop_width, inv_crop_height);
- break;
// Triplanar formats
case FOURCC_I420:
case FOURCC_YV12: {
@@ -193,15 +188,15 @@ int ConvertToI420(const uint8_t* sample,
int halfwidth = (src_width + 1) / 2;
int halfheight = (abs_src_height + 1) / 2;
if (format == FOURCC_YV12) {
- src_v = sample + src_width * abs_src_height +
- (halfwidth * crop_y + crop_x) / 2;
+ src_v = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) +
+ (crop_x / 2);
src_u = sample + src_width * abs_src_height +
- halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2);
} else {
- src_u = sample + src_width * abs_src_height +
- (halfwidth * crop_y + crop_x) / 2;
+ src_u = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) +
+ (crop_x / 2);
src_v = sample + src_width * abs_src_height +
- halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2);
}
r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
@@ -216,14 +211,14 @@ int ConvertToI420(const uint8_t* sample,
int halfwidth = (src_width + 1) / 2;
if (format == FOURCC_YV16) {
src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
- crop_x / 2;
+ (crop_x / 2);
src_u = sample + src_width * abs_src_height +
- halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ halfwidth * (abs_src_height + crop_y) + (crop_x / 2);
} else {
src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
- crop_x / 2;
+ (crop_x / 2);
src_v = sample + src_width * abs_src_height +
- halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ halfwidth * (abs_src_height + crop_y) + (crop_x / 2);
}
r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
diff --git a/chromium/third_party/libyuv/source/cpu_id.cc b/chromium/third_party/libyuv/source/cpu_id.cc
index 31e24b6739b..fe89452b772 100644
--- a/chromium/third_party/libyuv/source/cpu_id.cc
+++ b/chromium/third_party/libyuv/source/cpu_id.cc
@@ -75,9 +75,9 @@ void CpuId(int info_eax, int info_ecx, int* cpu_info) {
asm volatile(
#if defined(__i386__) && defined(__PIC__)
// Preserve ebx for fpic 32 bit.
- "mov %%ebx, %%edi \n"
+ "mov %%ebx, %%edi \n"
"cpuid \n"
- "xchg %%edi, %%ebx \n"
+ "xchg %%edi, %%ebx \n"
: "=D"(info_ebx),
#else
"cpuid \n"
@@ -163,32 +163,38 @@ LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
}
// TODO(fbarchard): Consider read_msa_ir().
-// TODO(fbarchard): Add unittest.
-LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name,
- const char ase[]) {
+LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) {
char cpuinfo_line[512];
+ int flag = 0x0;
FILE* f = fopen(cpuinfo_name, "r");
if (!f) {
- // ase enabled if /proc/cpuinfo is unavailable.
- if (strcmp(ase, " msa") == 0) {
- return kCpuHasMSA;
- }
+ // Assume nothing if /proc/cpuinfo is unavailable.
+ // This will occur for Chrome sandbox for Pepper or Render process.
return 0;
}
while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+ if (memcmp(cpuinfo_line, "cpu model", 9) == 0) {
+ // Workaround early kernel without mmi in ASEs line.
+ if (strstr(cpuinfo_line, "Loongson-3")) {
+ flag |= kCpuHasMMI;
+ } else if (strstr(cpuinfo_line, "Loongson-2K")) {
+ flag |= kCpuHasMMI | kCpuHasMSA;
+ }
+ }
if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
- char* p = strstr(cpuinfo_line, ase);
- if (p) {
- fclose(f);
- if (strcmp(ase, " msa") == 0) {
- return kCpuHasMSA;
- }
- return 0;
+ if (strstr(cpuinfo_line, "loongson-mmi") &&
+ strstr(cpuinfo_line, "loongson-ext")) {
+ flag |= kCpuHasMMI;
+ }
+ if (strstr(cpuinfo_line, "msa")) {
+ flag |= kCpuHasMSA;
}
+ // ASEs is the last line, so we can break here.
+ break;
}
}
fclose(f);
- return 0;
+ return flag;
}
static SAFEBUFFERS int GetCpuFlags(void) {
@@ -230,9 +236,7 @@ static SAFEBUFFERS int GetCpuFlags(void) {
}
#endif
#if defined(__mips__) && defined(__linux__)
-#if defined(__mips_msa)
- cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa");
-#endif
+ cpu_info = MipsCpuCaps("/proc/cpuinfo");
cpu_info |= kCpuHasMIPS;
#endif
#if defined(__arm__) || defined(__aarch64__)
diff --git a/chromium/third_party/libyuv/source/mjpeg_decoder.cc b/chromium/third_party/libyuv/source/mjpeg_decoder.cc
index eaf2530130b..adba832f53f 100644
--- a/chromium/third_party/libyuv/source/mjpeg_decoder.cc
+++ b/chromium/third_party/libyuv/source/mjpeg_decoder.cc
@@ -25,7 +25,8 @@
#endif
#endif
-struct FILE; // For jpeglib.h.
+
+#include <stdio.h> // For jpeglib.h.
// C++ build requires extern C for jpeg internals.
#ifdef __cplusplus
@@ -416,7 +417,10 @@ void init_source(j_decompress_ptr cinfo) {
boolean fill_input_buffer(j_decompress_ptr cinfo) {
BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
if (buf_vec->pos >= buf_vec->len) {
+ // Don't assert-fail when fuzzing.
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
assert(0 && "No more data");
+#endif
// ERROR: No more data
return FALSE;
}
@@ -427,7 +431,15 @@ boolean fill_input_buffer(j_decompress_ptr cinfo) {
}
void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT
- cinfo->src->next_input_byte += num_bytes;
+ jpeg_source_mgr* src = cinfo->src;
+ size_t bytes = static_cast<size_t>(num_bytes);
+ if (bytes > src->bytes_in_buffer) {
+ src->next_input_byte = nullptr;
+ src->bytes_in_buffer = 0;
+ } else {
+ src->next_input_byte += bytes;
+ src->bytes_in_buffer -= bytes;
+ }
}
void term_source(j_decompress_ptr cinfo) {
diff --git a/chromium/third_party/libyuv/source/mjpeg_validate.cc b/chromium/third_party/libyuv/source/mjpeg_validate.cc
index 80c2cc0cb9b..ba0a03ab9e5 100644
--- a/chromium/third_party/libyuv/source/mjpeg_validate.cc
+++ b/chromium/third_party/libyuv/source/mjpeg_validate.cc
@@ -18,10 +18,10 @@ extern "C" {
#endif
// Helper function to scan for EOI marker (0xff 0xd9).
-static LIBYUV_BOOL ScanEOI(const uint8_t* sample, size_t sample_size) {
- if (sample_size >= 2) {
- const uint8_t* end = sample + sample_size - 1;
- const uint8_t* it = sample;
+static LIBYUV_BOOL ScanEOI(const uint8_t* src_mjpg, size_t src_size_mjpg) {
+ if (src_size_mjpg >= 2) {
+ const uint8_t* end = src_mjpg + src_size_mjpg - 1;
+ const uint8_t* it = src_mjpg;
while (it < end) {
// TODO(fbarchard): scan for 0xd9 instead.
it = (const uint8_t*)(memchr(it, 0xff, end - it));
@@ -34,34 +34,35 @@ static LIBYUV_BOOL ScanEOI(const uint8_t* sample, size_t sample_size) {
++it; // Skip over current 0xff.
}
}
- // ERROR: Invalid jpeg end code not found. Size sample_size
+ // ERROR: Invalid jpeg end code not found. Size src_size_mjpg
return LIBYUV_FALSE;
}
// Helper function to validate the jpeg appears intact.
-LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size) {
+LIBYUV_BOOL ValidateJpeg(const uint8_t* src_mjpg, size_t src_size_mjpg) {
// Maximum size that ValidateJpeg will consider valid.
const size_t kMaxJpegSize = 0x7fffffffull;
const size_t kBackSearchSize = 1024;
- if (sample_size < 64 || sample_size > kMaxJpegSize || !sample) {
- // ERROR: Invalid jpeg size: sample_size
+ if (src_size_mjpg < 64 || src_size_mjpg > kMaxJpegSize || !src_mjpg) {
+ // ERROR: Invalid jpeg size: src_size_mjpg
return LIBYUV_FALSE;
}
- if (sample[0] != 0xff || sample[1] != 0xd8) { // SOI marker
+ // SOI marker
+ if (src_mjpg[0] != 0xff || src_mjpg[1] != 0xd8 || src_mjpg[2] != 0xff) {
// ERROR: Invalid jpeg initial start code
return LIBYUV_FALSE;
}
// Look for the End Of Image (EOI) marker near the end of the buffer.
- if (sample_size > kBackSearchSize) {
- if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
+ if (src_size_mjpg > kBackSearchSize) {
+ if (ScanEOI(src_mjpg + src_size_mjpg - kBackSearchSize, kBackSearchSize)) {
return LIBYUV_TRUE; // Success: Valid jpeg.
}
// Reduce search size for forward search.
- sample_size = sample_size - kBackSearchSize + 1;
+ src_size_mjpg = src_size_mjpg - kBackSearchSize + 1;
}
// Step over SOI marker and scan for EOI.
- return ScanEOI(sample + 2, sample_size - 2);
+ return ScanEOI(src_mjpg + 2, src_size_mjpg - 2);
}
#ifdef __cplusplus
diff --git a/chromium/third_party/libyuv/source/planar_functions.cc b/chromium/third_party/libyuv/source/planar_functions.cc
index 5eae3f763a7..4e8908c2eba 100644
--- a/chromium/third_party/libyuv/source/planar_functions.cc
+++ b/chromium/third_party/libyuv/source/planar_functions.cc
@@ -349,6 +349,39 @@ int I420ToI400(const uint8_t* src_y,
return 0;
}
+// Copy NV12. Supports inverting.
+int NV12Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv,
+ int src_stride_uv, uint8_t* dst_y, int dst_stride_y,
+ uint8_t* dst_uv, int dst_stride_uv, int width, int height) {
+ if (!src_y || !dst_y || !src_uv || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+ src_stride_y = -src_stride_y;
+ src_stride_uv = -src_stride_uv;
+ }
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ CopyPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth * 2,
+ halfheight);
+ return 0;
+}
+
+// Copy NV21. Supports inverting.
+int NV21Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu,
+ int src_stride_vu, uint8_t* dst_y, int dst_stride_y,
+ uint8_t* dst_vu, int dst_stride_vu, int width, int height) {
+ return NV12Copy(src_y, src_stride_y, src_vu, src_stride_vu, dst_y,
+ dst_stride_y, dst_vu, dst_stride_vu, width, height);
+}
+
// Support function for NV12 etc UV channels.
// Width and height are plane sizes (typically half pixel width).
LIBYUV_API
@@ -402,6 +435,14 @@ void SplitUVPlane(const uint8_t* src_uv,
}
}
#endif
+#if defined(HAS_SPLITUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SplitUVRow = SplitUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ SplitUVRow = SplitUVRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_SPLITUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SplitUVRow = SplitUVRow_Any_MSA;
@@ -432,7 +473,6 @@ void MergeUVPlane(const uint8_t* src_u,
int y;
void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
uint8_t* dst_uv, int width) = MergeUVRow_C;
- // Coalesce rows.
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -470,6 +510,14 @@ void MergeUVPlane(const uint8_t* src_u,
}
}
#endif
+#if defined(HAS_MERGEUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MergeUVRow = MergeUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ MergeUVRow = MergeUVRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_MERGEUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
MergeUVRow = MergeUVRow_Any_MSA;
@@ -488,6 +536,96 @@ void MergeUVPlane(const uint8_t* src_u,
}
}
+// Swap U and V channels in interleaved UV plane.
+LIBYUV_API
+void SwapUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int y;
+ void (*SwapUVRow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) =
+ SwapUVRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uv = src_uv + (height - 1) * src_stride_uv;
+ src_stride_uv = -src_stride_uv;
+ }
+ // Coalesce rows.
+ if (src_stride_uv == width * 2 && dst_stride_vu == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_uv = dst_stride_vu = 0;
+ }
+
+#if defined(HAS_SWAPUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ SwapUVRow = SwapUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ SwapUVRow = SwapUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SWAPUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ SwapUVRow = SwapUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ SwapUVRow = SwapUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SWAPUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SwapUVRow = SwapUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SwapUVRow = SwapUVRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ SwapUVRow(src_uv, dst_vu, width);
+ src_uv += src_stride_uv;
+ dst_vu += dst_stride_vu;
+ }
+}
+
+// Convert NV21 to NV12.
+LIBYUV_API
+int NV21ToNV12(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_vu || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_vu = src_vu + (halfheight - 1) * src_stride_vu;
+ src_stride_vu = -src_stride_vu;
+ }
+
+ SwapUVPlane(src_vu, src_stride_vu, dst_uv, dst_stride_uv, halfwidth,
+ halfheight);
+ return 0;
+}
+
// Support function for NV12 etc RGB channels.
// Width and height are plane sizes (typically half pixel width).
LIBYUV_API
@@ -529,6 +667,14 @@ void SplitRGBPlane(const uint8_t* src_rgb,
}
}
#endif
+#if defined(HAS_SPLITRGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SplitRGBRow = SplitRGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ SplitRGBRow = SplitRGBRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_SPLITRGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SplitRGBRow = SplitRGBRow_Any_NEON;
@@ -593,6 +739,14 @@ void MergeRGBPlane(const uint8_t* src_r,
}
}
#endif
+#if defined(HAS_MERGERGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MergeRGBRow = MergeRGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ MergeRGBRow = MergeRGBRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
// Merge a row of U and V into a row of RGB.
@@ -604,62 +758,6 @@ void MergeRGBPlane(const uint8_t* src_r,
}
}
-// Mirror a plane of data.
-void MirrorPlane(const uint8_t* src_y,
- int src_stride_y,
- uint8_t* dst_y,
- int dst_stride_y,
- int width,
- int height) {
- int y;
- void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_stride_y = -src_stride_y;
- }
-#if defined(HAS_MIRRORROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- MirrorRow = MirrorRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- MirrorRow = MirrorRow_NEON;
- }
- }
-#endif
-#if defined(HAS_MIRRORROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- MirrorRow = MirrorRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- MirrorRow = MirrorRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_MIRRORROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- MirrorRow = MirrorRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- MirrorRow = MirrorRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_MIRRORROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- MirrorRow = MirrorRow_Any_MSA;
- if (IS_ALIGNED(width, 64)) {
- MirrorRow = MirrorRow_MSA;
- }
- }
-#endif
-
- // Mirror plane
- for (y = 0; y < height; ++y) {
- MirrorRow(src_y, dst_y, width);
- src_y += src_stride_y;
- dst_y += dst_stride_y;
- }
-}
-
// Convert YUY2 to I422.
LIBYUV_API
int YUY2ToI422(const uint8_t* src_yuy2,
@@ -724,7 +822,17 @@ int YUY2ToI422(const uint8_t* src_yuy2,
}
}
#endif
-#if defined(HAS_YUY2TOYROW_MSA)
+#if defined(HAS_YUY2TOYROW_MMI) && defined(HAS_YUY2TOUV422ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ YUY2ToYRow = YUY2ToYRow_Any_MMI;
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ YUY2ToYRow = YUY2ToYRow_MMI;
+ YUY2ToUV422Row = YUY2ToUV422Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUV422ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
YUY2ToYRow = YUY2ToYRow_Any_MSA;
YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA;
@@ -810,7 +918,17 @@ int UYVYToI422(const uint8_t* src_uyvy,
}
}
#endif
-#if defined(HAS_UYVYTOYROW_MSA)
+#if defined(HAS_UYVYTOYROW_MMI) && defined(HAS_UYVYTOUV422ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ UYVYToYRow = UYVYToYRow_Any_MMI;
+ UYVYToUV422Row = UYVYToUV422Row_Any_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_MMI;
+ UYVYToUV422Row = UYVYToUV422Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_MSA) && defined(HAS_UYVYTOUV422ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
UYVYToYRow = UYVYToYRow_Any_MSA;
UYVYToUV422Row = UYVYToUV422Row_Any_MSA;
@@ -882,6 +1000,14 @@ int YUY2ToY(const uint8_t* src_yuy2,
}
}
#endif
+#if defined(HAS_YUY2TOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ YUY2ToYRow = YUY2ToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ YUY2ToYRow = YUY2ToYRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_YUY2TOYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
YUY2ToYRow = YUY2ToYRow_Any_MSA;
@@ -899,6 +1025,130 @@ int YUY2ToY(const uint8_t* src_yuy2,
return 0;
}
+// Mirror a plane of data.
+// See Also I400Mirror
+LIBYUV_API
+void MirrorPlane(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ int y;
+ void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+#if defined(HAS_MIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MirrorRow = MirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ MirrorRow = MirrorRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MirrorRow = MirrorRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MirrorRow = MirrorRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ MirrorRow = MirrorRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MirrorRow = MirrorRow_Any_MSA;
+ if (IS_ALIGNED(width, 64)) {
+ MirrorRow = MirrorRow_MSA;
+ }
+ }
+#endif
+
+ // Mirror plane
+ for (y = 0; y < height; ++y) {
+ MirrorRow(src_y, dst_y, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
+// Mirror a plane of UV data.
+LIBYUV_API
+void MirrorUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst, int width) =
+ MirrorUVRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uv = src_uv + (height - 1) * src_stride_uv;
+ src_stride_uv = -src_stride_uv;
+ }
+#if defined(HAS_MIRRORUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MirrorUVRow = MirrorUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorUVRow = MirrorUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ MirrorUVRow = MirrorUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ MirrorUVRow = MirrorUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MirrorUVRow = MirrorUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorUVRow = MirrorUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MirrorUVRow = MirrorUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ MirrorUVRow = MirrorUVRow_MSA;
+ }
+ }
+#endif
+
+ // MirrorUV plane
+ for (y = 0; y < height; ++y) {
+ MirrorUVRow(src_uv, dst_uv, width);
+ src_uv += src_stride_uv;
+ dst_uv += dst_stride_uv;
+ }
+}
+
// Mirror I400 with optional flipping
LIBYUV_API
int I400Mirror(const uint8_t* src_y,
@@ -939,7 +1189,7 @@ int I420Mirror(const uint8_t* src_y,
int height) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
- if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 ||
+ if (!src_y || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
height == 0) {
return -1;
}
@@ -963,6 +1213,41 @@ int I420Mirror(const uint8_t* src_y,
return 0;
}
+// NV12 mirror.
+LIBYUV_API
+int NV12Mirror(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_y || !src_uv || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+ src_stride_y = -src_stride_y;
+ src_stride_uv = -src_stride_uv;
+ }
+
+ if (dst_y) {
+ MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ MirrorUVPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth,
+ halfheight);
+ return 0;
+}
+
// ARGB mirror.
LIBYUV_API
int ARGBMirror(const uint8_t* src_argb,
@@ -986,7 +1271,7 @@ int ARGBMirror(const uint8_t* src_argb,
#if defined(HAS_ARGBMIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
- if (IS_ALIGNED(width, 4)) {
+ if (IS_ALIGNED(width, 8)) {
ARGBMirrorRow = ARGBMirrorRow_NEON;
}
}
@@ -1007,6 +1292,14 @@ int ARGBMirror(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBMIRRORROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBMirrorRow = ARGBMirrorRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBMIRRORROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
@@ -1025,6 +1318,52 @@ int ARGBMirror(const uint8_t* src_argb,
return 0;
}
+// RGB24 mirror.
+LIBYUV_API
+int RGB24Mirror(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ int y;
+ void (*RGB24MirrorRow)(const uint8_t* src, uint8_t* dst, int width) =
+ RGB24MirrorRow_C;
+ if (!src_rgb24 || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+#if defined(HAS_RGB24MIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB24MirrorRow = RGB24MirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24MirrorRow = RGB24MirrorRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGB24MIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGB24MirrorRow = RGB24MirrorRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24MirrorRow = RGB24MirrorRow_SSSE3;
+ }
+ }
+#endif
+
+ // Mirror plane
+ for (y = 0; y < height; ++y) {
+ RGB24MirrorRow(src_rgb24, dst_rgb24, width);
+ src_rgb24 += src_stride_rgb24;
+ dst_rgb24 += dst_stride_rgb24;
+ }
+ return 0;
+}
+
// Get a blender that optimized for the CPU and pixel count.
// As there are 6 blenders to choose from, the caller should try to use
// the same blend function for all pixels if possible.
@@ -1043,6 +1382,11 @@ ARGBBlendRow GetARGBBlend() {
ARGBBlendRow = ARGBBlendRow_NEON;
}
#endif
+#if defined(HAS_ARGBBLENDROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBBlendRow = ARGBBlendRow_MMI;
+ }
+#endif
#if defined(HAS_ARGBBLENDROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBBlendRow = ARGBBlendRow_MSA;
@@ -1140,6 +1484,14 @@ int BlendPlane(const uint8_t* src_y0,
}
}
#endif
+#if defined(HAS_BLENDPLANEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ BlendPlaneRow = BlendPlaneRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ BlendPlaneRow = BlendPlaneRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width);
@@ -1216,6 +1568,14 @@ int I420Blend(const uint8_t* src_y0,
}
}
#endif
+#if defined(HAS_BLENDPLANEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ BlendPlaneRow = BlendPlaneRow_Any_MMI;
+ if (IS_ALIGNED(halfwidth, 8)) {
+ BlendPlaneRow = BlendPlaneRow_MMI;
+ }
+ }
+#endif
if (!IS_ALIGNED(width, 2)) {
ScaleRowDown2 = ScaleRowDown2Box_Odd_C;
}
@@ -1252,6 +1612,17 @@ int I420Blend(const uint8_t* src_y0,
}
}
#endif
+#if defined(HAS_SCALEROWDOWN2_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Odd_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Any_MMI;
+ if (IS_ALIGNED(halfwidth, 8)) {
+ ScaleRowDown2 = ScaleRowDown2Box_MMI;
+ }
+ }
+ }
+#endif
// Row buffer for intermediate alpha pixels.
align_buffer_64(halfalpha, halfwidth);
@@ -1329,6 +1700,14 @@ int ARGBMultiply(const uint8_t* src_argb0,
}
}
#endif
+#if defined(HAS_ARGBMULTIPLYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBMULTIPLYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA;
@@ -1406,6 +1785,14 @@ int ARGBAdd(const uint8_t* src_argb0,
}
}
#endif
+#if defined(HAS_ARGBADDROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBAddRow = ARGBAddRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBAddRow = ARGBAddRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBADDROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBAddRow = ARGBAddRow_Any_MSA;
@@ -1478,6 +1865,14 @@ int ARGBSubtract(const uint8_t* src_argb0,
}
}
#endif
+#if defined(HAS_ARGBSUBTRACTROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBSubtractRow = ARGBSubtractRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBSUBTRACTROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBSubtractRow = ARGBSubtractRow_Any_MSA;
@@ -1496,177 +1891,6 @@ int ARGBSubtract(const uint8_t* src_argb0,
}
return 0;
}
-// Convert I422 to RGBA with matrix
-static int I422ToRGBAMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgba,
- int dst_stride_rgba,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToRGBARow_C;
- if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
- dst_stride_rgba = -dst_stride_rgba;
- }
-#if defined(HAS_I422TORGBAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToRGBARow = I422ToRGBARow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGBARow = I422ToRGBARow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGBARow = I422ToRGBARow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToRGBARow = I422ToRGBARow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
- dst_rgba += dst_stride_rgba;
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- return 0;
-}
-
-// Convert I422 to RGBA.
-LIBYUV_API
-int I422ToRGBA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgba,
- int dst_stride_rgba,
- int width,
- int height) {
- return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgba, dst_stride_rgba,
- &kYuvI601Constants, width, height);
-}
-
-// Convert I422 to BGRA.
-LIBYUV_API
-int I422ToBGRA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_bgra,
- int dst_stride_bgra,
- int width,
- int height) {
- return I422ToRGBAMatrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_bgra, dst_stride_bgra,
- &kYvuI601Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert NV12 to RGB565.
-LIBYUV_API
-int NV12ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height) {
- int y;
- void (*NV12ToRGB565Row)(
- const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
- if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
- dst_stride_rgb565 = -dst_stride_rgb565;
- }
-#if defined(HAS_NV12TORGB565ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_NV12TORGB565ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_NV12TORGB565ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- NV12ToRGB565Row = NV12ToRGB565Row_NEON;
- }
- }
-#endif
-#if defined(HAS_NV12TORGB565ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- NV12ToRGB565Row = NV12ToRGB565Row_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width);
- dst_rgb565 += dst_stride_rgb565;
- src_y += src_stride_y;
- if (y & 1) {
- src_uv += src_stride_uv;
- }
- }
- return 0;
-}
// Convert RAW to RGB24.
LIBYUV_API
@@ -1710,6 +1934,14 @@ int RAWToRGB24(const uint8_t* src_raw,
}
}
#endif
+#if defined(HAS_RAWTORGB24ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RAWToRGB24Row = RAWToRGB24Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ RAWToRGB24Row = RAWToRGB24Row_MMI;
+ }
+ }
+#endif
#if defined(HAS_RAWTORGB24ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RAWToRGB24Row = RAWToRGB24Row_Any_MSA;
@@ -1853,6 +2085,14 @@ int ARGBRect(uint8_t* dst_argb,
ARGBSetRow = ARGBSetRow_X86;
}
#endif
+#if defined(HAS_ARGBSETROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBSetRow = ARGBSetRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBSetRow = ARGBSetRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBSETROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBSetRow = ARGBSetRow_Any_MSA;
@@ -1931,6 +2171,14 @@ int ARGBAttenuate(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBATTENUATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBATTENUATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
@@ -2034,6 +2282,11 @@ int ARGBGrayTo(const uint8_t* src_argb,
ARGBGrayRow = ARGBGrayRow_NEON;
}
#endif
+#if defined(HAS_ARGBGRAYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+ ARGBGrayRow = ARGBGrayRow_MMI;
+ }
+#endif
#if defined(HAS_ARGBGRAYROW_MSA)
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_MSA;
@@ -2079,6 +2332,11 @@ int ARGBGray(uint8_t* dst_argb,
ARGBGrayRow = ARGBGrayRow_NEON;
}
#endif
+#if defined(HAS_ARGBGRAYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+ ARGBGrayRow = ARGBGrayRow_MMI;
+ }
+#endif
#if defined(HAS_ARGBGRAYROW_MSA)
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_MSA;
@@ -2122,6 +2380,11 @@ int ARGBSepia(uint8_t* dst_argb,
ARGBSepiaRow = ARGBSepiaRow_NEON;
}
#endif
+#if defined(HAS_ARGBSEPIAROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+ ARGBSepiaRow = ARGBSepiaRow_MMI;
+ }
+#endif
#if defined(HAS_ARGBSEPIAROW_MSA)
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
ARGBSepiaRow = ARGBSepiaRow_MSA;
@@ -2173,6 +2436,11 @@ int ARGBColorMatrix(const uint8_t* src_argb,
ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
}
#endif
+#if defined(HAS_ARGBCOLORMATRIXROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+ ARGBColorMatrixRow = ARGBColorMatrixRow_MMI;
+ }
+#endif
#if defined(HAS_ARGBCOLORMATRIXROW_MSA)
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
ARGBColorMatrixRow = ARGBColorMatrixRow_MSA;
@@ -2372,6 +2640,12 @@ int ARGBComputeCumulativeSum(const uint8_t* src_argb,
ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
}
#endif
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI;
+ }
+#endif
+
memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 int per pixel.
for (y = 0; y < height; ++y) {
ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
@@ -2430,6 +2704,11 @@ int ARGBBlur(const uint8_t* src_argb,
CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;
}
#endif
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI;
+ }
+#endif
// Compute enough CumulativeSum for first row to be blurred. After this
// one row of CumulativeSum is updated at a time.
ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum,
@@ -2531,6 +2810,11 @@ int ARGBShade(const uint8_t* src_argb,
ARGBShadeRow = ARGBShadeRow_NEON;
}
#endif
+#if defined(HAS_ARGBSHADEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+ ARGBShadeRow = ARGBShadeRow_MMI;
+ }
+#endif
#if defined(HAS_ARGBSHADEROW_MSA)
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) {
ARGBShadeRow = ARGBShadeRow_MSA;
@@ -2599,6 +2883,14 @@ int InterpolatePlane(const uint8_t* src0,
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
@@ -2722,6 +3014,14 @@ int ARGBShuffle(const uint8_t* src_bgra,
}
}
#endif
+#if defined(HAS_ARGBSHUFFLEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBShuffleRow = ARGBShuffleRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBSHUFFLEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBShuffleRow = ARGBShuffleRow_Any_MSA;
@@ -2739,6 +3039,80 @@ int ARGBShuffle(const uint8_t* src_bgra,
return 0;
}
+// Gauss blur a float plane using Gaussian 5x5 filter with
+// coefficients of 1, 4, 6, 4, 1.
+// Each destination pixel is a blur of the 5x5
+// pixels from the source.
+// Source edges are clamped.
+// Edge is 2 pixels on each side, and interior is multiple of 4.
+LIBYUV_API
+int GaussPlane_F32(const float* src,
+ int src_stride,
+ float* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ int y;
+ void (*GaussCol_F32)(const float* src0, const float* src1, const float* src2,
+ const float* src3, const float* src4, float* dst,
+ int width) = GaussCol_F32_C;
+ void (*GaussRow_F32)(const float* src, float* dst, int width) =
+ GaussRow_F32_C;
+ if (!src || !dst || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src = src + (height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+
+#if defined(HAS_GAUSSCOL_F32_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ GaussCol_F32 = GaussCol_F32_NEON;
+ }
+#endif
+#if defined(HAS_GAUSSROW_F32_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ GaussRow_F32 = GaussRow_F32_NEON;
+ }
+#endif
+ {
+ // 2 pixels on each side, but aligned out to 16 bytes.
+ align_buffer_64(rowbuf, (4 + width + 4) * 4);
+ memset(rowbuf, 0, 16);
+ memset(rowbuf + (4 + width) * 4, 0, 16);
+ float* row = (float*)(rowbuf + 16);
+ const float* src0 = src;
+ const float* src1 = src;
+ const float* src2 = src;
+ const float* src3 = src2 + ((height > 1) ? src_stride : 0);
+ const float* src4 = src3 + ((height > 2) ? src_stride : 0);
+
+ for (y = 0; y < height; ++y) {
+ GaussCol_F32(src0, src1, src2, src3, src4, row, width);
+
+ // Extrude edge by 2 floats
+ row[-2] = row[-1] = row[0];
+ row[width + 1] = row[width] = row[width - 1];
+
+ GaussRow_F32(row - 2, dst, width);
+
+ src0 = src1;
+ src1 = src2;
+ src2 = src3;
+ src3 = src4;
+ if ((y + 2) < (height - 1)) {
+ src4 += src_stride;
+ }
+ dst += dst_stride;
+ }
+ free_aligned_buffer_64(rowbuf);
+ }
+ return 0;
+}
+
// Sobel ARGB effect.
static int ARGBSobelize(const uint8_t* src_argb,
int src_stride_argb,
@@ -2793,6 +3167,14 @@ static int ARGBSobelize(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYJRow = ARGBToYJRow_Any_MSA;
@@ -2812,6 +3194,11 @@ static int ARGBSobelize(const uint8_t* src_argb,
SobelYRow = SobelYRow_NEON;
}
#endif
+#if defined(HAS_SOBELYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SobelYRow = SobelYRow_MMI;
+ }
+#endif
#if defined(HAS_SOBELYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SobelYRow = SobelYRow_MSA;
@@ -2827,6 +3214,11 @@ static int ARGBSobelize(const uint8_t* src_argb,
SobelXRow = SobelXRow_NEON;
}
#endif
+#if defined(HAS_SOBELXROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SobelXRow = SobelXRow_MMI;
+ }
+#endif
#if defined(HAS_SOBELXROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SobelXRow = SobelXRow_MSA;
@@ -2906,6 +3298,14 @@ int ARGBSobel(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_SOBELROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SobelRow = SobelRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ SobelRow = SobelRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_SOBELROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SobelRow = SobelRow_Any_MSA;
@@ -2944,6 +3344,14 @@ int ARGBSobelToPlane(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_SOBELTOPLANEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SobelToPlaneRow = SobelToPlaneRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ SobelToPlaneRow = SobelToPlaneRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_SOBELTOPLANEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SobelToPlaneRow = SobelToPlaneRow_Any_MSA;
@@ -2983,6 +3391,14 @@ int ARGBSobelXY(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_SOBELXYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SobelXYRow = SobelXYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ SobelXYRow = SobelXYRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_SOBELXYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SobelXYRow = SobelXYRow_Any_MSA;
@@ -3228,6 +3644,14 @@ int ARGBCopyAlpha(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBCOPYALPHAROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBCopyAlphaRow(src_argb, dst_argb, width);
@@ -3280,6 +3704,12 @@ int ARGBExtractAlpha(const uint8_t* src_argb,
: ARGBExtractAlphaRow_Any_NEON;
}
#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_MMI
+ : ARGBExtractAlphaRow_Any_MMI;
+ }
+#endif
#if defined(HAS_ARGBEXTRACTALPHAROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA
@@ -3337,6 +3767,14 @@ int ARGBCopyYToAlpha(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_ARGBCOPYYTOALPHAROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBCopyYToAlphaRow(src_y, dst_argb, width);
@@ -3398,6 +3836,14 @@ int YUY2ToNV12(const uint8_t* src_yuy2,
}
}
#endif
+#if defined(HAS_SPLITUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SplitUVRow = SplitUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ SplitUVRow = SplitUVRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_SPLITUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SplitUVRow = SplitUVRow_Any_MSA;
@@ -3430,6 +3876,14 @@ int YUY2ToNV12(const uint8_t* src_yuy2,
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
@@ -3514,6 +3968,14 @@ int UYVYToNV12(const uint8_t* src_uyvy,
}
}
#endif
+#if defined(HAS_SPLITUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SplitUVRow = SplitUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ SplitUVRow = SplitUVRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_SPLITUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SplitUVRow = SplitUVRow_Any_MSA;
@@ -3546,6 +4008,14 @@ int UYVYToNV12(const uint8_t* src_uyvy,
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
@@ -3581,6 +4051,56 @@ int UYVYToNV12(const uint8_t* src_uyvy,
return 0;
}
+// width and height are src size allowing odd size handling.
+LIBYUV_API
+void HalfMergeUVPlane(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ void (*HalfMergeUVRow)(const uint8_t* src_u, int src_stride_u,
+ const uint8_t* src_v, int src_stride_v,
+ uint8_t* dst_uv, int width) = HalfMergeUVRow_C;
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+#if defined(HAS_HALFMERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+ HalfMergeUVRow = HalfMergeUVRow_NEON;
+ }
+#endif
+#if defined(HAS_HALFMERGEUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+ HalfMergeUVRow = HalfMergeUVRow_SSSE3;
+ }
+#endif
+#if defined(HAS_HALFMERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
+ HalfMergeUVRow = HalfMergeUVRow_AVX2;
+ }
+#endif
+ for (y = 0; y < height - 1; y += 2) {
+ // Merge a row of U and V into a row of UV.
+ HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width);
+ src_u += src_stride_u * 2;
+ src_v += src_stride_v * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ HalfMergeUVRow(src_u, 0, src_v, 0, dst_uv, width);
+ }
+}
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/chromium/third_party/libyuv/source/rotate.cc b/chromium/third_party/libyuv/source/rotate.cc
index f2bed85b755..32904e47312 100644
--- a/chromium/third_party/libyuv/source/rotate.cc
+++ b/chromium/third_party/libyuv/source/rotate.cc
@@ -36,6 +36,15 @@ void TransposePlane(const uint8_t* src,
void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst,
int dst_stride, int width) = TransposeWx8_C;
#endif
+
+#if defined(HAS_TRANSPOSEWX16_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ TransposeWx16 = TransposeWx16_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ TransposeWx16 = TransposeWx16_MSA;
+ }
+ }
+#else
#if defined(HAS_TRANSPOSEWX8_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
TransposeWx8 = TransposeWx8_NEON;
@@ -49,6 +58,11 @@ void TransposePlane(const uint8_t* src,
}
}
#endif
+#if defined(HAS_TRANSPOSEWX8_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ TransposeWx8 = TransposeWx8_MMI;
+ }
+#endif
#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
@@ -57,14 +71,7 @@ void TransposePlane(const uint8_t* src,
}
}
#endif
-#if defined(HAS_TRANSPOSEWX16_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- TransposeWx16 = TransposeWx16_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- TransposeWx16 = TransposeWx16_MSA;
- }
- }
-#endif
+#endif /* defined(HAS_TRANSPOSEWX16_MSA) */
#if defined(HAS_TRANSPOSEWX16_MSA)
// Work across the source in 16x16 tiles
@@ -137,7 +144,7 @@ void RotatePlane180(const uint8_t* src,
#if defined(HAS_MIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MirrorRow = MirrorRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
+ if (IS_ALIGNED(width, 32)) {
MirrorRow = MirrorRow_NEON;
}
}
@@ -158,6 +165,14 @@ void RotatePlane180(const uint8_t* src,
}
}
#endif
+#if defined(HAS_MIRRORROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MirrorRow = MirrorRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ MirrorRow = MirrorRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_MIRRORROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
MirrorRow = MirrorRow_Any_MSA;
@@ -186,14 +201,19 @@ void RotatePlane180(const uint8_t* src,
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
+#if defined(HAS_COPYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ CopyRow = IS_ALIGNED(width, 8) ? CopyRow_MMI : CopyRow_Any_MMI;
+ }
+#endif
// Odd height will harmlessly mirror the middle row twice.
for (y = 0; y < half_height; ++y) {
- MirrorRow(src, row, width); // Mirror first row into a buffer
- src += src_stride;
+ CopyRow(src, row, width); // Copy first row into buffer
MirrorRow(src_bot, dst, width); // Mirror last row into first row
+ MirrorRow(row, dst_bot, width); // Mirror buffer into last row
+ src += src_stride;
dst += dst_stride;
- CopyRow(row, dst_bot, width); // Copy first mirrored row into last
src_bot -= src_stride;
dst_bot -= dst_stride;
}
@@ -219,6 +239,15 @@ void TransposeUV(const uint8_t* src,
int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
int width) = TransposeUVWx8_C;
#endif
+
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ TransposeUVWx16 = TransposeUVWx16_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ TransposeUVWx16 = TransposeUVWx16_MSA;
+ }
+ }
+#else
#if defined(HAS_TRANSPOSEUVWX8_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
TransposeUVWx8 = TransposeUVWx8_NEON;
@@ -232,14 +261,15 @@ void TransposeUV(const uint8_t* src,
}
}
#endif
-#if defined(HAS_TRANSPOSEUVWX16_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- TransposeUVWx16 = TransposeUVWx16_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- TransposeUVWx16 = TransposeUVWx16_MSA;
+#if defined(HAS_TRANSPOSEUVWX8_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ TransposeUVWx8 = TransposeUVWx8_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ TransposeUVWx8 = TransposeUVWx8_MMI;
}
}
#endif
+#endif /* defined(HAS_TRANSPOSEUVWX16_MSA) */
#if defined(HAS_TRANSPOSEUVWX16_MSA)
// Work through the source in 8x8 tiles.
@@ -314,21 +344,26 @@ void RotateUV180(const uint8_t* src,
int width,
int height) {
int i;
- void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
- int width) = MirrorUVRow_C;
-#if defined(HAS_MIRRORUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
- MirrorUVRow = MirrorUVRow_NEON;
+ void (*MirrorSplitUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
+ int width) = MirrorSplitUVRow_C;
+#if defined(HAS_MIRRORSPLITUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+ MirrorSplitUVRow = MirrorSplitUVRow_NEON;
}
#endif
-#if defined(HAS_MIRRORUVROW_SSSE3)
+#if defined(HAS_MIRRORSPLITUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
- MirrorUVRow = MirrorUVRow_SSSE3;
+ MirrorSplitUVRow = MirrorSplitUVRow_SSSE3;
+ }
+#endif
+#if defined(HAS_MIRRORSPLITUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 8)) {
+ MirrorSplitUVRow = MirrorSplitUVRow_MMI;
}
#endif
-#if defined(HAS_MIRRORUVROW_MSA)
+#if defined(HAS_MIRRORSPLITUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) {
- MirrorUVRow = MirrorUVRow_MSA;
+ MirrorSplitUVRow = MirrorSplitUVRow_MSA;
}
#endif
@@ -336,7 +371,7 @@ void RotateUV180(const uint8_t* src,
dst_b += dst_stride_b * (height - 1);
for (i = 0; i < height; ++i) {
- MirrorUVRow(src, dst_a, dst_b, width);
+ MirrorSplitUVRow(src, dst_a, dst_b, width);
src += src_stride;
dst_a -= dst_stride_a;
dst_b -= dst_stride_b;
@@ -451,6 +486,66 @@ int I420Rotate(const uint8_t* src_y,
}
LIBYUV_API
+int I444Rotate(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum libyuv::RotationMode mode) {
+ if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+ !dst_u || !dst_v) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ switch (mode) {
+ case libyuv::kRotate0:
+ // copy frame
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+ case libyuv::kRotate90:
+ RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+ case libyuv::kRotate270:
+ RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+ case libyuv::kRotate180:
+ RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+LIBYUV_API
int NV12ToI420Rotate(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_uv,
diff --git a/chromium/third_party/libyuv/source/rotate_any.cc b/chromium/third_party/libyuv/source/rotate_any.cc
index c2752e6222c..b3baf084d0c 100644
--- a/chromium/third_party/libyuv/source/rotate_any.cc
+++ b/chromium/third_party/libyuv/source/rotate_any.cc
@@ -35,6 +35,9 @@ TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
#ifdef HAS_TRANSPOSEWX8_SSSE3
TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
#endif
+#ifdef HAS_TRANSPOSEWX8_MMI
+TANY(TransposeWx8_Any_MMI, TransposeWx8_MMI, 7)
+#endif
#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
#endif
@@ -62,6 +65,9 @@ TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
#ifdef HAS_TRANSPOSEUVWX8_SSE2
TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
#endif
+#ifdef HAS_TRANSPOSEUVWX8_MMI
+TUVANY(TransposeUVWx8_Any_MMI, TransposeUVWx8_MMI, 7)
+#endif
#ifdef HAS_TRANSPOSEUVWX16_MSA
TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7)
#endif
diff --git a/chromium/third_party/libyuv/source/rotate_argb.cc b/chromium/third_party/libyuv/source/rotate_argb.cc
index 5a6e05376f1..ae653886018 100644
--- a/chromium/third_party/libyuv/source/rotate_argb.cc
+++ b/chromium/third_party/libyuv/source/rotate_argb.cc
@@ -21,17 +21,21 @@ namespace libyuv {
extern "C" {
#endif
-static void ARGBTranspose(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+static int ARGBTranspose(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int i;
int src_pixel_step = src_stride_argb >> 2;
void (*ScaleARGBRowDownEven)(
const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step,
uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C;
+ // Check stride is a multiple of 4.
+ if (src_stride_argb & 3) {
+ return -1;
+ }
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2;
@@ -48,6 +52,14 @@ static void ARGBTranspose(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MMI;
+ if (IS_ALIGNED(height, 4)) { // Width of dest.
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_MMI;
+ }
+ }
+#endif
#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MSA;
@@ -62,44 +74,45 @@ static void ARGBTranspose(const uint8_t* src_argb,
dst_argb += dst_stride_argb;
src_argb += 4;
}
+ return 0;
}
-void ARGBRotate90(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+static int ARGBRotate90(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
// Rotate by 90 is a ARGBTranspose with the source read
// from bottom to top. So set the source pointer to the end
// of the buffer and flip the sign of the source stride.
src_argb += src_stride_argb * (height - 1);
src_stride_argb = -src_stride_argb;
- ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
+ return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
}
-void ARGBRotate270(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+static int ARGBRotate270(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
// Rotate by 270 is a ARGBTranspose with the destination written
// from bottom to top. So set the destination pointer to the end
// of the buffer and flip the sign of the destination stride.
dst_argb += dst_stride_argb * (width - 1);
dst_stride_argb = -dst_stride_argb;
- ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
+ return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
}
-void ARGBRotate180(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+static int ARGBRotate180(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
// Swap first and last row and mirror the content. Uses a temporary row.
align_buffer_64(row, width * 4);
const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1);
@@ -113,7 +126,7 @@ void ARGBRotate180(const uint8_t* src_argb,
#if defined(HAS_ARGBMIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
- if (IS_ALIGNED(width, 4)) {
+ if (IS_ALIGNED(width, 8)) {
ARGBMirrorRow = ARGBMirrorRow_NEON;
}
}
@@ -134,6 +147,14 @@ void ARGBRotate180(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBMIRRORROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBMirrorRow = ARGBMirrorRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBMIRRORROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
@@ -174,6 +195,7 @@ void ARGBRotate180(const uint8_t* src_argb,
dst_bot -= dst_stride_argb;
}
free_aligned_buffer_64(row);
+ return 0;
}
LIBYUV_API
@@ -201,17 +223,14 @@ int ARGBRotate(const uint8_t* src_argb,
return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
width, height);
case kRotate90:
- ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
- return 0;
+ return ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
case kRotate270:
- ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
- return 0;
+ return ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
case kRotate180:
- ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
- return 0;
+ return ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
default:
break;
}
diff --git a/chromium/third_party/libyuv/source/rotate_gcc.cc b/chromium/third_party/libyuv/source/rotate_gcc.cc
index 04e19e29eef..fd359d4ae69 100644
--- a/chromium/third_party/libyuv/source/rotate_gcc.cc
+++ b/chromium/third_party/libyuv/source/rotate_gcc.cc
@@ -31,75 +31,75 @@ void TransposeWx8_SSSE3(const uint8_t* src,
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
- "1: \n"
- "movq (%0),%%xmm0 \n"
- "movq (%0,%3),%%xmm1 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "movq (%0),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "palignr $0x8,%%xmm1,%%xmm1 \n"
- "movq (%0,%3),%%xmm3 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "movq (%0),%%xmm4 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "movq (%0,%3),%%xmm5 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "movq (%0),%%xmm6 \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq (%0,%3),%%xmm7 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "neg %3 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "lea 0x8(%0,%3,8),%0 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "neg %3 \n"
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movq (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "movq (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "movq (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movq (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "movq (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movq (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "lea 0x8(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "neg %3 \n"
// Second round of bit swap.
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "palignr $0x8,%%xmm2,%%xmm2 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "movdqa %%xmm5,%%xmm7 \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
- "punpckldq %%xmm4,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "palignr $0x8,%%xmm4,%%xmm4 \n"
- "movq %%xmm4,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "movq %%xmm2,(%1) \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movq %%xmm6,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm1,%%xmm5 \n"
- "movq %%xmm1,(%1) \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq %%xmm5,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movq %%xmm3,(%1) \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "sub $0x8,%2 \n"
- "movq %%xmm7,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "jg 1b \n"
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "sub $0x8,%2 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -121,127 +121,127 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src,
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%0,%3),%%xmm1 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm0,%%xmm8 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm8 \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm8,%%xmm9 \n"
- "palignr $0x8,%%xmm1,%%xmm1 \n"
- "palignr $0x8,%%xmm9,%%xmm9 \n"
- "movdqu (%0,%3),%%xmm3 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm2,%%xmm10 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "punpckhbw %%xmm3,%%xmm10 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "movdqa %%xmm10,%%xmm11 \n"
- "movdqu (%0),%%xmm4 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "palignr $0x8,%%xmm11,%%xmm11 \n"
- "movdqu (%0,%3),%%xmm5 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm4,%%xmm12 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "punpckhbw %%xmm5,%%xmm12 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "movdqa %%xmm12,%%xmm13 \n"
- "movdqu (%0),%%xmm6 \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "palignr $0x8,%%xmm13,%%xmm13 \n"
- "movdqu (%0,%3),%%xmm7 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm6,%%xmm14 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "punpckhbw %%xmm7,%%xmm14 \n"
- "neg %3 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "movdqa %%xmm14,%%xmm15 \n"
- "lea 0x10(%0,%3,8),%0 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
- "neg %3 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm9 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "palignr $0x8,%%xmm9,%%xmm9 \n"
+ "movdqu (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm2,%%xmm10 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm10 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm10,%%xmm11 \n"
+ "movdqu (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "movdqu (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm4,%%xmm12 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm12 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movdqa %%xmm12,%%xmm13 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movdqu (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm6,%%xmm14 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "punpckhbw %%xmm7,%%xmm14 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "movdqa %%xmm14,%%xmm15 \n"
+ "lea 0x10(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "neg %3 \n"
// Second round of bit swap.
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "palignr $0x8,%%xmm2,%%xmm2 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "movdqa %%xmm5,%%xmm7 \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "punpcklwd %%xmm10,%%xmm8 \n"
- "punpcklwd %%xmm11,%%xmm9 \n"
- "movdqa %%xmm8,%%xmm10 \n"
- "movdqa %%xmm9,%%xmm11 \n"
- "palignr $0x8,%%xmm10,%%xmm10 \n"
- "palignr $0x8,%%xmm11,%%xmm11 \n"
- "punpcklwd %%xmm14,%%xmm12 \n"
- "punpcklwd %%xmm15,%%xmm13 \n"
- "movdqa %%xmm12,%%xmm14 \n"
- "movdqa %%xmm13,%%xmm15 \n"
- "palignr $0x8,%%xmm14,%%xmm14 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "punpcklwd %%xmm10,%%xmm8 \n"
+ "punpcklwd %%xmm11,%%xmm9 \n"
+ "movdqa %%xmm8,%%xmm10 \n"
+ "movdqa %%xmm9,%%xmm11 \n"
+ "palignr $0x8,%%xmm10,%%xmm10 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "punpcklwd %%xmm14,%%xmm12 \n"
+ "punpcklwd %%xmm15,%%xmm13 \n"
+ "movdqa %%xmm12,%%xmm14 \n"
+ "movdqa %%xmm13,%%xmm15 \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
// Third round of bit swap.
// Write to the destination pointer.
- "punpckldq %%xmm4,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "palignr $0x8,%%xmm4,%%xmm4 \n"
- "movq %%xmm4,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "movq %%xmm2,(%1) \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movq %%xmm6,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm1,%%xmm5 \n"
- "movq %%xmm1,(%1) \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq %%xmm5,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movq %%xmm3,(%1) \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "movq %%xmm7,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm12,%%xmm8 \n"
- "movq %%xmm8,(%1) \n"
- "movdqa %%xmm8,%%xmm12 \n"
- "palignr $0x8,%%xmm12,%%xmm12 \n"
- "movq %%xmm12,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm14,%%xmm10 \n"
- "movdqa %%xmm10,%%xmm14 \n"
- "movq %%xmm10,(%1) \n"
- "palignr $0x8,%%xmm14,%%xmm14 \n"
- "punpckldq %%xmm13,%%xmm9 \n"
- "movq %%xmm14,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm9,%%xmm13 \n"
- "movq %%xmm9,(%1) \n"
- "palignr $0x8,%%xmm13,%%xmm13 \n"
- "movq %%xmm13,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm15,%%xmm11 \n"
- "movq %%xmm11,(%1) \n"
- "movdqa %%xmm11,%%xmm15 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
- "sub $0x10,%2 \n"
- "movq %%xmm15,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "jg 1b \n"
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm12,%%xmm8 \n"
+ "movq %%xmm8,(%1) \n"
+ "movdqa %%xmm8,%%xmm12 \n"
+ "palignr $0x8,%%xmm12,%%xmm12 \n"
+ "movq %%xmm12,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm14,%%xmm10 \n"
+ "movdqa %%xmm10,%%xmm14 \n"
+ "movq %%xmm10,(%1) \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "punpckldq %%xmm13,%%xmm9 \n"
+ "movq %%xmm14,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm9,%%xmm13 \n"
+ "movq %%xmm9,(%1) \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movq %%xmm13,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm15,%%xmm11 \n"
+ "movq %%xmm11,(%1) \n"
+ "movdqa %%xmm11,%%xmm15 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "sub $0x10,%2 \n"
+ "movq %%xmm15,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -266,95 +266,95 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%0,%4),%%xmm1 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm0,%%xmm8 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm1 \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqu (%0,%4),%%xmm3 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm2,%%xmm8 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "punpckhbw %%xmm3,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm3 \n"
- "movdqu (%0),%%xmm4 \n"
- "movdqu (%0,%4),%%xmm5 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm4,%%xmm8 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "punpckhbw %%xmm5,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm5 \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu (%0,%4),%%xmm7 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm6,%%xmm8 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "neg %4 \n"
- "lea 0x10(%0,%4,8),%0 \n"
- "punpckhbw %%xmm7,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm7 \n"
- "neg %4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%4),%%xmm1 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm1 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu (%0,%4),%%xmm3 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm3 \n"
+ "movdqu (%0),%%xmm4 \n"
+ "movdqu (%0,%4),%%xmm5 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm5 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu (%0,%4),%%xmm7 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm6,%%xmm8 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %4 \n"
+ "lea 0x10(%0,%4,8),%0 \n"
+ "punpckhbw %%xmm7,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm7 \n"
+ "neg %4 \n"
// Second round of bit swap.
- "movdqa %%xmm0,%%xmm8 \n"
- "movdqa %%xmm1,%%xmm9 \n"
- "punpckhwd %%xmm2,%%xmm8 \n"
- "punpckhwd %%xmm3,%%xmm9 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm8,%%xmm2 \n"
- "movdqa %%xmm9,%%xmm3 \n"
- "movdqa %%xmm4,%%xmm8 \n"
- "movdqa %%xmm5,%%xmm9 \n"
- "punpckhwd %%xmm6,%%xmm8 \n"
- "punpckhwd %%xmm7,%%xmm9 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm8,%%xmm6 \n"
- "movdqa %%xmm9,%%xmm7 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "movdqa %%xmm1,%%xmm9 \n"
+ "punpckhwd %%xmm2,%%xmm8 \n"
+ "punpckhwd %%xmm3,%%xmm9 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm2 \n"
+ "movdqa %%xmm9,%%xmm3 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "movdqa %%xmm5,%%xmm9 \n"
+ "punpckhwd %%xmm6,%%xmm8 \n"
+ "punpckhwd %%xmm7,%%xmm9 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm8,%%xmm6 \n"
+ "movdqa %%xmm9,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
- "movdqa %%xmm0,%%xmm8 \n"
- "punpckldq %%xmm4,%%xmm0 \n"
- "movlpd %%xmm0,(%1) \n" // Write back U channel
- "movhpd %%xmm0,(%2) \n" // Write back V channel
- "punpckhdq %%xmm4,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm2,%%xmm8 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movlpd %%xmm2,(%1) \n"
- "movhpd %%xmm2,(%2) \n"
- "punpckhdq %%xmm6,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm1,%%xmm8 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movlpd %%xmm1,(%1) \n"
- "movhpd %%xmm1,(%2) \n"
- "punpckhdq %%xmm5,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm3,%%xmm8 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movlpd %%xmm3,(%1) \n"
- "movhpd %%xmm3,(%2) \n"
- "punpckhdq %%xmm7,%%xmm8 \n"
- "sub $0x8,%3 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "jg 1b \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movlpd %%xmm0,(%1) \n" // Write back U channel
+ "movhpd %%xmm0,(%2) \n" // Write back V channel
+ "punpckhdq %%xmm4,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movlpd %%xmm2,(%1) \n"
+ "movhpd %%xmm2,(%2) \n"
+ "punpckhdq %%xmm6,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm1,%%xmm8 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movlpd %%xmm1,(%1) \n"
+ "movhpd %%xmm1,(%2) \n"
+ "punpckhdq %%xmm5,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm3,%%xmm8 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movlpd %%xmm3,(%1) \n"
+ "movhpd %%xmm3,(%2) \n"
+ "punpckhdq %%xmm7,%%xmm8 \n"
+ "sub $0x8,%3 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
diff --git a/chromium/third_party/libyuv/source/rotate_mmi.cc b/chromium/third_party/libyuv/source/rotate_mmi.cc
new file mode 100644
index 00000000000..f8de60834d9
--- /dev/null
+++ b/chromium/third_party/libyuv/source/rotate_mmi.cc
@@ -0,0 +1,291 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Mips MMI.
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+void TransposeWx8_MMI(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width) {
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+ uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13;
+ uint8_t* src_tmp = nullptr;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "ldc1 %[tmp12], 0x00(%[src]) \n\t"
+ "dadd %[src_tmp], %[src], %[src_stride] \n\t"
+ "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
+
+ /* tmp0 = (00 10 01 11 02 12 03 13) */
+ "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t"
+ /* tmp1 = (04 14 05 15 06 16 07 17) */
+ "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t"
+
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
+
+ /* tmp2 = (20 30 21 31 22 32 23 33) */
+ "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t"
+ /* tmp3 = (24 34 25 35 26 36 27 37) */
+ "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t"
+
+ /* tmp4 = (00 10 20 30 01 11 21 31) */
+ "punpcklhw %[tmp4], %[tmp0], %[tmp2] \n\t"
+ /* tmp5 = (02 12 22 32 03 13 23 33) */
+ "punpckhhw %[tmp5], %[tmp0], %[tmp2] \n\t"
+ /* tmp6 = (04 14 24 34 05 15 25 35) */
+ "punpcklhw %[tmp6], %[tmp1], %[tmp3] \n\t"
+ /* tmp7 = (06 16 26 36 07 17 27 37) */
+ "punpckhhw %[tmp7], %[tmp1], %[tmp3] \n\t"
+
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
+
+ /* tmp0 = (40 50 41 51 42 52 43 53) */
+ "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t"
+ /* tmp1 = (44 54 45 55 46 56 47 57) */
+ "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t"
+
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
+
+ /* tmp2 = (60 70 61 71 62 72 63 73) */
+ "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t"
+ /* tmp3 = (64 74 65 75 66 76 67 77) */
+ "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t"
+
+ /* tmp8 = (40 50 60 70 41 51 61 71) */
+ "punpcklhw %[tmp8], %[tmp0], %[tmp2] \n\t"
+ /* tmp9 = (42 52 62 72 43 53 63 73) */
+ "punpckhhw %[tmp9], %[tmp0], %[tmp2] \n\t"
+ /* tmp10 = (44 54 64 74 45 55 65 75) */
+ "punpcklhw %[tmp10], %[tmp1], %[tmp3] \n\t"
+ /* tmp11 = (46 56 66 76 47 57 67 77) */
+ "punpckhhw %[tmp11], %[tmp1], %[tmp3] \n\t"
+
+ /* tmp0 = (00 10 20 30 40 50 60 70) */
+ "punpcklwd %[tmp0], %[tmp4], %[tmp8] \n\t"
+ /* tmp1 = (01 11 21 31 41 51 61 71) */
+ "punpckhwd %[tmp1], %[tmp4], %[tmp8] \n\t"
+ "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t"
+ "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t"
+ "dadd %[dst], %[dst], %[dst_stride] \n\t"
+ "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t"
+ "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t"
+
+ /* tmp0 = (02 12 22 32 42 52 62 72) */
+ "punpcklwd %[tmp0], %[tmp5], %[tmp9] \n\t"
+ /* tmp1 = (03 13 23 33 43 53 63 73) */
+ "punpckhwd %[tmp1], %[tmp5], %[tmp9] \n\t"
+ "dadd %[dst], %[dst], %[dst_stride] \n\t"
+ "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t"
+ "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t"
+ "dadd %[dst], %[dst], %[dst_stride] \n\t"
+ "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t"
+ "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t"
+
+ /* tmp0 = (04 14 24 34 44 54 64 74) */
+ "punpcklwd %[tmp0], %[tmp6], %[tmp10] \n\t"
+ /* tmp1 = (05 15 25 35 45 55 65 75) */
+ "punpckhwd %[tmp1], %[tmp6], %[tmp10] \n\t"
+ "dadd %[dst], %[dst], %[dst_stride] \n\t"
+ "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t"
+ "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t"
+ "dadd %[dst], %[dst], %[dst_stride] \n\t"
+ "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t"
+ "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t"
+
+ /* tmp0 = (06 16 26 36 46 56 66 76) */
+ "punpcklwd %[tmp0], %[tmp7], %[tmp11] \n\t"
+ /* tmp1 = (07 17 27 37 47 57 67 77) */
+ "punpckhwd %[tmp1], %[tmp7], %[tmp11] \n\t"
+ "dadd %[dst], %[dst], %[dst_stride] \n\t"
+ "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t"
+ "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t"
+ "dadd %[dst], %[dst], %[dst_stride] \n\t"
+ "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t"
+ "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t"
+
+ "dadd %[dst], %[dst], %[dst_stride] \n\t"
+ "daddi %[src], %[src], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
+ [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5),
+ [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8),
+ [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11),
+ [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst] "+&r"(dst),
+ [src_tmp] "+&r"(src_tmp)
+ : [src] "r"(src), [width] "r"(width), [src_stride] "r"(src_stride),
+ [dst_stride] "r"(dst_stride)
+ : "memory");
+}
+
+void TransposeUVWx8_MMI(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width) {
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+ uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13;
+ uint8_t* src_tmp = nullptr;
+
+ __asm__ volatile(
+ "1: \n\t"
+ /* tmp12 = (u00 v00 u01 v01 u02 v02 u03 v03) */
+ "ldc1 %[tmp12], 0x00(%[src]) \n\t"
+ "dadd %[src_tmp], %[src], %[src_stride] \n\t"
+ /* tmp13 = (u10 v10 u11 v11 u12 v12 u13 v13) */
+ "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
+
+ /* tmp0 = (u00 u10 v00 v10 u01 u11 v01 v11) */
+ "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t"
+ /* tmp1 = (u02 u12 v02 v12 u03 u13 v03 v13) */
+ "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t"
+
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ /* tmp12 = (u20 v20 u21 v21 u22 v22 u23 v23) */
+ "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ /* tmp13 = (u30 v30 u31 v31 u32 v32 u33 v33) */
+ "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
+
+ /* tmp2 = (u20 u30 v20 v30 u21 u31 v21 v31) */
+ "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t"
+ /* tmp3 = (u22 u32 v22 v32 u23 u33 v23 v33) */
+ "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t"
+
+ /* tmp4 = (u00 u10 u20 u30 v00 v10 v20 v30) */
+ "punpcklhw %[tmp4], %[tmp0], %[tmp2] \n\t"
+ /* tmp5 = (u01 u11 u21 u31 v01 v11 v21 v31) */
+ "punpckhhw %[tmp5], %[tmp0], %[tmp2] \n\t"
+ /* tmp6 = (u02 u12 u22 u32 v02 v12 v22 v32) */
+ "punpcklhw %[tmp6], %[tmp1], %[tmp3] \n\t"
+ /* tmp7 = (u03 u13 u23 u33 v03 v13 v23 v33) */
+ "punpckhhw %[tmp7], %[tmp1], %[tmp3] \n\t"
+
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ /* tmp12 = (u40 v40 u41 v41 u42 v42 u43 v43) */
+ "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
+ /* tmp13 = (u50 v50 u51 v51 u52 v52 u53 v53) */
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
+
+ /* tmp0 = (u40 u50 v40 v50 u41 u51 v41 v51) */
+ "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t"
+ /* tmp1 = (u42 u52 v42 v52 u43 u53 v43 v53) */
+ "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t"
+
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ /* tmp12 = (u60 v60 u61 v61 u62 v62 u63 v63) */
+ "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
+ /* tmp13 = (u70 v70 u71 v71 u72 v72 u73 v73) */
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
+
+ /* tmp2 = (u60 u70 v60 v70 u61 u71 v61 v71) */
+ "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t"
+ /* tmp3 = (u62 u72 v62 v72 u63 u73 v63 v73) */
+ "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t"
+
+ /* tmp8 = (u40 u50 u60 u70 v40 v50 v60 v70) */
+ "punpcklhw %[tmp8], %[tmp0], %[tmp2] \n\t"
+ /* tmp9 = (u41 u51 u61 u71 v41 v51 v61 v71) */
+ "punpckhhw %[tmp9], %[tmp0], %[tmp2] \n\t"
+ /* tmp10 = (u42 u52 u62 u72 v42 v52 v62 v72) */
+ "punpcklhw %[tmp10], %[tmp1], %[tmp3] \n\t"
+ /* tmp11 = (u43 u53 u63 u73 v43 v53 v63 v73) */
+ "punpckhhw %[tmp11], %[tmp1], %[tmp3] \n\t"
+
+ /* tmp0 = (u00 u10 u20 u30 u40 u50 u60 u70) */
+ "punpcklwd %[tmp0], %[tmp4], %[tmp8] \n\t"
+ /* tmp1 = (v00 v10 v20 v30 v40 v50 v60 v70) */
+ "punpckhwd %[tmp1], %[tmp4], %[tmp8] \n\t"
+ "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t"
+ "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t"
+ "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t"
+ "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t"
+
+ /* tmp0 = (u01 u11 u21 u31 u41 u51 u61 u71) */
+ "punpcklwd %[tmp0], %[tmp5], %[tmp9] \n\t"
+ /* tmp1 = (v01 v11 v21 v31 v41 v51 v61 v71) */
+ "punpckhwd %[tmp1], %[tmp5], %[tmp9] \n\t"
+ "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t"
+ "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t"
+ "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t"
+ "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t"
+ "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t"
+ "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t"
+
+ /* tmp0 = (u02 u12 u22 u32 u42 u52 u62 u72) */
+ "punpcklwd %[tmp0], %[tmp6], %[tmp10] \n\t"
+ /* tmp1 = (v02 v12 v22 v32 v42 v52 v62 v72) */
+ "punpckhwd %[tmp1], %[tmp6], %[tmp10] \n\t"
+ "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t"
+ "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t"
+ "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t"
+ "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t"
+ "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t"
+ "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t"
+
+ /* tmp0 = (u03 u13 u23 u33 u43 u53 u63 u73) */
+ "punpcklwd %[tmp0], %[tmp7], %[tmp11] \n\t"
+ /* tmp1 = (v03 v13 v23 v33 v43 v53 v63 v73) */
+ "punpckhwd %[tmp1], %[tmp7], %[tmp11] \n\t"
+ "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t"
+ "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t"
+ "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t"
+ "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t"
+ "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t"
+ "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t"
+
+ "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t"
+ "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t"
+ "daddiu %[src], %[src], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
+ [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5),
+ [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8),
+ [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11),
+ [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst_a] "+&r"(dst_a),
+ [dst_b] "+&r"(dst_b), [src_tmp] "+&r"(src_tmp)
+ : [src] "r"(src), [width] "r"(width), [dst_stride_a] "r"(dst_stride_a),
+ [dst_stride_b] "r"(dst_stride_b), [src_stride] "r"(src_stride)
+ : "memory");
+}
+
+#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/rotate_neon.cc b/chromium/third_party/libyuv/source/rotate_neon.cc
index fdc0dd476c6..844df2bf305 100644
--- a/chromium/third_party/libyuv/source/rotate_neon.cc
+++ b/chromium/third_party/libyuv/source/rotate_neon.cc
@@ -38,52 +38,52 @@ void TransposeWx8_NEON(const uint8_t* src,
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
- "mov %0, %1 \n"
-
- "vld1.8 {d0}, [%0], %2 \n"
- "vld1.8 {d1}, [%0], %2 \n"
- "vld1.8 {d2}, [%0], %2 \n"
- "vld1.8 {d3}, [%0], %2 \n"
- "vld1.8 {d4}, [%0], %2 \n"
- "vld1.8 {d5}, [%0], %2 \n"
- "vld1.8 {d6}, [%0], %2 \n"
- "vld1.8 {d7}, [%0] \n"
-
- "vtrn.8 d1, d0 \n"
- "vtrn.8 d3, d2 \n"
- "vtrn.8 d5, d4 \n"
- "vtrn.8 d7, d6 \n"
-
- "vtrn.16 d1, d3 \n"
- "vtrn.16 d0, d2 \n"
- "vtrn.16 d5, d7 \n"
- "vtrn.16 d4, d6 \n"
-
- "vtrn.32 d1, d5 \n"
- "vtrn.32 d0, d4 \n"
- "vtrn.32 d3, d7 \n"
- "vtrn.32 d2, d6 \n"
-
- "vrev16.8 q0, q0 \n"
- "vrev16.8 q1, q1 \n"
- "vrev16.8 q2, q2 \n"
- "vrev16.8 q3, q3 \n"
-
- "mov %0, %3 \n"
-
- "vst1.8 {d1}, [%0], %4 \n"
- "vst1.8 {d0}, [%0], %4 \n"
- "vst1.8 {d3}, [%0], %4 \n"
- "vst1.8 {d2}, [%0], %4 \n"
- "vst1.8 {d5}, [%0], %4 \n"
- "vst1.8 {d4}, [%0], %4 \n"
- "vst1.8 {d7}, [%0], %4 \n"
- "vst1.8 {d6}, [%0] \n"
-
- "add %1, #8 \n" // src += 8
- "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
- "subs %5, #8 \n" // w -= 8
- "bge 1b \n"
+ "mov %0, %1 \n"
+
+ "vld1.8 {d0}, [%0], %2 \n"
+ "vld1.8 {d1}, [%0], %2 \n"
+ "vld1.8 {d2}, [%0], %2 \n"
+ "vld1.8 {d3}, [%0], %2 \n"
+ "vld1.8 {d4}, [%0], %2 \n"
+ "vld1.8 {d5}, [%0], %2 \n"
+ "vld1.8 {d6}, [%0], %2 \n"
+ "vld1.8 {d7}, [%0] \n"
+
+ "vtrn.8 d1, d0 \n"
+ "vtrn.8 d3, d2 \n"
+ "vtrn.8 d5, d4 \n"
+ "vtrn.8 d7, d6 \n"
+
+ "vtrn.16 d1, d3 \n"
+ "vtrn.16 d0, d2 \n"
+ "vtrn.16 d5, d7 \n"
+ "vtrn.16 d4, d6 \n"
+
+ "vtrn.32 d1, d5 \n"
+ "vtrn.32 d0, d4 \n"
+ "vtrn.32 d3, d7 \n"
+ "vtrn.32 d2, d6 \n"
+
+ "vrev16.8 q0, q0 \n"
+ "vrev16.8 q1, q1 \n"
+ "vrev16.8 q2, q2 \n"
+ "vrev16.8 q3, q3 \n"
+
+ "mov %0, %3 \n"
+
+ "vst1.8 {d1}, [%0], %4 \n"
+ "vst1.8 {d0}, [%0], %4 \n"
+ "vst1.8 {d3}, [%0], %4 \n"
+ "vst1.8 {d2}, [%0], %4 \n"
+ "vst1.8 {d5}, [%0], %4 \n"
+ "vst1.8 {d4}, [%0], %4 \n"
+ "vst1.8 {d7}, [%0], %4 \n"
+ "vst1.8 {d6}, [%0] \n"
+
+ "add %1, #8 \n" // src += 8
+ "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
+ "subs %5, #8 \n" // w -= 8
+ "bge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
@@ -208,68 +208,70 @@ void TransposeUVWx8_NEON(const uint8_t* src,
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
- "mov %0, %1 \n"
-
- "vld2.8 {d0, d1}, [%0], %2 \n"
- "vld2.8 {d2, d3}, [%0], %2 \n"
- "vld2.8 {d4, d5}, [%0], %2 \n"
- "vld2.8 {d6, d7}, [%0], %2 \n"
- "vld2.8 {d16, d17}, [%0], %2 \n"
- "vld2.8 {d18, d19}, [%0], %2 \n"
- "vld2.8 {d20, d21}, [%0], %2 \n"
- "vld2.8 {d22, d23}, [%0] \n"
-
- "vtrn.8 q1, q0 \n"
- "vtrn.8 q3, q2 \n"
- "vtrn.8 q9, q8 \n"
- "vtrn.8 q11, q10 \n"
-
- "vtrn.16 q1, q3 \n"
- "vtrn.16 q0, q2 \n"
- "vtrn.16 q9, q11 \n"
- "vtrn.16 q8, q10 \n"
-
- "vtrn.32 q1, q9 \n"
- "vtrn.32 q0, q8 \n"
- "vtrn.32 q3, q11 \n"
- "vtrn.32 q2, q10 \n"
-
- "vrev16.8 q0, q0 \n"
- "vrev16.8 q1, q1 \n"
- "vrev16.8 q2, q2 \n"
- "vrev16.8 q3, q3 \n"
- "vrev16.8 q8, q8 \n"
- "vrev16.8 q9, q9 \n"
- "vrev16.8 q10, q10 \n"
- "vrev16.8 q11, q11 \n"
-
- "mov %0, %3 \n"
-
- "vst1.8 {d2}, [%0], %4 \n"
- "vst1.8 {d0}, [%0], %4 \n"
- "vst1.8 {d6}, [%0], %4 \n"
- "vst1.8 {d4}, [%0], %4 \n"
- "vst1.8 {d18}, [%0], %4 \n"
- "vst1.8 {d16}, [%0], %4 \n"
- "vst1.8 {d22}, [%0], %4 \n"
- "vst1.8 {d20}, [%0] \n"
-
- "mov %0, %5 \n"
-
- "vst1.8 {d3}, [%0], %6 \n"
- "vst1.8 {d1}, [%0], %6 \n"
- "vst1.8 {d7}, [%0], %6 \n"
- "vst1.8 {d5}, [%0], %6 \n"
- "vst1.8 {d19}, [%0], %6 \n"
- "vst1.8 {d17}, [%0], %6 \n"
- "vst1.8 {d23}, [%0], %6 \n"
- "vst1.8 {d21}, [%0] \n"
-
- "add %1, #8*2 \n" // src += 8*2
- "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a
- "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b
- "subs %7, #8 \n" // w -= 8
- "bge 1b \n"
+ "mov %0, %1 \n"
+
+ "vld2.8 {d0, d1}, [%0], %2 \n"
+ "vld2.8 {d2, d3}, [%0], %2 \n"
+ "vld2.8 {d4, d5}, [%0], %2 \n"
+ "vld2.8 {d6, d7}, [%0], %2 \n"
+ "vld2.8 {d16, d17}, [%0], %2 \n"
+ "vld2.8 {d18, d19}, [%0], %2 \n"
+ "vld2.8 {d20, d21}, [%0], %2 \n"
+ "vld2.8 {d22, d23}, [%0] \n"
+
+ "vtrn.8 q1, q0 \n"
+ "vtrn.8 q3, q2 \n"
+ "vtrn.8 q9, q8 \n"
+ "vtrn.8 q11, q10 \n"
+
+ "vtrn.16 q1, q3 \n"
+ "vtrn.16 q0, q2 \n"
+ "vtrn.16 q9, q11 \n"
+ "vtrn.16 q8, q10 \n"
+
+ "vtrn.32 q1, q9 \n"
+ "vtrn.32 q0, q8 \n"
+ "vtrn.32 q3, q11 \n"
+ "vtrn.32 q2, q10 \n"
+
+ "vrev16.8 q0, q0 \n"
+ "vrev16.8 q1, q1 \n"
+ "vrev16.8 q2, q2 \n"
+ "vrev16.8 q3, q3 \n"
+ "vrev16.8 q8, q8 \n"
+ "vrev16.8 q9, q9 \n"
+ "vrev16.8 q10, q10 \n"
+ "vrev16.8 q11, q11 \n"
+
+ "mov %0, %3 \n"
+
+ "vst1.8 {d2}, [%0], %4 \n"
+ "vst1.8 {d0}, [%0], %4 \n"
+ "vst1.8 {d6}, [%0], %4 \n"
+ "vst1.8 {d4}, [%0], %4 \n"
+ "vst1.8 {d18}, [%0], %4 \n"
+ "vst1.8 {d16}, [%0], %4 \n"
+ "vst1.8 {d22}, [%0], %4 \n"
+ "vst1.8 {d20}, [%0] \n"
+
+ "mov %0, %5 \n"
+
+ "vst1.8 {d3}, [%0], %6 \n"
+ "vst1.8 {d1}, [%0], %6 \n"
+ "vst1.8 {d7}, [%0], %6 \n"
+ "vst1.8 {d5}, [%0], %6 \n"
+ "vst1.8 {d19}, [%0], %6 \n"
+ "vst1.8 {d17}, [%0], %6 \n"
+ "vst1.8 {d23}, [%0], %6 \n"
+ "vst1.8 {d21}, [%0] \n"
+
+ "add %1, #8*2 \n" // src += 8*2
+ "add %3, %3, %4, lsl #3 \n" // dst_a += 8 *
+ // dst_stride_a
+ "add %5, %5, %6, lsl #3 \n" // dst_b += 8 *
+ // dst_stride_b
+ "subs %7, #8 \n" // w -= 8
+ "bge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
diff --git a/chromium/third_party/libyuv/source/rotate_neon64.cc b/chromium/third_party/libyuv/source/rotate_neon64.cc
index f469baacf68..43c1581731d 100644
--- a/chromium/third_party/libyuv/source/rotate_neon64.cc
+++ b/chromium/third_party/libyuv/source/rotate_neon64.cc
@@ -34,58 +34,74 @@ void TransposeWx8_NEON(const uint8_t* src,
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
- "sub %w3, %w3, #8 \n"
+ "sub %w3, %w3, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
- "1: \n"
+ "1: \n"
+ "mov %0, %1 \n"
+
+ "ld1 {v0.8b}, [%0], %5 \n"
+ "ld1 {v1.8b}, [%0], %5 \n"
+ "ld1 {v2.8b}, [%0], %5 \n"
+ "ld1 {v3.8b}, [%0], %5 \n"
+ "ld1 {v4.8b}, [%0], %5 \n"
+ "ld1 {v5.8b}, [%0], %5 \n"
+ "ld1 {v6.8b}, [%0], %5 \n"
+ "ld1 {v7.8b}, [%0] \n"
"mov %0, %1 \n"
- "ld1 {v0.8b}, [%0], %5 \n"
- "ld1 {v1.8b}, [%0], %5 \n"
- "ld1 {v2.8b}, [%0], %5 \n"
- "ld1 {v3.8b}, [%0], %5 \n"
- "ld1 {v4.8b}, [%0], %5 \n"
- "ld1 {v5.8b}, [%0], %5 \n"
- "ld1 {v6.8b}, [%0], %5 \n"
- "ld1 {v7.8b}, [%0] \n"
-
- "trn2 v16.8b, v0.8b, v1.8b \n"
- "trn1 v17.8b, v0.8b, v1.8b \n"
- "trn2 v18.8b, v2.8b, v3.8b \n"
- "trn1 v19.8b, v2.8b, v3.8b \n"
- "trn2 v20.8b, v4.8b, v5.8b \n"
- "trn1 v21.8b, v4.8b, v5.8b \n"
- "trn2 v22.8b, v6.8b, v7.8b \n"
- "trn1 v23.8b, v6.8b, v7.8b \n"
-
- "trn2 v3.4h, v17.4h, v19.4h \n"
- "trn1 v1.4h, v17.4h, v19.4h \n"
- "trn2 v2.4h, v16.4h, v18.4h \n"
- "trn1 v0.4h, v16.4h, v18.4h \n"
- "trn2 v7.4h, v21.4h, v23.4h \n"
- "trn1 v5.4h, v21.4h, v23.4h \n"
- "trn2 v6.4h, v20.4h, v22.4h \n"
- "trn1 v4.4h, v20.4h, v22.4h \n"
-
- "trn2 v21.2s, v1.2s, v5.2s \n"
- "trn1 v17.2s, v1.2s, v5.2s \n"
- "trn2 v20.2s, v0.2s, v4.2s \n"
- "trn1 v16.2s, v0.2s, v4.2s \n"
- "trn2 v23.2s, v3.2s, v7.2s \n"
- "trn1 v19.2s, v3.2s, v7.2s \n"
- "trn2 v22.2s, v2.2s, v6.2s \n"
- "trn1 v18.2s, v2.2s, v6.2s \n"
+ "trn2 v16.8b, v0.8b, v1.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "trn1 v17.8b, v0.8b, v1.8b \n"
+ "add %0, %0, %5 \n"
+ "trn2 v18.8b, v2.8b, v3.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 1
+ "trn1 v19.8b, v2.8b, v3.8b \n"
+ "add %0, %0, %5 \n"
+ "trn2 v20.8b, v4.8b, v5.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 2
+ "trn1 v21.8b, v4.8b, v5.8b \n"
+ "add %0, %0, %5 \n"
+ "trn2 v22.8b, v6.8b, v7.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 3
+ "trn1 v23.8b, v6.8b, v7.8b \n"
+ "add %0, %0, %5 \n"
+
+ "trn2 v3.4h, v17.4h, v19.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 4
+ "trn1 v1.4h, v17.4h, v19.4h \n"
+ "add %0, %0, %5 \n"
+ "trn2 v2.4h, v16.4h, v18.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 5
+ "trn1 v0.4h, v16.4h, v18.4h \n"
+ "add %0, %0, %5 \n"
+ "trn2 v7.4h, v21.4h, v23.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 6
+ "trn1 v5.4h, v21.4h, v23.4h \n"
+ "add %0, %0, %5 \n"
+ "trn2 v6.4h, v20.4h, v22.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 7
+ "trn1 v4.4h, v20.4h, v22.4h \n"
+
+ "trn2 v21.2s, v1.2s, v5.2s \n"
+ "trn1 v17.2s, v1.2s, v5.2s \n"
+ "trn2 v20.2s, v0.2s, v4.2s \n"
+ "trn1 v16.2s, v0.2s, v4.2s \n"
+ "trn2 v23.2s, v3.2s, v7.2s \n"
+ "trn1 v19.2s, v3.2s, v7.2s \n"
+ "trn2 v22.2s, v2.2s, v6.2s \n"
+ "trn1 v18.2s, v2.2s, v6.2s \n"
"mov %0, %2 \n"
- "st1 {v17.8b}, [%0], %6 \n"
- "st1 {v16.8b}, [%0], %6 \n"
- "st1 {v19.8b}, [%0], %6 \n"
- "st1 {v18.8b}, [%0], %6 \n"
- "st1 {v21.8b}, [%0], %6 \n"
- "st1 {v20.8b}, [%0], %6 \n"
- "st1 {v23.8b}, [%0], %6 \n"
- "st1 {v22.8b}, [%0] \n"
+ "st1 {v17.8b}, [%0], %6 \n"
+ "st1 {v16.8b}, [%0], %6 \n"
+ "st1 {v19.8b}, [%0], %6 \n"
+ "st1 {v18.8b}, [%0], %6 \n"
+ "st1 {v21.8b}, [%0], %6 \n"
+ "st1 {v20.8b}, [%0], %6 \n"
+ "st1 {v23.8b}, [%0], %6 \n"
+ "st1 {v22.8b}, [%0] \n"
"add %1, %1, #8 \n" // src += 8
"add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride
@@ -94,33 +110,33 @@ void TransposeWx8_NEON(const uint8_t* src,
// add 8 back to counter. if the result is 0 there are
// no residuals.
- "adds %w3, %w3, #8 \n"
- "b.eq 4f \n"
+ "adds %w3, %w3, #8 \n"
+ "b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
- "cmp %w3, #2 \n"
- "b.lt 3f \n"
+ "cmp %w3, #2 \n"
+ "b.lt 3f \n"
- "cmp %w3, #4 \n"
- "b.lt 2f \n"
+ "cmp %w3, #4 \n"
+ "b.lt 2f \n"
// 4x8 block
- "mov %0, %1 \n"
- "ld1 {v0.s}[0], [%0], %5 \n"
- "ld1 {v0.s}[1], [%0], %5 \n"
- "ld1 {v0.s}[2], [%0], %5 \n"
- "ld1 {v0.s}[3], [%0], %5 \n"
- "ld1 {v1.s}[0], [%0], %5 \n"
- "ld1 {v1.s}[1], [%0], %5 \n"
- "ld1 {v1.s}[2], [%0], %5 \n"
- "ld1 {v1.s}[3], [%0] \n"
+ "mov %0, %1 \n"
+ "ld1 {v0.s}[0], [%0], %5 \n"
+ "ld1 {v0.s}[1], [%0], %5 \n"
+ "ld1 {v0.s}[2], [%0], %5 \n"
+ "ld1 {v0.s}[3], [%0], %5 \n"
+ "ld1 {v1.s}[0], [%0], %5 \n"
+ "ld1 {v1.s}[1], [%0], %5 \n"
+ "ld1 {v1.s}[2], [%0], %5 \n"
+ "ld1 {v1.s}[3], [%0] \n"
- "mov %0, %2 \n"
+ "mov %0, %2 \n"
- "ld1 {v2.16b}, [%4] \n"
+ "ld1 {v2.16b}, [%4] \n"
- "tbl v3.16b, {v0.16b}, v2.16b \n"
- "tbl v0.16b, {v1.16b}, v2.16b \n"
+ "tbl v3.16b, {v0.16b}, v2.16b \n"
+ "tbl v0.16b, {v1.16b}, v2.16b \n"
// TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes.
@@ -212,89 +228,90 @@ void TransposeUVWx8_NEON(const uint8_t* src,
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
- "sub %w4, %w4, #8 \n"
+ "sub %w4, %w4, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
- "mov %0, %1 \n"
-
- "ld1 {v0.16b}, [%0], %5 \n"
- "ld1 {v1.16b}, [%0], %5 \n"
- "ld1 {v2.16b}, [%0], %5 \n"
- "ld1 {v3.16b}, [%0], %5 \n"
- "ld1 {v4.16b}, [%0], %5 \n"
- "ld1 {v5.16b}, [%0], %5 \n"
- "ld1 {v6.16b}, [%0], %5 \n"
- "ld1 {v7.16b}, [%0] \n"
-
- "trn1 v16.16b, v0.16b, v1.16b \n"
- "trn2 v17.16b, v0.16b, v1.16b \n"
- "trn1 v18.16b, v2.16b, v3.16b \n"
- "trn2 v19.16b, v2.16b, v3.16b \n"
- "trn1 v20.16b, v4.16b, v5.16b \n"
- "trn2 v21.16b, v4.16b, v5.16b \n"
- "trn1 v22.16b, v6.16b, v7.16b \n"
- "trn2 v23.16b, v6.16b, v7.16b \n"
-
- "trn1 v0.8h, v16.8h, v18.8h \n"
- "trn2 v1.8h, v16.8h, v18.8h \n"
- "trn1 v2.8h, v20.8h, v22.8h \n"
- "trn2 v3.8h, v20.8h, v22.8h \n"
- "trn1 v4.8h, v17.8h, v19.8h \n"
- "trn2 v5.8h, v17.8h, v19.8h \n"
- "trn1 v6.8h, v21.8h, v23.8h \n"
- "trn2 v7.8h, v21.8h, v23.8h \n"
-
- "trn1 v16.4s, v0.4s, v2.4s \n"
- "trn2 v17.4s, v0.4s, v2.4s \n"
- "trn1 v18.4s, v1.4s, v3.4s \n"
- "trn2 v19.4s, v1.4s, v3.4s \n"
- "trn1 v20.4s, v4.4s, v6.4s \n"
- "trn2 v21.4s, v4.4s, v6.4s \n"
- "trn1 v22.4s, v5.4s, v7.4s \n"
- "trn2 v23.4s, v5.4s, v7.4s \n"
+ "mov %0, %1 \n"
- "mov %0, %2 \n"
+ "ld1 {v0.16b}, [%0], %5 \n"
+ "ld1 {v1.16b}, [%0], %5 \n"
+ "ld1 {v2.16b}, [%0], %5 \n"
+ "ld1 {v3.16b}, [%0], %5 \n"
+ "ld1 {v4.16b}, [%0], %5 \n"
+ "ld1 {v5.16b}, [%0], %5 \n"
+ "ld1 {v6.16b}, [%0], %5 \n"
+ "ld1 {v7.16b}, [%0] \n"
+ "mov %0, %1 \n"
- "st1 {v16.d}[0], [%0], %6 \n"
- "st1 {v18.d}[0], [%0], %6 \n"
- "st1 {v17.d}[0], [%0], %6 \n"
- "st1 {v19.d}[0], [%0], %6 \n"
- "st1 {v16.d}[1], [%0], %6 \n"
- "st1 {v18.d}[1], [%0], %6 \n"
- "st1 {v17.d}[1], [%0], %6 \n"
- "st1 {v19.d}[1], [%0] \n"
+ "trn1 v16.16b, v0.16b, v1.16b \n"
+ "trn2 v17.16b, v0.16b, v1.16b \n"
+ "trn1 v18.16b, v2.16b, v3.16b \n"
+ "trn2 v19.16b, v2.16b, v3.16b \n"
+ "trn1 v20.16b, v4.16b, v5.16b \n"
+ "trn2 v21.16b, v4.16b, v5.16b \n"
+ "trn1 v22.16b, v6.16b, v7.16b \n"
+ "trn2 v23.16b, v6.16b, v7.16b \n"
+
+ "trn1 v0.8h, v16.8h, v18.8h \n"
+ "trn2 v1.8h, v16.8h, v18.8h \n"
+ "trn1 v2.8h, v20.8h, v22.8h \n"
+ "trn2 v3.8h, v20.8h, v22.8h \n"
+ "trn1 v4.8h, v17.8h, v19.8h \n"
+ "trn2 v5.8h, v17.8h, v19.8h \n"
+ "trn1 v6.8h, v21.8h, v23.8h \n"
+ "trn2 v7.8h, v21.8h, v23.8h \n"
+
+ "trn1 v16.4s, v0.4s, v2.4s \n"
+ "trn2 v17.4s, v0.4s, v2.4s \n"
+ "trn1 v18.4s, v1.4s, v3.4s \n"
+ "trn2 v19.4s, v1.4s, v3.4s \n"
+ "trn1 v20.4s, v4.4s, v6.4s \n"
+ "trn2 v21.4s, v4.4s, v6.4s \n"
+ "trn1 v22.4s, v5.4s, v7.4s \n"
+ "trn2 v23.4s, v5.4s, v7.4s \n"
- "mov %0, %3 \n"
+ "mov %0, %2 \n"
- "st1 {v20.d}[0], [%0], %7 \n"
- "st1 {v22.d}[0], [%0], %7 \n"
- "st1 {v21.d}[0], [%0], %7 \n"
- "st1 {v23.d}[0], [%0], %7 \n"
- "st1 {v20.d}[1], [%0], %7 \n"
- "st1 {v22.d}[1], [%0], %7 \n"
- "st1 {v21.d}[1], [%0], %7 \n"
- "st1 {v23.d}[1], [%0] \n"
-
- "add %1, %1, #16 \n" // src += 8*2
- "add %2, %2, %6, lsl #3 \n" // dst_a += 8 *
+ "st1 {v16.d}[0], [%0], %6 \n"
+ "st1 {v18.d}[0], [%0], %6 \n"
+ "st1 {v17.d}[0], [%0], %6 \n"
+ "st1 {v19.d}[0], [%0], %6 \n"
+ "st1 {v16.d}[1], [%0], %6 \n"
+ "st1 {v18.d}[1], [%0], %6 \n"
+ "st1 {v17.d}[1], [%0], %6 \n"
+ "st1 {v19.d}[1], [%0] \n"
+
+ "mov %0, %3 \n"
+
+ "st1 {v20.d}[0], [%0], %7 \n"
+ "st1 {v22.d}[0], [%0], %7 \n"
+ "st1 {v21.d}[0], [%0], %7 \n"
+ "st1 {v23.d}[0], [%0], %7 \n"
+ "st1 {v20.d}[1], [%0], %7 \n"
+ "st1 {v22.d}[1], [%0], %7 \n"
+ "st1 {v21.d}[1], [%0], %7 \n"
+ "st1 {v23.d}[1], [%0] \n"
+
+ "add %1, %1, #16 \n" // src += 8*2
+ "add %2, %2, %6, lsl #3 \n" // dst_a += 8 *
// dst_stride_a
- "add %3, %3, %7, lsl #3 \n" // dst_b += 8 *
+ "add %3, %3, %7, lsl #3 \n" // dst_b += 8 *
// dst_stride_b
- "subs %w4, %w4, #8 \n" // w -= 8
- "b.ge 1b \n"
+ "subs %w4, %w4, #8 \n" // w -= 8
+ "b.ge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
- "adds %w4, %w4, #8 \n"
- "b.eq 4f \n"
+ "adds %w4, %w4, #8 \n"
+ "b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
- "cmp %w4, #2 \n"
- "b.lt 3f \n"
+ "cmp %w4, #2 \n"
+ "b.lt 3f \n"
- "cmp %w4, #4 \n"
- "b.lt 2f \n"
+ "cmp %w4, #4 \n"
+ "b.lt 2f \n"
// TODO(frkoenig): Clean this up
// 4x8 block
diff --git a/chromium/third_party/libyuv/source/row_any.cc b/chromium/third_party/libyuv/source/row_any.cc
index e91560c44c6..7216373bcd1 100644
--- a/chromium/third_party/libyuv/source/row_any.cc
+++ b/chromium/third_party/libyuv/source/row_any.cc
@@ -64,6 +64,9 @@ ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
#ifdef HAS_I422ALPHATOARGBROW_MSA
ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
#endif
+#ifdef HAS_I422ALPHATOARGBROW_MMI
+ANY41C(I422AlphaToARGBRow_Any_MMI, I422AlphaToARGBRow_MMI, 1, 0, 4, 7)
+#endif
#undef ANY41C
// Any 3 planes to 1.
@@ -92,6 +95,9 @@ ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15)
#ifdef HAS_MERGERGBROW_NEON
ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
#endif
+#ifdef HAS_MERGERGBROW_MMI
+ANY31(MergeRGBRow_Any_MMI, MergeRGBRow_MMI, 0, 0, 3, 7)
+#endif
#ifdef HAS_I422TOYUY2ROW_SSE2
ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
@@ -106,18 +112,27 @@ ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
#ifdef HAS_I422TOYUY2ROW_MSA
ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)
#endif
+#ifdef HAS_I422TOYUY2ROW_MMI
+ANY31(I422ToYUY2Row_Any_MMI, I422ToYUY2Row_MMI, 1, 1, 4, 7)
+#endif
#ifdef HAS_I422TOUYVYROW_NEON
ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
#endif
#ifdef HAS_I422TOUYVYROW_MSA
ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)
#endif
+#ifdef HAS_I422TOUYVYROW_MMI
+ANY31(I422ToUYVYRow_Any_MMI, I422ToUYVYRow_MMI, 1, 1, 4, 7)
+#endif
#ifdef HAS_BLENDPLANEROW_AVX2
ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
#endif
#ifdef HAS_BLENDPLANEROW_SSSE3
ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
#endif
+#ifdef HAS_BLENDPLANEROW_MMI
+ANY31(BlendPlaneRow_Any_MMI, BlendPlaneRow_MMI, 0, 0, 1, 7)
+#endif
#undef ANY31
// Note that odd width replication includes 444 due to implementation
@@ -203,6 +218,15 @@ ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7)
ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7)
ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
#endif
+#ifdef HAS_I422TOARGBROW_MMI
+ANY31C(I444ToARGBRow_Any_MMI, I444ToARGBRow_MMI, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_MMI, I422ToARGBRow_MMI, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_MMI, I422ToRGB24Row_MMI, 1, 0, 3, 15)
+ANY31C(I422ToARGB4444Row_Any_MMI, I422ToARGB4444Row_MMI, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_MMI, I422ToARGB1555Row_MMI, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_MMI, I422ToRGB565Row_MMI, 1, 0, 2, 7)
+ANY31C(I422ToRGBARow_Any_MMI, I422ToRGBARow_MMI, 1, 0, 4, 7)
+#endif
#undef ANY31C
// Any 3 planes of 16 bit to 1 with yuvconstants
@@ -238,6 +262,9 @@ ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
#ifdef HAS_I210TOAR30ROW_AVX2
ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
#endif
+#ifdef HAS_I210TOARGBROW_MMI
+ANY31CT(I210ToARGBRow_Any_MMI, I210ToARGBRow_MMI, 1, 0, uint16_t, 2, 4, 7)
+#endif
#undef ANY31CT
// Any 2 planes to 1.
@@ -271,7 +298,15 @@ ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
#ifdef HAS_MERGEUVROW_MSA
ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15)
#endif
-
+#ifdef HAS_MERGEUVROW_MMI
+ANY21(MergeUVRow_Any_MMI, MergeUVRow_MMI, 0, 1, 1, 2, 7)
+#endif
+#ifdef HAS_NV21TOYUV24ROW_NEON
+ANY21(NV21ToYUV24Row_Any_NEON, NV21ToYUV24Row_NEON, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV21TOYUV24ROW_AVX2
+ANY21(NV21ToYUV24Row_Any_AVX2, NV21ToYUV24Row_AVX2, 1, 1, 2, 3, 31)
+#endif
// Math functions.
#ifdef HAS_ARGBMULTIPLYROW_SSE2
ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
@@ -303,12 +338,21 @@ ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
#ifdef HAS_ARGBMULTIPLYROW_MSA
ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3)
#endif
+#ifdef HAS_ARGBMULTIPLYROW_MMI
+ANY21(ARGBMultiplyRow_Any_MMI, ARGBMultiplyRow_MMI, 0, 4, 4, 4, 1)
+#endif
#ifdef HAS_ARGBADDROW_MSA
ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7)
#endif
+#ifdef HAS_ARGBADDROW_MMI
+ANY21(ARGBAddRow_Any_MMI, ARGBAddRow_MMI, 0, 4, 4, 4, 1)
+#endif
#ifdef HAS_ARGBSUBTRACTROW_MSA
ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7)
#endif
+#ifdef HAS_ARGBSUBTRACTROW_MMI
+ANY21(ARGBSubtractRow_Any_MMI, ARGBSubtractRow_MMI, 0, 4, 4, 4, 1)
+#endif
#ifdef HAS_SOBELROW_SSE2
ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
#endif
@@ -318,6 +362,9 @@ ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
#ifdef HAS_SOBELROW_MSA
ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15)
#endif
+#ifdef HAS_SOBELROW_MMI
+ANY21(SobelRow_Any_MMI, SobelRow_MMI, 0, 1, 1, 4, 7)
+#endif
#ifdef HAS_SOBELTOPLANEROW_SSE2
ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
#endif
@@ -327,6 +374,9 @@ ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
#ifdef HAS_SOBELTOPLANEROW_MSA
ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31)
#endif
+#ifdef HAS_SOBELTOPLANEROW_MMI
+ANY21(SobelToPlaneRow_Any_MMI, SobelToPlaneRow_MMI, 0, 1, 1, 1, 7)
+#endif
#ifdef HAS_SOBELXYROW_SSE2
ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
#endif
@@ -336,6 +386,9 @@ ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
#ifdef HAS_SOBELXYROW_MSA
ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15)
#endif
+#ifdef HAS_SOBELXYROW_MMI
+ANY21(SobelXYRow_Any_MMI, SobelXYRow_MMI, 0, 1, 1, 4, 7)
+#endif
#undef ANY21
// Any 2 planes to 1 with yuvconstants
@@ -369,6 +422,9 @@ ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
#ifdef HAS_NV12TOARGBROW_MSA
ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7)
#endif
+#ifdef HAS_NV12TOARGBROW_MMI
+ANY21C(NV12ToARGBRow_Any_MMI, NV12ToARGBRow_MMI, 1, 1, 2, 4, 7)
+#endif
#ifdef HAS_NV21TOARGBROW_SSSE3
ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
#endif
@@ -381,6 +437,9 @@ ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
#ifdef HAS_NV21TOARGBROW_MSA
ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7)
#endif
+#ifdef HAS_NV21TOARGBROW_MMI
+ANY21C(NV21ToARGBRow_Any_MMI, NV21ToARGBRow_MMI, 1, 1, 2, 4, 7)
+#endif
#ifdef HAS_NV12TORGB24ROW_NEON
ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7)
#endif
@@ -390,6 +449,9 @@ ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7)
#ifdef HAS_NV12TORGB24ROW_SSSE3
ANY21C(NV12ToRGB24Row_Any_SSSE3, NV12ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
#endif
+#ifdef HAS_NV12TORGB24ROW_MMI
+ANY21C(NV12ToRGB24Row_Any_MMI, NV12ToRGB24Row_MMI, 1, 1, 2, 3, 7)
+#endif
#ifdef HAS_NV21TORGB24ROW_SSSE3
ANY21C(NV21ToRGB24Row_Any_SSSE3, NV21ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
#endif
@@ -399,6 +461,9 @@ ANY21C(NV12ToRGB24Row_Any_AVX2, NV12ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
#ifdef HAS_NV21TORGB24ROW_AVX2
ANY21C(NV21ToRGB24Row_Any_AVX2, NV21ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
#endif
+#ifdef HAS_NV21TORGB24ROW_MMI
+ANY21C(NV21ToRGB24Row_Any_MMI, NV21ToRGB24Row_MMI, 1, 1, 2, 3, 7)
+#endif
#ifdef HAS_NV12TORGB565ROW_SSSE3
ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
#endif
@@ -411,6 +476,9 @@ ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
#ifdef HAS_NV12TORGB565ROW_MSA
ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7)
#endif
+#ifdef HAS_NV12TORGB565ROW_MMI
+ANY21C(NV12ToRGB565Row_Any_MMI, NV12ToRGB565Row_MMI, 1, 1, 2, 2, 7)
+#endif
#undef ANY21C
// Any 1 to 1.
@@ -478,12 +546,6 @@ ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
#if defined(HAS_J400TOARGBROW_AVX2)
ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
#endif
-#if defined(HAS_I400TOARGBROW_SSE2)
-ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7)
-#endif
-#if defined(HAS_I400TOARGBROW_AVX2)
-ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15)
-#endif
#if defined(HAS_RGB24TOARGBROW_SSSE3)
ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
@@ -491,6 +553,9 @@ ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
#endif
+#if defined(HAS_RAWTORGBAROW_SSSE3)
+ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15)
+#endif
#if defined(HAS_RAWTORGB24ROW_SSSE3)
ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7)
#endif
@@ -510,7 +575,6 @@ ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7)
ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
-ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
#endif
#if defined(HAS_ARGBTORGB24ROW_MSA)
ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15)
@@ -519,7 +583,14 @@ ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7)
ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7)
ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7)
ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)
-ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_MMI)
+ANY11(ARGBToRGB24Row_Any_MMI, ARGBToRGB24Row_MMI, 0, 4, 3, 3)
+ANY11(ARGBToRAWRow_Any_MMI, ARGBToRAWRow_MMI, 0, 4, 3, 3)
+ANY11(ARGBToRGB565Row_Any_MMI, ARGBToRGB565Row_MMI, 0, 4, 2, 3)
+ANY11(ARGBToARGB1555Row_Any_MMI, ARGBToARGB1555Row_MMI, 0, 4, 2, 3)
+ANY11(ARGBToARGB4444Row_Any_MMI, ARGBToARGB4444Row_MMI, 0, 4, 2, 3)
+ANY11(J400ToARGBRow_Any_MMI, J400ToARGBRow_MMI, 0, 1, 4, 3)
#endif
#if defined(HAS_RAWTORGB24ROW_NEON)
ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
@@ -527,12 +598,21 @@ ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
#if defined(HAS_RAWTORGB24ROW_MSA)
ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15)
#endif
+#if defined(HAS_RAWTORGB24ROW_MMI)
+ANY11(RAWToRGB24Row_Any_MMI, RAWToRGB24Row_MMI, 0, 3, 3, 3)
+#endif
#ifdef HAS_ARGBTOYROW_AVX2
ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
#endif
+#ifdef HAS_ABGRTOYROW_AVX2
+ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31)
+#endif
#ifdef HAS_ARGBTOYJROW_AVX2
ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
#endif
+#ifdef HAS_RGBATOYJROW_AVX2
+ANY11(RGBAToYJRow_Any_AVX2, RGBAToYJRow_AVX2, 0, 4, 1, 31)
+#endif
#ifdef HAS_UYVYTOYROW_AVX2
ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31)
#endif
@@ -552,63 +632,117 @@ ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
#ifdef HAS_ARGBTOYJROW_SSSE3
ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
#endif
+#ifdef HAS_RGBATOYJROW_SSSE3
+ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15)
+#endif
#ifdef HAS_ARGBTOYROW_NEON
ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
#endif
#ifdef HAS_ARGBTOYROW_MSA
ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
#endif
+#ifdef HAS_ARGBTOYROW_MMI
+ANY11(ARGBToYRow_Any_MMI, ARGBToYRow_MMI, 0, 4, 1, 7)
+#endif
#ifdef HAS_ARGBTOYJROW_NEON
ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
#endif
+#ifdef HAS_RGBATOYJROW_NEON
+ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 7)
+#endif
#ifdef HAS_ARGBTOYJROW_MSA
ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
#endif
+#ifdef HAS_ARGBTOYJROW_MMI
+ANY11(ARGBToYJRow_Any_MMI, ARGBToYJRow_MMI, 0, 4, 1, 7)
+#endif
#ifdef HAS_BGRATOYROW_NEON
ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
#endif
#ifdef HAS_BGRATOYROW_MSA
ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
#endif
+#ifdef HAS_BGRATOYROW_MMI
+ANY11(BGRAToYRow_Any_MMI, BGRAToYRow_MMI, 0, 4, 1, 7)
+#endif
#ifdef HAS_ABGRTOYROW_NEON
ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
#endif
#ifdef HAS_ABGRTOYROW_MSA
ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
#endif
+#ifdef HAS_ABGRTOYROW_MMI
+ANY11(ABGRToYRow_Any_MMI, ABGRToYRow_MMI, 0, 4, 1, 7)
+#endif
#ifdef HAS_RGBATOYROW_NEON
ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
#endif
#ifdef HAS_RGBATOYROW_MSA
ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
#endif
+#ifdef HAS_RGBATOYROW_MMI
+ANY11(RGBAToYRow_Any_MMI, RGBAToYRow_MMI, 0, 4, 1, 7)
+#endif
#ifdef HAS_RGB24TOYROW_NEON
ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
#endif
+#ifdef HAS_RGB24TOYJROW_AVX2
+ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RGB24TOYJROW_SSSE3
+ANY11(RGB24ToYJRow_Any_SSSE3, RGB24ToYJRow_SSSE3, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYJROW_NEON
+ANY11(RGB24ToYJRow_Any_NEON, RGB24ToYJRow_NEON, 0, 3, 1, 7)
+#endif
#ifdef HAS_RGB24TOYROW_MSA
ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15)
#endif
+#ifdef HAS_RGB24TOYROW_MMI
+ANY11(RGB24ToYRow_Any_MMI, RGB24ToYRow_MMI, 0, 3, 1, 7)
+#endif
#ifdef HAS_RAWTOYROW_NEON
ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
#endif
+#ifdef HAS_RAWTOYJROW_AVX2
+ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RAWTOYJROW_SSSE3
+ANY11(RAWToYJRow_Any_SSSE3, RAWToYJRow_SSSE3, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYJROW_NEON
+ANY11(RAWToYJRow_Any_NEON, RAWToYJRow_NEON, 0, 3, 1, 7)
+#endif
#ifdef HAS_RAWTOYROW_MSA
ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15)
#endif
+#ifdef HAS_RAWTOYROW_MMI
+ANY11(RAWToYRow_Any_MMI, RAWToYRow_MMI, 0, 3, 1, 7)
+#endif
#ifdef HAS_RGB565TOYROW_NEON
ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
#endif
#ifdef HAS_RGB565TOYROW_MSA
ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15)
#endif
+#ifdef HAS_RGB565TOYROW_MMI
+ANY11(RGB565ToYRow_Any_MMI, RGB565ToYRow_MMI, 0, 2, 1, 7)
+#endif
#ifdef HAS_ARGB1555TOYROW_NEON
ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
#endif
#ifdef HAS_ARGB1555TOYROW_MSA
ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15)
#endif
+#ifdef HAS_ARGB1555TOYROW_MMI
+ANY11(ARGB1555ToYRow_Any_MMI, ARGB1555ToYRow_MMI, 0, 2, 1, 7)
+#endif
#ifdef HAS_ARGB4444TOYROW_NEON
ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
#endif
+#ifdef HAS_ARGB4444TOYROW_MMI
+ANY11(ARGB4444ToYRow_Any_MMI, ARGB4444ToYRow_MMI, 0, 2, 1, 7)
+#endif
#ifdef HAS_YUY2TOYROW_NEON
ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
#endif
@@ -618,39 +752,75 @@ ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15)
#ifdef HAS_YUY2TOYROW_MSA
ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)
#endif
+#ifdef HAS_YUY2TOYROW_MMI
+ANY11(YUY2ToYRow_Any_MMI, YUY2ToYRow_MMI, 1, 4, 1, 7)
+#endif
#ifdef HAS_UYVYTOYROW_MSA
ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
#endif
+#ifdef HAS_UYVYTOYROW_MMI
+ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15)
+#endif
+#ifdef HAS_AYUVTOYROW_NEON
+ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_SWAPUVROW_SSSE3
+ANY11(SwapUVRow_Any_SSSE3, SwapUVRow_SSSE3, 0, 2, 2, 15)
+#endif
+#ifdef HAS_SWAPUVROW_AVX2
+ANY11(SwapUVRow_Any_AVX2, SwapUVRow_AVX2, 0, 2, 2, 31)
+#endif
+#ifdef HAS_SWAPUVROW_NEON
+ANY11(SwapUVRow_Any_NEON, SwapUVRow_NEON, 0, 2, 2, 15)
+#endif
#ifdef HAS_RGB24TOARGBROW_NEON
ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
#endif
#ifdef HAS_RGB24TOARGBROW_MSA
ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15)
#endif
+#ifdef HAS_RGB24TOARGBROW_MMI
+ANY11(RGB24ToARGBRow_Any_MMI, RGB24ToARGBRow_MMI, 0, 3, 4, 3)
+#endif
#ifdef HAS_RAWTOARGBROW_NEON
ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
#endif
+#ifdef HAS_RAWTORGBAROW_NEON
+ANY11(RAWToRGBARow_Any_NEON, RAWToRGBARow_NEON, 0, 3, 4, 7)
+#endif
#ifdef HAS_RAWTOARGBROW_MSA
ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15)
#endif
+#ifdef HAS_RAWTOARGBROW_MMI
+ANY11(RAWToARGBRow_Any_MMI, RAWToARGBRow_MMI, 0, 3, 4, 3)
+#endif
#ifdef HAS_RGB565TOARGBROW_NEON
ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
#endif
#ifdef HAS_RGB565TOARGBROW_MSA
ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15)
#endif
+#ifdef HAS_RGB565TOARGBROW_MMI
+ANY11(RGB565ToARGBRow_Any_MMI, RGB565ToARGBRow_MMI, 0, 2, 4, 3)
+#endif
#ifdef HAS_ARGB1555TOARGBROW_NEON
ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
#endif
#ifdef HAS_ARGB1555TOARGBROW_MSA
ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15)
#endif
+#ifdef HAS_ARGB1555TOARGBROW_MMI
+ANY11(ARGB1555ToARGBRow_Any_MMI, ARGB1555ToARGBRow_MMI, 0, 2, 4, 3)
+#endif
#ifdef HAS_ARGB4444TOARGBROW_NEON
ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
#endif
#ifdef HAS_ARGB4444TOARGBROW_MSA
ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15)
#endif
+#ifdef HAS_ARGB4444TOARGBROW_MMI
+ANY11(ARGB4444ToARGBRow_Any_MMI, ARGB4444ToARGBRow_MMI, 0, 2, 4, 3)
+#endif
#ifdef HAS_ARGBATTENUATEROW_SSSE3
ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
#endif
@@ -669,6 +839,9 @@ ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
#ifdef HAS_ARGBATTENUATEROW_MSA
ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7)
#endif
+#ifdef HAS_ARGBATTENUATEROW_MMI
+ANY11(ARGBAttenuateRow_Any_MMI, ARGBAttenuateRow_MMI, 0, 4, 4, 1)
+#endif
#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
#endif
@@ -681,6 +854,9 @@ ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
#ifdef HAS_ARGBEXTRACTALPHAROW_MSA
ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15)
#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_MMI
+ANY11(ARGBExtractAlphaRow_Any_MMI, ARGBExtractAlphaRow_MMI, 0, 4, 1, 7)
+#endif
#undef ANY11
// Any 1 to 1 blended. Destination is read, modify, write.
@@ -705,12 +881,18 @@ ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
#ifdef HAS_ARGBCOPYALPHAROW_SSE2
ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)
#endif
+#ifdef HAS_ARGBCOPYALPHAROW_MMI
+ANY11B(ARGBCopyAlphaRow_Any_MMI, ARGBCopyAlphaRow_MMI, 0, 4, 4, 1)
+#endif
#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
#endif
#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_MMI
+ANY11B(ARGBCopyYToAlphaRow_Any_MMI, ARGBCopyYToAlphaRow_MMI, 0, 1, 4, 7)
+#endif
#undef ANY11B
// Any 1 to 1 with parameter.
@@ -728,6 +910,47 @@ ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
}
+#if defined(HAS_I400TOARGBROW_SSE2)
+ANY11P(I400ToARGBRow_Any_SSE2,
+ I400ToARGBRow_SSE2,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 7)
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+ANY11P(I400ToARGBRow_Any_AVX2,
+ I400ToARGBRow_AVX2,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 15)
+#endif
+#if defined(HAS_I400TOARGBROW_NEON)
+ANY11P(I400ToARGBRow_Any_NEON,
+ I400ToARGBRow_NEON,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 7)
+#endif
+#if defined(HAS_I400TOARGBROW_MSA)
+ANY11P(I400ToARGBRow_Any_MSA,
+ I400ToARGBRow_MSA,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 15)
+#endif
+#if defined(HAS_I400TOARGBROW_MMI)
+ANY11P(I400ToARGBRow_Any_MMI,
+ I400ToARGBRow_MMI,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 7)
+#endif
+
#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
ANY11P(ARGBToRGB565DitherRow_Any_SSE2,
ARGBToRGB565DitherRow_SSE2,
@@ -760,6 +983,14 @@ ANY11P(ARGBToRGB565DitherRow_Any_MSA,
2,
7)
#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
+ANY11P(ARGBToRGB565DitherRow_Any_MMI,
+ ARGBToRGB565DitherRow_MMI,
+ const uint32_t,
+ 4,
+ 2,
+ 3)
+#endif
#ifdef HAS_ARGBSHUFFLEROW_SSSE3
ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7)
#endif
@@ -772,6 +1003,10 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3)
#ifdef HAS_ARGBSHUFFLEROW_MSA
ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7)
#endif
+#ifdef HAS_ARGBSHUFFLEROW_MMI
+ANY11P(ARGBShuffleRow_Any_MMI, ARGBShuffleRow_MMI, const uint8_t*, 4, 4, 1)
+#endif
+#undef ANY11P
#undef ANY11P
// Any 1 to 1 with parameter and shorts. BPP measures in shorts.
@@ -909,6 +1144,10 @@ ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7)
ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7)
#endif
+#if defined(HAS_YUY2TOARGBROW_MMI)
+ANY11C(YUY2ToARGBRow_Any_MMI, YUY2ToARGBRow_MMI, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_MMI, UYVYToARGBRow_MMI, 1, 4, 4, 7)
+#endif
#undef ANY11C
// Any 1 to 1 interpolate. Takes 2 rows of source via stride.
@@ -940,6 +1179,9 @@ ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
#ifdef HAS_INTERPOLATEROW_MSA
ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31)
#endif
+#ifdef HAS_INTERPOLATEROW_MMI
+ANY11T(InterpolateRow_Any_MMI, InterpolateRow_MMI, 1, 1, 7)
+#endif
#undef ANY11T
// Any 1 to 1 mirror.
@@ -964,11 +1206,26 @@ ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
#endif
#ifdef HAS_MIRRORROW_NEON
-ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
+ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 31)
#endif
#ifdef HAS_MIRRORROW_MSA
ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
#endif
+#ifdef HAS_MIRRORROW_MMI
+ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7)
+#endif
+#ifdef HAS_MIRRORUVROW_AVX2
+ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15)
+#endif
+#ifdef HAS_MIRRORUVROW_SSSE3
+ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7)
+#endif
+#ifdef HAS_MIRRORUVROW_NEON
+ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31)
+#endif
+#ifdef HAS_MIRRORUVROW_MSA
+ANY11M(MirrorUVRow_Any_MSA, MirrorUVRow_MSA, 2, 7)
+#endif
#ifdef HAS_ARGBMIRRORROW_AVX2
ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
#endif
@@ -976,17 +1233,27 @@ ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
#endif
#ifdef HAS_ARGBMIRRORROW_NEON
-ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
+ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7)
#endif
#ifdef HAS_ARGBMIRRORROW_MSA
ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
#endif
+#ifdef HAS_ARGBMIRRORROW_MMI
+ANY11M(ARGBMirrorRow_Any_MMI, ARGBMirrorRow_MMI, 4, 1)
+#endif
+#ifdef HAS_RGB24MIRRORROW_SSSE3
+ANY11M(RGB24MirrorRow_Any_SSSE3, RGB24MirrorRow_SSSE3, 3, 15)
+#endif
+#ifdef HAS_RGB24MIRRORROW_NEON
+ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15)
+#endif
#undef ANY11M
// Any 1 plane. (memset)
#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \
void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \
SIMD_ALIGNED(uint8_t temp[64]); \
+ memset(temp, 0, 64); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
@@ -1008,6 +1275,9 @@ ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32_t, 4, 3)
#ifdef HAS_ARGBSETROW_MSA
ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3)
#endif
+#ifdef HAS_ARGBSETROW_MMI
+ANY1(ARGBSetRow_Any_MMI, ARGBSetRow_MMI, uint32_t, 4, 3)
+#endif
#undef ANY1
// Any 1 to 2. Outputs UV planes.
@@ -1039,6 +1309,9 @@ ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
#ifdef HAS_SPLITUVROW_MSA
ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31)
#endif
+#ifdef HAS_SPLITUVROW_MMI
+ANY12(SplitUVRow_Any_MMI, SplitUVRow_MMI, 0, 2, 0, 7)
+#endif
#ifdef HAS_ARGBTOUV444ROW_SSSE3
ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
#endif
@@ -1060,6 +1333,11 @@ ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)
ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
#endif
+#ifdef HAS_YUY2TOUV422ROW_MMI
+ANY12(ARGBToUV444Row_Any_MMI, ARGBToUV444Row_MMI, 0, 4, 0, 7)
+ANY12(UYVYToUV422Row_Any_MMI, UYVYToUV422Row_MMI, 1, 4, 1, 15)
+ANY12(YUY2ToUV422Row_Any_MMI, YUY2ToUV422Row_MMI, 1, 4, 1, 15)
+#endif
#undef ANY12
// Any 1 to 3. Outputs RGB planes.
@@ -1086,6 +1364,9 @@ ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15)
#ifdef HAS_SPLITRGBROW_NEON
ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
#endif
+#ifdef HAS_SPLITRGBROW_MMI
+ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3)
+#endif
// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes.
// 128 byte row allows for 32 avx ARGB pixels.
@@ -1116,6 +1397,9 @@ ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
#ifdef HAS_ARGBTOUVROW_AVX2
ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
#endif
+#ifdef HAS_ABGRTOUVROW_AVX2
+ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31)
+#endif
#ifdef HAS_ARGBTOUVJROW_AVX2
ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
#endif
@@ -1140,29 +1424,44 @@ ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
#ifdef HAS_ARGBTOUVROW_MSA
ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
#endif
+#ifdef HAS_ARGBTOUVROW_MMI
+ANY12S(ARGBToUVRow_Any_MMI, ARGBToUVRow_MMI, 0, 4, 15)
+#endif
#ifdef HAS_ARGBTOUVJROW_NEON
ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_ARGBTOUVJROW_MSA
ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)
#endif
+#ifdef HAS_ARGBTOUVJROW_MMI
+ANY12S(ARGBToUVJRow_Any_MMI, ARGBToUVJRow_MMI, 0, 4, 15)
+#endif
#ifdef HAS_BGRATOUVROW_NEON
ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_BGRATOUVROW_MSA
-ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31)
+ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 15)
+#endif
+#ifdef HAS_BGRATOUVROW_MMI
+ANY12S(BGRAToUVRow_Any_MMI, BGRAToUVRow_MMI, 0, 4, 15)
#endif
#ifdef HAS_ABGRTOUVROW_NEON
ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_ABGRTOUVROW_MSA
-ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31)
+ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_MMI
+ANY12S(ABGRToUVRow_Any_MMI, ABGRToUVRow_MMI, 0, 4, 15)
#endif
#ifdef HAS_RGBATOUVROW_NEON
ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_RGBATOUVROW_MSA
-ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31)
+ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_MMI
+ANY12S(RGBAToUVRow_Any_MMI, RGBAToUVRow_MMI, 0, 4, 15)
#endif
#ifdef HAS_RGB24TOUVROW_NEON
ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
@@ -1170,27 +1469,42 @@ ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
#ifdef HAS_RGB24TOUVROW_MSA
ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15)
#endif
+#ifdef HAS_RGB24TOUVROW_MMI
+ANY12S(RGB24ToUVRow_Any_MMI, RGB24ToUVRow_MMI, 0, 3, 15)
+#endif
#ifdef HAS_RAWTOUVROW_NEON
ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
#endif
#ifdef HAS_RAWTOUVROW_MSA
ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15)
#endif
+#ifdef HAS_RAWTOUVROW_MMI
+ANY12S(RAWToUVRow_Any_MMI, RAWToUVRow_MMI, 0, 3, 15)
+#endif
#ifdef HAS_RGB565TOUVROW_NEON
ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
#endif
#ifdef HAS_RGB565TOUVROW_MSA
ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15)
#endif
+#ifdef HAS_RGB565TOUVROW_MMI
+ANY12S(RGB565ToUVRow_Any_MMI, RGB565ToUVRow_MMI, 0, 2, 15)
+#endif
#ifdef HAS_ARGB1555TOUVROW_NEON
ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
#endif
#ifdef HAS_ARGB1555TOUVROW_MSA
ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15)
#endif
+#ifdef HAS_ARGB1555TOUVROW_MMI
+ANY12S(ARGB1555ToUVRow_Any_MMI, ARGB1555ToUVRow_MMI, 0, 2, 15)
+#endif
#ifdef HAS_ARGB4444TOUVROW_NEON
ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
#endif
+#ifdef HAS_ARGB4444TOUVROW_MMI
+ANY12S(ARGB4444ToUVRow_Any_MMI, ARGB4444ToUVRow_MMI, 0, 2, 15)
+#endif
#ifdef HAS_YUY2TOUVROW_NEON
ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
#endif
@@ -1200,11 +1514,48 @@ ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
#ifdef HAS_YUY2TOUVROW_MSA
ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
#endif
+#ifdef HAS_YUY2TOUVROW_MMI
+ANY12S(YUY2ToUVRow_Any_MMI, YUY2ToUVRow_MMI, 1, 4, 15)
+#endif
#ifdef HAS_UYVYTOUVROW_MSA
ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)
#endif
+#ifdef HAS_UYVYTOUVROW_MMI
+ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15)
+#endif
#undef ANY12S
+// Any 1 to 1 with source stride (2 rows of source). Outputs UV plane.
+// 128 byte row allows for 32 avx ARGB pixels.
+#define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_vu, \
+ int width) { \
+ SIMD_ALIGNED(uint8_t temp[128 * 3]); \
+ memset(temp, 0, 128 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, src_stride_ptr, dst_vu, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
+ memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \
+ SS(r, UVSHIFT) * BPP); \
+ if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
+ memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
+ BPP); \
+ memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \
+ temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \
+ } \
+ ANY_SIMD(temp, 128, temp + 256, MASK + 1); \
+ memcpy(dst_vu + (n >> 1) * 2, temp + 256, SS(r, 1) * 2); \
+ }
+
+#ifdef HAS_AYUVTOVUROW_NEON
+ANY11S(AYUVToUVRow_Any_NEON, AYUVToUVRow_NEON, 0, 4, 15)
+ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15)
+#endif
+#undef ANY11S
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/chromium/third_party/libyuv/source/row_common.cc b/chromium/third_party/libyuv/source/row_common.cc
index 2bbc5adbf14..79aed5c7877 100644
--- a/chromium/third_party/libyuv/source/row_common.cc
+++ b/chromium/third_party/libyuv/source/row_common.cc
@@ -14,30 +14,44 @@
#include <string.h> // For memcpy and memset.
#include "libyuv/basic_types.h"
+#include "libyuv/convert_argb.h" // For kYuvI601Constants
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
+// The following ifdef from row_win makes the C code match the row_win code,
+// which is 7 bit fixed point.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+ (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
+#define LIBYUV_RGB7 1
+#endif
+
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+ defined(_M_IX86)
+#define LIBYUV_ARGBTOUV_PAVGB 1
+#define LIBYUV_RGBTOU_TRUNCATE 1
+#endif
+
// llvm x86 is poor at ternary operator, so use branchless min/max.
#define USE_BRANCHLESS 1
#if USE_BRANCHLESS
static __inline int32_t clamp0(int32_t v) {
- return ((-(v) >> 31) & (v));
+ return -(v >= 0) & v;
}
-
+// TODO(fbarchard): make clamp255 preserve negative values.
static __inline int32_t clamp255(int32_t v) {
- return (((255 - (v)) >> 31) | (v)) & 255;
+ return (-(v >= 255) | v) & 255;
}
static __inline int32_t clamp1023(int32_t v) {
- return (((1023 - (v)) >> 31) | (v)) & 1023;
+ return (-(v >= 1023) | v) & 1023;
}
static __inline uint32_t Abs(int32_t v) {
- int m = v >> 31;
+ int m = -(v < 0);
return (v + m) ^ m;
}
#else // USE_BRANCHLESS
@@ -111,6 +125,21 @@ void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
}
}
+void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t r = src_raw[0];
+ uint8_t g = src_raw[1];
+ uint8_t b = src_raw[2];
+ dst_rgba[0] = 255u;
+ dst_rgba[1] = b;
+ dst_rgba[2] = g;
+ dst_rgba[3] = r;
+ dst_rgba += 4;
+ src_raw += 3;
+ }
+}
+
void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
int x;
for (x = 0; x < width; ++x) {
@@ -181,7 +210,8 @@ void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
int x;
for (x = 0; x < width; ++x) {
- uint32_t ar30 = *(const uint32_t*)src_ar30;
+ uint32_t ar30;
+ memcpy(&ar30, src_ar30, sizeof ar30);
uint32_t b = (ar30 >> 2) & 0xff;
uint32_t g = (ar30 >> 12) & 0xff;
uint32_t r = (ar30 >> 22) & 0xff;
@@ -195,7 +225,8 @@ void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
int x;
for (x = 0; x < width; ++x) {
- uint32_t ar30 = *(const uint32_t*)src_ar30;
+ uint32_t ar30;
+ memcpy(&ar30, src_ar30, sizeof ar30);
uint32_t b = (ar30 >> 2) & 0xff;
uint32_t g = (ar30 >> 12) & 0xff;
uint32_t r = (ar30 >> 22) & 0xff;
@@ -209,7 +240,8 @@ void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {
int x;
for (x = 0; x < width; ++x) {
- uint32_t ar30 = *(const uint32_t*)src_ar30;
+ uint32_t ar30;
+ memcpy(&ar30, src_ar30, sizeof ar30);
uint32_t b = ar30 & 0x3ff;
uint32_t ga = ar30 & 0xc00ffc00;
uint32_t r = (ar30 >> 20) & 0x3ff;
@@ -381,18 +413,55 @@ void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
}
}
+#ifdef LIBYUV_RGB7
+// Old 7 bit math for compatibility on unsupported platforms.
+static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
+ return ((33 * r + 65 * g + 13 * b) >> 7) + 16;
+}
+#else
+// 8 bit
+// Intel SSE/AVX uses the following equivalent formula
+// 0x7e80 = (66 + 129 + 25) * -128 + 0x1000 (for +16) and 0x0080 for round.
+// return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) +
+// 0x7e80) >> 8;
+
static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
}
+#endif
+
+#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+#ifdef LIBYUV_RGBTOU_TRUNCATE
+static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
+ return (112 * b - 74 * g - 38 * r + 0x8000) >> 8;
+}
+static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
+ return (112 * r - 94 * g - 18 * b + 0x8000) >> 8;
+}
+#else
+// TODO(fbarchard): Add rounding to SIMD and use this
static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
}
static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
}
+#endif
+
+#if !defined(LIBYUV_ARGBTOUV_PAVGB)
+static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) {
+ return ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8;
+}
+static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
+ return ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8;
+}
+#endif
// ARGBToY_C and ARGBToUV_C
+// Intel version mimic SSE/AVX which does 2 pavgb
+#if LIBYUV_ARGBTOUV_PAVGB
+
#define MAKEROWY(NAME, R, G, B, BPP) \
void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
int x; \
@@ -407,15 +476,12 @@ static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \
int x; \
for (x = 0; x < width - 1; x += 2) { \
- uint8_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \
- src_rgb1[B + BPP]) >> \
- 2; \
- uint8_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \
- src_rgb1[G + BPP]) >> \
- 2; \
- uint8_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \
- src_rgb1[R + BPP]) >> \
- 2; \
+ uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \
+ AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \
+ uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \
+ AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \
+ uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \
+ AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \
dst_u[0] = RGBToU(ar, ag, ab); \
dst_v[0] = RGBToV(ar, ag, ab); \
src_rgb0 += BPP * 2; \
@@ -424,13 +490,54 @@ static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
dst_v += 1; \
} \
if (width & 1) { \
- uint8_t ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \
- uint8_t ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \
- uint8_t ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \
+ uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]); \
+ uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]); \
+ uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]); \
dst_u[0] = RGBToU(ar, ag, ab); \
dst_v[0] = RGBToV(ar, ag, ab); \
} \
}
+#else
+// ARM version does sum / 2 then multiply by 2x smaller coefficients
+#define MAKEROWY(NAME, R, G, B, BPP) \
+ void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
+ src_argb0 += BPP; \
+ dst_y += 1; \
+ } \
+ } \
+ void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \
+ uint8_t* dst_u, uint8_t* dst_v, int width) { \
+ const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \
+ src_rgb1[B + BPP] + 1) >> \
+ 1; \
+ uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \
+ src_rgb1[G + BPP] + 1) >> \
+ 1; \
+ uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \
+ src_rgb1[R + BPP] + 1) >> \
+ 1; \
+ dst_u[0] = RGB2xToU(ar, ag, ab); \
+ dst_v[0] = RGB2xToV(ar, ag, ab); \
+ src_rgb0 += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint16_t ab = src_rgb0[B] + src_rgb1[B]; \
+ uint16_t ag = src_rgb0[G] + src_rgb1[G]; \
+ uint16_t ar = src_rgb0[R] + src_rgb1[R]; \
+ dst_u[0] = RGB2xToU(ar, ag, ab); \
+ dst_v[0] = RGB2xToV(ar, ag, ab); \
+ } \
+ }
+#endif
MAKEROWY(ARGB, 2, 1, 0, 4)
MAKEROWY(BGRA, 1, 2, 3, 4)
@@ -448,14 +555,14 @@ MAKEROWY(RAW, 0, 1, 2, 3)
// b 0.1016 * 255 = 25.908 = 25
// g 0.5078 * 255 = 129.489 = 129
// r 0.2578 * 255 = 65.739 = 66
-// JPeg 8 bit Y (not used):
-// b 0.11400 * 256 = 29.184 = 29
-// g 0.58700 * 256 = 150.272 = 150
-// r 0.29900 * 256 = 76.544 = 77
-// JPeg 7 bit Y:
+// JPeg 7 bit Y (deprecated)
// b 0.11400 * 128 = 14.592 = 15
// g 0.58700 * 128 = 75.136 = 75
// r 0.29900 * 128 = 38.272 = 38
+// JPeg 8 bit Y:
+// b 0.11400 * 256 = 29.184 = 29
+// g 0.58700 * 256 = 150.272 = 150
+// r 0.29900 * 256 = 76.544 = 77
// JPeg 8 bit U:
// b 0.50000 * 255 = 127.5 = 127
// g -0.33126 * 255 = -84.4713 = -84
@@ -465,20 +572,37 @@ MAKEROWY(RAW, 0, 1, 2, 3)
// g -0.41869 * 255 = -106.76595 = -107
// r 0.50000 * 255 = 127.5 = 127
+#ifdef LIBYUV_RGB7
+// Old 7 bit math for compatibility on unsupported platforms.
static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
return (38 * r + 75 * g + 15 * b + 64) >> 7;
}
+#else
+// 8 bit
+static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
+ return (77 * r + 150 * g + 29 * b + 128) >> 8;
+}
+#endif
+#if defined(LIBYUV_ARGBTOUV_PAVGB)
static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
}
static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
}
-
-#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+#else
+static __inline int RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) {
+ return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8;
+}
+static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
+ return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8;
+}
+#endif
// ARGBToYJ_C and ARGBToUVJ_C
+// Intel version mimic SSE/AVX which does 2 pavgb
+#if LIBYUV_ARGBTOUV_PAVGB
#define MAKEROWYJ(NAME, R, G, B, BPP) \
void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
int x; \
@@ -514,8 +638,53 @@ static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
dst_v[0] = RGBToVJ(ar, ag, ab); \
} \
}
+#else
+// ARM version does sum / 2 then multiply by 2x smaller coefficients
+#define MAKEROWYJ(NAME, R, G, B, BPP) \
+ void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \
+ src_argb0 += BPP; \
+ dst_y += 1; \
+ } \
+ } \
+ void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \
+ uint8_t* dst_u, uint8_t* dst_v, int width) { \
+ const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \
+ src_rgb1[B + BPP] + 1) >> \
+ 1; \
+ uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \
+ src_rgb1[G + BPP] + 1) >> \
+ 1; \
+ uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \
+ src_rgb1[R + BPP] + 1) >> \
+ 1; \
+ dst_u[0] = RGB2xToUJ(ar, ag, ab); \
+ dst_v[0] = RGB2xToVJ(ar, ag, ab); \
+ src_rgb0 += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint16_t ab = (src_rgb0[B] + src_rgb1[B]); \
+ uint16_t ag = (src_rgb0[G] + src_rgb1[G]); \
+ uint16_t ar = (src_rgb0[R] + src_rgb1[R]); \
+ dst_u[0] = RGB2xToUJ(ar, ag, ab); \
+ dst_v[0] = RGB2xToVJ(ar, ag, ab); \
+ } \
+ }
+
+#endif
MAKEROWYJ(ARGB, 2, 1, 0, 4)
+MAKEROWYJ(RGBA, 3, 2, 1, 4)
+MAKEROWYJ(RGB24, 2, 1, 0, 3)
+MAKEROWYJ(RAW, 0, 1, 2, 3)
#undef MAKEROWYJ
void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
@@ -583,13 +752,34 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
uint8_t b3 = next_rgb565[2] & 0x1f;
uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
uint8_t r3 = next_rgb565[3] >> 3;
- uint8_t b = (b0 + b1 + b2 + b3); // 565 * 4 = 787.
- uint8_t g = (g0 + g1 + g2 + g3);
- uint8_t r = (r0 + r1 + r2 + r3);
- b = (b << 1) | (b >> 6); // 787 -> 888.
- r = (r << 1) | (r >> 6);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+
+ b0 = (b0 << 3) | (b0 >> 2);
+ g0 = (g0 << 2) | (g0 >> 4);
+ r0 = (r0 << 3) | (r0 >> 2);
+ b1 = (b1 << 3) | (b1 >> 2);
+ g1 = (g1 << 2) | (g1 >> 4);
+ r1 = (r1 << 3) | (r1 >> 2);
+ b2 = (b2 << 3) | (b2 >> 2);
+ g2 = (g2 << 2) | (g2 >> 4);
+ r2 = (r2 << 3) | (r2 >> 2);
+ b3 = (b3 << 3) | (b3 >> 2);
+ g3 = (g3 << 2) | (g3 >> 4);
+ r3 = (r3 << 3) | (r3 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+ uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+ uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+ uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+ uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
src_rgb565 += 4;
next_rgb565 += 4;
dst_u += 1;
@@ -602,14 +792,27 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
uint8_t b2 = next_rgb565[0] & 0x1f;
uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
uint8_t r2 = next_rgb565[1] >> 3;
- uint8_t b = (b0 + b2); // 565 * 2 = 676.
- uint8_t g = (g0 + g2);
- uint8_t r = (r0 + r2);
- b = (b << 2) | (b >> 4); // 676 -> 888
- g = (g << 1) | (g >> 6);
- r = (r << 2) | (r >> 4);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+
+ b0 = (b0 << 3) | (b0 >> 2);
+ g0 = (g0 << 2) | (g0 >> 4);
+ r0 = (r0 << 3) | (r0 >> 2);
+ b2 = (b2 << 3) | (b2 >> 2);
+ g2 = (g2 << 2) | (g2 >> 4);
+ r2 = (r2 << 3) | (r2 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(b0, b2);
+ uint8_t ag = AVGB(g0, g2);
+ uint8_t ar = AVGB(r0, r2);
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = b0 + b2;
+ uint16_t g = g0 + g2;
+ uint16_t r = r0 + r2;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
}
}
@@ -633,14 +836,34 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
uint8_t b3 = next_argb1555[2] & 0x1f;
uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2;
- uint8_t b = (b0 + b1 + b2 + b3); // 555 * 4 = 777.
- uint8_t g = (g0 + g1 + g2 + g3);
- uint8_t r = (r0 + r1 + r2 + r3);
- b = (b << 1) | (b >> 6); // 777 -> 888.
- g = (g << 1) | (g >> 6);
- r = (r << 1) | (r >> 6);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+
+ b0 = (b0 << 3) | (b0 >> 2);
+ g0 = (g0 << 3) | (g0 >> 2);
+ r0 = (r0 << 3) | (r0 >> 2);
+ b1 = (b1 << 3) | (b1 >> 2);
+ g1 = (g1 << 3) | (g1 >> 2);
+ r1 = (r1 << 3) | (r1 >> 2);
+ b2 = (b2 << 3) | (b2 >> 2);
+ g2 = (g2 << 3) | (g2 >> 2);
+ r2 = (r2 << 3) | (r2 >> 2);
+ b3 = (b3 << 3) | (b3 >> 2);
+ g3 = (g3 << 3) | (g3 >> 2);
+ r3 = (r3 << 3) | (r3 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+ uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+ uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+ uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+ uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
src_argb1555 += 4;
next_argb1555 += 4;
dst_u += 1;
@@ -653,14 +876,27 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
uint8_t b2 = next_argb1555[0] & 0x1f;
uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
uint8_t r2 = next_argb1555[1] >> 3;
- uint8_t b = (b0 + b2); // 555 * 2 = 666.
- uint8_t g = (g0 + g2);
- uint8_t r = (r0 + r2);
- b = (b << 2) | (b >> 4); // 666 -> 888.
- g = (g << 2) | (g >> 4);
- r = (r << 2) | (r >> 4);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+
+ b0 = (b0 << 3) | (b0 >> 2);
+ g0 = (g0 << 3) | (g0 >> 2);
+ r0 = (r0 << 3) | (r0 >> 2);
+ b2 = (b2 << 3) | (b2 >> 2);
+ g2 = (g2 << 3) | (g2 >> 2);
+ r2 = (r2 << 3) | (r2 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(b0, b2);
+ uint8_t ag = AVGB(g0, g2);
+ uint8_t ar = AVGB(r0, r2);
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = b0 + b2;
+ uint16_t g = g0 + g2;
+ uint16_t r = r0 + r2;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
}
}
@@ -684,14 +920,34 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
uint8_t b3 = next_argb4444[2] & 0x0f;
uint8_t g3 = next_argb4444[2] >> 4;
uint8_t r3 = next_argb4444[3] & 0x0f;
- uint8_t b = (b0 + b1 + b2 + b3); // 444 * 4 = 666.
- uint8_t g = (g0 + g1 + g2 + g3);
- uint8_t r = (r0 + r1 + r2 + r3);
- b = (b << 2) | (b >> 4); // 666 -> 888.
- g = (g << 2) | (g >> 4);
- r = (r << 2) | (r >> 4);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+
+ b0 = (b0 << 4) | b0;
+ g0 = (g0 << 4) | g0;
+ r0 = (r0 << 4) | r0;
+ b1 = (b1 << 4) | b1;
+ g1 = (g1 << 4) | g1;
+ r1 = (r1 << 4) | r1;
+ b2 = (b2 << 4) | b2;
+ g2 = (g2 << 4) | g2;
+ r2 = (r2 << 4) | r2;
+ b3 = (b3 << 4) | b3;
+ g3 = (g3 << 4) | g3;
+ r3 = (r3 << 4) | r3;
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+ uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+ uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+ uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+ uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
src_argb4444 += 4;
next_argb4444 += 4;
dst_u += 1;
@@ -704,14 +960,27 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
uint8_t b2 = next_argb4444[0] & 0x0f;
uint8_t g2 = next_argb4444[0] >> 4;
uint8_t r2 = next_argb4444[1] & 0x0f;
- uint8_t b = (b0 + b2); // 444 * 2 = 555.
- uint8_t g = (g0 + g2);
- uint8_t r = (r0 + r2);
- b = (b << 3) | (b >> 2); // 555 -> 888.
- g = (g << 3) | (g >> 2);
- r = (r << 3) | (r >> 2);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+
+ b0 = (b0 << 4) | b0;
+ g0 = (g0 << 4) | g0;
+ r0 = (r0 << 4) | r0;
+ b2 = (b2 << 4) | b2;
+ g2 = (g2 << 4) | g2;
+ r2 = (r2 << 4) | r2;
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(b0, b2);
+ uint8_t ag = AVGB(g0, g2);
+ uint8_t ar = AVGB(r0, r2);
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = b0 + b2;
+ uint16_t g = g0 + g2;
+ uint16_t r = r0 + r2;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
}
}
@@ -1087,26 +1356,26 @@ const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
{-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
{UG, VG, UG, VG, UG, VG, UG, VG},
{UG, VG, UG, VG, UG, VG, UG, VG},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
{-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
{-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
{VG, UG, VG, UG, VG, UG, VG, UG},
{VG, UG, VG, UG, VG, UG, VG, UG},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
#elif defined(__arm__) // 32 bit arm
const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
{-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
{UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
{-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
{VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
#else
const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
@@ -1118,7 +1387,9 @@ const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
{BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
{BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
{BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
{VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
@@ -1129,7 +1400,9 @@ const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
{BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
{BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
{BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
#endif
#undef BB
@@ -1168,26 +1441,26 @@ const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
{-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
{UG, VG, UG, VG, UG, VG, UG, VG},
{UG, VG, UG, VG, UG, VG, UG, VG},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
{-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
{-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
{VG, UG, VG, UG, VG, UG, VG, UG},
{VG, UG, VG, UG, VG, UG, VG, UG},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
#elif defined(__arm__)
const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
{-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
{UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
{-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
{VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
#else
const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
@@ -1199,7 +1472,9 @@ const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
{BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
{BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
{BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
{VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
@@ -1210,7 +1485,9 @@ const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
{BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
{BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
{BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
#endif
#undef BB
@@ -1251,26 +1528,26 @@ const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
{-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
{UG, VG, UG, VG, UG, VG, UG, VG},
{UG, VG, UG, VG, UG, VG, UG, VG},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
{-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
{-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
{VG, UG, VG, UG, VG, UG, VG, UG},
{VG, UG, VG, UG, VG, UG, VG, UG},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
#elif defined(__arm__)
const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
{-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
{UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
{-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
{VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
#else
const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
@@ -1282,7 +1559,9 @@ const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
{BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
{BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
{BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
{VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
@@ -1293,7 +1572,95 @@ const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
{BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
{BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
{BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
+#endif
+
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
+// BT.2020 YUV to RGB reference
+// R = (Y - 16) * 1.164384 - V * -1.67867
+// G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042
+// B = (Y - 16) * 1.164384 - U * -2.14177
+
+// Y contribution to R,G,B. Scale and bias.
+#define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
+
+// TODO(fbarchard): Improve accuracy; the B channel is off by 7%.
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.142 * 64)) */
+#define UG 12 /* round(0.187326 * 64) */
+#define VG 42 /* round(0.65042 * 64) */
+#define VR -107 /* round(-1.67867 * 64) */
+
+// Bias values to round, and subtract 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
+#if defined(__aarch64__)
+const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = {
+ {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+ {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+ {UG, VG, UG, VG, UG, VG, UG, VG},
+ {UG, VG, UG, VG, UG, VG, UG, VG},
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = {
+ {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+ {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+ {VG, UG, VG, UG, VG, UG, VG, UG},
+ {VG, UG, VG, UG, VG, UG, VG, UG},
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
+#elif defined(__arm__)
+const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = {
+ {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+ {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = {
+ {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+ {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
+#else
+const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = {
+ {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+ {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+ {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+ 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
+const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = {
+ {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+ VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+ {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+ {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+ 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
#endif
#undef BB
@@ -1308,7 +1675,6 @@ const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
// C reference code that mimics the YUV assembly.
// Reads 8 bit YUV and leaves result as 16 bit.
-
static __inline void YuvPixel(uint8_t y,
uint8_t u,
uint8_t v,
@@ -1324,7 +1690,7 @@ static __inline void YuvPixel(uint8_t y,
int bb = yuvconstants->kUVBiasBGR[0];
int bg = yuvconstants->kUVBiasBGR[1];
int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
+ int yg = yuvconstants->kYToRgb[1];
#elif defined(__arm__)
int ub = -yuvconstants->kUVToRB[0];
int ug = yuvconstants->kUVToG[0];
@@ -1333,7 +1699,7 @@ static __inline void YuvPixel(uint8_t y,
int bb = yuvconstants->kUVBiasBGR[0];
int bg = yuvconstants->kUVBiasBGR[1];
int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
+ int yg = yuvconstants->kYToRgb[1];
#else
int ub = yuvconstants->kUVToB[0];
int ug = yuvconstants->kUVToG[0];
@@ -1367,7 +1733,7 @@ static __inline void YuvPixel8_16(uint8_t y,
int bb = yuvconstants->kUVBiasBGR[0];
int bg = yuvconstants->kUVBiasBGR[1];
int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
+ int yg = yuvconstants->kYToRgb[1];
#elif defined(__arm__)
int ub = -yuvconstants->kUVToRB[0];
int ug = yuvconstants->kUVToG[0];
@@ -1376,7 +1742,7 @@ static __inline void YuvPixel8_16(uint8_t y,
int bb = yuvconstants->kUVBiasBGR[0];
int bg = yuvconstants->kUVBiasBGR[1];
int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
+ int yg = yuvconstants->kYToRgb[1];
#else
int ub = yuvconstants->kUVToB[0];
int ug = yuvconstants->kUVToG[0];
@@ -1411,7 +1777,7 @@ static __inline void YuvPixel16(int16_t y,
int bb = yuvconstants->kUVBiasBGR[0];
int bg = yuvconstants->kUVBiasBGR[1];
int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
+ int yg = yuvconstants->kYToRgb[1];
#elif defined(__arm__)
int ub = -yuvconstants->kUVToRB[0];
int ug = yuvconstants->kUVToG[0];
@@ -1420,7 +1786,7 @@ static __inline void YuvPixel16(int16_t y,
int bb = yuvconstants->kUVBiasBGR[0];
int bg = yuvconstants->kUVBiasBGR[1];
int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
+ int yg = yuvconstants->kYToRgb[1];
#else
int ub = yuvconstants->kUVToB[0];
int ug = yuvconstants->kUVToG[0];
@@ -1458,21 +1824,26 @@ static __inline void YuvPixel10(uint16_t y,
*r = Clamp(r16 >> 6);
}
-// Y contribution to R,G,B. Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
-
// C reference code that mimics the YUV assembly.
-static __inline void YPixel(uint8_t y, uint8_t* b, uint8_t* g, uint8_t* r) {
- uint32_t y1 = (uint32_t)(y * 0x0101 * YG) >> 16;
- *b = Clamp((int32_t)(y1 + YGB) >> 6);
- *g = Clamp((int32_t)(y1 + YGB) >> 6);
- *r = Clamp((int32_t)(y1 + YGB) >> 6);
+// Reads 8 bit YUV and leaves result as 16 bit.
+static __inline void YPixel(uint8_t y,
+ uint8_t* b,
+ uint8_t* g,
+ uint8_t* r,
+ const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__) || defined(__arm__)
+ int ygb = yuvconstants->kUVBiasBGR[3];
+ int yg = yuvconstants->kYToRgb[1];
+#else
+ int ygb = yuvconstants->kYBiasToRgb[0];
+ int yg = yuvconstants->kYToRgb[0];
+#endif
+ uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
+ *b = Clamp(((int32_t)(y1) + ygb) >> 6);
+ *g = Clamp(((int32_t)(y1) + ygb) >> 6);
+ *r = Clamp(((int32_t)(y1) + ygb) >> 6);
}
-#undef YG
-#undef YGB
-
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
// C mimic assembly.
@@ -2006,18 +2377,21 @@ void I422ToRGBARow_C(const uint8_t* src_y,
}
}
-void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width) {
+void I400ToARGBRow_C(const uint8_t* src_y,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
- YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
rgb_buf[7] = 255;
src_y += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
}
}
@@ -2035,10 +2409,21 @@ void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
}
}
-void MirrorUVRow_C(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
+void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ int x;
+ src_uv += (width - 1) << 1;
+ for (x = 0; x < width; ++x) {
+ dst_uv[0] = src_uv[0];
+ dst_uv[1] = src_uv[1];
+ src_uv -= 2;
+ dst_uv += 2;
+ }
+}
+
+void MirrorSplitUVRow_C(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
int x;
src_uv += (width - 1) << 1;
for (x = 0; x < width - 1; x += 2) {
@@ -2069,6 +2454,21 @@ void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
}
}
+void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) {
+ int x;
+ src_rgb24 += width * 3 - 3;
+ for (x = 0; x < width; ++x) {
+ uint8_t b = src_rgb24[0];
+ uint8_t g = src_rgb24[1];
+ uint8_t r = src_rgb24[2];
+ dst_rgb24[0] = b;
+ dst_rgb24[1] = g;
+ dst_rgb24[2] = r;
+ src_rgb24 -= 3;
+ dst_rgb24 += 3;
+ }
+}
+
void SplitUVRow_C(const uint8_t* src_uv,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -2208,10 +2608,9 @@ void SetRow_C(uint8_t* dst, uint8_t v8, int width) {
}
void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) {
- uint32_t* d = (uint32_t*)(dst_argb);
int x;
for (x = 0; x < width; ++x) {
- d[x] = v32;
+ memcpy(dst_argb + x * sizeof v32, &v32, sizeof v32);
}
}
@@ -2309,7 +2708,7 @@ void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
}
}
-#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
+#define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f)
// Blend src_argb0 over src_argb1 and store to dst_argb.
// dst_argb may be src_argb0 or src_argb1.
@@ -2385,10 +2784,14 @@ void BlendPlaneRow_C(const uint8_t* src0,
}
#undef UBLEND
+#if defined(__aarch64__) || defined(__arm__)
+#define ATTENUATE(f, a) (f * a + 128) >> 8
+#else
+// This code mimics the SSSE3 version for better testability.
#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
+#endif
// Multiply source RGB by alpha and store to destination.
-// This code mimics the SSSE3 version for better testability.
void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
int i;
for (i = 0; i < width - 1; i += 2) {
@@ -3175,12 +3578,73 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
}
#endif
+#ifdef HAS_RGB24TOYJROW_AVX2
+// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
+void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
+ ARGBToYJRow_AVX2(row, dst_yj, twidth);
+ src_rgb24 += twidth * 3;
+ dst_yj += twidth;
+ width -= twidth;
+ }
+}
+#endif // HAS_RGB24TOYJROW_AVX2
+
+#ifdef HAS_RAWTOYJROW_AVX2
+// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
+void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ RAWToARGBRow_SSSE3(src_raw, row, twidth);
+ ARGBToYJRow_AVX2(row, dst_yj, twidth);
+ src_raw += twidth * 3;
+ dst_yj += twidth;
+ width -= twidth;
+ }
+}
+#endif // HAS_RAWTOYJROW_AVX2
+
+#ifdef HAS_RGB24TOYJROW_SSSE3
+// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
+void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
+ ARGBToYJRow_SSSE3(row, dst_yj, twidth);
+ src_rgb24 += twidth * 3;
+ dst_yj += twidth;
+ width -= twidth;
+ }
+}
+#endif // HAS_RGB24TOYJROW_SSSE3
+
+#ifdef HAS_RAWTOYJROW_SSSE3
+// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
+void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ RAWToARGBRow_SSSE3(src_raw, row, twidth);
+ ARGBToYJRow_SSSE3(row, dst_yj, twidth);
+ src_raw += twidth * 3;
+ dst_yj += twidth;
+ width -= twidth;
+ }
+}
+#endif // HAS_RAWTOYJROW_SSSE3
+
float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
float fsum = 0.f;
int i;
-#if defined(__clang__)
-#pragma clang loop vectorize_width(4)
-#endif
for (i = 0; i < width; ++i) {
float v = *src++;
fsum += v * v;
@@ -3231,6 +3695,154 @@ void GaussCol_C(const uint16_t* src0,
}
}
+void GaussRow_F32_C(const float* src, float* dst, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ *dst++ = (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4]) *
+ (1.0f / 256.0f);
+ ++src;
+ }
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_F32_C(const float* src0,
+ const float* src1,
+ const float* src2,
+ const float* src3,
+ const float* src4,
+ float* dst,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
+ }
+}
+
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_C(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_yuv24[0] = src_vu[0]; // V
+ dst_yuv24[1] = src_vu[1]; // U
+ dst_yuv24[2] = src_y[0]; // Y0
+ dst_yuv24[3] = src_vu[0]; // V
+ dst_yuv24[4] = src_vu[1]; // U
+ dst_yuv24[5] = src_y[1]; // Y1
+ src_y += 2;
+ src_vu += 2;
+ dst_yuv24 += 6; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ dst_yuv24[0] = src_vu[0]; // V
+ dst_yuv24[1] = src_vu[1]; // U
+ dst_yuv24[2] = src_y[0]; // Y0
+ }
+}
+
+// Filter 2 rows of AYUV UV's (444) into UV (420).
+void AYUVToUVRow_C(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_uv,
+ int width) {
+ // Output a row of UV values, filtering 2x2 rows of AYUV.
+ int x;
+ for (x = 0; x < width; x += 2) {
+ dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
+ src_ayuv[src_stride_ayuv + 5] + 2) >>
+ 2;
+ dst_uv[1] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
+ src_ayuv[src_stride_ayuv + 4] + 2) >>
+ 2;
+ src_ayuv += 8;
+ dst_uv += 2;
+ }
+ if (width & 1) {
+ dst_uv[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
+ src_ayuv[src_stride_ayuv + 0] + 2) >>
+ 2;
+ dst_uv[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
+ src_ayuv[src_stride_ayuv + 1] + 2) >>
+ 2;
+ }
+}
+
+// Filter 2 rows of AYUV UV's (444) into VU (420).
+void AYUVToVURow_C(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_vu,
+ int width) {
+ // Output a row of VU values, filtering 2x2 rows of AYUV.
+ int x;
+ for (x = 0; x < width; x += 2) {
+ dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
+ src_ayuv[src_stride_ayuv + 4] + 2) >>
+ 2;
+ dst_vu[1] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
+ src_ayuv[src_stride_ayuv + 5] + 2) >>
+ 2;
+ src_ayuv += 8;
+ dst_vu += 2;
+ }
+ if (width & 1) {
+ dst_vu[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
+ src_ayuv[src_stride_ayuv + 0] + 2) >>
+ 2;
+ dst_vu[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
+ src_ayuv[src_stride_ayuv + 1] + 2) >>
+ 2;
+ }
+}
+
+// Copy row of AYUV Y's into Y
+void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+ // Output a row of Y values.
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_y[x] = src_ayuv[2]; // v,u,y,a
+ src_ayuv += 4;
+ }
+}
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t u = src_uv[0];
+ uint8_t v = src_uv[1];
+ dst_vu[0] = v;
+ dst_vu[1] = u;
+ src_uv += 2;
+ dst_vu += 2;
+ }
+}
+
+void HalfMergeUVRow_C(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_uv[0] = (src_u[0] + src_u[1] + src_u[src_stride_u] +
+ src_u[src_stride_u + 1] + 2) >>
+ 2;
+ dst_uv[1] = (src_v[0] + src_v[1] + src_v[src_stride_v] +
+ src_v[src_stride_v + 1] + 2) >>
+ 2;
+ src_u += 2;
+ src_v += 2;
+ dst_uv += 2;
+ }
+ if (width & 1) {
+ dst_uv[0] = (src_u[0] + src_u[src_stride_u] + 1) >> 1;
+ dst_uv[1] = (src_v[0] + src_v[src_stride_v] + 1) >> 1;
+ }
+}
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/chromium/third_party/libyuv/source/row_gcc.cc b/chromium/third_party/libyuv/source/row_gcc.cc
index 8d3cb81cec2..a107c30e769 100644
--- a/chromium/third_party/libyuv/source/row_gcc.cc
+++ b/chromium/third_party/libyuv/source/row_gcc.cc
@@ -22,12 +22,15 @@ extern "C" {
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
// Constants for ARGB
-static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
- 13, 65, 33, 0, 13, 65, 33, 0};
+static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
+ 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};
// JPeg full range.
-static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
- 15, 75, 38, 0, 15, 75, 38, 0};
+static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
+ 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
+
+static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
+ 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
@@ -45,8 +48,8 @@ static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
-20, -107, 127, 0, -20, -107, 127, 0};
// Constants for BGRA
-static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
- 0, 33, 65, 13, 0, 33, 65, 13};
+static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
+ 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
0, -38, -74, 112, 0, -38, -74, 112};
@@ -55,8 +58,8 @@ static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
0, 112, -94, -18, 0, 112, -94, -18};
// Constants for ABGR
-static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
- 33, 65, 13, 0, 33, 65, 13, 0};
+static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
+ 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};
static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
-38, -74, 112, 0, -38, -74, 112, 0};
@@ -65,8 +68,8 @@ static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
112, -94, -18, 0, 112, -94, -18, 0};
// Constants for RGBA.
-static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
- 0, 13, 65, 33, 0, 13, 65, 33};
+static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
+ 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};
static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
0, 112, -74, -38, 0, 112, -74, -38};
@@ -74,17 +77,15 @@ static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
0, -18, -94, 112, 0, -18, -94, 112};
-static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
- 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
-
-// 7 bit fixed point 0.5.
-static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
+static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
+ 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};
static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
- 0x8080u, 0x8080u, 0x8080u, 0x8080u};
+static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
+ 0x8080u, 0x8080u, 0x8080u, 0x8080u};
+
#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
#ifdef HAS_RGB24TOARGBROW_SSSE3
@@ -97,6 +98,10 @@ static const uvec8 kShuffleMaskRGB24ToARGB = {
static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
+// Shuffle table for converting RAW to RGBA.
+static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u, 4u, 3u,
+ 14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u};
+
// Shuffle table for converting RAW to RGB24. First 8.
static const uvec8 kShuffleMaskRAWToRGB24_0 = {
2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
@@ -154,24 +159,24 @@ static const lvec8 kShuffleNV21 = {
#ifdef HAS_J400TOARGBROW_SSE2
void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movq (%0),%%xmm0 \n"
- "lea 0x8(%0),%0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm1 \n"
- "por %%xmm5,%%xmm0 \n"
- "por %%xmm5,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -185,35 +190,35 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
uint8_t* dst_argb,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
- "pslld $0x18,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm3 \n"
- "lea 0x30(%0),%0 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "palignr $0x8,%%xmm1,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm2 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqu %%xmm2,0x20(%1) \n"
- "por %%xmm5,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "palignr $0x4,%%xmm3,%%xmm3 \n"
- "pshufb %%xmm4,%%xmm3 \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm3,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -223,35 +228,35 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
- "pslld $0x18,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm3 \n"
- "lea 0x30(%0),%0 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "palignr $0x8,%%xmm1,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm2 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqu %%xmm2,0x20(%1) \n"
- "por %%xmm5,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "palignr $0x4,%%xmm3,%%xmm3 \n"
- "pshufb %%xmm4,%%xmm3 \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm3,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -259,29 +264,68 @@ void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
+// Same code as RAWToARGB with different shuffler and A in low bits
+void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff
+ "psrld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgba), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskRAWToRGBA) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
uint8_t* dst_rgb24,
int width) {
asm volatile(
- "movdqa %3,%%xmm3 \n"
- "movdqa %4,%%xmm4 \n"
- "movdqa %5,%%xmm5 \n"
+ "movdqa %3,%%xmm3 \n"
+ "movdqa %4,%%xmm4 \n"
+ "movdqa %5,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x4(%0),%%xmm1 \n"
- "movdqu 0x8(%0),%%xmm2 \n"
- "lea 0x18(%0),%0 \n"
- "pshufb %%xmm3,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x8(%1) \n"
- "movq %%xmm2,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x4(%0),%%xmm1 \n"
+ "movdqu 0x8(%0),%%xmm2 \n"
+ "lea 0x18(%0),%0 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x8(%1) \n"
+ "movq %%xmm2,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
@@ -293,44 +337,44 @@ void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "mov $0x1080108,%%eax \n"
- "movd %%eax,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "mov $0x20802080,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psllw $0xb,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psllw $0xa,%%xmm4 \n"
- "psrlw $0x5,%%xmm4 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psllw $0x8,%%xmm7 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x20802080,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xa,%%xmm4 \n"
+ "psrlw $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm3,%%xmm1 \n"
- "psllw $0xb,%%xmm2 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm2 \n"
- "psllw $0x8,%%xmm1 \n"
- "por %%xmm2,%%xmm1 \n"
- "pand %%xmm4,%%xmm0 \n"
- "pmulhuw %%xmm6,%%xmm0 \n"
- "por %%xmm7,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- "movdqu %%xmm1,0x00(%1,%0,2) \n"
- "movdqu %%xmm2,0x10(%1,%0,2) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1,0x00(%1,%0,2) \n"
+ "movdqu %%xmm2,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -341,47 +385,47 @@ void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "mov $0x1080108,%%eax \n"
- "movd %%eax,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "mov $0x42004200,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psllw $0xb,%%xmm3 \n"
- "movdqa %%xmm3,%%xmm4 \n"
- "psrlw $0x6,%%xmm4 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psllw $0x8,%%xmm7 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x42004200,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "movdqa %%xmm3,%%xmm4 \n"
+ "psrlw $0x6,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psllw $0x1,%%xmm1 \n"
- "psllw $0xb,%%xmm2 \n"
- "pand %%xmm3,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm2 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "psllw $0x8,%%xmm1 \n"
- "por %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm4,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "pmulhuw %%xmm6,%%xmm0 \n"
- "pand %%xmm7,%%xmm2 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- "movdqu %%xmm1,0x00(%1,%0,2) \n"
- "movdqu %%xmm2,0x10(%1,%0,2) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psllw $0x1,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1,0x00(%1,%0,2) \n"
+ "movdqu %%xmm2,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -392,34 +436,34 @@ void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "mov $0xf0f0f0f,%%eax \n"
- "movd %%eax,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "pslld $0x4,%%xmm5 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
+ "mov $0xf0f0f0f,%%eax \n"
+ "movd %%eax,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x4,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm4,%%xmm0 \n"
- "pand %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "psllw $0x4,%%xmm1 \n"
- "psrlw $0x4,%%xmm3 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,0x00(%1,%0,2) \n"
- "movdqu %%xmm1,0x10(%1,%0,2) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "psllw $0x4,%%xmm1 \n"
+ "psrlw $0x4,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,0x00(%1,%0,2) \n"
+ "movdqu %%xmm1,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -430,35 +474,35 @@ void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "movdqa %3,%%xmm6 \n"
+ "movdqa %3,%%xmm6 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "pshufb %%xmm6,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "pshufb %%xmm6,%%xmm2 \n"
- "pshufb %%xmm6,%%xmm3 \n"
- "movdqa %%xmm1,%%xmm4 \n"
- "psrldq $0x4,%%xmm1 \n"
- "pslldq $0xc,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm5 \n"
- "por %%xmm4,%%xmm0 \n"
- "pslldq $0x8,%%xmm5 \n"
- "movdqu %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "psrldq $0x8,%%xmm2 \n"
- "pslldq $0x4,%%xmm3 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "movdqu %%xmm2,0x20(%1) \n"
- "lea 0x30(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -469,35 +513,35 @@ void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "movdqa %3,%%xmm6 \n"
+ "movdqa %3,%%xmm6 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "pshufb %%xmm6,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "pshufb %%xmm6,%%xmm2 \n"
- "pshufb %%xmm6,%%xmm3 \n"
- "movdqa %%xmm1,%%xmm4 \n"
- "psrldq $0x4,%%xmm1 \n"
- "pslldq $0xc,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm5 \n"
- "por %%xmm4,%%xmm0 \n"
- "pslldq $0x8,%%xmm5 \n"
- "movdqu %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "psrldq $0x8,%%xmm2 \n"
- "pslldq $0x4,%%xmm3 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "movdqu %%xmm2,0x20(%1) \n"
- "lea 0x30(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -512,37 +556,37 @@ static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm6 \n"
- "vmovdqa %4,%%ymm7 \n"
+ "vmovdqa %4,%%ymm7 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
- "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
- "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
- "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
- "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
- "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
- "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
- "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
- "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
- "vpor %%ymm4,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
- "vpermq $0x4f,%%ymm2,%%ymm4 \n"
- "vpor %%ymm4,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
- "vpermq $0x93,%%ymm3,%%ymm3 \n"
- "vpor %%ymm3,%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm2,0x40(%1) \n"
- "lea 0x60(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
+ "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
+ "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
+ "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
+ "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
+ "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
+ "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
+ "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
+ "vpor %%ymm4,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
+ "vpermq $0x4f,%%ymm2,%%ymm4 \n"
+ "vpor %%ymm4,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
+ "vpermq $0x93,%%ymm3,%%ymm3 \n"
+ "vpor %%ymm3,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm2,0x40(%1) \n"
+ "lea 0x60(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -571,26 +615,26 @@ static const ulvec8 kPermARGBToRGB24_2 = {
void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "vmovdqa %3,%%ymm5 \n"
- "vmovdqa %4,%%ymm6 \n"
- "vmovdqa %5,%%ymm7 \n"
+ "vmovdqa %3,%%ymm5 \n"
+ "vmovdqa %4,%%ymm6 \n"
+ "vmovdqa %5,%%ymm7 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n"
- "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n"
- "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "vmovdqu %%ymm2,0x40(%1) \n"
- "lea 0x60(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n"
+ "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n"
+ "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "vmovdqu %%ymm2,0x40(%1) \n"
+ "lea 0x60(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -606,37 +650,37 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm6 \n"
- "vmovdqa %4,%%ymm7 \n"
+ "vmovdqa %4,%%ymm7 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
- "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
- "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
- "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
- "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
- "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
- "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
- "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
- "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
- "vpor %%ymm4,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
- "vpermq $0x4f,%%ymm2,%%ymm4 \n"
- "vpor %%ymm4,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
- "vpermq $0x93,%%ymm3,%%ymm3 \n"
- "vpor %%ymm3,%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm2,0x40(%1) \n"
- "lea 0x60(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
+ "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
+ "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
+ "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
+ "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
+ "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
+ "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
+ "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
+ "vpor %%ymm4,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
+ "vpermq $0x4f,%%ymm2,%%ymm4 \n"
+ "vpor %%ymm4,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
+ "vpermq $0x93,%%ymm3,%%ymm3 \n"
+ "vpor %%ymm3,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm2,0x40(%1) \n"
+ "lea 0x60(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -650,34 +694,34 @@ void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psrld $0x1b,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrld $0x1a,%%xmm4 \n"
- "pslld $0x5,%%xmm4 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0xb,%%xmm5 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psrld $0x1b,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1a,%%xmm4 \n"
+ "pslld $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0xb,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pslld $0x8,%%xmm0 \n"
- "psrld $0x3,%%xmm1 \n"
- "psrld $0x5,%%xmm2 \n"
- "psrad $0x10,%%xmm0 \n"
- "pand %%xmm3,%%xmm1 \n"
- "pand %%xmm4,%%xmm2 \n"
- "pand %%xmm5,%%xmm0 \n"
- "por %%xmm2,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pslld $0x8,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x5,%%xmm2 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -690,40 +734,40 @@ void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
const uint32_t dither4,
int width) {
asm volatile(
- "movd %3,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm6 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "punpcklwd %%xmm6,%%xmm6 \n"
- "punpckhwd %%xmm7,%%xmm7 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psrld $0x1b,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrld $0x1a,%%xmm4 \n"
- "pslld $0x5,%%xmm4 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0xb,%%xmm5 \n"
+ "movd %3,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm6 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "punpcklwd %%xmm6,%%xmm6 \n"
+ "punpckhwd %%xmm7,%%xmm7 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psrld $0x1b,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1a,%%xmm4 \n"
+ "pslld $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0xb,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "paddusb %%xmm6,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pslld $0x8,%%xmm0 \n"
- "psrld $0x3,%%xmm1 \n"
- "psrld $0x5,%%xmm2 \n"
- "psrad $0x10,%%xmm0 \n"
- "pand %%xmm3,%%xmm1 \n"
- "pand %%xmm4,%%xmm2 \n"
- "pand %%xmm5,%%xmm0 \n"
- "por %%xmm2,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "paddusb %%xmm6,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pslld $0x8,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x5,%%xmm2 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -739,35 +783,35 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
int width) {
asm volatile(
"vbroadcastss %3,%%xmm6 \n"
- "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
- "vpermq $0xd8,%%ymm6,%%ymm6 \n"
- "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
- "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
- "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
- "vpslld $0x5,%%ymm4,%%ymm4 \n"
- "vpslld $0xb,%%ymm3,%%ymm5 \n"
+ "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
+ "vpermq $0xd8,%%ymm6,%%ymm6 \n"
+ "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
+ "vpslld $0x5,%%ymm4,%%ymm4 \n"
+ "vpslld $0xb,%%ymm3,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
- "vpsrld $0x5,%%ymm0,%%ymm2 \n"
- "vpsrld $0x3,%%ymm0,%%ymm1 \n"
- "vpsrld $0x8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm4,%%ymm2,%%ymm2 \n"
- "vpand %%ymm3,%%ymm1,%%ymm1 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpor %%ymm2,%%ymm1,%%ymm1 \n"
- "vpor %%ymm1,%%ymm0,%%ymm0 \n"
- "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "lea 0x20(%0),%0 \n"
- "vmovdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x5,%%ymm0,%%ymm2 \n"
+ "vpsrld $0x3,%%ymm0,%%ymm1 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpand %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "lea 0x20(%0),%0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -780,38 +824,38 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrld $0x1b,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "pslld $0x5,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "pslld $0xa,%%xmm6 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "pslld $0xf,%%xmm7 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1b,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x5,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "pslld $0xa,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "pslld $0xf,%%xmm7 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "psrad $0x10,%%xmm0 \n"
- "psrld $0x3,%%xmm1 \n"
- "psrld $0x6,%%xmm2 \n"
- "psrld $0x9,%%xmm3 \n"
- "pand %%xmm7,%%xmm0 \n"
- "pand %%xmm4,%%xmm1 \n"
- "pand %%xmm5,%%xmm2 \n"
- "pand %%xmm6,%%xmm3 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm3,%%xmm2 \n"
- "por %%xmm2,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x6,%%xmm2 \n"
+ "psrld $0x9,%%xmm3 \n"
+ "pand %%xmm7,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm6,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -821,26 +865,26 @@ void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psllw $0xc,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm3 \n"
- "psrlw $0x8,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xc,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm3 \n"
+ "psrlw $0x8,%%xmm3 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm3,%%xmm0 \n"
- "pand %%xmm4,%%xmm1 \n"
- "psrlq $0x4,%%xmm0 \n"
- "psrlq $0x8,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm3,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "psrlq $0x4,%%xmm0 \n"
+ "psrlq $0x8,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -884,31 +928,31 @@ static const uint32_t kMulAG10 = 64 * 65536 + 1028;
void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "movdqa %3,%%xmm2 \n" // shuffler for RB
- "movd %4,%%xmm3 \n" // multipler for RB
- "movd %5,%%xmm4 \n" // mask for R10 B10
- "movd %6,%%xmm5 \n" // mask for AG
- "movd %7,%%xmm6 \n" // multipler for AG
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "sub %0,%1 \n"
-
- "1: \n"
- "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels
- "movdqa %%xmm0,%%xmm1 \n"
- "pshufb %%xmm2,%%xmm1 \n" // R0B0
- "pand %%xmm5,%%xmm0 \n" // A0G0
- "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
- "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
- "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
- "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
- "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
- "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
- "add $0x10,%0 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqa %3,%%xmm2 \n" // shuffler for RB
+ "movd %4,%%xmm3 \n" // multipler for RB
+ "movd %5,%%xmm4 \n" // mask for R10 B10
+ "movd %6,%%xmm5 \n" // mask for AG
+ "movd %7,%%xmm6 \n" // multipler for AG
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n" // R0B0
+ "pand %%xmm5,%%xmm0 \n" // A0G0
+ "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
+ "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
+ "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
+ "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
+ "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
+ "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
+ "add $0x10,%0 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -923,31 +967,31 @@ void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "movdqa %3,%%xmm2 \n" // shuffler for RB
- "movd %4,%%xmm3 \n" // multipler for RB
- "movd %5,%%xmm4 \n" // mask for R10 B10
- "movd %6,%%xmm5 \n" // mask for AG
- "movd %7,%%xmm6 \n" // multipler for AG
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "sub %0,%1 \n"
-
- "1: \n"
- "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels
- "movdqa %%xmm0,%%xmm1 \n"
- "pshufb %%xmm2,%%xmm1 \n" // R0B0
- "pand %%xmm5,%%xmm0 \n" // A0G0
- "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
- "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
- "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
- "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
- "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
- "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
- "add $0x10,%0 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqa %3,%%xmm2 \n" // shuffler for RB
+ "movd %4,%%xmm3 \n" // multipler for RB
+ "movd %5,%%xmm4 \n" // mask for R10 B10
+ "movd %6,%%xmm5 \n" // mask for AG
+ "movd %7,%%xmm6 \n" // multipler for AG
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n" // R0B0
+ "pand %%xmm5,%%xmm0 \n" // A0G0
+ "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
+ "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
+ "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
+ "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
+ "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
+ "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
+ "add $0x10,%0 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -964,25 +1008,25 @@ void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
- "vbroadcastss %4,%%ymm3 \n" // multipler for RB
- "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
- "vbroadcastss %6,%%ymm5 \n" // mask for AG
- "vbroadcastss %7,%%ymm6 \n" // multipler for AG
- "sub %0,%1 \n"
-
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels
- "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
- "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
- "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
- "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
- "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
- "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
- "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
- "add $0x20,%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vbroadcastss %4,%%ymm3 \n" // multipler for RB
+ "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
+ "vbroadcastss %6,%%ymm5 \n" // mask for AG
+ "vbroadcastss %7,%%ymm6 \n" // multipler for AG
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels
+ "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
+ "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
+ "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
+ "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
+ "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
+ "add $0x20,%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
@@ -1001,25 +1045,25 @@ void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
- "vbroadcastss %4,%%ymm3 \n" // multipler for RB
- "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
- "vbroadcastss %6,%%ymm5 \n" // mask for AG
- "vbroadcastss %7,%%ymm6 \n" // multipler for AG
- "sub %0,%1 \n"
-
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels
- "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
- "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
- "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
- "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
- "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
- "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
- "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
- "add $0x20,%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vbroadcastss %4,%%ymm3 \n" // multipler for RB
+ "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
+ "vbroadcastss %6,%%ymm5 \n" // mask for AG
+ "vbroadcastss %7,%%ymm6 \n" // multipler for AG
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels
+ "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
+ "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
+ "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
+ "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
+ "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
+ "add $0x20,%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
@@ -1034,82 +1078,130 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
}
#endif
+// clang-format off
+
+// TODO(mraptis): Consider passing R, G, B multipliers as parameter.
+// round parameter is register containing value to add before shift.
+#define RGBTOY(round) \
+ "1: \n" \
+ "movdqu (%0),%%xmm0 \n" \
+ "movdqu 0x10(%0),%%xmm1 \n" \
+ "movdqu 0x20(%0),%%xmm2 \n" \
+ "movdqu 0x30(%0),%%xmm3 \n" \
+ "psubb %%xmm5,%%xmm0 \n" \
+ "psubb %%xmm5,%%xmm1 \n" \
+ "psubb %%xmm5,%%xmm2 \n" \
+ "psubb %%xmm5,%%xmm3 \n" \
+ "movdqu %%xmm4,%%xmm6 \n" \
+ "pmaddubsw %%xmm0,%%xmm6 \n" \
+ "movdqu %%xmm4,%%xmm0 \n" \
+ "pmaddubsw %%xmm1,%%xmm0 \n" \
+ "movdqu %%xmm4,%%xmm1 \n" \
+ "pmaddubsw %%xmm2,%%xmm1 \n" \
+ "movdqu %%xmm4,%%xmm2 \n" \
+ "pmaddubsw %%xmm3,%%xmm2 \n" \
+ "lea 0x40(%0),%0 \n" \
+ "phaddw %%xmm0,%%xmm6 \n" \
+ "phaddw %%xmm2,%%xmm1 \n" \
+ "prefetcht0 1280(%0) \n" \
+ "paddw %%" #round ",%%xmm6 \n" \
+ "paddw %%" #round ",%%xmm1 \n" \
+ "psrlw $0x8,%%xmm6 \n" \
+ "psrlw $0x8,%%xmm1 \n" \
+ "packuswb %%xmm1,%%xmm6 \n" \
+ "movdqu %%xmm6,(%1) \n" \
+ "lea 0x10(%1),%1 \n" \
+ "sub $0x10,%2 \n" \
+ "jg 1b \n"
+
+#define RGBTOY_AVX2(round) \
+ "1: \n" \
+ "vmovdqu (%0),%%ymm0 \n" \
+ "vmovdqu 0x20(%0),%%ymm1 \n" \
+ "vmovdqu 0x40(%0),%%ymm2 \n" \
+ "vmovdqu 0x60(%0),%%ymm3 \n" \
+ "vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \
+ "vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \
+ "vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \
+ "vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \
+ "vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \
+ "vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \
+ "vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \
+ "vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \
+ "lea 0x80(%0),%0 \n" \
+ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \
+ "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \
+ "prefetcht0 1280(%0) \n" \
+ "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \
+ "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \
+ "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \
+ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \
+ "vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \
+ "vmovdqu %%ymm0,(%1) \n" \
+ "lea 0x20(%1),%1 \n" \
+ "sub $0x20,%2 \n" \
+ "jg 1b \n" \
+ "vzeroupper \n"
+
+// clang-format on
+
#ifdef HAS_ARGBTOYROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ LABELALIGN RGBTOY(xmm7)
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kARGBToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+ "m"(kSub128), // %4
+ "m"(kAddY16) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
#endif // HAS_ARGBTOYROW_SSSE3
#ifdef HAS_ARGBTOYJROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
-// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
+// Same as ARGBToYRow but different coefficients, no add 16.
void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "paddw %%xmm5,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ LABELALIGN RGBTOY(xmm5)
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kARGBToYJ), // %3
- "m"(kAddYJ64) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+ "m"(kSub128) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif // HAS_ARGBTOYJROW_SSSE3
+#ifdef HAS_RGBATOYJROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16.
+void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ LABELALIGN RGBTOY(xmm5)
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kRGBAToYJ), // %3
+ "m"(kSub128) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif // HAS_RGBATOYJROW_SSSE3
+
#ifdef HAS_ARGBTOYROW_AVX2
// vpermd for vphaddw + vpackuswb vpermd.
static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
@@ -1119,83 +1211,84 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
- "vmovdqu %5,%%ymm6 \n"
+ "vbroadcastf128 %5,%%ymm7 \n"
+ "vmovdqu %6,%%ymm6 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
- "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
- "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
- "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
- "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
- "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ LABELALIGN RGBTOY_AVX2(ymm7)
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kARGBToY), // %3
- "m"(kAddY16), // %4
- "m"(kPermdARGBToY_AVX) // %5
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+ "m"(kSub128), // %4
+ "m"(kAddY16), // %5
+ "m"(kPermdARGBToY_AVX) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
#endif // HAS_ARGBTOYROW_AVX2
+#ifdef HAS_ABGRTOYROW_AVX2
+// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
+void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+ "vbroadcastf128 %5,%%ymm7 \n"
+ "vmovdqu %6,%%ymm6 \n"
+
+ LABELALIGN RGBTOY_AVX2(ymm7)
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kABGRToY), // %3
+ "m"(kSub128), // %4
+ "m"(kAddY16), // %5
+ "m"(kPermdARGBToY_AVX) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ABGRTOYROW_AVX2
+
#ifdef HAS_ARGBTOYJROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
- "vmovdqu %5,%%ymm6 \n"
+ "vmovdqu %5,%%ymm6 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
- "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
- "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding.
- "vpaddw %%ymm5,%%ymm2,%%ymm2 \n"
- "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
- "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
- "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ LABELALIGN RGBTOY_AVX2(ymm5)
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kARGBToYJ), // %3
- "m"(kAddYJ64), // %4
+ "m"(kSub128), // %4
"m"(kPermdARGBToY_AVX) // %5
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
#endif // HAS_ARGBTOYJROW_AVX2
+#ifdef HAS_RGBATOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+ "vmovdqu %5,%%ymm6 \n"
+
+ LABELALIGN RGBTOY_AVX2(
+ ymm5) "vzeroupper \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kRGBAToYJ), // %3
+ "m"(kSub128), // %4
+ "m"(kPermdARGBToY_AVX) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif // HAS_RGBATOYJROW_AVX2
+
#ifdef HAS_ARGBTOUVROW_SSSE3
void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
int src_stride_argb,
@@ -1203,52 +1296,52 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
-
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1275,44 +1368,44 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
"vbroadcastf128 %5,%%ymm5 \n"
"vbroadcastf128 %6,%%ymm6 \n"
"vbroadcastf128 %7,%%ymm7 \n"
- "sub %1,%2 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
- "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
- "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
- "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
- "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
- "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
- "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
- "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
- "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
-
- "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
- "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
- "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
- "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpsraw $0x8,%%ymm1,%%ymm1 \n"
- "vpsraw $0x8,%%ymm0,%%ymm0 \n"
- "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpshufb %8,%%ymm0,%%ymm0 \n"
- "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
+ "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
+ "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm0,(%1) \n"
"vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
@@ -1328,6 +1421,69 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
}
#endif // HAS_ARGBTOUVROW_AVX2
+#ifdef HAS_ABGRTOUVROW_AVX2
+void ABGRToUVRow_AVX2(const uint8_t* src_abgr0,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "vbroadcastf128 %5,%%ymm5 \n"
+ "vbroadcastf128 %6,%%ymm6 \n"
+ "vbroadcastf128 %7,%%ymm7 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
+ "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
+ "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
+
+ "vextractf128 $0x0,%%ymm0,(%1) \n"
+ "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_abgr0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_abgr)), // %4
+ "m"(kAddUV128), // %5
+ "m"(kABGRToV), // %6
+ "m"(kABGRToU), // %7
+ "m"(kShufARGBToUV_AVX) // %8
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ABGRTOUVROW_AVX2
+
#ifdef HAS_ARGBTOUVJROW_AVX2
void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
int src_stride_argb,
@@ -1338,52 +1494,52 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
"vbroadcastf128 %5,%%ymm5 \n"
"vbroadcastf128 %6,%%ymm6 \n"
"vbroadcastf128 %7,%%ymm7 \n"
- "sub %1,%2 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
- "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
- "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
- "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
- "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
- "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
- "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
- "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
- "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
-
- "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
- "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
- "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
- "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
- "vpsraw $0x8,%%ymm1,%%ymm1 \n"
- "vpsraw $0x8,%%ymm0,%%ymm0 \n"
- "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpshufb %8,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
+ "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm0,(%1) \n"
"vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
: "r"((intptr_t)(src_stride_argb)), // %4
- "m"(kAddUVJ128), // %5
+ "m"(kSub128), // %5
"m"(kARGBToVJ), // %6
"m"(kARGBToUJ), // %7
"m"(kShufARGBToUV_AVX) // %8
@@ -1399,53 +1555,53 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
-
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "paddw %%xmm5,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1453,7 +1609,7 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
: "r"((intptr_t)(src_stride_argb)), // %4
"m"(kARGBToVJ), // %5
"m"(kARGBToUJ), // %6
- "m"(kAddUVJ128) // %7
+ "m"(kSub128) // %7
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
}
#endif // HAS_ARGBTOUVJROW_SSSE3
@@ -1464,47 +1620,47 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %4,%%xmm3 \n"
- "movdqa %5,%%xmm4 \n"
- "movdqa %6,%%xmm5 \n"
- "sub %1,%2 \n"
+ "movdqa %4,%%xmm3 \n"
+ "movdqa %5,%%xmm4 \n"
+ "movdqa %6,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm6 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm2 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "packsswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "pmaddubsw %%xmm3,%%xmm0 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm2 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "packsswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "lea 0x40(%0),%0 \n"
- "movdqu %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqu %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1518,36 +1674,19 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
asm volatile(
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ LABELALIGN RGBTOY(xmm7)
: "+r"(src_bgra), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kBGRAToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+ "m"(kSub128), // %4
+ "m"(kAddY16) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
@@ -1556,52 +1695,52 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
-
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_bgra0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1615,70 +1754,36 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
asm volatile(
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ LABELALIGN RGBTOY(xmm7)
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kABGRToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+ "m"(kSub128), // %4
+ "m"(kAddY16) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile(
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ LABELALIGN RGBTOY(xmm7)
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kRGBAToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+ "m"(kSub128), // %4
+ "m"(kAddY16) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
@@ -1687,52 +1792,52 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
-
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_abgr0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1750,52 +1855,52 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
-
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_rgba0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -2012,16 +2117,16 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV444
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2041,27 +2146,27 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
- "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
- "sub %[u_buf],%[v_buf] \n"
+ "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+ "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
+ "sub %[u_buf],%[v_buf] \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422
YUVTORGB(yuvconstants)
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm2,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "movq %%xmm0,(%[dst_rgb24]) \n"
- "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
- "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
- "subl $0x8,%[width] \n"
- "jg 1b \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "movq %%xmm0,(%[dst_rgb24]) \n"
+ "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
+ "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2087,16 +2192,16 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2116,21 +2221,21 @@ void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
- "psrlw $14,%%xmm5 \n"
- "psllw $4,%%xmm5 \n" // 2 alpha bits
- "pxor %%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
- "psrlw $6,%%xmm7 \n" // 1023 for max
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
+ "psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422
YUVTORGB16(yuvconstants)
STOREAR30
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2151,16 +2256,16 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV210
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2181,21 +2286,21 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $14,%%xmm5 \n"
- "psllw $4,%%xmm5 \n" // 2 alpha bits
- "pxor %%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
- "psrlw $6,%%xmm7 \n" // 1023 for max
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
+ "psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV210
YUVTORGB16(yuvconstants)
STOREAR30
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2218,15 +2323,15 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
+ "sub %[u_buf],%[v_buf] \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUVA422
YUVTORGB(yuvconstants)
STOREARGB
- "subl $0x8,%[width] \n"
- "jg 1b \n"
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2253,15 +2358,15 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READNV12
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[uv_buf]"+r"(uv_buf), // %[uv_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
@@ -2281,15 +2386,15 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READNV21
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[vu_buf]"+r"(vu_buf), // %[vu_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
@@ -2309,15 +2414,15 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUY2
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
@@ -2337,15 +2442,15 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READUYVY
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
@@ -2366,16 +2471,16 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422
YUVTORGB(yuvconstants)
STORERGBA
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2590,17 +2695,17 @@ void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV444_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2624,18 +2729,18 @@ void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
- "vzeroupper \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2659,23 +2764,23 @@ void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
- "vpsrlw $14,%%ymm5,%%ymm5 \n"
- "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
- "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
- "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
- "vpsrlw $6,%%ymm7,%%ymm7 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422_AVX2
YUVTORGB16_AVX2(yuvconstants)
STOREAR30_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
- "vzeroupper \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2699,18 +2804,18 @@ void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV210_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
- "vzeroupper \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2734,23 +2839,23 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
- "vpsrlw $14,%%ymm5,%%ymm5 \n"
- "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
- "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
- "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
- "vpsrlw $6,%%ymm7,%%ymm7 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV210_AVX2
YUVTORGB16_AVX2(yuvconstants)
STOREAR30_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
- "vzeroupper \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2776,16 +2881,16 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
+ "sub %[u_buf],%[v_buf] \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUVA422_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "subl $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "subl $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2815,11 +2920,11 @@ void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422_AVX2
YUVTORGB_AVX2(yuvconstants)
@@ -2859,16 +2964,16 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READNV12_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[uv_buf]"+r"(uv_buf), // %[uv_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
@@ -2892,16 +2997,16 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READNV21_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[vu_buf]"+r"(vu_buf), // %[vu_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
@@ -2925,16 +3030,16 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUY2_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
@@ -2958,16 +3063,16 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READUYVY_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
@@ -2982,17 +3087,15 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
#endif // HAS_UYVYTOARGBROW_AVX2
#ifdef HAS_I400TOARGBROW_SSE2
-void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
- asm volatile(
- "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
- "movd %%eax,%%xmm2 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
- "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 *
- // 16
- "movd %%eax,%%xmm3 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "pslld $0x18,%%xmm4 \n"
+void I400ToARGBRow_SSE2(const uint8_t* y_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ "movdqa 192(%3),%%xmm2 \n" // yg = 18997 = 1.164
+ "movdqa 224(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16
+ "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000
+ "pslld $0x18,%%xmm4 \n"
LABELALIGN
"1: \n"
@@ -3001,8 +3104,8 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
"lea 0x8(%0),%0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
- "psubusw %%xmm3,%%xmm0 \n"
- "psrlw $6, %%xmm0 \n"
+ "paddsw %%xmm3,%%xmm0 \n"
+ "psraw $6, %%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
// Step 2: Weave into ARGB
@@ -3018,28 +3121,26 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
"sub $0x8,%2 \n"
"jg 1b \n"
- : "+r"(y_buf), // %0
- "+r"(dst_argb), // %1
- "+rm"(width) // %2
- :
- : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+ : "+r"(y_buf), // %0
+ "+r"(dst_argb), // %1
+ "+rm"(width) // %2
+ : "r"(yuvconstants) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
}
#endif // HAS_I400TOARGBROW_SSE2
#ifdef HAS_I400TOARGBROW_AVX2
// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
// note: vpunpcklbw mutates and vpackuswb unmutates.
-void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
+void I400ToARGBRow_AVX2(const uint8_t* y_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
asm volatile(
- "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 *
- // 16
- "vmovd %%eax,%%xmm2 \n"
- "vbroadcastss %%xmm2,%%ymm2 \n"
- "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
- "vmovd %%eax,%%xmm3 \n"
- "vbroadcastss %%xmm3,%%ymm3 \n"
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpslld $0x18,%%ymm4,%%ymm4 \n"
+ "vmovdqa 192(%3),%%ymm2 \n" // yg = 18997 = 1.164
+ "vmovdqa 224(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000
+ "vpslld $0x18,%%ymm4,%%ymm4 \n"
LABELALIGN
"1: \n"
@@ -3049,8 +3150,8 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
"vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x6,%%ymm0,%%ymm0 \n"
+ "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpsraw $0x6,%%ymm0,%%ymm0 \n"
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
"vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
"vpermq $0xd8,%%ymm1,%%ymm1 \n"
@@ -3060,15 +3161,15 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
"vpor %%ymm4,%%ymm1,%%ymm1 \n"
"vmovdqu %%ymm0,(%1) \n"
"vmovdqu %%ymm1,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
+ "lea 0x40(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
- : "+r"(y_buf), // %0
- "+r"(dst_argb), // %1
- "+rm"(width) // %2
- :
- : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+ : "+r"(y_buf), // %0
+ "+r"(dst_argb), // %1
+ "+rm"(width) // %2
+ : "r"(yuvconstants) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
}
#endif // HAS_I400TOARGBROW_AVX2
@@ -3081,16 +3182,16 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile(
- "movdqa %3,%%xmm5 \n"
+ "movdqa %3,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu -0x10(%0,%2,1),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu -0x10(%0,%2,1),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(temp_width) // %2
@@ -3108,13 +3209,13 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
LABELALIGN
"1: \n"
- "vmovdqu -0x20(%0,%2,1),%%ymm0 \n"
- "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
- "vpermq $0x4e,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu -0x20(%0,%2,1),%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpermq $0x4e,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -3125,37 +3226,136 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
#endif // HAS_MIRRORROW_AVX2
#ifdef HAS_MIRRORUVROW_SSSE3
+// Shuffle table for reversing the UV.
+static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
+ 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
+
+void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile(
+
+ "movdqa %3,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu -0x10(%0,%2,2),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirrorUV) // %3
+ : "memory", "cc", "xmm0", "xmm5");
+}
+#endif // HAS_MIRRORUVROW_SSSE3
+
+#ifdef HAS_MIRRORUVROW_AVX2
+void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile(
+
+ "vbroadcastf128 %3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu -0x20(%0,%2,2),%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpermq $0x4e,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirrorUV) // %3
+ : "memory", "cc", "xmm0", "xmm5");
+}
+#endif // HAS_MIRRORUVROW_AVX2
+
+#ifdef HAS_MIRRORSPLITUVROW_SSSE3
// Shuffle table for reversing the bytes of UV channels.
-static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
- 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
-void MirrorUVRow_SSSE3(const uint8_t* src,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
+static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+ 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile(
- "movdqa %4,%%xmm1 \n"
- "lea -0x10(%0,%3,2),%0 \n"
- "sub %1,%2 \n"
+ "movdqa %4,%%xmm1 \n"
+ "lea -0x10(%0,%3,2),%0 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea -0x10(%0),%0 \n"
- "pshufb %%xmm1,%%xmm0 \n"
- "movlpd %%xmm0,(%1) \n"
- "movhpd %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $8,%3 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(temp_width) // %3
- : "m"(kShuffleMirrorUV) // %4
+ "movdqu (%0),%%xmm0 \n"
+ "lea -0x10(%0),%0 \n"
+ "pshufb %%xmm1,%%xmm0 \n"
+ "movlpd %%xmm0,(%1) \n"
+ "movhpd %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $8,%3 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(temp_width) // %3
+ : "m"(kShuffleMirrorSplitUV) // %4
: "memory", "cc", "xmm0", "xmm1");
}
-#endif // HAS_MIRRORUVROW_SSSE3
+#endif // HAS_MIRRORSPLITUVROW_SSSE3
+
+#ifdef HAS_RGB24MIRRORROW_SSSE3
+
+// Shuffle first 5 pixels to last 5 mirrored. first byte zero
+static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u,
+ 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u};
+
+// Shuffle last 5 pixels to first 5 mirrored. last byte zero
+static const uvec8 kShuffleMirrorRGB1 = {
+ 13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u};
+
+// Shuffle 5 pixels at a time (15 bytes)
+void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
+ uint8_t* dst_rgb24,
+ int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ src_rgb24 += width * 3 - 48;
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // first 5
+ "movdqu 15(%0),%%xmm1 \n" // next 5
+ "movdqu 30(%0),%%xmm2 \n" // next 5
+ "movdqu 32(%0),%%xmm3 \n" // last 1 special
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "pshufb %%xmm5,%%xmm3 \n"
+ "lea -0x30(%0),%0 \n"
+ "movdqu %%xmm0,32(%1) \n" // last 5
+ "movdqu %%xmm1,17(%1) \n" // next 5
+ "movdqu %%xmm2,2(%1) \n" // next 5
+ "movlpd %%xmm3,0(%1) \n" // first 1
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirrorRGB0), // %3
+ "m"(kShuffleMirrorRGB1) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_RGB24MIRRORROW_SSSE3
#ifdef HAS_ARGBMIRRORROW_SSE2
@@ -3163,17 +3363,17 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile(
- "lea -0x10(%0,%2,4),%0 \n"
+ "lea -0x10(%0,%2,4),%0 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "pshufd $0x1b,%%xmm0,%%xmm0 \n"
- "lea -0x10(%0),%0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "pshufd $0x1b,%%xmm0,%%xmm0 \n"
+ "lea -0x10(%0),%0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(temp_width) // %2
@@ -3189,15 +3389,15 @@ void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile(
- "vmovdqu %3,%%ymm5 \n"
+ "vmovdqu %3,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -3213,28 +3413,28 @@ void SplitUVRow_AVX2(const uint8_t* src_uv,
uint8_t* dst_v,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm2,0x00(%1,%2,1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm2,0x00(%1,%2,1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
@@ -3251,28 +3451,28 @@ void SplitUVRow_SSE2(const uint8_t* src_uv,
uint8_t* dst_v,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "psrlw $0x8,%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm2,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm2,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -3289,22 +3489,22 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
int width) {
asm volatile(
- "sub %0,%1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x00(%0,%1,1),%%ymm1 \n"
- "lea 0x20(%0),%0 \n"
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
- "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x00(%0,%1,1),%%ymm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm2,(%2) \n"
"vextractf128 $0x0,%%ymm0,0x10(%2) \n"
"vextractf128 $0x1,%%ymm2,0x20(%2) \n"
"vextractf128 $0x1,%%ymm0,0x30(%2) \n"
- "lea 0x40(%2),%2 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x40(%2),%2 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
@@ -3322,21 +3522,21 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
int width) {
asm volatile(
- "sub %0,%1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm2 \n"
- "movdqu %%xmm0,(%2) \n"
- "movdqu %%xmm2,0x10(%2) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm2 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "movdqu %%xmm2,0x10(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
@@ -3359,30 +3559,30 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
int width) {
// clang-format off
asm volatile (
- "vmovd %4,%%xmm3 \n"
- "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
- "vbroadcastss %%xmm3,%%ymm3 \n"
- "sub %0,%1 \n"
+ "vmovd %4,%%xmm3 \n"
+ "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%ymm3 \n"
+ "sub %0,%1 \n"
// 16 pixels per loop.
LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu (%0,%1,1),%%ymm1 \n"
- "add $0x20,%0 \n"
-
- "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
- "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
- "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
- "vextractf128 $0x0,%%ymm2,(%2) \n"
- "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
- "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
- "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
- "add $0x40,%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu (%0,%1,1),%%ymm1 \n"
+ "add $0x20,%0 \n"
+
+ "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
+ "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm2,(%2) \n"
+ "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
+ "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
+ "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
+ "add $0x40,%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
@@ -3405,24 +3605,24 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y,
int width) {
// clang-format off
asm volatile (
- "vmovd %3,%%xmm3 \n"
- "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
- "vbroadcastss %%xmm3,%%ymm3 \n"
- "sub %0,%1 \n"
+ "vmovd %3,%%xmm3 \n"
+ "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%ymm3 \n"
+ "sub %0,%1 \n"
// 16 pixels per loop.
LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
- "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm0,(%0,%1) \n"
- "vmovdqu %%ymm1,0x20(%0,%1) \n"
- "add $0x40,%0 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%0,%1) \n"
+ "vmovdqu %%ymm1,0x20(%0,%1) \n"
+ "add $0x40,%0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -3443,23 +3643,23 @@ void Convert16To8Row_SSSE3(const uint16_t* src_y,
int width) {
// clang-format off
asm volatile (
- "movd %3,%%xmm2 \n"
- "punpcklwd %%xmm2,%%xmm2 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "movd %3,%%xmm2 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
// 32 pixels per loop.
LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "add $0x20,%0 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "add $0x10,%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "add $0x20,%0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "add $0x10,%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -3475,25 +3675,25 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
int width) {
// clang-format off
asm volatile (
- "vmovd %3,%%xmm2 \n"
- "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
- "vbroadcastss %%xmm2,%%ymm2 \n"
+ "vmovd %3,%%xmm2 \n"
+ "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
+ "vbroadcastss %%xmm2,%%ymm2 \n"
// 32 pixels per loop.
LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "add $0x40,%0 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "add $0x20,%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "add $0x40,%0 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "add $0x20,%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -3514,25 +3714,25 @@ void Convert8To16Row_SSE2(const uint8_t* src_y,
int width) {
// clang-format off
asm volatile (
- "movd %3,%%xmm2 \n"
- "punpcklwd %%xmm2,%%xmm2 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "movd %3,%%xmm2 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
// 32 pixels per loop.
LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "add $0x10,%0 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "add $0x20,%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "add $0x10,%0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "add $0x20,%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -3548,26 +3748,26 @@ void Convert8To16Row_AVX2(const uint8_t* src_y,
int width) {
// clang-format off
asm volatile (
- "vmovd %3,%%xmm2 \n"
- "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
- "vbroadcastss %%xmm2,%%ymm2 \n"
+ "vmovd %3,%%xmm2 \n"
+ "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
+ "vbroadcastss %%xmm2,%%ymm2 \n"
// 32 pixels per loop.
LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "add $0x20,%0 \n"
- "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
- "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "add $0x40,%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "add $0x20,%0 \n"
+ "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "add $0x40,%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -3619,41 +3819,41 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "pshufb %5, %%xmm0 \n"
- "pshufb %6, %%xmm1 \n"
- "pshufb %7, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
-
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "pshufb %8, %%xmm0 \n"
- "pshufb %9, %%xmm1 \n"
- "pshufb %10, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
-
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "pshufb %11, %%xmm0 \n"
- "pshufb %12, %%xmm1 \n"
- "pshufb %13, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%3) \n"
- "lea 0x10(%3),%3 \n"
- "lea 0x30(%0),%0 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "pshufb %5, %%xmm0 \n"
+ "pshufb %6, %%xmm1 \n"
+ "pshufb %7, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "pshufb %8, %%xmm0 \n"
+ "pshufb %9, %%xmm1 \n"
+ "pshufb %10, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "pshufb %11, %%xmm0 \n"
+ "pshufb %12, %%xmm1 \n"
+ "pshufb %13, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%3) \n"
+ "lea 0x10(%3),%3 \n"
+ "lea 0x30(%0),%0 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
@@ -3714,42 +3914,42 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%1),%%xmm1 \n"
- "movdqu (%2),%%xmm2 \n"
- "pshufb %5, %%xmm0 \n"
- "pshufb %6, %%xmm1 \n"
- "pshufb %7, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%3) \n"
-
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%1),%%xmm1 \n"
- "movdqu (%2),%%xmm2 \n"
- "pshufb %8, %%xmm0 \n"
- "pshufb %9, %%xmm1 \n"
- "pshufb %10, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,16(%3) \n"
-
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%1),%%xmm1 \n"
- "movdqu (%2),%%xmm2 \n"
- "pshufb %11, %%xmm0 \n"
- "pshufb %12, %%xmm1 \n"
- "pshufb %13, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,32(%3) \n"
-
- "lea 0x10(%0),%0 \n"
- "lea 0x10(%1),%1 \n"
- "lea 0x10(%2),%2 \n"
- "lea 0x30(%3),%3 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "pshufb %5, %%xmm0 \n"
+ "pshufb %6, %%xmm1 \n"
+ "pshufb %7, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%3) \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "pshufb %8, %%xmm0 \n"
+ "pshufb %9, %%xmm1 \n"
+ "pshufb %10, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,16(%3) \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "pshufb %11, %%xmm0 \n"
+ "pshufb %12, %%xmm1 \n"
+ "pshufb %13, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,32(%3) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x10(%1),%1 \n"
+ "lea 0x10(%2),%2 \n"
+ "lea 0x30(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
@@ -3771,35 +3971,35 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
#ifdef HAS_COPYROW_SSE2
void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "test $0xf,%0 \n"
- "jne 2f \n"
- "test $0xf,%1 \n"
- "jne 2f \n"
+ "test $0xf,%0 \n"
+ "jne 2f \n"
+ "test $0xf,%1 \n"
+ "jne 2f \n"
LABELALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm0,(%1) \n"
- "movdqa %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "jmp 9f \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "movdqa %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "jmp 9f \n"
LABELALIGN
"2: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 2b \n"
-
- LABELALIGN "9: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 2b \n"
+
+ LABELALIGN "9: \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -3814,14 +4014,14 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x40,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x40,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -3836,7 +4036,7 @@ void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
size_t width_tmp = (size_t)(width);
asm volatile(
- "rep movsb \n"
+ "rep movsb \n"
: "+S"(src), // %0
"+D"(dst), // %1
"+c"(width_tmp) // %2
@@ -3849,29 +4049,29 @@ void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
// width in pixels
void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "pcmpeqb %%xmm0,%%xmm0 \n"
- "pslld $0x18,%%xmm0 \n"
- "pcmpeqb %%xmm1,%%xmm1 \n"
- "psrld $0x8,%%xmm1 \n"
+ "pcmpeqb %%xmm0,%%xmm0 \n"
+ "pslld $0x18,%%xmm0 \n"
+ "pcmpeqb %%xmm1,%%xmm1 \n"
+ "psrld $0x8,%%xmm1 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqu 0x10(%0),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "movdqu (%1),%%xmm4 \n"
- "movdqu 0x10(%1),%%xmm5 \n"
- "pand %%xmm0,%%xmm2 \n"
- "pand %%xmm0,%%xmm3 \n"
- "pand %%xmm1,%%xmm4 \n"
- "pand %%xmm1,%%xmm5 \n"
- "por %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm2,(%1) \n"
- "movdqu %%xmm3,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu 0x10(%0),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqu (%1),%%xmm4 \n"
+ "movdqu 0x10(%1),%%xmm5 \n"
+ "pand %%xmm0,%%xmm2 \n"
+ "pand %%xmm0,%%xmm3 \n"
+ "pand %%xmm1,%%xmm4 \n"
+ "pand %%xmm1,%%xmm5 \n"
+ "por %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "movdqu %%xmm3,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -3884,21 +4084,21 @@ void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
// width in pixels
void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+ "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm1 \n"
- "vmovdqu 0x20(%0),%%ymm2 \n"
- "lea 0x40(%0),%0 \n"
- "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
- "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm1,(%1) \n"
- "vmovdqu %%ymm2,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm1 \n"
+ "vmovdqu 0x20(%0),%%ymm2 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
+ "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm1,(%1) \n"
+ "vmovdqu %%ymm2,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -3917,17 +4117,17 @@ void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
LABELALIGN
"1: \n"
- "movdqu (%0), %%xmm0 \n"
- "movdqu 0x10(%0), %%xmm1 \n"
- "lea 0x20(%0), %0 \n"
- "psrld $0x18, %%xmm0 \n"
- "psrld $0x18, %%xmm1 \n"
- "packssdw %%xmm1, %%xmm0 \n"
- "packuswb %%xmm0, %%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1), %1 \n"
- "sub $0x8, %2 \n"
- "jg 1b \n"
+ "movdqu (%0), %%xmm0 \n"
+ "movdqu 0x10(%0), %%xmm1 \n"
+ "lea 0x20(%0), %0 \n"
+ "psrld $0x18, %%xmm0 \n"
+ "psrld $0x18, %%xmm1 \n"
+ "packssdw %%xmm1, %%xmm0 \n"
+ "packuswb %%xmm0, %%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1), %1 \n"
+ "sub $0x8, %2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_a), // %1
"+rm"(width) // %2
@@ -3945,28 +4145,28 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_a,
int width) {
asm volatile(
- "vmovdqa %3,%%ymm4 \n"
+ "vmovdqa %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0), %%ymm0 \n"
- "vmovdqu 0x20(%0), %%ymm1 \n"
- "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
- "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
- "vmovdqu 0x40(%0), %%ymm2 \n"
- "vmovdqu 0x60(%0), %%ymm3 \n"
- "lea 0x80(%0), %0 \n"
- "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
- "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
- "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
- "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
- "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
- "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20, %2 \n"
- "jg 1b \n"
+ "vmovdqu (%0), %%ymm0 \n"
+ "vmovdqu 0x20(%0), %%ymm1 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu 0x40(%0), %%ymm2 \n"
+ "vmovdqu 0x60(%0), %%ymm3 \n"
+ "lea 0x80(%0), %0 \n"
+ "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
+ "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
+ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
+ "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20, %2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_a), // %1
@@ -3981,31 +4181,31 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
// width in pixels
void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "pcmpeqb %%xmm0,%%xmm0 \n"
- "pslld $0x18,%%xmm0 \n"
- "pcmpeqb %%xmm1,%%xmm1 \n"
- "psrld $0x8,%%xmm1 \n"
+ "pcmpeqb %%xmm0,%%xmm0 \n"
+ "pslld $0x18,%%xmm0 \n"
+ "pcmpeqb %%xmm1,%%xmm1 \n"
+ "psrld $0x8,%%xmm1 \n"
LABELALIGN
"1: \n"
- "movq (%0),%%xmm2 \n"
- "lea 0x8(%0),%0 \n"
- "punpcklbw %%xmm2,%%xmm2 \n"
- "punpckhwd %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm2,%%xmm2 \n"
- "movdqu (%1),%%xmm4 \n"
- "movdqu 0x10(%1),%%xmm5 \n"
- "pand %%xmm0,%%xmm2 \n"
- "pand %%xmm0,%%xmm3 \n"
- "pand %%xmm1,%%xmm4 \n"
- "pand %%xmm1,%%xmm5 \n"
- "por %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm2,(%1) \n"
- "movdqu %%xmm3,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm2 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "punpckhwd %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "movdqu (%1),%%xmm4 \n"
+ "movdqu 0x10(%1),%%xmm5 \n"
+ "pand %%xmm0,%%xmm2 \n"
+ "pand %%xmm0,%%xmm3 \n"
+ "pand %%xmm1,%%xmm4 \n"
+ "pand %%xmm1,%%xmm5 \n"
+ "por %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "movdqu %%xmm3,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -4018,23 +4218,23 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
// width in pixels
void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+ "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
LABELALIGN
"1: \n"
- "vpmovzxbd (%0),%%ymm1 \n"
- "vpmovzxbd 0x8(%0),%%ymm2 \n"
- "lea 0x10(%0),%0 \n"
- "vpslld $0x18,%%ymm1,%%ymm1 \n"
- "vpslld $0x18,%%ymm2,%%ymm2 \n"
- "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
- "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm1,(%1) \n"
- "vmovdqu %%ymm2,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vpmovzxbd (%0),%%ymm1 \n"
+ "vpmovzxbd 0x8(%0),%%ymm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "vpslld $0x18,%%ymm1,%%ymm1 \n"
+ "vpslld $0x18,%%ymm2,%%ymm2 \n"
+ "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
+ "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm1,(%1) \n"
+ "vmovdqu %%ymm2,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -4050,7 +4250,7 @@ void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
asm volatile(
- "rep stosl \n"
+ "rep stosl \n"
: "+D"(dst), // %0
"+c"(width_tmp) // %1
: "a"(v32) // %2
@@ -4061,7 +4261,7 @@ void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
size_t width_tmp = (size_t)(width);
asm volatile(
- "rep stosb \n"
+ "rep stosb \n"
: "+D"(dst), // %0
"+c"(width_tmp) // %1
: "a"(v8) // %2
@@ -4072,7 +4272,7 @@ void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
size_t width_tmp = (size_t)(width);
asm volatile(
- "rep stosl \n"
+ "rep stosl \n"
: "+D"(dst_argb), // %0
"+c"(width_tmp) // %1
: "a"(v32) // %2
@@ -4083,21 +4283,21 @@ void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
#ifdef HAS_YUY2TOYROW_SSE2
void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -4111,32 +4311,32 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
uint8_t* dst_v,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%4,1),%%xmm2 \n"
- "movdqu 0x10(%0,%4,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -4150,28 +4350,28 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
uint8_t* dst_v,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -4185,16 +4385,16 @@ void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -4208,32 +4408,32 @@ void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
uint8_t* dst_v,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%4,1),%%xmm2 \n"
- "movdqu 0x10(%0,%4,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -4247,28 +4447,28 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
uint8_t* dst_v,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -4281,22 +4481,22 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
#ifdef HAS_YUY2TOYROW_AVX2
void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
@@ -4311,32 +4511,32 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
uint8_t* dst_v,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
- "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm1,(%1) \n"
"vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
@@ -4351,30 +4551,30 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
uint8_t* dst_v,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm1,(%1) \n"
"vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
@@ -4389,17 +4589,17 @@ void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
@@ -4413,32 +4613,32 @@ void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
uint8_t* dst_v,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
- "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm1,(%1) \n"
"vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
@@ -4453,30 +4653,30 @@ void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
uint8_t* dst_v,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm1,(%1) \n"
"vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
@@ -4498,71 +4698,71 @@ void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
uint8_t* dst_argb,
int width) {
asm volatile(
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psrlw $0xf,%%xmm7 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x8,%%xmm6 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psllw $0x8,%%xmm5 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "pslld $0x18,%%xmm4 \n"
- "sub $0x4,%3 \n"
- "jl 49f \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0xf,%%xmm7 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
// 4 pixel loop.
LABELALIGN
"40: \n"
- "movdqu (%0),%%xmm3 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movdqu (%1),%%xmm2 \n"
- "pshufb %4,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movdqu (%1),%%xmm1 \n"
- "lea 0x10(%1),%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jge 40b \n"
+ "movdqu (%0),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
"49: \n"
- "add $0x3,%3 \n"
- "jl 99f \n"
+ "add $0x3,%3 \n"
+ "jl 99f \n"
// 1 pixel loop.
"91: \n"
- "movd (%0),%%xmm3 \n"
- "lea 0x4(%0),%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movd (%1),%%xmm2 \n"
- "pshufb %4,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movd (%1),%%xmm1 \n"
- "lea 0x4(%1),%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x4(%2),%2 \n"
- "sub $0x1,%3 \n"
- "jge 91b \n"
+ "movd (%0),%%xmm3 \n"
+ "lea 0x4(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movd (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movd (%1),%%xmm1 \n"
+ "lea 0x4(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "sub $0x1,%3 \n"
+ "jge 91b \n"
"99: \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -4586,36 +4786,36 @@ void BlendPlaneRow_SSSE3(const uint8_t* src0,
uint8_t* dst,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psllw $0x8,%%xmm5 \n"
- "mov $0x80808080,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "mov $0x807f807f,%%eax \n"
- "movd %%eax,%%xmm7 \n"
- "pshufd $0x0,%%xmm7,%%xmm7 \n"
- "sub %2,%0 \n"
- "sub %2,%1 \n"
- "sub %2,%3 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "mov $0x807f807f,%%eax \n"
+ "movd %%eax,%%xmm7 \n"
+ "pshufd $0x0,%%xmm7,%%xmm7 \n"
+ "sub %2,%0 \n"
+ "sub %2,%1 \n"
+ "sub %2,%3 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movq (%2),%%xmm0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "pxor %%xmm5,%%xmm0 \n"
- "movq (%0,%2,1),%%xmm1 \n"
- "movq (%1,%2,1),%%xmm2 \n"
- "punpcklbw %%xmm2,%%xmm1 \n"
- "psubb %%xmm6,%%xmm1 \n"
- "pmaddubsw %%xmm1,%%xmm0 \n"
- "paddw %%xmm7,%%xmm0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%3,%2,1) \n"
- "lea 0x8(%2),%2 \n"
- "sub $0x8,%4 \n"
- "jg 1b \n"
+ "movq (%2),%%xmm0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm0 \n"
+ "movq (%0,%2,1),%%xmm1 \n"
+ "movq (%1,%2,1),%%xmm2 \n"
+ "punpcklbw %%xmm2,%%xmm1 \n"
+ "psubb %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm7,%%xmm0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%3,%2,1) \n"
+ "lea 0x8(%2),%2 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
: "+r"(src0), // %0
"+r"(src1), // %1
"+r"(alpha), // %2
@@ -4638,43 +4838,43 @@ void BlendPlaneRow_AVX2(const uint8_t* src0,
uint8_t* dst,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsllw $0x8,%%ymm5,%%ymm5 \n"
- "mov $0x80808080,%%eax \n"
- "vmovd %%eax,%%xmm6 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsllw $0x8,%%ymm5,%%ymm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "vmovd %%eax,%%xmm6 \n"
"vbroadcastss %%xmm6,%%ymm6 \n"
- "mov $0x807f807f,%%eax \n"
- "vmovd %%eax,%%xmm7 \n"
+ "mov $0x807f807f,%%eax \n"
+ "vmovd %%eax,%%xmm7 \n"
"vbroadcastss %%xmm7,%%ymm7 \n"
- "sub %2,%0 \n"
- "sub %2,%1 \n"
- "sub %2,%3 \n"
+ "sub %2,%0 \n"
+ "sub %2,%1 \n"
+ "sub %2,%3 \n"
// 32 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%2),%%ymm0 \n"
- "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
- "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
- "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
- "vmovdqu (%0,%2,1),%%ymm1 \n"
- "vmovdqu (%1,%2,1),%%ymm2 \n"
- "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
- "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
- "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
- "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%3,%2,1) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x20,%4 \n"
- "jg 1b \n"
+ "vmovdqu (%2),%%ymm0 \n"
+ "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0,%2,1),%%ymm1 \n"
+ "vmovdqu (%1,%2,1),%%ymm2 \n"
+ "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
+ "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
+ "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%3,%2,1) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x20,%4 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src0), // %0
"+r"(src1), // %1
@@ -4688,7 +4888,7 @@ void BlendPlaneRow_AVX2(const uint8_t* src0,
#endif // HAS_BLENDPLANEROW_AVX2
#ifdef HAS_ARGBATTENUATEROW_SSSE3
-// Shuffle table duplicating alpha
+// Shuffle table duplicating alpha.
static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
@@ -4698,35 +4898,35 @@ void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_argb,
int width) {
asm volatile(
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "pslld $0x18,%%xmm3 \n"
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "pslld $0x18,%%xmm3 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "punpcklbw %%xmm1,%%xmm1 \n"
- "pmulhuw %%xmm1,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "movdqu (%0),%%xmm2 \n"
- "punpckhbw %%xmm2,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "movdqu (%0),%%xmm2 \n"
- "lea 0x10(%0),%0 \n"
- "pand %%xmm3,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "punpcklbw %%xmm1,%%xmm1 \n"
+ "pmulhuw %%xmm1,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "punpckhbw %%xmm2,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "pand %%xmm3,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -4747,29 +4947,29 @@ void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpslld $0x18,%%ymm5,%%ymm5 \n"
- "sub %0,%1 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpslld $0x18,%%ymm5,%%ymm5 \n"
+ "sub %0,%1 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm6 \n"
- "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
- "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
- "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
- "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpand %%ymm5,%%ymm6,%%ymm6 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpor %%ymm6,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm6 \n"
+ "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
+ "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
+ "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
+ "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpand %%ymm5,%%ymm6,%%ymm6 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm6,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -4789,32 +4989,32 @@ void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movzb 0x03(%0),%3 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movd 0x00(%4,%3,4),%%xmm2 \n"
- "movzb 0x07(%0),%3 \n"
- "movd 0x00(%4,%3,4),%%xmm3 \n"
- "pshuflw $0x40,%%xmm2,%%xmm2 \n"
- "pshuflw $0x40,%%xmm3,%%xmm3 \n"
- "movlhps %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "movzb 0x0b(%0),%3 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "movd 0x00(%4,%3,4),%%xmm2 \n"
- "movzb 0x0f(%0),%3 \n"
- "movd 0x00(%4,%3,4),%%xmm3 \n"
- "pshuflw $0x40,%%xmm2,%%xmm2 \n"
- "pshuflw $0x40,%%xmm3,%%xmm3 \n"
- "movlhps %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movzb 0x03(%0),%3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x07(%0),%3 \n"
+ "movd 0x00(%4,%3,4),%%xmm3 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "movzb 0x0b(%0),%3 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "movd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x0f(%0),%3 \n"
+ "movd 0x00(%4,%3,4),%%xmm3 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width), // %2
@@ -4834,52 +5034,52 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
int width) {
uintptr_t alpha;
asm volatile(
- "sub %0,%1 \n"
+ "sub %0,%1 \n"
"vbroadcastf128 %5,%%ymm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
// replace VPGATHER
- "movzb 0x03(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm0 \n"
- "movzb 0x07(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm1 \n"
- "movzb 0x0b(%0),%3 \n"
- "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
- "vmovd 0x00(%4,%3,4),%%xmm2 \n"
- "movzb 0x0f(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm3 \n"
- "movzb 0x13(%0),%3 \n"
- "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
- "vmovd 0x00(%4,%3,4),%%xmm0 \n"
- "movzb 0x17(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm1 \n"
- "movzb 0x1b(%0),%3 \n"
- "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
- "vmovd 0x00(%4,%3,4),%%xmm2 \n"
- "movzb 0x1f(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm3 \n"
- "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
+ "movzb 0x03(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm0 \n"
+ "movzb 0x07(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm1 \n"
+ "movzb 0x0b(%0),%3 \n"
+ "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x0f(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm3 \n"
+ "movzb 0x13(%0),%3 \n"
+ "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm0 \n"
+ "movzb 0x17(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm1 \n"
+ "movzb 0x1b(%0),%3 \n"
+ "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x1f(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm3 \n"
+ "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
"vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
"vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
"vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
// end of VPGATHER
- "vmovdqu (%0),%%ymm6 \n"
- "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
- "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
- "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
- "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
- "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
- "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm6 \n"
+ "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
+ "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
+ "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -4896,44 +5096,48 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqu 0x10(%0),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "psrld $0x18,%%xmm2 \n"
- "psrld $0x18,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpcklbw %%xmm2,%%xmm3 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm3,%%xmm0 \n"
- "punpckhwd %%xmm3,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "psubb %%xmm5,%%xmm0 \n"
+ "psubb %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm4,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "movdqu %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm0,%%xmm6 \n"
+ "paddw %%xmm5,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu 0x10(%0),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrld $0x18,%%xmm2 \n"
+ "psrld $0x18,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm6,%%xmm3 \n"
+ "punpcklbw %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm6,%%xmm1 \n"
+ "punpcklwd %%xmm3,%%xmm6 \n"
+ "punpckhwd %%xmm3,%%xmm1 \n"
+ "movdqu %%xmm6,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "m"(kARGBToYJ), // %3
- "m"(kAddYJ64) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+ "m"(kSub128) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif // HAS_ARGBGRAYROW_SSSE3
@@ -4954,50 +5158,50 @@ static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
asm volatile(
- "movdqa %2,%%xmm2 \n"
- "movdqa %3,%%xmm3 \n"
- "movdqa %4,%%xmm4 \n"
+ "movdqa %2,%%xmm2 \n"
+ "movdqa %3,%%xmm3 \n"
+ "movdqa %4,%%xmm4 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm6 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "pmaddubsw %%xmm2,%%xmm6 \n"
- "phaddw %%xmm6,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movdqu (%0),%%xmm5 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm5 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm5 \n"
- "psrlw $0x7,%%xmm5 \n"
- "packuswb %%xmm5,%%xmm5 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "movdqu (%0),%%xmm5 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm5 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm5 \n"
- "psrlw $0x7,%%xmm5 \n"
- "packuswb %%xmm5,%%xmm5 \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "psrld $0x18,%%xmm6 \n"
- "psrld $0x18,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm5 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm5,%%xmm0 \n"
- "punpckhwd %%xmm5,%%xmm1 \n"
- "movdqu %%xmm0,(%0) \n"
- "movdqu %%xmm1,0x10(%0) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x8,%1 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm6 \n"
+ "phaddw %%xmm6,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movdqu (%0),%%xmm5 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm5 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "movdqu (%0),%%xmm5 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm5 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "psrld $0x18,%%xmm6 \n"
+ "psrld $0x18,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm5 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "punpckhwd %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "movdqu %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%1 \n"
+ "jg 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
: "m"(kARGBToSepiaB), // %2
@@ -5015,54 +5219,54 @@ void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
const int8_t* matrix_argb,
int width) {
asm volatile(
- "movdqu (%3),%%xmm5 \n"
- "pshufd $0x00,%%xmm5,%%xmm2 \n"
- "pshufd $0x55,%%xmm5,%%xmm3 \n"
- "pshufd $0xaa,%%xmm5,%%xmm4 \n"
- "pshufd $0xff,%%xmm5,%%xmm5 \n"
+ "movdqu (%3),%%xmm5 \n"
+ "pshufd $0x00,%%xmm5,%%xmm2 \n"
+ "pshufd $0x55,%%xmm5,%%xmm3 \n"
+ "pshufd $0xaa,%%xmm5,%%xmm4 \n"
+ "pshufd $0xff,%%xmm5,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm7 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "pmaddubsw %%xmm2,%%xmm7 \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "phaddsw %%xmm7,%%xmm0 \n"
- "phaddsw %%xmm1,%%xmm6 \n"
- "psraw $0x6,%%xmm0 \n"
- "psraw $0x6,%%xmm6 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "movdqu 0x10(%0),%%xmm7 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm7 \n"
- "phaddsw %%xmm7,%%xmm1 \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x10(%0),%%xmm7 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm7 \n"
- "phaddsw %%xmm7,%%xmm6 \n"
- "psraw $0x6,%%xmm1 \n"
- "psraw $0x6,%%xmm6 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "punpcklwd %%xmm1,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm6 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm6,0x10(%1) \n"
- "lea 0x20(%0),%0 \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm7 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm7 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddsw %%xmm7,%%xmm0 \n"
+ "phaddsw %%xmm1,%%xmm6 \n"
+ "psraw $0x6,%%xmm0 \n"
+ "psraw $0x6,%%xmm6 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "movdqu 0x10(%0),%%xmm7 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm7 \n"
+ "phaddsw %%xmm7,%%xmm1 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x10(%0),%%xmm7 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm7 \n"
+ "phaddsw %%xmm7,%%xmm6 \n"
+ "psraw $0x6,%%xmm1 \n"
+ "psraw $0x6,%%xmm6 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "punpcklwd %%xmm1,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm6 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm6,0x10(%1) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -5080,40 +5284,40 @@ void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
int interval_offset,
int width) {
asm volatile(
- "movd %2,%%xmm2 \n"
- "movd %3,%%xmm3 \n"
- "movd %4,%%xmm4 \n"
- "pshuflw $0x40,%%xmm2,%%xmm2 \n"
- "pshufd $0x44,%%xmm2,%%xmm2 \n"
- "pshuflw $0x40,%%xmm3,%%xmm3 \n"
- "pshufd $0x44,%%xmm3,%%xmm3 \n"
- "pshuflw $0x40,%%xmm4,%%xmm4 \n"
- "pshufd $0x44,%%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "pslld $0x18,%%xmm6 \n"
+ "movd %2,%%xmm2 \n"
+ "movd %3,%%xmm3 \n"
+ "movd %4,%%xmm4 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshufd $0x44,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "pshufd $0x44,%%xmm3,%%xmm3 \n"
+ "pshuflw $0x40,%%xmm4,%%xmm4 \n"
+ "pshufd $0x44,%%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "pslld $0x18,%%xmm6 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "punpckhbw %%xmm5,%%xmm1 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "pmullw %%xmm3,%%xmm0 \n"
- "movdqu (%0),%%xmm7 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "pand %%xmm6,%%xmm7 \n"
- "paddw %%xmm4,%%xmm0 \n"
- "paddw %%xmm4,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "por %%xmm7,%%xmm0 \n"
- "movdqu %%xmm0,(%0) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x4,%1 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "pmullw %%xmm3,%%xmm0 \n"
+ "movdqu (%0),%%xmm7 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "pand %%xmm6,%%xmm7 \n"
+ "paddw %%xmm4,%%xmm0 \n"
+ "paddw %%xmm4,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x4,%1 \n"
+ "jg 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
: "r"(scale), // %2
@@ -5131,27 +5335,27 @@ void ARGBShadeRow_SSE2(const uint8_t* src_argb,
int width,
uint32_t value) {
asm volatile(
- "movd %3,%%xmm2 \n"
- "punpcklbw %%xmm2,%%xmm2 \n"
- "punpcklqdq %%xmm2,%%xmm2 \n"
+ "movd %3,%%xmm2 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm2 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -5168,28 +5372,28 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
int width) {
asm volatile(
- "pxor %%xmm5,%%xmm5 \n"
+ "pxor %%xmm5,%%xmm5 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movdqu (%1),%%xmm2 \n"
- "lea 0x10(%1),%1 \n"
- "movdqu %%xmm0,%%xmm1 \n"
- "movdqu %%xmm2,%%xmm3 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpckhbw %%xmm5,%%xmm3 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqu %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpckhbw %%xmm5,%%xmm3 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -5207,26 +5411,26 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
int width) {
asm volatile(
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm1 \n"
- "lea 0x20(%0),%0 \n"
- "vmovdqu (%1),%%ymm3 \n"
- "lea 0x20(%1),%1 \n"
- "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
- "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
- "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
- "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%2) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "vmovdqu (%1),%%ymm3 \n"
+ "lea 0x20(%1),%1 \n"
+ "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
+ "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
+ "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -5238,7 +5442,7 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
,
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
- );
+ );
}
#endif // HAS_ARGBMULTIPLYROW_AVX2
@@ -5252,15 +5456,15 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb0,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movdqu (%1),%%xmm1 \n"
- "lea 0x10(%1),%1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -5280,14 +5484,14 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb0,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "lea 0x20(%0),%0 \n"
- "vpaddusb (%1),%%ymm0,%%ymm0 \n"
- "lea 0x20(%1),%1 \n"
- "vmovdqu %%ymm0,(%2) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpaddusb (%1),%%ymm0,%%ymm0 \n"
+ "lea 0x20(%1),%1 \n"
+ "vmovdqu %%ymm0,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -5308,15 +5512,15 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movdqu (%1),%%xmm1 \n"
- "lea 0x10(%1),%1 \n"
- "psubusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "psubusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -5336,14 +5540,14 @@ void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "lea 0x20(%0),%0 \n"
- "vpsubusb (%1),%%ymm0,%%ymm0 \n"
- "lea 0x20(%1),%1 \n"
- "vmovdqu %%ymm0,(%2) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpsubusb (%1),%%ymm0,%%ymm0 \n"
+ "lea 0x20(%1),%1 \n"
+ "vmovdqu %%ymm0,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -5365,40 +5569,40 @@ void SobelXRow_SSE2(const uint8_t* src_y0,
uint8_t* dst_sobelx,
int width) {
asm volatile(
- "sub %0,%1 \n"
- "sub %0,%2 \n"
- "sub %0,%3 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "sub %0,%3 \n"
+ "pxor %%xmm5,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movq (%0),%%xmm0 \n"
- "movq 0x2(%0),%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "psubw %%xmm1,%%xmm0 \n"
- "movq 0x00(%0,%1,1),%%xmm1 \n"
- "movq 0x02(%0,%1,1),%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "psubw %%xmm2,%%xmm1 \n"
- "movq 0x00(%0,%2,1),%%xmm2 \n"
- "movq 0x02(%0,%2,1),%%xmm3 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm3 \n"
- "psubw %%xmm3,%%xmm2 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "psubw %%xmm0,%%xmm1 \n"
- "pmaxsw %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,0x00(%0,%3,1) \n"
- "lea 0x8(%0),%0 \n"
- "sub $0x8,%4 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm0 \n"
+ "movq 0x2(%0),%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "psubw %%xmm1,%%xmm0 \n"
+ "movq 0x00(%0,%1,1),%%xmm1 \n"
+ "movq 0x02(%0,%1,1),%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "psubw %%xmm2,%%xmm1 \n"
+ "movq 0x00(%0,%2,1),%%xmm2 \n"
+ "movq 0x02(%0,%2,1),%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "psubw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm1 \n"
+ "pmaxsw %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,0x00(%0,%3,1) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(src_y2), // %2
@@ -5419,39 +5623,39 @@ void SobelYRow_SSE2(const uint8_t* src_y0,
uint8_t* dst_sobely,
int width) {
asm volatile(
- "sub %0,%1 \n"
- "sub %0,%2 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "pxor %%xmm5,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movq (%0),%%xmm0 \n"
- "movq 0x00(%0,%1,1),%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "psubw %%xmm1,%%xmm0 \n"
- "movq 0x1(%0),%%xmm1 \n"
- "movq 0x01(%0,%1,1),%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "psubw %%xmm2,%%xmm1 \n"
- "movq 0x2(%0),%%xmm2 \n"
- "movq 0x02(%0,%1,1),%%xmm3 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm3 \n"
- "psubw %%xmm3,%%xmm2 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "psubw %%xmm0,%%xmm1 \n"
- "pmaxsw %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,0x00(%0,%2,1) \n"
- "lea 0x8(%0),%0 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm0 \n"
+ "movq 0x00(%0,%1,1),%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "psubw %%xmm1,%%xmm0 \n"
+ "movq 0x1(%0),%%xmm1 \n"
+ "movq 0x01(%0,%1,1),%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "psubw %%xmm2,%%xmm1 \n"
+ "movq 0x2(%0),%%xmm2 \n"
+ "movq 0x02(%0,%1,1),%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "psubw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm1 \n"
+ "pmaxsw %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,0x00(%0,%2,1) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(dst_sobely), // %2
@@ -5472,37 +5676,37 @@ void SobelRow_SSE2(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "sub %0,%1 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "punpcklbw %%xmm0,%%xmm2 \n"
- "punpckhbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm1 \n"
- "punpckhwd %%xmm2,%%xmm2 \n"
- "por %%xmm5,%%xmm1 \n"
- "por %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "punpcklwd %%xmm0,%%xmm3 \n"
- "punpckhwd %%xmm0,%%xmm0 \n"
- "por %%xmm5,%%xmm3 \n"
- "por %%xmm5,%%xmm0 \n"
- "movdqu %%xmm1,(%2) \n"
- "movdqu %%xmm2,0x10(%2) \n"
- "movdqu %%xmm3,0x20(%2) \n"
- "movdqu %%xmm0,0x30(%2) \n"
- "lea 0x40(%2),%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm2 \n"
+ "punpckhbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm1 \n"
+ "punpckhwd %%xmm2,%%xmm2 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklwd %%xmm0,%%xmm3 \n"
+ "punpckhwd %%xmm0,%%xmm0 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm1,(%2) \n"
+ "movdqu %%xmm2,0x10(%2) \n"
+ "movdqu %%xmm3,0x20(%2) \n"
+ "movdqu %%xmm0,0x30(%2) \n"
+ "lea 0x40(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -5519,21 +5723,21 @@ void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
uint8_t* dst_y,
int width) {
asm volatile(
- "sub %0,%1 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_y), // %2
@@ -5554,36 +5758,36 @@ void SobelXYRow_SSE2(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "sub %0,%1 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "paddusb %%xmm1,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "punpcklbw %%xmm5,%%xmm3 \n"
- "punpckhbw %%xmm5,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm4 \n"
- "punpcklbw %%xmm2,%%xmm4 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "punpcklwd %%xmm3,%%xmm6 \n"
- "punpckhwd %%xmm3,%%xmm4 \n"
- "movdqa %%xmm1,%%xmm7 \n"
- "punpcklwd %%xmm0,%%xmm7 \n"
- "punpckhwd %%xmm0,%%xmm1 \n"
- "movdqu %%xmm6,(%2) \n"
- "movdqu %%xmm4,0x10(%2) \n"
- "movdqu %%xmm7,0x20(%2) \n"
- "movdqu %%xmm1,0x30(%2) \n"
- "lea 0x40(%2),%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "paddusb %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "punpckhbw %%xmm5,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "punpcklbw %%xmm2,%%xmm4 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "punpcklwd %%xmm3,%%xmm6 \n"
+ "punpckhwd %%xmm3,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "punpcklwd %%xmm0,%%xmm7 \n"
+ "punpckhwd %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm6,(%2) \n"
+ "movdqu %%xmm4,0x10(%2) \n"
+ "movdqu %%xmm7,0x20(%2) \n"
+ "movdqu %%xmm1,0x30(%2) \n"
+ "lea 0x40(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -5602,67 +5806,67 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
const int32_t* previous_cumsum,
int width) {
asm volatile(
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "sub $0x4,%3 \n"
- "jl 49f \n"
- "test $0xf,%1 \n"
- "jne 49f \n"
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+ "test $0xf,%1 \n"
+ "jne 49f \n"
// 4 pixel loop.
LABELALIGN
"40: \n"
- "movdqu (%0),%%xmm2 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm2,%%xmm4 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm1,%%xmm2 \n"
- "punpckhwd %%xmm1,%%xmm3 \n"
- "punpckhbw %%xmm1,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "punpcklwd %%xmm1,%%xmm4 \n"
- "punpckhwd %%xmm1,%%xmm5 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "movdqu (%2),%%xmm2 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "movdqu 0x10(%2),%%xmm3 \n"
- "paddd %%xmm0,%%xmm3 \n"
- "paddd %%xmm4,%%xmm0 \n"
- "movdqu 0x20(%2),%%xmm4 \n"
- "paddd %%xmm0,%%xmm4 \n"
- "paddd %%xmm5,%%xmm0 \n"
- "movdqu 0x30(%2),%%xmm5 \n"
- "lea 0x40(%2),%2 \n"
- "paddd %%xmm0,%%xmm5 \n"
- "movdqu %%xmm2,(%1) \n"
- "movdqu %%xmm3,0x10(%1) \n"
- "movdqu %%xmm4,0x20(%1) \n"
- "movdqu %%xmm5,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x4,%3 \n"
- "jge 40b \n"
+ "movdqu (%0),%%xmm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "punpckhwd %%xmm1,%%xmm3 \n"
+ "punpckhbw %%xmm1,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "punpcklwd %%xmm1,%%xmm4 \n"
+ "punpckhwd %%xmm1,%%xmm5 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "movdqu 0x10(%2),%%xmm3 \n"
+ "paddd %%xmm0,%%xmm3 \n"
+ "paddd %%xmm4,%%xmm0 \n"
+ "movdqu 0x20(%2),%%xmm4 \n"
+ "paddd %%xmm0,%%xmm4 \n"
+ "paddd %%xmm5,%%xmm0 \n"
+ "movdqu 0x30(%2),%%xmm5 \n"
+ "lea 0x40(%2),%2 \n"
+ "paddd %%xmm0,%%xmm5 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "movdqu %%xmm3,0x10(%1) \n"
+ "movdqu %%xmm4,0x20(%1) \n"
+ "movdqu %%xmm5,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
"49: \n"
- "add $0x3,%3 \n"
- "jl 19f \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
// 1 pixel loop.
LABELALIGN
"10: \n"
- "movd (%0),%%xmm2 \n"
- "lea 0x4(%0),%0 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "punpcklwd %%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "movdqu (%2),%%xmm2 \n"
- "lea 0x10(%2),%2 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "movdqu %%xmm2,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x1,%3 \n"
- "jge 10b \n"
+ "movd (%0),%%xmm2 \n"
+ "lea 0x4(%0),%0 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "lea 0x10(%2),%2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
"19: \n"
: "+r"(row), // %0
@@ -5682,119 +5886,119 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
uint8_t* dst,
int count) {
asm volatile(
- "movd %5,%%xmm5 \n"
- "cvtdq2ps %%xmm5,%%xmm5 \n"
- "rcpss %%xmm5,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "sub $0x4,%3 \n"
- "jl 49f \n"
- "cmpl $0x80,%5 \n"
- "ja 40f \n"
-
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrld $0x10,%%xmm6 \n"
- "cvtdq2ps %%xmm6,%%xmm6 \n"
- "addps %%xmm6,%%xmm5 \n"
- "mulps %%xmm4,%%xmm5 \n"
- "cvtps2dq %%xmm5,%%xmm5 \n"
- "packssdw %%xmm5,%%xmm5 \n"
+ "movd %5,%%xmm5 \n"
+ "cvtdq2ps %%xmm5,%%xmm5 \n"
+ "rcpss %%xmm5,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+ "cmpl $0x80,%5 \n"
+ "ja 40f \n"
+
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrld $0x10,%%xmm6 \n"
+ "cvtdq2ps %%xmm6,%%xmm6 \n"
+ "addps %%xmm6,%%xmm5 \n"
+ "mulps %%xmm4,%%xmm5 \n"
+ "cvtps2dq %%xmm5,%%xmm5 \n"
+ "packssdw %%xmm5,%%xmm5 \n"
// 4 pixel small loop.
LABELALIGN
"4: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "psubd 0x00(%0,%4,4),%%xmm0 \n"
- "psubd 0x10(%0,%4,4),%%xmm1 \n"
- "psubd 0x20(%0,%4,4),%%xmm2 \n"
- "psubd 0x30(%0,%4,4),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "psubd (%1),%%xmm0 \n"
- "psubd 0x10(%1),%%xmm1 \n"
- "psubd 0x20(%1),%%xmm2 \n"
- "psubd 0x30(%1),%%xmm3 \n"
- "paddd 0x00(%1,%4,4),%%xmm0 \n"
- "paddd 0x10(%1,%4,4),%%xmm1 \n"
- "paddd 0x20(%1,%4,4),%%xmm2 \n"
- "paddd 0x30(%1,%4,4),%%xmm3 \n"
- "lea 0x40(%1),%1 \n"
- "packssdw %%xmm1,%%xmm0 \n"
- "packssdw %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm5,%%xmm0 \n"
- "pmulhuw %%xmm5,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jge 4b \n"
- "jmp 49f \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "psubd 0x00(%0,%4,4),%%xmm0 \n"
+ "psubd 0x10(%0,%4,4),%%xmm1 \n"
+ "psubd 0x20(%0,%4,4),%%xmm2 \n"
+ "psubd 0x30(%0,%4,4),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "psubd 0x10(%1),%%xmm1 \n"
+ "psubd 0x20(%1),%%xmm2 \n"
+ "psubd 0x30(%1),%%xmm3 \n"
+ "paddd 0x00(%1,%4,4),%%xmm0 \n"
+ "paddd 0x10(%1,%4,4),%%xmm1 \n"
+ "paddd 0x20(%1,%4,4),%%xmm2 \n"
+ "paddd 0x30(%1,%4,4),%%xmm3 \n"
+ "lea 0x40(%1),%1 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm0 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 4b \n"
+ "jmp 49f \n"
// 4 pixel loop
LABELALIGN
"40: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "psubd 0x00(%0,%4,4),%%xmm0 \n"
- "psubd 0x10(%0,%4,4),%%xmm1 \n"
- "psubd 0x20(%0,%4,4),%%xmm2 \n"
- "psubd 0x30(%0,%4,4),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "psubd (%1),%%xmm0 \n"
- "psubd 0x10(%1),%%xmm1 \n"
- "psubd 0x20(%1),%%xmm2 \n"
- "psubd 0x30(%1),%%xmm3 \n"
- "paddd 0x00(%1,%4,4),%%xmm0 \n"
- "paddd 0x10(%1,%4,4),%%xmm1 \n"
- "paddd 0x20(%1,%4,4),%%xmm2 \n"
- "paddd 0x30(%1,%4,4),%%xmm3 \n"
- "lea 0x40(%1),%1 \n"
- "cvtdq2ps %%xmm0,%%xmm0 \n"
- "cvtdq2ps %%xmm1,%%xmm1 \n"
- "mulps %%xmm4,%%xmm0 \n"
- "mulps %%xmm4,%%xmm1 \n"
- "cvtdq2ps %%xmm2,%%xmm2 \n"
- "cvtdq2ps %%xmm3,%%xmm3 \n"
- "mulps %%xmm4,%%xmm2 \n"
- "mulps %%xmm4,%%xmm3 \n"
- "cvtps2dq %%xmm0,%%xmm0 \n"
- "cvtps2dq %%xmm1,%%xmm1 \n"
- "cvtps2dq %%xmm2,%%xmm2 \n"
- "cvtps2dq %%xmm3,%%xmm3 \n"
- "packssdw %%xmm1,%%xmm0 \n"
- "packssdw %%xmm3,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jge 40b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "psubd 0x00(%0,%4,4),%%xmm0 \n"
+ "psubd 0x10(%0,%4,4),%%xmm1 \n"
+ "psubd 0x20(%0,%4,4),%%xmm2 \n"
+ "psubd 0x30(%0,%4,4),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "psubd 0x10(%1),%%xmm1 \n"
+ "psubd 0x20(%1),%%xmm2 \n"
+ "psubd 0x30(%1),%%xmm3 \n"
+ "paddd 0x00(%1,%4,4),%%xmm0 \n"
+ "paddd 0x10(%1,%4,4),%%xmm1 \n"
+ "paddd 0x20(%1,%4,4),%%xmm2 \n"
+ "paddd 0x30(%1,%4,4),%%xmm3 \n"
+ "lea 0x40(%1),%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "cvtdq2ps %%xmm1,%%xmm1 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm1 \n"
+ "cvtdq2ps %%xmm2,%%xmm2 \n"
+ "cvtdq2ps %%xmm3,%%xmm3 \n"
+ "mulps %%xmm4,%%xmm2 \n"
+ "mulps %%xmm4,%%xmm3 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "cvtps2dq %%xmm1,%%xmm1 \n"
+ "cvtps2dq %%xmm2,%%xmm2 \n"
+ "cvtps2dq %%xmm3,%%xmm3 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
"49: \n"
- "add $0x3,%3 \n"
- "jl 19f \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
// 1 pixel loop
LABELALIGN
"10: \n"
- "movdqu (%0),%%xmm0 \n"
- "psubd 0x00(%0,%4,4),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "psubd (%1),%%xmm0 \n"
- "paddd 0x00(%1,%4,4),%%xmm0 \n"
- "lea 0x10(%1),%1 \n"
- "cvtdq2ps %%xmm0,%%xmm0 \n"
- "mulps %%xmm4,%%xmm0 \n"
- "cvtps2dq %%xmm0,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x4(%2),%2 \n"
- "sub $0x1,%3 \n"
- "jge 10b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "psubd 0x00(%0,%4,4),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "paddd 0x00(%1,%4,4),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
"19: \n"
: "+r"(topleft), // %0
"+r"(botleft), // %1
@@ -5817,70 +6021,70 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb,
intptr_t src_argb_stride_temp = src_argb_stride;
intptr_t temp;
asm volatile(
- "movq (%3),%%xmm2 \n"
- "movq 0x08(%3),%%xmm7 \n"
- "shl $0x10,%1 \n"
- "add $0x4,%1 \n"
- "movd %1,%%xmm5 \n"
- "sub $0x4,%4 \n"
- "jl 49f \n"
-
- "pshufd $0x44,%%xmm7,%%xmm7 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "addps %%xmm7,%%xmm0 \n"
- "movlhps %%xmm0,%%xmm2 \n"
- "movdqa %%xmm7,%%xmm4 \n"
- "addps %%xmm4,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "addps %%xmm4,%%xmm3 \n"
- "addps %%xmm4,%%xmm4 \n"
+ "movq (%3),%%xmm2 \n"
+ "movq 0x08(%3),%%xmm7 \n"
+ "shl $0x10,%1 \n"
+ "add $0x4,%1 \n"
+ "movd %1,%%xmm5 \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
+
+ "pshufd $0x44,%%xmm7,%%xmm7 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "addps %%xmm7,%%xmm0 \n"
+ "movlhps %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm7,%%xmm4 \n"
+ "addps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "addps %%xmm4,%%xmm4 \n"
// 4 pixel loop
LABELALIGN
"40: \n"
- "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2
- "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2
- "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
- "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride
- "movd %%xmm0,%k1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movd %%xmm0,%k5 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movd 0x00(%0,%1,1),%%xmm1 \n"
- "movd 0x00(%0,%5,1),%%xmm6 \n"
- "punpckldq %%xmm6,%%xmm1 \n"
- "addps %%xmm4,%%xmm2 \n"
- "movq %%xmm1,(%2) \n"
- "movd %%xmm0,%k1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movd %%xmm0,%k5 \n"
- "movd 0x00(%0,%1,1),%%xmm0 \n"
- "movd 0x00(%0,%5,1),%%xmm6 \n"
- "punpckldq %%xmm6,%%xmm0 \n"
- "addps %%xmm4,%%xmm3 \n"
- "movq %%xmm0,0x08(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%4 \n"
- "jge 40b \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2
+ "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2
+ "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
+ "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride
+ "movd %%xmm0,%k1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k5 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd 0x00(%0,%1,1),%%xmm1 \n"
+ "movd 0x00(%0,%5,1),%%xmm6 \n"
+ "punpckldq %%xmm6,%%xmm1 \n"
+ "addps %%xmm4,%%xmm2 \n"
+ "movq %%xmm1,(%2) \n"
+ "movd %%xmm0,%k1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k5 \n"
+ "movd 0x00(%0,%1,1),%%xmm0 \n"
+ "movd 0x00(%0,%5,1),%%xmm6 \n"
+ "punpckldq %%xmm6,%%xmm0 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "movq %%xmm0,0x08(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%4 \n"
+ "jge 40b \n"
"49: \n"
- "add $0x3,%4 \n"
- "jl 19f \n"
+ "add $0x3,%4 \n"
+ "jl 19f \n"
// 1 pixel loop
LABELALIGN
"10: \n"
- "cvttps2dq %%xmm2,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "pmaddwd %%xmm5,%%xmm0 \n"
- "addps %%xmm7,%%xmm2 \n"
- "movd %%xmm0,%k1 \n"
- "movd 0x00(%0,%1,1),%%xmm0 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x04(%2),%2 \n"
- "sub $0x1,%4 \n"
- "jge 10b \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "pmaddwd %%xmm5,%%xmm0 \n"
+ "addps %%xmm7,%%xmm2 \n"
+ "movd %%xmm0,%k1 \n"
+ "movd 0x00(%0,%1,1),%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x04(%2),%2 \n"
+ "sub $0x1,%4 \n"
+ "jge 10b \n"
"19: \n"
: "+r"(src_argb), // %0
"+r"(src_argb_stride_temp), // %1
@@ -5902,68 +6106,68 @@ void InterpolateRow_SSSE3(uint8_t* dst_ptr,
int dst_width,
int source_y_fraction) {
asm volatile(
- "sub %1,%0 \n"
- "cmp $0x0,%3 \n"
- "je 100f \n"
- "cmp $0x80,%3 \n"
- "je 50f \n"
-
- "movd %3,%%xmm0 \n"
- "neg %3 \n"
- "add $0x100,%3 \n"
- "movd %3,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm5 \n"
- "punpcklwd %%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "mov $0x80808080,%%eax \n"
- "movd %%eax,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "sub %1,%0 \n"
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "cmp $0x80,%3 \n"
+ "je 50f \n"
+
+ "movd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x100,%3 \n"
+ "movd %3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "movd %%eax,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
// General purpose row blend.
LABELALIGN
"1: \n"
- "movdqu (%1),%%xmm0 \n"
- "movdqu 0x00(%1,%4,1),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "psubb %%xmm4,%%xmm0 \n"
- "psubb %%xmm4,%%xmm1 \n"
- "movdqa %%xmm5,%%xmm2 \n"
- "movdqa %%xmm5,%%xmm3 \n"
- "pmaddubsw %%xmm0,%%xmm2 \n"
- "pmaddubsw %%xmm1,%%xmm3 \n"
- "paddw %%xmm4,%%xmm2 \n"
- "paddw %%xmm4,%%xmm3 \n"
- "psrlw $0x8,%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "movdqu %%xmm2,0x00(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "jmp 99f \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu 0x00(%1,%4,1),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "psubb %%xmm4,%%xmm0 \n"
+ "psubb %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm5,%%xmm3 \n"
+ "pmaddubsw %%xmm0,%%xmm2 \n"
+ "pmaddubsw %%xmm1,%%xmm3 \n"
+ "paddw %%xmm4,%%xmm2 \n"
+ "paddw %%xmm4,%%xmm3 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm2,0x00(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
// Blend 50 / 50.
LABELALIGN
"50: \n"
- "movdqu (%1),%%xmm0 \n"
- "movdqu 0x00(%1,%4,1),%%xmm1 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,0x00(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 50b \n"
- "jmp 99f \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu 0x00(%1,%4,1),%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,0x00(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
LABELALIGN
"100: \n"
- "movdqu (%1),%%xmm0 \n"
- "movdqu %%xmm0,0x00(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 100b \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu %%xmm0,0x00(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 100b \n"
"99: \n"
: "+r"(dst_ptr), // %0
@@ -5983,61 +6187,61 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr,
int dst_width,
int source_y_fraction) {
asm volatile(
- "cmp $0x0,%3 \n"
- "je 100f \n"
- "sub %1,%0 \n"
- "cmp $0x80,%3 \n"
- "je 50f \n"
-
- "vmovd %3,%%xmm0 \n"
- "neg %3 \n"
- "add $0x100,%3 \n"
- "vmovd %3,%%xmm5 \n"
- "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
- "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "sub %1,%0 \n"
+ "cmp $0x80,%3 \n"
+ "je 50f \n"
+
+ "vmovd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x100,%3 \n"
+ "vmovd %3,%%xmm5 \n"
+ "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
+ "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
"vbroadcastss %%xmm5,%%ymm5 \n"
- "mov $0x80808080,%%eax \n"
- "vmovd %%eax,%%xmm4 \n"
+ "mov $0x80808080,%%eax \n"
+ "vmovd %%eax,%%xmm4 \n"
"vbroadcastss %%xmm4,%%ymm4 \n"
// General purpose row blend.
LABELALIGN
"1: \n"
- "vmovdqu (%1),%%ymm0 \n"
- "vmovdqu 0x00(%1,%4,1),%%ymm2 \n"
- "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
- "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
- "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
- "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
- "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "jmp 99f \n"
+ "vmovdqu (%1),%%ymm0 \n"
+ "vmovdqu 0x00(%1,%4,1),%%ymm2 \n"
+ "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
+ "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
+ "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
+ "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
// Blend 50 / 50.
LABELALIGN
"50: \n"
- "vmovdqu (%1),%%ymm0 \n"
- "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 50b \n"
- "jmp 99f \n"
+ "vmovdqu (%1),%%ymm0 \n"
+ "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
LABELALIGN
"100: \n"
- "rep movsb \n"
- "jmp 999f \n"
+ "rep movsb \n"
+ "jmp 999f \n"
"99: \n"
"vzeroupper \n"
@@ -6059,20 +6263,20 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
int width) {
asm volatile(
- "movdqu (%3),%%xmm5 \n"
+ "movdqu (%3),%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -6093,16 +6297,16 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
- "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -6120,24 +6324,24 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y,
int width) {
asm volatile(
- "sub %1,%2 \n"
+ "sub %1,%2 \n"
LABELALIGN
- "1: \n"
- "movq (%1),%%xmm2 \n"
- "movq 0x00(%1,%2,1),%%xmm1 \n"
- "add $0x8,%1 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "movdqu (%0),%%xmm0 \n"
- "add $0x10,%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,(%3) \n"
- "movdqu %%xmm1,0x10(%3) \n"
- "lea 0x20(%3),%3 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
+ "1: \n"
+ "movq (%1),%%xmm2 \n"
+ "movq 0x00(%1,%2,1),%%xmm1 \n"
+ "add $0x8,%1 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "add $0x10,%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%3) \n"
+ "movdqu %%xmm1,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -6156,24 +6360,24 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y,
int width) {
asm volatile(
- "sub %1,%2 \n"
+ "sub %1,%2 \n"
LABELALIGN
- "1: \n"
- "movq (%1),%%xmm2 \n"
- "movq 0x00(%1,%2,1),%%xmm1 \n"
- "add $0x8,%1 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "add $0x10,%0 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- "movdqu %%xmm1,(%3) \n"
- "movdqu %%xmm2,0x10(%3) \n"
- "lea 0x20(%3),%3 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
+ "1: \n"
+ "movq (%1),%%xmm2 \n"
+ "movq 0x00(%1,%2,1),%%xmm1 \n"
+ "add $0x8,%1 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "add $0x10,%0 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1,(%3) \n"
+ "movdqu %%xmm2,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -6192,27 +6396,27 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,
int width) {
asm volatile(
- "sub %1,%2 \n"
+ "sub %1,%2 \n"
LABELALIGN
- "1: \n"
- "vpmovzxbw (%1),%%ymm1 \n"
- "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
- "add $0x10,%1 \n"
- "vpsllw $0x8,%%ymm2,%%ymm2 \n"
- "vpor %%ymm1,%%ymm2,%%ymm2 \n"
- "vmovdqu (%0),%%ymm0 \n"
- "add $0x20,%0 \n"
- "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
- "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
- "vextractf128 $0x0,%%ymm1,(%3) \n"
- "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
- "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
- "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
- "lea 0x40(%3),%3 \n"
- "sub $0x20,%4 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "1: \n"
+ "vpmovzxbw (%1),%%ymm1 \n"
+ "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
+ "add $0x10,%1 \n"
+ "vpsllw $0x8,%%ymm2,%%ymm2 \n"
+ "vpor %%ymm1,%%ymm2,%%ymm2 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "add $0x20,%0 \n"
+ "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
+ "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
+ "vextractf128 $0x0,%%ymm1,(%3) \n"
+ "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
+ "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
+ "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
+ "lea 0x40(%3),%3 \n"
+ "sub $0x20,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -6231,27 +6435,27 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y,
int width) {
asm volatile(
- "sub %1,%2 \n"
+ "sub %1,%2 \n"
LABELALIGN
- "1: \n"
- "vpmovzxbw (%1),%%ymm1 \n"
- "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
- "add $0x10,%1 \n"
- "vpsllw $0x8,%%ymm2,%%ymm2 \n"
- "vpor %%ymm1,%%ymm2,%%ymm2 \n"
- "vmovdqu (%0),%%ymm0 \n"
- "add $0x20,%0 \n"
- "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
- "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
- "vextractf128 $0x0,%%ymm1,(%3) \n"
- "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
- "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
- "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
- "lea 0x40(%3),%3 \n"
- "sub $0x20,%4 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "1: \n"
+ "vpmovzxbw (%1),%%ymm1 \n"
+ "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
+ "add $0x10,%1 \n"
+ "vpsllw $0x8,%%ymm2,%%ymm2 \n"
+ "vpor %%ymm1,%%ymm2,%%ymm2 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "add $0x20,%0 \n"
+ "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
+ "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
+ "vextractf128 $0x0,%%ymm1,(%3) \n"
+ "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
+ "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
+ "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
+ "lea 0x40(%3),%3 \n"
+ "sub $0x20,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -6269,47 +6473,47 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
int width) {
asm volatile(
- "pxor %%xmm3,%%xmm3 \n"
+ "pxor %%xmm3,%%xmm3 \n"
// 2 pixel loop.
LABELALIGN
"1: \n"
- "movq (%0),%%xmm0 \n"
- "lea 0x8(%0),%0 \n"
- "punpcklbw %%xmm3,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "punpcklwd %%xmm3,%%xmm0 \n"
- "punpckhwd %%xmm3,%%xmm4 \n"
- "cvtdq2ps %%xmm0,%%xmm0 \n"
- "cvtdq2ps %%xmm4,%%xmm4 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "mulps 0x10(%3),%%xmm0 \n"
- "mulps 0x10(%3),%%xmm4 \n"
- "addps (%3),%%xmm0 \n"
- "addps (%3),%%xmm4 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "movdqa %%xmm5,%%xmm6 \n"
- "mulps %%xmm1,%%xmm2 \n"
- "mulps %%xmm5,%%xmm6 \n"
- "mulps %%xmm2,%%xmm1 \n"
- "mulps %%xmm6,%%xmm5 \n"
- "mulps 0x20(%3),%%xmm2 \n"
- "mulps 0x20(%3),%%xmm6 \n"
- "mulps 0x30(%3),%%xmm1 \n"
- "mulps 0x30(%3),%%xmm5 \n"
- "addps %%xmm2,%%xmm0 \n"
- "addps %%xmm6,%%xmm4 \n"
- "addps %%xmm1,%%xmm0 \n"
- "addps %%xmm5,%%xmm4 \n"
- "cvttps2dq %%xmm0,%%xmm0 \n"
- "cvttps2dq %%xmm4,%%xmm4 \n"
- "packuswb %%xmm4,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x2,%2 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm3,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "punpcklwd %%xmm3,%%xmm0 \n"
+ "punpckhwd %%xmm3,%%xmm4 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "cvtdq2ps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "mulps 0x10(%3),%%xmm0 \n"
+ "mulps 0x10(%3),%%xmm4 \n"
+ "addps (%3),%%xmm0 \n"
+ "addps (%3),%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm5,%%xmm6 \n"
+ "mulps %%xmm1,%%xmm2 \n"
+ "mulps %%xmm5,%%xmm6 \n"
+ "mulps %%xmm2,%%xmm1 \n"
+ "mulps %%xmm6,%%xmm5 \n"
+ "mulps 0x20(%3),%%xmm2 \n"
+ "mulps 0x20(%3),%%xmm6 \n"
+ "mulps 0x30(%3),%%xmm1 \n"
+ "mulps 0x30(%3),%%xmm5 \n"
+ "addps %%xmm2,%%xmm0 \n"
+ "addps %%xmm6,%%xmm4 \n"
+ "addps %%xmm1,%%xmm0 \n"
+ "addps %%xmm5,%%xmm4 \n"
+ "cvttps2dq %%xmm0,%%xmm0 \n"
+ "cvttps2dq %%xmm4,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x2,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -6405,27 +6609,27 @@ void HalfFloatRow_AVX2(const uint16_t* src,
int width) {
scale *= kScaleBias;
asm volatile(
- "vbroadcastss %3, %%ymm4 \n"
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
- "sub %0,%1 \n"
+ "vbroadcastss %3, %%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %0,%1 \n"
// 16 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm2 \n" // 16 shorts
- "add $0x20,%0 \n"
- "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
- "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
- "vcvtdq2ps %%ymm3,%%ymm3 \n"
- "vcvtdq2ps %%ymm2,%%ymm2 \n"
- "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
- "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
- "vpsrld $0xd,%%ymm3,%%ymm3 \n"
- "vpsrld $0xd,%%ymm2,%%ymm2 \n"
- "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
- "vmovdqu %%ymm2,-0x20(%0,%1,1) \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm2 \n" // 16 shorts
+ "add $0x20,%0 \n"
+ "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
+ "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
+ "vcvtdq2ps %%ymm3,%%ymm3 \n"
+ "vcvtdq2ps %%ymm2,%%ymm2 \n"
+ "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
+ "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
+ "vpsrld $0xd,%%ymm3,%%ymm3 \n"
+ "vpsrld $0xd,%%ymm2,%%ymm2 \n"
+ "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
+ "vmovdqu %%ymm2,-0x20(%0,%1,1) \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
@@ -6446,8 +6650,8 @@ void HalfFloatRow_F16C(const uint16_t* src,
float scale,
int width) {
asm volatile(
- "vbroadcastss %3, %%ymm4 \n"
- "sub %0,%1 \n"
+ "vbroadcastss %3, %%ymm4 \n"
+ "sub %0,%1 \n"
// 16 pixel loop.
LABELALIGN
@@ -6481,7 +6685,7 @@ void HalfFloatRow_F16C(const uint16_t* src,
#ifdef HAS_HALFFLOATROW_F16C
void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
asm volatile(
- "sub %0,%1 \n"
+ "sub %0,%1 \n"
// 16 pixel loop.
LABELALIGN
"1: \n"
@@ -6515,21 +6719,21 @@ void ARGBColorTableRow_X86(uint8_t* dst_argb,
// 1 pixel loop.
LABELALIGN
"1: \n"
- "movzb (%0),%1 \n"
- "lea 0x4(%0),%0 \n"
- "movzb 0x00(%3,%1,4),%1 \n"
- "mov %b1,-0x4(%0) \n"
- "movzb -0x3(%0),%1 \n"
- "movzb 0x01(%3,%1,4),%1 \n"
- "mov %b1,-0x3(%0) \n"
- "movzb -0x2(%0),%1 \n"
- "movzb 0x02(%3,%1,4),%1 \n"
- "mov %b1,-0x2(%0) \n"
- "movzb -0x1(%0),%1 \n"
- "movzb 0x03(%3,%1,4),%1 \n"
- "mov %b1,-0x1(%0) \n"
- "dec %2 \n"
- "jg 1b \n"
+ "movzb (%0),%1 \n"
+ "lea 0x4(%0),%0 \n"
+ "movzb 0x00(%3,%1,4),%1 \n"
+ "mov %b1,-0x4(%0) \n"
+ "movzb -0x3(%0),%1 \n"
+ "movzb 0x01(%3,%1,4),%1 \n"
+ "mov %b1,-0x3(%0) \n"
+ "movzb -0x2(%0),%1 \n"
+ "movzb 0x02(%3,%1,4),%1 \n"
+ "mov %b1,-0x2(%0) \n"
+ "movzb -0x1(%0),%1 \n"
+ "movzb 0x03(%3,%1,4),%1 \n"
+ "mov %b1,-0x1(%0) \n"
+ "dec %2 \n"
+ "jg 1b \n"
: "+r"(dst_argb), // %0
"=&d"(pixel_temp), // %1
"+r"(width) // %2
@@ -6548,18 +6752,18 @@ void RGBColorTableRow_X86(uint8_t* dst_argb,
// 1 pixel loop.
LABELALIGN
"1: \n"
- "movzb (%0),%1 \n"
- "lea 0x4(%0),%0 \n"
- "movzb 0x00(%3,%1,4),%1 \n"
- "mov %b1,-0x4(%0) \n"
- "movzb -0x3(%0),%1 \n"
- "movzb 0x01(%3,%1,4),%1 \n"
- "mov %b1,-0x3(%0) \n"
- "movzb -0x2(%0),%1 \n"
- "movzb 0x02(%3,%1,4),%1 \n"
- "mov %b1,-0x2(%0) \n"
- "dec %2 \n"
- "jg 1b \n"
+ "movzb (%0),%1 \n"
+ "lea 0x4(%0),%0 \n"
+ "movzb 0x00(%3,%1,4),%1 \n"
+ "mov %b1,-0x4(%0) \n"
+ "movzb -0x3(%0),%1 \n"
+ "movzb 0x01(%3,%1,4),%1 \n"
+ "mov %b1,-0x3(%0) \n"
+ "movzb -0x2(%0),%1 \n"
+ "movzb 0x02(%3,%1,4),%1 \n"
+ "mov %b1,-0x2(%0) \n"
+ "dec %2 \n"
+ "jg 1b \n"
: "+r"(dst_argb), // %0
"=&d"(pixel_temp), // %1
"+r"(width) // %2
@@ -6578,86 +6782,86 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
uintptr_t pixel_temp;
uintptr_t table_temp;
asm volatile(
- "movd %6,%%xmm3 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psllw $0x8,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "movd %6,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0x8,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%2),%%xmm0 \n"
- "pmaddubsw %%xmm3,%%xmm0 \n"
- "phaddw %%xmm0,%%xmm0 \n"
- "pand %%xmm4,%%xmm0 \n"
- "punpcklwd %%xmm5,%%xmm0 \n"
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
-
- "movzb (%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,(%3) \n"
- "movzb 0x1(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x1(%3) \n"
- "movzb 0x2(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x2(%3) \n"
- "movzb 0x3(%2),%0 \n"
- "mov %b0,0x3(%3) \n"
-
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
-
- "movzb 0x4(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x4(%3) \n"
- "movzb 0x5(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x5(%3) \n"
- "movzb 0x6(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x6(%3) \n"
- "movzb 0x7(%2),%0 \n"
- "mov %b0,0x7(%3) \n"
-
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
-
- "movzb 0x8(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x8(%3) \n"
- "movzb 0x9(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x9(%3) \n"
- "movzb 0xa(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0xa(%3) \n"
- "movzb 0xb(%2),%0 \n"
- "mov %b0,0xb(%3) \n"
-
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
-
- "movzb 0xc(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0xc(%3) \n"
- "movzb 0xd(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0xd(%3) \n"
- "movzb 0xe(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0xe(%3) \n"
- "movzb 0xf(%2),%0 \n"
- "mov %b0,0xf(%3) \n"
- "lea 0x10(%2),%2 \n"
- "lea 0x10(%3),%3 \n"
- "sub $0x4,%4 \n"
- "jg 1b \n"
+ "movdqu (%2),%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "phaddw %%xmm0,%%xmm0 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb (%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,(%3) \n"
+ "movzb 0x1(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x1(%3) \n"
+ "movzb 0x2(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x2(%3) \n"
+ "movzb 0x3(%2),%0 \n"
+ "mov %b0,0x3(%3) \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb 0x4(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x4(%3) \n"
+ "movzb 0x5(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x5(%3) \n"
+ "movzb 0x6(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x6(%3) \n"
+ "movzb 0x7(%2),%0 \n"
+ "mov %b0,0x7(%3) \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb 0x8(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x8(%3) \n"
+ "movzb 0x9(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x9(%3) \n"
+ "movzb 0xa(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xa(%3) \n"
+ "movzb 0xb(%2),%0 \n"
+ "mov %b0,0xb(%3) \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+
+ "movzb 0xc(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xc(%3) \n"
+ "movzb 0xd(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xd(%3) \n"
+ "movzb 0xe(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xe(%3) \n"
+ "movzb 0xf(%2),%0 \n"
+ "mov %b0,0xf(%3) \n"
+ "lea 0x10(%2),%2 \n"
+ "lea 0x10(%3),%3 \n"
+ "sub $0x4,%4 \n"
+ "jg 1b \n"
: "=&d"(pixel_temp), // %0
"=&a"(table_temp), // %1
"+r"(src_argb), // %2
@@ -6669,6 +6873,300 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
}
#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+#ifdef HAS_NV21TOYUV24ROW_AVX2
+
+// begin NV21ToYUV24Row_C avx2 constants
+static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00,
+ 0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80,
+ 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
+ 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00};
+
+static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
+ 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
+ 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
+ 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80};
+
+static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
+ 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
+ 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
+ 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00};
+
+static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
+ 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05,
+ 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
+ 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05};
+
+static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
+ 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80,
+ 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
+ 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80};
+
+static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
+ 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f,
+ 0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
+ 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f};
+
+static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
+ 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80,
+ 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
+ 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80};
+
+static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
+ 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a,
+ 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
+ 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a};
+
+static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
+ 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80,
+ 0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
+ 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80};
+
+// NV21ToYUV24Row_AVX2
+void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width) {
+ uint8_t* src_y_ptr;
+ uint64_t src_offset = 0;
+ uint64_t width64;
+
+ width64 = width;
+ src_y_ptr = (uint8_t*)src_y;
+
+ asm volatile(
+ "vmovdqu %5, %%ymm0 \n" // init blend value
+ "vmovdqu %6, %%ymm1 \n" // init blend value
+ "vmovdqu %7, %%ymm2 \n" // init blend value
+ // "sub $0x20, %3 \n" //sub 32 from
+ // width for final loop
+
+ LABELALIGN
+ "1: \n" // label 1
+ "vmovdqu (%0,%4), %%ymm3 \n" // src_y
+ "vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1
+ "vmovdqu (%1), %%ymm5 \n" // src_uv
+ "vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf
+ "vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for
+ // shuf
+ "vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for
+ // shuf
+ "vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf
+ "vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for
+ // shuf
+ "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0
+ "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0
+ "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2
+ "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1
+ "vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const
+ "vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results
+ "vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h
+ "vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results
+ "add $0x20, %4 \n" // add to src buffer
+ // ptr
+ "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert
+ "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert
+ "vmovdqu %%ymm4, (%2) \n" // store dst_yuv
+ "vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h
+ "add $0x60,%2 \n" // add to dst buffer
+ // ptr
+ // "cmp %3, %4 \n" //(width64 -
+ // 32 bytes) and src_offset
+ "sub $0x20,%3 \n" // 32 pixels per loop
+ "jg 1b \n"
+ "vzeroupper \n" // sse-avx2
+ // transistions
+
+ : "+r"(src_y), //%0
+ "+r"(src_vu), //%1
+ "+r"(dst_yuv24), //%2
+ "+r"(width64), //%3
+ "+r"(src_offset) //%4
+ : "m"(kBLEND0), //%5
+ "m"(kBLEND1), //%6
+ "m"(kBLEND2), //%7
+ "m"(kSHUF0), //%8
+ "m"(kSHUF1), //%9
+ "m"(kSHUF2), //%10
+ "m"(kSHUF3), //%11
+ "m"(kSHUF4), //%12
+ "m"(kSHUF5) //%13
+ : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12",
+ "xmm13", "xmm14", "xmm15");
+}
+#endif // HAS_NV21TOYUV24ROW_AVX2
+
+#ifdef HAS_SWAPUVROW_SSSE3
+
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
+ 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+ asm volatile(
+
+ "movdqu %3,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_vu), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleUVToVU) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif // HAS_SWAPUVROW_SSSE3
+
+#ifdef HAS_SWAPUVROW_AVX2
+void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+ asm volatile(
+
+ "vbroadcastf128 %3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_vu), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleUVToVU) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif // HAS_SWAPUVROW_AVX2
+
+void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // load 16 U values
+ "movdqu (%1),%%xmm1 \n" // load 16 V values
+ "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row
+ "movdqu 0(%1,%5,1),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n" // half size
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x10(%1),%1 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "psrlw $0x1,%%xmm0 \n"
+ "psrlw $0x1,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n" // store 8 UV pixels
+ "lea 0x10(%2),%2 \n"
+ "sub $0x10,%3 \n" // 16 src pixels per loop
+ "jg 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(src_stride_u)), // %4
+ "r"((intptr_t)(src_stride_v)) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void HalfMergeUVRow_AVX2(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // load 32 U values
+ "vmovdqu (%1),%%ymm1 \n" // load 32 V values
+ "vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row
+ "vmovdqu 0(%1,%5,1),%%ymm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "lea 0x20(%1),%1 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels
+ "lea 0x20(%2),%2 \n"
+ "sub $0x20,%3 \n" // 32 src pixels per loop
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(src_stride_u)), // %4
+ "r"((intptr_t)(src_stride_v)) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
+ asm volatile(
+ "pxor %%xmm1,%%xmm1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movd (%0),%%xmm0 \n" // load float
+ "maxss %%xmm1, %%xmm0 \n" // clamp to zero
+ "add 4, %0 \n"
+ "movd %%xmm0, (%1) \n" // store float
+ "add 4, %1 \n"
+ "sub $0x4,%2 \n" // 1 float per loop
+ "jg 1b \n"
+ : "+r"(src_x), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus
diff --git a/chromium/third_party/libyuv/source/row_mmi.cc b/chromium/third_party/libyuv/source/row_mmi.cc
new file mode 100644
index 00000000000..9a8e2cb2d16
--- /dev/null
+++ b/chromium/third_party/libyuv/source/row_mmi.cc
@@ -0,0 +1,7842 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include "libyuv/row.h"
+
+#include <string.h> // For memcpy and memset.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Mips MMI.
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+// clang-format off
+
+void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
+ int width) {
+ uint64_t src0, src1, dest;
+ const uint64_t mask = 0xff000000ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t"
+ "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t"
+ "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t"
+
+ "or %[src0], %[src0], %[mask] \n\t"
+ "or %[src1], %[src1], %[mask] \n\t"
+ "punpcklwd %[dest], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "gslwlc1 %[src0], 0x09(%[src_ptr]) \n\t"
+ "gslwrc1 %[src0], 0x06(%[src_ptr]) \n\t"
+ "gslwlc1 %[src1], 0x0c(%[src_ptr]) \n\t"
+ "gslwrc1 %[src1], 0x09(%[src_ptr]) \n\t"
+
+ "or %[src0], %[src0], %[mask] \n\t"
+ "or %[src1], %[src1], %[mask] \n\t"
+ "punpcklwd %[dest], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_rgb24), [dst_ptr] "r"(dst_argb), [width] "r"(width),
+ [mask] "f"(mask)
+ : "memory");
+}
+
+void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+ uint64_t src0, src1, dest;
+ const uint64_t mask0 = 0x0;
+ const uint64_t mask1 = 0xff000000ULL;
+ const uint64_t mask2 = 0xc6;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t"
+ "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t"
+ "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t"
+
+ "or %[src0], %[src0], %[mask1] \n\t"
+ "punpcklbh %[src0], %[src0], %[mask0] \n\t"
+ "pshufh %[src0], %[src0], %[mask2] \n\t"
+ "or %[src1], %[src1], %[mask1] \n\t"
+ "punpcklbh %[src1], %[src1], %[mask0] \n\t"
+ "pshufh %[src1], %[src1], %[mask2] \n\t"
+ "packushb %[dest], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "gslwlc1 %[src0], 0x09(%[src_ptr]) \n\t"
+ "gslwrc1 %[src0], 0x06(%[src_ptr]) \n\t"
+ "gslwlc1 %[src1], 0x0c(%[src_ptr]) \n\t"
+ "gslwrc1 %[src1], 0x09(%[src_ptr]) \n\t"
+
+ "or %[src0], %[src0], %[mask1] \n\t"
+ "punpcklbh %[src0], %[src0], %[mask0] \n\t"
+ "pshufh %[src0], %[src0], %[mask2] \n\t"
+ "or %[src1], %[src1], %[mask1] \n\t"
+ "punpcklbh %[src1], %[src1], %[mask0] \n\t"
+ "pshufh %[src1], %[src1], %[mask2] \n\t"
+ "packushb %[dest], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_raw), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
+ [mask1] "f"(mask1), [mask2] "f"(mask2), [width] "r"(width)
+ : "memory");
+}
+
+void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+ uint64_t src0, src1;
+ uint64_t ftmp[4];
+ uint64_t mask0 = 0xc6;
+ uint64_t mask1 = 0x6c;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_raw]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_raw]) \n\t"
+ "gslwrc1 %[src1], 0x08(%[src_raw]) \n\t"
+ "gslwlc1 %[src1], 0x0b(%[src_raw]) \n\t"
+
+ "punpcklbh %[ftmp0], %[src0], %[zero] \n\t"
+ "pshufh %[ftmp0], %[ftmp0], %[mask0] \n\t"
+ "punpckhbh %[ftmp1], %[src0], %[zero] \n\t"
+ "punpcklbh %[src1], %[src1], %[zero] \n\t"
+ "pextrh %[ftmp2], %[ftmp0], %[three] \n\t"
+ "pextrh %[ftmp3], %[ftmp1], %[one] \n\t"
+ "pinsrh_3 %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
+ "pextrh %[ftmp3], %[ftmp1], %[two] \n\t"
+ "pinsrh_1 %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "pshufh %[src1], %[src1], %[mask1] \n\t"
+ "pextrh %[ftmp2], %[src1], %[zero] \n\t"
+ "pinsrh_2 %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "pinsrh_0 %[src1], %[src1], %[ftmp3] \n\t"
+ "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "packushb %[src1], %[src1], %[zero] \n\t"
+
+ "gssdrc1 %[ftmp0], 0x00(%[dst_rgb24]) \n\t"
+ "gssdlc1 %[ftmp0], 0x07(%[dst_rgb24]) \n\t"
+ "gsswrc1 %[src1], 0x08(%[dst_rgb24]) \n\t"
+ "gsswlc1 %[src1], 0x0b(%[dst_rgb24]) \n\t"
+
+ "daddiu %[src_raw], %[src_raw], 0x0c \n\t"
+ "daddiu %[dst_rgb24], %[dst_rgb24], 0x0c \n\t"
+ "daddiu %[width], %[width], -0x04 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]),
+ [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3])
+ : [src_raw] "r"(src_raw), [dst_rgb24] "r"(dst_rgb24), [width] "r"(width),
+ [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00),
+ [one] "f"(0x01), [two] "f"(0x02), [three] "f"(0x03)
+ : "memory");
+}
+
+void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width) {
+ uint64_t ftmp[5];
+ uint64_t c0 = 0x001f001f001f001f;
+ uint64_t c1 = 0x00ff00ff00ff00ff;
+ uint64_t c2 = 0x0007000700070007;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t"
+ "psrlh %[src1], %[src0], %[eight] \n\t"
+ "and %[b], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g], %[src1], %[c2] \n\t"
+ "psllh %[g], %[g], %[three] \n\t"
+ "or %[g], %[src0], %[g] \n\t"
+ "psrlh %[r], %[src1], %[three] \n\t"
+ "psllh %[src0], %[b], %[three] \n\t"
+ "psrlh %[src1], %[b], %[two] \n\t"
+ "or %[b], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[g], %[two] \n\t"
+ "psrlh %[src1], %[g], %[four] \n\t"
+ "or %[g], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[r], %[three] \n\t"
+ "psrlh %[src1], %[r], %[two] \n\t"
+ "or %[r], %[src0], %[src1] \n\t"
+ "packushb %[b], %[b], %[r] \n\t"
+ "packushb %[g], %[g], %[c1] \n\t"
+ "punpcklbh %[src0], %[b], %[g] \n\t"
+ "punpckhbh %[src1], %[b], %[g] \n\t"
+ "punpcklhw %[r], %[src0], %[src1] \n\t"
+ "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t"
+ "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t"
+ "punpckhhw %[r], %[src0], %[src1] \n\t"
+ "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t"
+ "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t"
+ "daddiu %[src_rgb565], %[src_rgb565], 0x08 \n\t"
+ "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t"
+ "daddiu %[width], %[width], -0x04 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
+ [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4])
+ : [src_rgb565] "r"(src_rgb565), [dst_argb] "r"(dst_argb),
+ [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
+ [eight] "f"(0x08), [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02),
+ [four] "f"(0x04)
+ : "memory");
+}
+
+void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
+ int width) {
+ uint64_t ftmp[6];
+ uint64_t c0 = 0x001f001f001f001f;
+ uint64_t c1 = 0x00ff00ff00ff00ff;
+ uint64_t c2 = 0x0003000300030003;
+ uint64_t c3 = 0x007c007c007c007c;
+ uint64_t c4 = 0x0001000100010001;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t"
+ "psrlh %[src1], %[src0], %[eight] \n\t"
+ "and %[b], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g], %[src1], %[c2] \n\t"
+ "psllh %[g], %[g], %[three] \n\t"
+ "or %[g], %[src0], %[g] \n\t"
+ "and %[r], %[src1], %[c3] \n\t"
+ "psrlh %[r], %[r], %[two] \n\t"
+ "psrlh %[a], %[src1], %[seven] \n\t"
+ "psllh %[src0], %[b], %[three] \n\t"
+ "psrlh %[src1], %[b], %[two] \n\t"
+ "or %[b], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[g], %[three] \n\t"
+ "psrlh %[src1], %[g], %[two] \n\t"
+ "or %[g], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[r], %[three] \n\t"
+ "psrlh %[src1], %[r], %[two] \n\t"
+ "or %[r], %[src0], %[src1] \n\t"
+ "xor %[a], %[a], %[c1] \n\t"
+ "paddb %[a], %[a], %[c4] \n\t"
+ "packushb %[b], %[b], %[r] \n\t"
+ "packushb %[g], %[g], %[a] \n\t"
+ "punpcklbh %[src0], %[b], %[g] \n\t"
+ "punpckhbh %[src1], %[b], %[g] \n\t"
+ "punpcklhw %[r], %[src0], %[src1] \n\t"
+ "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t"
+ "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t"
+ "punpckhhw %[r], %[src0], %[src1] \n\t"
+ "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t"
+ "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t"
+ "daddiu %[src_argb1555], %[src_argb1555], 0x08 \n\t"
+ "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t"
+ "daddiu %[width], %[width], -0x04 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
+ [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5])
+ : [src_argb1555] "r"(src_argb1555), [dst_argb] "r"(dst_argb),
+ [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
+ [c3] "f"(c3), [c4] "f"(c4), [eight] "f"(0x08), [five] "f"(0x05),
+ [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07)
+ : "memory");
+}
+
+void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width) {
+ uint64_t ftmp[6];
+ uint64_t c0 = 0x000f000f000f000f;
+ uint64_t c1 = 0x00ff00ff00ff00ff;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t"
+ "psrlh %[src1], %[src0], %[eight] \n\t"
+ "and %[b], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[g], %[src0], %[four] \n\t"
+ "and %[r], %[src1], %[c0] \n\t"
+ "psrlh %[a], %[src1], %[four] \n\t"
+ "psllh %[src0], %[b], %[four] \n\t"
+ "or %[b], %[src0], %[b] \n\t"
+ "psllh %[src0], %[g], %[four] \n\t"
+ "or %[g], %[src0], %[g] \n\t"
+ "psllh %[src0], %[r], %[four] \n\t"
+ "or %[r], %[src0], %[r] \n\t"
+ "psllh %[src0], %[a], %[four] \n\t"
+ "or %[a], %[src0], %[a] \n\t"
+ "packushb %[b], %[b], %[r] \n\t"
+ "packushb %[g], %[g], %[a] \n\t"
+ "punpcklbh %[src0], %[b], %[g] \n\t"
+ "punpckhbh %[src1], %[b], %[g] \n\t"
+ "punpcklhw %[r], %[src0], %[src1] \n\t"
+ "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t"
+ "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t"
+ "punpckhhw %[r], %[src0], %[src1] \n\t"
+ "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t"
+ "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t"
+ "daddiu %[src_argb4444], %[src_argb4444], 0x08 \n\t"
+ "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t"
+ "daddiu %[width], %[width], -0x04 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
+ [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5])
+ : [src_argb4444] "r"(src_argb4444), [dst_argb] "r"(dst_argb),
+ [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [eight] "f"(0x08),
+ [four] "f"(0x04)
+ : "memory");
+}
+
+void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+ uint64_t src;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gslwlc1 %[src], 0x03(%[src_ptr]) \n\t"
+ "gslwrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "gsswlc1 %[src], 0x03(%[dst_ptr]) \n\t"
+ "gsswrc1 %[src], 0x00(%[dst_ptr]) \n\t"
+
+ "gslwlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gslwrc1 %[src], 0x04(%[src_ptr]) \n\t"
+ "gsswlc1 %[src], 0x06(%[dst_ptr]) \n\t"
+ "gsswrc1 %[src], 0x03(%[dst_ptr]) \n\t"
+
+ "gslwlc1 %[src], 0x0b(%[src_ptr]) \n\t"
+ "gslwrc1 %[src], 0x08(%[src_ptr]) \n\t"
+ "gsswlc1 %[src], 0x09(%[dst_ptr]) \n\t"
+ "gsswrc1 %[src], 0x06(%[dst_ptr]) \n\t"
+
+ "gslwlc1 %[src], 0x0f(%[src_ptr]) \n\t"
+ "gslwrc1 %[src], 0x0c(%[src_ptr]) \n\t"
+ "gsswlc1 %[src], 0x0c(%[dst_ptr]) \n\t"
+ "gsswrc1 %[src], 0x09(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x0c \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src)
+ : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_rgb), [width] "r"(width)
+ : "memory");
+}
+
+void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+ uint64_t src0, src1;
+ uint64_t ftmp[3];
+ uint64_t mask0 = 0xc6;
+ uint64_t mask1 = 0x18;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t"
+
+ "punpcklbh %[ftmp0], %[src0], %[zero] \n\t"
+ "pshufh %[ftmp0], %[ftmp0], %[mask0] \n\t"
+ "punpckhbh %[ftmp1], %[src0], %[zero] \n\t"
+ "punpcklbh %[ftmp2], %[src1], %[zero] \n\t"
+ "punpckhbh %[src1], %[src1], %[zero] \n\t"
+
+ "pextrh %[src0], %[ftmp1], %[two] \n\t"
+ "pinsrh_3 %[ftmp0], %[ftmp0], %[src0] \n\t"
+ "pshufh %[ftmp1], %[ftmp1], %[one] \n\t"
+
+ "pextrh %[src0], %[ftmp2], %[two] \n\t"
+ "pinsrh_2 %[ftmp1], %[ftmp1], %[src0] \n\t"
+ "pextrh %[src0], %[ftmp2], %[one] \n\t"
+ "pinsrh_3 %[ftmp1], %[ftmp1], %[src0] \n\t"
+ "pextrh %[src0], %[ftmp2], %[zero] \n\t"
+ "pshufh %[src1], %[src1], %[mask1] \n\t"
+ "pinsrh_0 %[src1], %[src1], %[src0] \n\t"
+ "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "packushb %[src1], %[src1], %[zero] \n\t"
+
+ "gssdrc1 %[ftmp0], 0x00(%[dst_rgb]) \n\t"
+ "gssdlc1 %[ftmp0], 0x07(%[dst_rgb]) \n\t"
+ "gsswrc1 %[src1], 0x08(%[dst_rgb]) \n\t"
+ "gsswlc1 %[src1], 0x0b(%[dst_rgb]) \n\t"
+
+ "daddiu %[src_argb], %[src_argb], 0x10 \n\t"
+ "daddiu %[dst_rgb], %[dst_rgb], 0x0c \n\t"
+ "daddiu %[width], %[width], -0x04 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]),
+ [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2])
+ : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
+ [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00),
+ [one] "f"(0x01), [two] "f"(0x02)
+ : "memory");
+}
+
+void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+ uint64_t src0, src1;
+ uint64_t ftmp[3];
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t"
+
+ "punpcklbh %[b], %[src0], %[src1] \n\t"
+ "punpckhbh %[g], %[src0], %[src1] \n\t"
+ "punpcklbh %[src0], %[b], %[g] \n\t"
+ "punpckhbh %[src1], %[b], %[g] \n\t"
+ "punpcklbh %[b], %[src0], %[zero] \n\t"
+ "punpckhbh %[g], %[src0], %[zero] \n\t"
+ "punpcklbh %[r], %[src1], %[zero] \n\t"
+
+ "psrlh %[b], %[b], %[three] \n\t"
+ "psrlh %[g], %[g], %[two] \n\t"
+ "psrlh %[r], %[r], %[three] \n\t"
+
+ "psllh %[g], %[g], %[five] \n\t"
+ "psllh %[r], %[r], %[eleven] \n\t"
+ "or %[b], %[b], %[g] \n\t"
+ "or %[b], %[b], %[r] \n\t"
+
+ "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t"
+ "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t"
+
+ "daddiu %[src_argb], %[src_argb], 0x10 \n\t"
+ "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t"
+ "daddiu %[width], %[width], -0x04 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
+ [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2])
+ : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
+ [zero] "f"(0x00), [two] "f"(0x02), [three] "f"(0x03), [five] "f"(0x05),
+ [eleven] "f"(0x0b)
+ : "memory");
+}
+
+// dither4 is a row of 4 values from 4x4 dither matrix.
+// The 4x4 matrix contains values to increase RGB. When converting to
+// fewer bits (565) this provides an ordered dither.
+// The order in the 4x4 matrix in first byte is upper left.
+// The 4 values are passed as an int, then referenced as an array, so
+// endian will not affect order of the original matrix. But the dither4
+// will containing the first pixel in the lower byte for little endian
+// or the upper byte for big endian.
+void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
+ int width) {
+ uint64_t src0, src1;
+ uint64_t ftmp[3];
+ uint64_t c0 = 0x00ff00ff00ff00ff;
+
+ __asm__ volatile(
+ "punpcklbh %[dither], %[dither], %[zero] \n\t"
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t"
+
+ "punpcklbh %[b], %[src0], %[src1] \n\t"
+ "punpckhbh %[g], %[src0], %[src1] \n\t"
+ "punpcklbh %[src0], %[b], %[g] \n\t"
+ "punpckhbh %[src1], %[b], %[g] \n\t"
+ "punpcklbh %[b], %[src0], %[zero] \n\t"
+ "punpckhbh %[g], %[src0], %[zero] \n\t"
+ "punpcklbh %[r], %[src1], %[zero] \n\t"
+
+ "paddh %[b], %[b], %[dither] \n\t"
+ "paddh %[g], %[g], %[dither] \n\t"
+ "paddh %[r], %[r], %[dither] \n\t"
+ "pcmpgth %[src0], %[b], %[c0] \n\t"
+ "or %[src0], %[src0], %[b] \n\t"
+ "and %[b], %[src0], %[c0] \n\t"
+ "pcmpgth %[src0], %[g], %[c0] \n\t"
+ "or %[src0], %[src0], %[g] \n\t"
+ "and %[g], %[src0], %[c0] \n\t"
+ "pcmpgth %[src0], %[r], %[c0] \n\t"
+ "or %[src0], %[src0], %[r] \n\t"
+ "and %[r], %[src0], %[c0] \n\t"
+
+ "psrlh %[b], %[b], %[three] \n\t"
+ "psrlh %[g], %[g], %[two] \n\t"
+ "psrlh %[r], %[r], %[three] \n\t"
+
+ "psllh %[g], %[g], %[five] \n\t"
+ "psllh %[r], %[r], %[eleven] \n\t"
+ "or %[b], %[b], %[g] \n\t"
+ "or %[b], %[b], %[r] \n\t"
+
+ "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t"
+ "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t"
+
+ "daddiu %[src_argb], %[src_argb], 0x10 \n\t"
+ "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t"
+ "daddiu %[width], %[width], -0x04 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
+ [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2])
+ : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
+ [dither] "f"(dither4), [c0] "f"(c0), [zero] "f"(0x00), [two] "f"(0x02),
+ [three] "f"(0x03), [five] "f"(0x05), [eleven] "f"(0x0b)
+ : "memory");
+}
+
+void ARGBToARGB1555Row_MMI(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
+ uint64_t src0, src1;
+ uint64_t ftmp[4];
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t"
+
+ "punpcklbh %[b], %[src0], %[src1] \n\t"
+ "punpckhbh %[g], %[src0], %[src1] \n\t"
+ "punpcklbh %[src0], %[b], %[g] \n\t"
+ "punpckhbh %[src1], %[b], %[g] \n\t"
+ "punpcklbh %[b], %[src0], %[zero] \n\t"
+ "punpckhbh %[g], %[src0], %[zero] \n\t"
+ "punpcklbh %[r], %[src1], %[zero] \n\t"
+ "punpckhbh %[a], %[src1], %[zero] \n\t"
+
+ "psrlh %[b], %[b], %[three] \n\t"
+ "psrlh %[g], %[g], %[three] \n\t"
+ "psrlh %[r], %[r], %[three] \n\t"
+ "psrlh %[a], %[a], %[seven] \n\t"
+
+ "psllh %[g], %[g], %[five] \n\t"
+ "psllh %[r], %[r], %[ten] \n\t"
+ "psllh %[a], %[a], %[fifteen] \n\t"
+ "or %[b], %[b], %[g] \n\t"
+ "or %[b], %[b], %[r] \n\t"
+ "or %[b], %[b], %[a] \n\t"
+
+ "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t"
+ "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t"
+
+ "daddiu %[src_argb], %[src_argb], 0x10 \n\t"
+ "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t"
+ "daddiu %[width], %[width], -0x04 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
+ [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3])
+ : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
+ [zero] "f"(0x00), [three] "f"(0x03), [five] "f"(0x05),
+ [seven] "f"(0x07), [ten] "f"(0x0a), [fifteen] "f"(0x0f)
+ : "memory");
+}
+
+void ARGBToARGB4444Row_MMI(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
+ uint64_t src0, src1;
+ uint64_t ftmp[4];
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t"
+
+ "punpcklbh %[b], %[src0], %[src1] \n\t"
+ "punpckhbh %[g], %[src0], %[src1] \n\t"
+ "punpcklbh %[src0], %[b], %[g] \n\t"
+ "punpckhbh %[src1], %[b], %[g] \n\t"
+ "punpcklbh %[b], %[src0], %[zero] \n\t"
+ "punpckhbh %[g], %[src0], %[zero] \n\t"
+ "punpcklbh %[r], %[src1], %[zero] \n\t"
+ "punpckhbh %[a], %[src1], %[zero] \n\t"
+
+ "psrlh %[b], %[b], %[four] \n\t"
+ "psrlh %[g], %[g], %[four] \n\t"
+ "psrlh %[r], %[r], %[four] \n\t"
+ "psrlh %[a], %[a], %[four] \n\t"
+
+ "psllh %[g], %[g], %[four] \n\t"
+ "psllh %[r], %[r], %[eight] \n\t"
+ "psllh %[a], %[a], %[twelve] \n\t"
+ "or %[b], %[b], %[g] \n\t"
+ "or %[b], %[b], %[r] \n\t"
+ "or %[b], %[b], %[a] \n\t"
+
+ "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t"
+ "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t"
+
+ "daddiu %[src_argb], %[src_argb], 0x10 \n\t"
+ "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t"
+ "daddiu %[width], %[width], -0x04 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
+ [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3])
+ : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
+ [zero] "f"(0x00), [four] "f"(0x04), [eight] "f"(0x08),
+ [twelve] "f"(0x0c)
+ : "memory");
+}
+
+void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+ uint64_t src, src_hi, src_lo;
+ uint64_t dest0, dest1, dest2, dest3;
+ const uint64_t value = 0x1080;
+ const uint64_t mask = 0x0001004200810019;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest0], %[dest0], %[src] \n\t"
+ "psrlw %[dest0], %[dest0], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest1], %[dest1], %[src] \n\t"
+ "psrlw %[dest1], %[dest1], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest2], %[dest2], %[src] \n\t"
+ "psrlw %[dest2], %[dest2], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest3], %[dest3], %[src] \n\t"
+ "psrlw %[dest3], %[dest3], %[eight] \n\t"
+
+ "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
+ "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
+ "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
+
+ "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t"
+ "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+ [dest3] "=&f"(dest3)
+ : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+ [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
+ [zero] "f"(0x00)
+ : "memory");
+}
+
+void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t src_rgb1;
+ uint64_t ftmp[13];
+ uint64_t tmp[1];
+ const uint64_t value = 0x4040;
+ const uint64_t mask_u = 0x0013002500380002;
+ const uint64_t mask_v = 0x00020038002f0009;
+
+ __asm__ volatile(
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
+ "1: \n\t"
+ "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsll %[dest0_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t"
+ "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
+ "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
+ "psubw %[dest0_u], %[src0], %[src1] \n\t"
+ "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
+ "psubw %[dest0_v], %[src1], %[src0] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsll %[dest1_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t"
+ "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
+ "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
+ "psubw %[dest1_u], %[src0], %[src1] \n\t"
+ "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
+ "psubw %[dest1_v], %[src1], %[src0] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsll %[dest2_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t"
+ "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
+ "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
+ "psubw %[dest2_u], %[src0], %[src1] \n\t"
+ "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
+ "psubw %[dest2_v], %[src1], %[src0] \n\t"
+ "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsll %[dest3_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t"
+ "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
+ "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
+ "psubw %[dest3_u], %[src0], %[src1] \n\t"
+ "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
+ "psubw %[dest3_v], %[src1], %[src0] \n\t"
+ "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
+
+ "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
+ "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
+ "packushb %[dest0_u], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
+ "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
+
+ "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
+ "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
+ "packushb %[dest0_v], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
+ "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
+
+ "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
+ "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
+ "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
+ "daddi %[width], %[width], -0x10 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
+ [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
+ [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
+ [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
+ [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+ [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+ : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+ [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+ [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
+ [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
+ [sixteen] "f"(0x10)
+ : "memory");
+}
+
+void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+ uint64_t src, src_hi, src_lo;
+ uint64_t dest0, dest1, dest2, dest3;
+ const uint64_t value = 0x1080;
+ const uint64_t mask = 0x0019008100420001;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest0], %[dest0], %[src] \n\t"
+ "psrlw %[dest0], %[dest0], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest1], %[dest1], %[src] \n\t"
+ "psrlw %[dest1], %[dest1], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest2], %[dest2], %[src] \n\t"
+ "psrlw %[dest2], %[dest2], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest3], %[dest3], %[src] \n\t"
+ "psrlw %[dest3], %[dest3], %[eight] \n\t"
+
+ "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
+ "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
+ "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
+
+ "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t"
+ "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+ [dest3] "=&f"(dest3)
+ : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+ [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
+ [zero] "f"(0x00)
+ : "memory");
+}
+
+void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t src_rgb1;
+ uint64_t ftmp[13];
+ uint64_t tmp[1];
+ const uint64_t value = 0x4040;
+ const uint64_t mask_u = 0x0002003800250013;
+ const uint64_t mask_v = 0x0009002f00380002;
+
+ __asm__ volatile(
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
+ "1: \n\t"
+ "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsrl %[dest0_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[dest0_u], %[dest0_u], %[value] \n\t"
+ "pinsrh_0 %[dest0_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
+ "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsrl %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
+ "psubw %[dest0_u], %[src1], %[src0] \n\t"
+ "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
+ "psubw %[dest0_v], %[src0], %[src1] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsrl %[dest1_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[dest1_u], %[dest1_u], %[value] \n\t"
+ "pinsrh_0 %[dest1_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
+ "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsrl %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
+ "psubw %[dest1_u], %[src1], %[src0] \n\t"
+ "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
+ "psubw %[dest1_v], %[src0], %[src1] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsrl %[dest2_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[dest2_u], %[dest2_u], %[value] \n\t"
+ "pinsrh_0 %[dest2_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
+ "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsrl %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
+ "psubw %[dest2_u], %[src1], %[src0] \n\t"
+ "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
+ "psubw %[dest2_v], %[src0], %[src1] \n\t"
+ "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsrl %[dest3_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[dest3_u], %[dest3_u], %[value] \n\t"
+ "pinsrh_0 %[dest3_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
+ "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsrl %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
+ "psubw %[dest3_u], %[src1], %[src0] \n\t"
+ "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
+ "psubw %[dest3_v], %[src0], %[src1] \n\t"
+ "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
+
+ "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
+ "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
+ "packushb %[dest0_u], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
+ "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
+
+ "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
+ "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
+ "packushb %[dest0_v], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
+ "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
+
+ "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
+ "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
+ "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
+ "daddi %[width], %[width], -0x10 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
+ [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
+ [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
+ [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
+ [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+ [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+ : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+ [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+ [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
+ [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
+ [sixteen] "f"(0x10)
+ : "memory");
+}
+
+void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+ uint64_t src, src_hi, src_lo;
+ uint64_t dest0, dest1, dest2, dest3;
+ const uint64_t value = 0x1080;
+ const uint64_t mask = 0x0001001900810042;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest0], %[dest0], %[src] \n\t"
+ "psrlw %[dest0], %[dest0], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest1], %[dest1], %[src] \n\t"
+ "psrlw %[dest1], %[dest1], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest2], %[dest2], %[src] \n\t"
+ "psrlw %[dest2], %[dest2], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest3], %[dest3], %[src] \n\t"
+ "psrlw %[dest3], %[dest3], %[eight] \n\t"
+
+ "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
+ "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
+ "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
+
+ "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t"
+ "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+ [dest3] "=&f"(dest3)
+ : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+ [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
+ [zero] "f"(0x00)
+ : "memory");
+}
+
+void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t src_rgb1;
+ uint64_t ftmp[13];
+ uint64_t tmp[1];
+ const uint64_t value = 0x4040;
+ const uint64_t mask_u = 0x0002003800250013;
+ const uint64_t mask_v = 0x0009002F00380002;
+
+ __asm__ volatile(
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
+ "1: \n\t"
+ "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t"
+ "dsll %[dest0_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t"
+ "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
+ "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
+ "dsll %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
+ "psubw %[dest0_u], %[src1], %[src0] \n\t"
+ "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
+ "psubw %[dest0_v], %[src0], %[src1] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t"
+ "dsll %[dest1_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t"
+ "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
+ "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
+ "dsll %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
+ "psubw %[dest1_u], %[src1], %[src0] \n\t"
+ "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
+ "psubw %[dest1_v], %[src0], %[src1] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t"
+ "dsll %[dest2_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t"
+ "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
+ "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
+ "dsll %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
+ "psubw %[dest2_u], %[src1], %[src0] \n\t"
+ "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
+ "psubw %[dest2_v], %[src0], %[src1] \n\t"
+ "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t"
+ "dsll %[dest3_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t"
+ "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
+ "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
+ "dsll %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
+ "psubw %[dest3_u], %[src1], %[src0] \n\t"
+ "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
+ "psubw %[dest3_v], %[src0], %[src1] \n\t"
+ "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
+
+ "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
+ "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
+ "packushb %[dest0_u], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
+ "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
+
+ "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
+ "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
+ "packushb %[dest0_v], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
+ "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
+
+ "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
+ "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
+ "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
+ "daddi %[width], %[width], -0x10 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
+ [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
+ [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
+ [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
+ [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+ [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+ : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+ [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+ [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
+ [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
+ [sixteen] "f"(0x10)
+ : "memory");
+}
+
+void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+ uint64_t src, src_hi, src_lo;
+ uint64_t dest0, dest1, dest2, dest3;
+ const uint64_t value = 0x1080;
+ const uint64_t mask = 0x0042008100190001;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest0], %[dest0], %[src] \n\t"
+ "psrlw %[dest0], %[dest0], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest1], %[dest1], %[src] \n\t"
+ "psrlw %[dest1], %[dest1], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest2], %[dest2], %[src] \n\t"
+ "psrlw %[dest2], %[dest2], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest3], %[dest3], %[src] \n\t"
+ "psrlw %[dest3], %[dest3], %[eight] \n\t"
+
+ "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
+ "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
+ "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
+
+ "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t"
+ "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+ [dest3] "=&f"(dest3)
+ : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+ [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
+ [zero] "f"(0x00)
+ : "memory");
+}
+
+void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t src_rgb1;
+ uint64_t ftmp[13];
+ uint64_t tmp[1];
+ const uint64_t value = 0x4040;
+ const uint64_t mask_u = 0x0013002500380002;
+ const uint64_t mask_v = 0x00020038002f0009;
+
+ __asm__ volatile(
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
+ "1: \n\t"
+ "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_0 %[dest0_u], %[src0], %[value] \n\t"
+ "dsrl %[dest0_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[dest0_v], %[dest0_v], %[value] \n\t"
+ "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
+ "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
+ "dsrl %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
+ "psubw %[dest0_u], %[src0], %[src1] \n\t"
+ "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
+ "psubw %[dest0_v], %[src1], %[src0] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_0 %[dest1_u], %[src0], %[value] \n\t"
+ "dsrl %[dest1_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[dest1_v], %[dest1_v], %[value] \n\t"
+ "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
+ "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
+ "dsrl %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
+ "psubw %[dest1_u], %[src0], %[src1] \n\t"
+ "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
+ "psubw %[dest1_v], %[src1], %[src0] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_0 %[dest2_u], %[src0], %[value] \n\t"
+ "dsrl %[dest2_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[dest2_v], %[dest2_v], %[value] \n\t"
+ "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
+ "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
+ "dsrl %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
+ "psubw %[dest2_u], %[src0], %[src1] \n\t"
+ "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
+ "psubw %[dest2_v], %[src1], %[src0] \n\t"
+ "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_0 %[dest3_u], %[src0], %[value] \n\t"
+ "dsrl %[dest3_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[dest3_v], %[dest3_v], %[value] \n\t"
+ "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
+ "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
+ "dsrl %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
+ "psubw %[dest3_u], %[src0], %[src1] \n\t"
+ "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
+ "psubw %[dest3_v], %[src1], %[src0] \n\t"
+ "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
+
+ "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
+ "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
+ "packushb %[dest0_u], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
+ "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
+
+ "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
+ "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
+ "packushb %[dest0_v], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
+ "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
+
+ "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
+ "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
+ "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
+ "daddi %[width], %[width], -0x10 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
+ [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
+ [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
+ [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
+ [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+ [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+ : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+ [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+ [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
+ [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
+ [sixteen] "f"(0x10)
+ : "memory");
+}
+
+void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+ uint64_t src, src_hi, src_lo;
+ uint64_t dest0, dest1, dest2, dest3;
+ const uint64_t value = 0x1080;
+ const uint64_t mask = 0x0001004200810019;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "dsll %[src], %[src], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest0], %[dest0], %[src] \n\t"
+ "psrlw %[dest0], %[dest0], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x0d(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x06(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "dsll %[src], %[src], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest1], %[dest1], %[src] \n\t"
+ "psrlw %[dest1], %[dest1], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x13(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x0c(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "dsll %[src], %[src], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest2], %[dest2], %[src] \n\t"
+ "psrlw %[dest2], %[dest2], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x19(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x12(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "dsll %[src], %[src], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest3], %[dest3], %[src] \n\t"
+ "psrlw %[dest3], %[dest3], %[eight] \n\t"
+
+ "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
+ "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
+ "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
+
+ "daddiu %[src_argb0], %[src_argb0], 0x18 \n\t"
+ "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+ [dest3] "=&f"(dest3)
+ : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+ [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
+ [zero] "f"(0x00)
+ : "memory");
+}
+
+void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t src_rgb1;
+ uint64_t ftmp[13];
+ uint64_t tmp[1];
+ const uint64_t value = 0x4040;
+ const uint64_t mask_u = 0x0013002500380002;
+ const uint64_t mask_v = 0x00020038002f0009;
+
+ __asm__ volatile(
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
+ "1: \n\t"
+ "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsll %[dest0_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t"
+ "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
+ "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x06(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x0d(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
+ "psubw %[dest0_u], %[src0], %[src1] \n\t"
+ "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
+ "psubw %[dest0_v], %[src1], %[src0] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x0c(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x13(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsll %[dest1_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t"
+ "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
+ "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x12(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x19(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
+ "psubw %[dest1_u], %[src0], %[src1] \n\t"
+ "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
+ "psubw %[dest1_v], %[src1], %[src0] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsll %[dest2_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t"
+ "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
+ "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x1e(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x25(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
+ "psubw %[dest2_u], %[src0], %[src1] \n\t"
+ "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
+ "psubw %[dest2_v], %[src1], %[src0] \n\t"
+ "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x24(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x2b(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsll %[dest3_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t"
+ "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
+ "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x2a(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x31(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
+ "psubw %[dest3_u], %[src0], %[src1] \n\t"
+ "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
+ "psubw %[dest3_v], %[src1], %[src0] \n\t"
+ "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
+
+ "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
+ "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
+ "packushb %[dest0_u], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
+ "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
+
+ "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
+ "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
+ "packushb %[dest0_v], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
+ "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
+
+ "daddiu %[src_rgb0], %[src_rgb0], 0x30 \n\t"
+ "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
+ "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
+ "daddi %[width], %[width], -0x10 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
+ [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
+ [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
+ [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
+ [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+ [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+ : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+ [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+ [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
+ [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
+ [sixteen] "f"(0x10)
+ : "memory");
+}
+
+void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+ uint64_t src, src_hi, src_lo;
+ uint64_t dest0, dest1, dest2, dest3;
+ const uint64_t value = 0x1080;
+ const uint64_t mask = 0x0001001900810042;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "dsll %[src], %[src], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest0], %[dest0], %[src] \n\t"
+ "psrlw %[dest0], %[dest0], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x0d(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x06(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "dsll %[src], %[src], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest1], %[dest1], %[src] \n\t"
+ "psrlw %[dest1], %[dest1], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x13(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x0c(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "dsll %[src], %[src], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest2], %[dest2], %[src] \n\t"
+ "psrlw %[dest2], %[dest2], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x19(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x12(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "dsll %[src], %[src], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest3], %[dest3], %[src] \n\t"
+ "psrlw %[dest3], %[dest3], %[eight] \n\t"
+
+ "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
+ "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
+ "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
+
+ "daddiu %[src_argb0], %[src_argb0], 0x18 \n\t"
+ "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+ [dest3] "=&f"(dest3)
+ : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+ [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
+ [zero] "f"(0x00)
+ : "memory");
+}
+
+void RAWToUVRow_MMI(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t src_rgb1;
+ uint64_t ftmp[13];
+ uint64_t tmp[1];
+ const uint64_t value = 0x4040;
+ const uint64_t mask_u = 0x0002003800250013;
+ const uint64_t mask_v = 0x0009002f00380002;
+
+ __asm__ volatile(
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
+ "1: \n\t"
+ "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t"
+ "dsll %[dest0_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t"
+ "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
+ "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x06(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x0d(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
+ "dsll %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
+ "psubw %[dest0_u], %[src1], %[src0] \n\t"
+ "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
+ "psubw %[dest0_v], %[src0], %[src1] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x0c(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x13(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t"
+ "dsll %[dest1_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t"
+ "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
+ "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x12(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x19(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
+ "dsll %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
+ "psubw %[dest1_u], %[src1], %[src0] \n\t"
+ "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
+ "psubw %[dest1_v], %[src0], %[src1] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t"
+ "dsll %[dest2_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t"
+ "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
+ "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x1e(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x25(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
+ "dsll %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
+ "psubw %[dest2_u], %[src1], %[src0] \n\t"
+ "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
+ "psubw %[dest2_v], %[src0], %[src1] \n\t"
+ "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x24(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x2b(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t"
+ "dsll %[dest3_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t"
+ "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
+ "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x2a(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x31(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
+ "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
+ "dsll %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
+ "psubw %[dest3_u], %[src1], %[src0] \n\t"
+ "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
+ "psubw %[dest3_v], %[src0], %[src1] \n\t"
+ "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
+
+ "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
+ "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
+ "packushb %[dest0_u], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
+ "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
+
+ "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
+ "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
+ "packushb %[dest0_v], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
+ "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
+
+ "daddiu %[src_rgb0], %[src_rgb0], 0x30 \n\t"
+ "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
+ "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
+ "daddi %[width], %[width], -0x10 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
+ [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
+ [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
+ [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
+ [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+ [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+ : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+ [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+ [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
+ [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
+ [sixteen] "f"(0x10)
+ : "memory");
+}
+
+void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+ uint64_t src, src_hi, src_lo;
+ uint64_t dest, dest0, dest1, dest2, dest3;
+ uint64_t tmp0, tmp1;
+ const uint64_t shift = 0x08;
+ const uint64_t value = 0x80;
+ const uint64_t mask0 = 0x0;
+ const uint64_t mask1 = 0x0001004D0096001DULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t"
+ "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t"
+ "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest0], %[tmp0], %[tmp1] \n\t"
+ "psrlw %[dest0], %[dest0], %[shift] \n\t"
+
+ "gsldlc1 %[src], 0x0f(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x08(%[src_ptr]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t"
+ "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t"
+ "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest1], %[tmp0], %[tmp1] \n\t"
+ "psrlw %[dest1], %[dest1], %[shift] \n\t"
+
+ "gsldlc1 %[src], 0x17(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x10(%[src_ptr]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t"
+ "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t"
+ "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest2], %[tmp0], %[tmp1] \n\t"
+ "psrlw %[dest2], %[dest2], %[shift] \n\t"
+
+ "gsldlc1 %[src], 0x1f(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x18(%[src_ptr]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t"
+ "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t"
+ "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest3], %[tmp0], %[tmp1] \n\t"
+ "psrlw %[dest3], %[dest3], %[shift] \n\t"
+
+ "packsswh %[tmp0], %[dest0], %[dest1] \n\t"
+ "packsswh %[tmp1], %[dest2], %[dest3] \n\t"
+ "packushb %[dest], %[tmp0], %[tmp1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
+ [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
+ [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [tmp0] "=&f"(tmp0),
+ [tmp1] "=&f"(tmp1)
+ : [src_ptr] "r"(src_argb0), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0),
+ [mask1] "f"(mask1), [shift] "f"(shift), [value] "f"(value),
+ [width] "r"(width)
+ : "memory");
+}
+
+void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t src_rgb1;
+ uint64_t ftmp[12];
+ const uint64_t value = 0x4040;
+ const uint64_t mask_u = 0x0015002a003f0002;
+ const uint64_t mask_v = 0x0002003f0035000a;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "punpcklbh %[src0], %[src1], %[zero] \n\t"
+ "punpckhbh %[src1], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
+ "pavgh %[src0], %[src0], %[src1] \n\t"
+ "dsll %[dest0_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t"
+ "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
+ "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "punpcklbh %[src0], %[src1], %[zero] \n\t"
+ "punpckhbh %[src1], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
+ "pavgh %[src0], %[src0], %[src1] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
+ "psubw %[dest0_u], %[src0], %[src1] \n\t"
+ "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
+ "psubw %[dest0_v], %[src1], %[src0] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "punpcklbh %[src0], %[src1], %[zero] \n\t"
+ "punpckhbh %[src1], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
+ "pavgh %[src0], %[src0], %[src1] \n\t"
+ "dsll %[dest1_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t"
+ "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
+ "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "punpcklbh %[src0], %[src1], %[zero] \n\t"
+ "punpckhbh %[src1], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
+ "pavgh %[src0], %[src0], %[src1] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
+ "psubw %[dest1_u], %[src0], %[src1] \n\t"
+ "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
+ "psubw %[dest1_v], %[src1], %[src0] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "punpcklbh %[src0], %[src1], %[zero] \n\t"
+ "punpckhbh %[src1], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
+ "pavgh %[src0], %[src0], %[src1] \n\t"
+ "dsll %[dest2_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t"
+ "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
+ "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "punpcklbh %[src0], %[src1], %[zero] \n\t"
+ "punpckhbh %[src1], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
+ "pavgh %[src0], %[src0], %[src1] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
+ "psubw %[dest2_u], %[src0], %[src1] \n\t"
+ "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
+ "psubw %[dest2_v], %[src1], %[src0] \n\t"
+ "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "punpcklbh %[src0], %[src1], %[zero] \n\t"
+ "punpckhbh %[src1], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
+ "pavgh %[src0], %[src0], %[src1] \n\t"
+ "dsll %[dest3_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t"
+ "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
+ "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "punpcklbh %[src0], %[src1], %[zero] \n\t"
+ "punpckhbh %[src1], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
+ "pavgh %[src0], %[src0], %[src1] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
+ "psubw %[dest3_u], %[src0], %[src1] \n\t"
+ "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
+ "psubw %[dest3_v], %[src1], %[src0] \n\t"
+ "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
+
+ "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
+ "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
+ "packushb %[dest0_u], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
+ "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
+
+ "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
+ "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
+ "packushb %[dest0_v], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
+ "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
+
+ "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
+ "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
+ "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
+ "daddi %[width], %[width], -0x10 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
+ [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
+ [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
+ [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
+ [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
+ : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+ [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+ [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
+ [zero] "f"(0x00), [eight] "f"(0x08),
+ [sixteen] "f"(0x10)
+ : "memory");
+}
+
+void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+ uint64_t ftmp[11];
+ const uint64_t value = 0x1080108010801080;
+ const uint64_t mask = 0x0001004200810019;
+ uint64_t c0 = 0x001f001f001f001f;
+ uint64_t c1 = 0x00ff00ff00ff00ff;
+ uint64_t c2 = 0x0007000700070007;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t"
+ "psrlh %[src1], %[src0], %[eight] \n\t"
+ "and %[b], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g], %[src1], %[c2] \n\t"
+ "psllh %[g], %[g], %[three] \n\t"
+ "or %[g], %[src0], %[g] \n\t"
+ "psrlh %[r], %[src1], %[three] \n\t"
+ "psllh %[src0], %[b], %[three] \n\t"
+ "psrlh %[src1], %[b], %[two] \n\t"
+ "or %[b], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[g], %[two] \n\t"
+ "psrlh %[src1], %[g], %[four] \n\t"
+ "or %[g], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[r], %[three] \n\t"
+ "psrlh %[src1], %[r], %[two] \n\t"
+ "or %[r], %[src0], %[src1] \n\t"
+ "punpcklhw %[src0], %[b], %[r] \n\t"
+ "punpcklhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest0], %[src0], %[src1] \n\t"
+ "psrlw %[dest0], %[dest0], %[eight] \n\t"
+
+ "punpckhhw %[src0], %[b], %[r] \n\t"
+ "punpckhhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest1], %[src0], %[src1] \n\t"
+ "psrlw %[dest1], %[dest1], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_rgb565]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_rgb565]) \n\t"
+ "psrlh %[src1], %[src0], %[eight] \n\t"
+ "and %[b], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g], %[src1], %[c2] \n\t"
+ "psllh %[g], %[g], %[three] \n\t"
+ "or %[g], %[src0], %[g] \n\t"
+ "psrlh %[r], %[src1], %[three] \n\t"
+ "psllh %[src0], %[b], %[three] \n\t"
+ "psrlh %[src1], %[b], %[two] \n\t"
+ "or %[b], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[g], %[two] \n\t"
+ "psrlh %[src1], %[g], %[four] \n\t"
+ "or %[g], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[r], %[three] \n\t"
+ "psrlh %[src1], %[r], %[two] \n\t"
+ "or %[r], %[src0], %[src1] \n\t"
+ "punpcklhw %[src0], %[b], %[r] \n\t"
+ "punpcklhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest2], %[src0], %[src1] \n\t"
+ "psrlw %[dest2], %[dest2], %[eight] \n\t"
+
+ "punpckhhw %[src0], %[b], %[r] \n\t"
+ "punpckhhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest3], %[src0], %[src1] \n\t"
+ "psrlw %[dest3], %[dest3], %[eight] \n\t"
+
+ "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
+ "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
+ "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
+
+ "daddiu %[src_rgb565], %[src_rgb565], 0x10 \n\t"
+ "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
+ "daddiu %[width], %[width], -0x08 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
+ [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
+ [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
+ [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
+ : [src_rgb565] "r"(src_rgb565), [dst_y] "r"(dst_y), [value] "f"(value),
+ [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
+ [mask] "f"(mask), [eight] "f"(0x08), [five] "f"(0x05),
+ [three] "f"(0x03), [two] "f"(0x02), [four] "f"(0x04)
+ : "memory");
+}
+
+void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555,
+ uint8_t* dst_y,
+ int width) {
+ uint64_t ftmp[11];
+ const uint64_t value = 0x1080108010801080;
+ const uint64_t mask = 0x0001004200810019;
+ uint64_t c0 = 0x001f001f001f001f;
+ uint64_t c1 = 0x00ff00ff00ff00ff;
+ uint64_t c2 = 0x0003000300030003;
+ uint64_t c3 = 0x007c007c007c007c;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t"
+ "psrlh %[src1], %[src0], %[eight] \n\t"
+ "and %[b], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g], %[src1], %[c2] \n\t"
+ "psllh %[g], %[g], %[three] \n\t"
+ "or %[g], %[src0], %[g] \n\t"
+ "and %[r], %[src1], %[c3] \n\t"
+ "psrlh %[r], %[r], %[two] \n\t"
+ "psllh %[src0], %[b], %[three] \n\t"
+ "psrlh %[src1], %[b], %[two] \n\t"
+ "or %[b], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[g], %[three] \n\t"
+ "psrlh %[src1], %[g], %[two] \n\t"
+ "or %[g], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[r], %[three] \n\t"
+ "psrlh %[src1], %[r], %[two] \n\t"
+ "or %[r], %[src0], %[src1] \n\t"
+ "punpcklhw %[src0], %[b], %[r] \n\t"
+ "punpcklhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest0], %[src0], %[src1] \n\t"
+ "psrlw %[dest0], %[dest0], %[eight] \n\t"
+
+ "punpckhhw %[src0], %[b], %[r] \n\t"
+ "punpckhhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest1], %[src0], %[src1] \n\t"
+ "psrlw %[dest1], %[dest1], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_argb1555]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_argb1555]) \n\t"
+ "psrlh %[src1], %[src0], %[eight] \n\t"
+ "and %[b], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g], %[src1], %[c2] \n\t"
+ "psllh %[g], %[g], %[three] \n\t"
+ "or %[g], %[src0], %[g] \n\t"
+ "and %[r], %[src1], %[c3] \n\t"
+ "psrlh %[r], %[r], %[two] \n\t"
+ "psllh %[src0], %[b], %[three] \n\t"
+ "psrlh %[src1], %[b], %[two] \n\t"
+ "or %[b], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[g], %[three] \n\t"
+ "psrlh %[src1], %[g], %[two] \n\t"
+ "or %[g], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[r], %[three] \n\t"
+ "psrlh %[src1], %[r], %[two] \n\t"
+ "or %[r], %[src0], %[src1] \n\t"
+ "punpcklhw %[src0], %[b], %[r] \n\t"
+ "punpcklhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest2], %[src0], %[src1] \n\t"
+ "psrlw %[dest2], %[dest2], %[eight] \n\t"
+
+ "punpckhhw %[src0], %[b], %[r] \n\t"
+ "punpckhhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest3], %[src0], %[src1] \n\t"
+ "psrlw %[dest3], %[dest3], %[eight] \n\t"
+
+ "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
+ "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
+ "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
+
+ "daddiu %[src_argb1555], %[src_argb1555], 0x10 \n\t"
+ "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
+ "daddiu %[width], %[width], -0x08 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
+ [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
+ [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
+ [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
+ : [src_argb1555] "r"(src_argb1555), [dst_y] "r"(dst_y),
+ [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0),
+ [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [eight] "f"(0x08),
+ [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07)
+ : "memory");
+}
+
+void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444,
+ uint8_t* dst_y,
+ int width) {
+ uint64_t ftmp[11];
+ uint64_t value = 0x1080108010801080;
+ uint64_t mask = 0x0001004200810019;
+ uint64_t c0 = 0x000f000f000f000f;
+ uint64_t c1 = 0x00ff00ff00ff00ff;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t"
+ "psrlh %[src1], %[src0], %[eight] \n\t"
+ "and %[b], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[g], %[src0], %[four] \n\t"
+ "and %[r], %[src1], %[c0] \n\t"
+ "psllh %[src0], %[b], %[four] \n\t"
+ "or %[b], %[src0], %[b] \n\t"
+ "psllh %[src0], %[g], %[four] \n\t"
+ "or %[g], %[src0], %[g] \n\t"
+ "psllh %[src0], %[r], %[four] \n\t"
+ "or %[r], %[src0], %[r] \n\t"
+ "punpcklhw %[src0], %[b], %[r] \n\t"
+ "punpcklhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest0], %[src0], %[src1] \n\t"
+ "psrlw %[dest0], %[dest0], %[eight] \n\t"
+
+ "punpckhhw %[src0], %[b], %[r] \n\t"
+ "punpckhhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest1], %[src0], %[src1] \n\t"
+ "psrlw %[dest1], %[dest1], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_argb4444]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_argb4444]) \n\t"
+ "psrlh %[src1], %[src0], %[eight] \n\t"
+ "and %[b], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[g], %[src0], %[four] \n\t"
+ "and %[r], %[src1], %[c0] \n\t"
+ "psllh %[src0], %[b], %[four] \n\t"
+ "or %[b], %[src0], %[b] \n\t"
+ "psllh %[src0], %[g], %[four] \n\t"
+ "or %[g], %[src0], %[g] \n\t"
+ "psllh %[src0], %[r], %[four] \n\t"
+ "or %[r], %[src0], %[r] \n\t"
+ "punpcklhw %[src0], %[b], %[r] \n\t"
+ "punpcklhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest2], %[src0], %[src1] \n\t"
+ "psrlw %[dest2], %[dest2], %[eight] \n\t"
+
+ "punpckhhw %[src0], %[b], %[r] \n\t"
+ "punpckhhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest3], %[src0], %[src1] \n\t"
+ "psrlw %[dest3], %[dest3], %[eight] \n\t"
+
+ "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
+ "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
+ "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
+
+ "daddiu %[src_argb4444], %[src_argb4444], 0x10 \n\t"
+ "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
+ "daddiu %[width], %[width], -0x08 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
+ [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
+ [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
+ [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
+ : [src_argb4444] "r"(src_argb4444), [dst_y] "r"(dst_y),
+ [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0),
+ [c1] "f"(c1), [eight] "f"(0x08), [four] "f"(0x04)
+ : "memory");
+}
+
+void RGB565ToUVRow_MMI(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t ftmp[13];
+ uint64_t value = 0x2020202020202020;
+ uint64_t mask_u = 0x0026004a00700002;
+ uint64_t mask_v = 0x00020070005e0012;
+ uint64_t mask = 0x93;
+ uint64_t c0 = 0x001f001f001f001f;
+ uint64_t c1 = 0x00ff00ff00ff00ff;
+ uint64_t c2 = 0x0007000700070007;
+ __asm__ volatile(
+ "daddu %[next_rgb565], %[src_rgb565], %[next_rgb565] \n\t"
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[next_rgb565]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[next_rgb565]) \n\t"
+ "psrlh %[dest0_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g0], %[dest0_u], %[c2] \n\t"
+ "psllh %[g0], %[g0], %[three] \n\t"
+ "or %[g0], %[src0], %[g0] \n\t"
+ "psrlh %[r0], %[dest0_u], %[three] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest0_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[src1], %[src1], %[five] \n\t"
+ "and %[dest0_v], %[src0], %[c2] \n\t"
+ "psllh %[dest0_v], %[dest0_v], %[three] \n\t"
+ "or %[dest0_v], %[src1], %[dest0_v] \n\t"
+ "psrlh %[src0], %[src0], %[three] \n\t"
+ "paddh %[b0], %[b0], %[dest0_u] \n\t"
+ "paddh %[g0], %[g0], %[dest0_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest0_u], %[dest0_v] \n\t"
+ "psrlh %[b0], %[src0], %[six] \n\t"
+ "psllh %[r0], %[src0], %[one] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest0_u], %[dest0_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest0_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest0_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest0_u], %[b0] \n\t"
+ "psubw %[dest0_u], %[src0], %[src1] \n\t"
+ "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[g0] \n\t"
+ "psubw %[dest0_v], %[src1], %[src0] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_rgb565]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_rgb565]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[next_rgb565]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[next_rgb565]) \n\t"
+ "psrlh %[dest1_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g0], %[dest1_u], %[c2] \n\t"
+ "psllh %[g0], %[g0], %[three] \n\t"
+ "or %[g0], %[src0], %[g0] \n\t"
+ "psrlh %[r0], %[dest1_u], %[three] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest1_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[src1], %[src1], %[five] \n\t"
+ "and %[dest1_v], %[src0], %[c2] \n\t"
+ "psllh %[dest1_v], %[dest1_v], %[three] \n\t"
+ "or %[dest1_v], %[src1], %[dest1_v] \n\t"
+ "psrlh %[src0], %[src0], %[three] \n\t"
+ "paddh %[b0], %[b0], %[dest1_u] \n\t"
+ "paddh %[g0], %[g0], %[dest1_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest1_u], %[dest1_v] \n\t"
+ "psrlh %[b0], %[src0], %[six] \n\t"
+ "psllh %[r0], %[src0], %[one] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest1_u], %[dest1_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest1_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest1_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest1_u], %[b0] \n\t"
+ "psubw %[dest1_u], %[src0], %[src1] \n\t"
+ "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[g0] \n\t"
+ "psubw %[dest1_v], %[src1], %[src0] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x10(%[src_rgb565]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_rgb565]) \n\t"
+ "gsldrc1 %[src1], 0x10(%[next_rgb565]) \n\t"
+ "gsldlc1 %[src1], 0x17(%[next_rgb565]) \n\t"
+ "psrlh %[dest2_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g0], %[dest2_u], %[c2] \n\t"
+ "psllh %[g0], %[g0], %[three] \n\t"
+ "or %[g0], %[src0], %[g0] \n\t"
+ "psrlh %[r0], %[dest2_u], %[three] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest2_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[src1], %[src1], %[five] \n\t"
+ "and %[dest2_v], %[src0], %[c2] \n\t"
+ "psllh %[dest2_v], %[dest2_v], %[three] \n\t"
+ "or %[dest2_v], %[src1], %[dest2_v] \n\t"
+ "psrlh %[src0], %[src0], %[three] \n\t"
+ "paddh %[b0], %[b0], %[dest2_u] \n\t"
+ "paddh %[g0], %[g0], %[dest2_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest2_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest2_u], %[dest2_v] \n\t"
+ "psrlh %[b0], %[src0], %[six] \n\t"
+ "psllh %[r0], %[src0], %[one] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest2_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest2_u], %[dest2_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest2_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest2_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest2_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest2_u], %[b0] \n\t"
+ "psubw %[dest2_u], %[src0], %[src1] \n\t"
+ "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest2_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest2_v], %[g0] \n\t"
+ "psubw %[dest2_v], %[src1], %[src0] \n\t"
+ "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x18(%[src_rgb565]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_rgb565]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[next_rgb565]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[next_rgb565]) \n\t"
+ "psrlh %[dest3_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g0], %[dest3_u], %[c2] \n\t"
+ "psllh %[g0], %[g0], %[three] \n\t"
+ "or %[g0], %[src0], %[g0] \n\t"
+ "psrlh %[r0], %[dest3_u], %[three] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest3_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[src1], %[src1], %[five] \n\t"
+ "and %[dest3_v], %[src0], %[c2] \n\t"
+ "psllh %[dest3_v], %[dest3_v], %[three] \n\t"
+ "or %[dest3_v], %[src1], %[dest3_v] \n\t"
+ "psrlh %[src0], %[src0], %[three] \n\t"
+ "paddh %[b0], %[b0], %[dest3_u] \n\t"
+ "paddh %[g0], %[g0], %[dest3_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest3_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest3_u], %[dest3_v] \n\t"
+ "psrlh %[b0], %[src0], %[six] \n\t"
+ "psllh %[r0], %[src0], %[one] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest3_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest3_u], %[dest3_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest3_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest3_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest3_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest3_u], %[b0] \n\t"
+ "psubw %[dest3_u], %[src0], %[src1] \n\t"
+ "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest3_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest3_v], %[g0] \n\t"
+ "psubw %[dest3_v], %[src1], %[src0] \n\t"
+ "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
+
+ "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
+ "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
+ "packushb %[dest0_u], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
+ "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
+ "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
+ "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
+ "packushb %[dest0_v], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
+ "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
+
+ "daddiu %[src_rgb565], %[src_rgb565], 0x20 \n\t"
+ "daddiu %[next_rgb565], %[next_rgb565], 0x20 \n\t"
+ "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
+ "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
+ "daddiu %[width], %[width], -0x10 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
+ [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
+ [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
+ [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
+ [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]),
+ [dest3_v] "=&f"(ftmp[12])
+ : [src_rgb565] "r"(src_rgb565), [next_rgb565] "r"(src_stride_rgb565),
+ [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+ [value] "f"(value), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
+ [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
+ [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03),
+ [one] "f"(0x01)
+ : "memory");
+}
+
+void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t ftmp[11];
+ uint64_t value = 0x2020202020202020;
+ uint64_t mask_u = 0x0026004a00700002;
+ uint64_t mask_v = 0x00020070005e0012;
+ uint64_t mask = 0x93;
+ uint64_t c0 = 0x001f001f001f001f;
+ uint64_t c1 = 0x00ff00ff00ff00ff;
+ uint64_t c2 = 0x0003000300030003;
+ uint64_t c3 = 0x007c007c007c007c;
+ __asm__ volatile(
+ "daddu %[next_argb1555], %[src_argb1555], %[next_argb1555] \n\t"
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[next_argb1555]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[next_argb1555]) \n\t"
+ "psrlh %[dest0_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g0], %[dest0_u], %[c2] \n\t"
+ "psllh %[g0], %[g0], %[three] \n\t"
+ "or %[g0], %[src0], %[g0] \n\t"
+ "and %[r0], %[dest0_u], %[c3] \n\t"
+ "psrlh %[r0], %[r0], %[two] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest0_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[src1], %[src1], %[five] \n\t"
+ "and %[dest0_v], %[src0], %[c2] \n\t"
+ "psllh %[dest0_v], %[dest0_v], %[three] \n\t"
+ "or %[dest0_v], %[src1], %[dest0_v] \n\t"
+ "and %[src0], %[src0], %[c3] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[b0], %[b0], %[dest0_u] \n\t"
+ "paddh %[g0], %[g0], %[dest0_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest0_u], %[dest0_v] \n\t"
+ "psrlh %[b0], %[src0], %[six] \n\t"
+ "psllh %[r0], %[src0], %[one] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "psrlh %[r0], %[g0], %[six] \n\t"
+ "psllh %[g0], %[g0], %[one] \n\t"
+ "or %[g0], %[g0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest0_u], %[dest0_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest0_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest0_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest0_u], %[b0] \n\t"
+ "psubw %[dest0_u], %[src0], %[src1] \n\t"
+ "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[g0] \n\t"
+ "psubw %[dest0_v], %[src1], %[src0] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_argb1555]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_argb1555]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[next_argb1555]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[next_argb1555]) \n\t"
+ "psrlh %[dest1_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g0], %[dest1_u], %[c2] \n\t"
+ "psllh %[g0], %[g0], %[three] \n\t"
+ "or %[g0], %[src0], %[g0] \n\t"
+ "and %[r0], %[dest1_u], %[c3] \n\t"
+ "psrlh %[r0], %[r0], %[two] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest1_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[src1], %[src1], %[five] \n\t"
+ "and %[dest1_v], %[src0], %[c2] \n\t"
+ "psllh %[dest1_v], %[dest1_v], %[three] \n\t"
+ "or %[dest1_v], %[src1], %[dest1_v] \n\t"
+ "and %[src0], %[src0], %[c3] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[b0], %[b0], %[dest1_u] \n\t"
+ "paddh %[g0], %[g0], %[dest1_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest1_u], %[dest1_v] \n\t"
+ "psrlh %[b0], %[src0], %[six] \n\t"
+ "psllh %[r0], %[src0], %[one] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "psrlh %[r0], %[g0], %[six] \n\t"
+ "psllh %[g0], %[g0], %[one] \n\t"
+ "or %[g0], %[g0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest1_u], %[dest1_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest1_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest1_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest1_u], %[b0] \n\t"
+ "psubw %[dest1_u], %[src0], %[src1] \n\t"
+ "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[g0] \n\t"
+ "psubw %[dest1_v], %[src1], %[src0] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "packsswh %[dest0_u], %[dest0_u], %[dest1_u] \n\t"
+ "packsswh %[dest1_u], %[dest0_v], %[dest1_v] \n\t"
+
+ "gsldrc1 %[src0], 0x10(%[src_argb1555]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_argb1555]) \n\t"
+ "gsldrc1 %[src1], 0x10(%[next_argb1555]) \n\t"
+ "gsldlc1 %[src1], 0x17(%[next_argb1555]) \n\t"
+ "psrlh %[dest2_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g0], %[dest2_u], %[c2] \n\t"
+ "psllh %[g0], %[g0], %[three] \n\t"
+ "or %[g0], %[src0], %[g0] \n\t"
+ "and %[r0], %[dest2_u], %[c3] \n\t"
+ "psrlh %[r0], %[r0], %[two] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest2_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[src1], %[src1], %[five] \n\t"
+ "and %[dest0_v], %[src0], %[c2] \n\t"
+ "psllh %[dest0_v], %[dest0_v], %[three] \n\t"
+ "or %[dest0_v], %[src1], %[dest0_v] \n\t"
+ "and %[src0], %[src0], %[c3] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[b0], %[b0], %[dest2_u] \n\t"
+ "paddh %[g0], %[g0], %[dest0_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest2_u], %[dest0_v] \n\t"
+ "psrlh %[b0], %[src0], %[six] \n\t"
+ "psllh %[r0], %[src0], %[one] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "psrlh %[r0], %[g0], %[six] \n\t"
+ "psllh %[g0], %[g0], %[one] \n\t"
+ "or %[g0], %[g0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest2_u], %[dest0_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest2_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest2_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest2_u], %[b0] \n\t"
+ "psubw %[dest2_u], %[src0], %[src1] \n\t"
+ "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[g0] \n\t"
+ "psubw %[dest0_v], %[src1], %[src0] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x18(%[src_argb1555]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_argb1555]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[next_argb1555]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[next_argb1555]) \n\t"
+ "psrlh %[dest3_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g0], %[dest3_u], %[c2] \n\t"
+ "psllh %[g0], %[g0], %[three] \n\t"
+ "or %[g0], %[src0], %[g0] \n\t"
+ "and %[r0], %[dest3_u], %[c3] \n\t"
+ "psrlh %[r0], %[r0], %[two] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest3_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[src1], %[src1], %[five] \n\t"
+ "and %[dest1_v], %[src0], %[c2] \n\t"
+ "psllh %[dest1_v], %[dest1_v], %[three] \n\t"
+ "or %[dest1_v], %[src1], %[dest1_v] \n\t"
+ "and %[src0], %[src0], %[c3] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[b0], %[b0], %[dest3_u] \n\t"
+ "paddh %[g0], %[g0], %[dest1_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest3_u], %[dest1_v] \n\t"
+ "psrlh %[b0], %[src0], %[six] \n\t"
+ "psllh %[r0], %[src0], %[one] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "psrlh %[r0], %[g0], %[six] \n\t"
+ "psllh %[g0], %[g0], %[one] \n\t"
+ "or %[g0], %[g0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest3_u], %[dest1_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest3_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest3_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest3_u], %[b0] \n\t"
+ "psubw %[dest3_u], %[src0], %[src1] \n\t"
+ "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[g0] \n\t"
+ "psubw %[dest1_v], %[src1], %[src0] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
+ "packushb %[dest0_u], %[dest0_u], %[src1] \n\t"
+ "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
+ "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
+ "packsswh %[src1], %[dest0_v], %[dest1_v] \n\t"
+ "packushb %[dest0_v], %[dest1_u], %[src1] \n\t"
+ "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
+ "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
+
+ "daddiu %[src_argb1555], %[src_argb1555], 0x20 \n\t"
+ "daddiu %[next_argb1555], %[next_argb1555], 0x20 \n\t"
+ "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
+ "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
+ "daddiu %[width], %[width], -0x10 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
+ [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
+ [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
+ [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
+ [dest1_v] "=&f"(ftmp[10])
+ : [src_argb1555] "r"(src_argb1555),
+ [next_argb1555] "r"(src_stride_argb1555), [dst_u] "r"(dst_u),
+ [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value),
+ [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3),
+ [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
+ [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03),
+ [two] "f"(0x02), [one] "f"(0x01)
+ : "memory");
+}
+
+void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444,
+ int src_stride_argb4444,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t ftmp[13];
+ uint64_t value = 0x2020202020202020;
+ uint64_t mask_u = 0x0026004a00700002;
+ uint64_t mask_v = 0x00020070005e0012;
+ uint64_t mask = 0x93;
+ uint64_t c0 = 0x000f000f000f000f;
+ uint64_t c1 = 0x00ff00ff00ff00ff;
+ __asm__ volatile(
+ "daddu %[next_argb4444], %[src_argb4444], %[next_argb4444] \n\t"
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[next_argb4444]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[next_argb4444]) \n\t"
+ "psrlh %[dest0_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[g0], %[src0], %[four] \n\t"
+ "and %[r0], %[dest0_u], %[c0] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest0_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[dest0_v], %[src1], %[four] \n\t"
+ "and %[src0], %[src0], %[c0] \n\t"
+ "paddh %[b0], %[b0], %[dest0_u] \n\t"
+ "paddh %[g0], %[g0], %[dest0_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest0_u], %[dest0_v] \n\t"
+ "psrlh %[b0], %[src0], %[four] \n\t"
+ "psllh %[r0], %[src0], %[two] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "psrlh %[r0], %[g0], %[four] \n\t"
+ "psllh %[g0], %[g0], %[two] \n\t"
+ "or %[g0], %[g0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest0_u], %[dest0_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest0_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest0_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest0_u], %[b0] \n\t"
+ "psubw %[dest0_u], %[src0], %[src1] \n\t"
+ "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[g0] \n\t"
+ "psubw %[dest0_v], %[src1], %[src0] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_argb4444]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_argb4444]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[next_argb4444]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[next_argb4444]) \n\t"
+ "psrlh %[dest1_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[g0], %[src0], %[four] \n\t"
+ "and %[r0], %[dest1_u], %[c0] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest1_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[dest1_v], %[src1], %[four] \n\t"
+ "and %[src0], %[src0], %[c0] \n\t"
+ "paddh %[b0], %[b0], %[dest1_u] \n\t"
+ "paddh %[g0], %[g0], %[dest1_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest1_u], %[dest1_v] \n\t"
+ "psrlh %[b0], %[src0], %[four] \n\t"
+ "psllh %[r0], %[src0], %[two] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "psrlh %[r0], %[g0], %[four] \n\t"
+ "psllh %[g0], %[g0], %[two] \n\t"
+ "or %[g0], %[g0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest1_u], %[dest1_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest1_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest1_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest1_u], %[b0] \n\t"
+ "psubw %[dest1_u], %[src0], %[src1] \n\t"
+ "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[g0] \n\t"
+ "psubw %[dest1_v], %[src1], %[src0] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x10(%[src_argb4444]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_argb4444]) \n\t"
+ "gsldrc1 %[src1], 0x10(%[next_argb4444]) \n\t"
+ "gsldlc1 %[src1], 0x17(%[next_argb4444]) \n\t"
+ "psrlh %[dest2_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[g0], %[src0], %[four] \n\t"
+ "and %[r0], %[dest2_u], %[c0] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest2_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[dest2_v], %[src1], %[four] \n\t"
+ "and %[src0], %[src0], %[c0] \n\t"
+ "paddh %[b0], %[b0], %[dest2_u] \n\t"
+ "paddh %[g0], %[g0], %[dest2_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest2_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest2_u], %[dest2_v] \n\t"
+ "psrlh %[b0], %[src0], %[four] \n\t"
+ "psllh %[r0], %[src0], %[two] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "psrlh %[r0], %[g0], %[four] \n\t"
+ "psllh %[g0], %[g0], %[two] \n\t"
+ "or %[g0], %[g0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest2_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest2_u], %[dest2_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest2_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest2_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest2_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest2_u], %[b0] \n\t"
+ "psubw %[dest2_u], %[src0], %[src1] \n\t"
+ "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest2_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest2_v], %[g0] \n\t"
+ "psubw %[dest2_v], %[src1], %[src0] \n\t"
+ "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x18(%[src_argb4444]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_argb4444]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[next_argb4444]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[next_argb4444]) \n\t"
+ "psrlh %[dest3_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[g0], %[src0], %[four] \n\t"
+ "and %[r0], %[dest3_u], %[c0] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest3_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[dest3_v], %[src1], %[four] \n\t"
+ "and %[src0], %[src0], %[c0] \n\t"
+ "paddh %[b0], %[b0], %[dest3_u] \n\t"
+ "paddh %[g0], %[g0], %[dest3_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest3_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest3_u], %[dest3_v] \n\t"
+ "psrlh %[b0], %[src0], %[four] \n\t"
+ "psllh %[r0], %[src0], %[two] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "psrlh %[r0], %[g0], %[four] \n\t"
+ "psllh %[g0], %[g0], %[two] \n\t"
+ "or %[g0], %[g0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest3_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest3_u], %[dest3_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest3_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest3_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest3_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest3_u], %[b0] \n\t"
+ "psubw %[dest3_u], %[src0], %[src1] \n\t"
+ "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest3_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest3_v], %[g0] \n\t"
+ "psubw %[dest3_v], %[src1], %[src0] \n\t"
+ "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
+
+ "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
+ "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
+ "packushb %[dest0_u], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
+ "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
+ "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
+ "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
+ "packushb %[dest0_v], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
+ "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
+
+ "daddiu %[src_argb4444], %[src_argb4444], 0x20 \n\t"
+ "daddiu %[next_argb4444], %[next_argb4444], 0x20 \n\t"
+ "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
+ "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
+ "daddiu %[width], %[width], -0x10 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
+ [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
+ [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
+ [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
+ [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]),
+ [dest3_v] "=&f"(ftmp[12])
+ : [src_argb4444] "r"(src_argb4444),
+ [next_argb4444] "r"(src_stride_argb4444), [dst_u] "r"(dst_u),
+ [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value),
+ [c0] "f"(c0), [c1] "f"(c1), [mask] "f"(mask), [mask_u] "f"(mask_u),
+ [mask_v] "f"(mask_v), [eight] "f"(0x08), [four] "f"(0x04),
+ [two] "f"(0x02)
+ : "memory");
+}
+
+void ARGBToUV444Row_MMI(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t ftmp[12];
+ const uint64_t value = 0x4040;
+ const uint64_t mask_u = 0x0026004a00700002;
+ const uint64_t mask_v = 0x00020070005e0012;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "dsll %[dest0_u], %[src_lo], %[sixteen] \n\t"
+ "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t"
+ "pinsrh_3 %[dest0_v], %[src_lo], %[value] \n\t"
+ "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
+ "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
+
+ "dsll %[src_lo], %[src_hi], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
+ "psubw %[dest0_u], %[src0], %[src1] \n\t"
+ "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
+ "psubw %[dest0_v], %[src1], %[src0] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_argb]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_argb]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "dsll %[dest1_u], %[src_lo], %[sixteen] \n\t"
+ "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t"
+ "pinsrh_3 %[dest1_v], %[src_lo], %[value] \n\t"
+ "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
+ "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
+ "dsll %[src_lo], %[src_hi], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
+ "psubw %[dest1_u], %[src0], %[src1] \n\t"
+ "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
+ "psubw %[dest1_v], %[src1], %[src0] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x10(%[src_argb]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_argb]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "dsll %[dest2_u], %[src_lo], %[sixteen] \n\t"
+ "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t"
+ "pinsrh_3 %[dest2_v], %[src_lo], %[value] \n\t"
+ "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
+ "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
+ "dsll %[src_lo], %[src_hi], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
+ "psubw %[dest2_u], %[src0], %[src1] \n\t"
+ "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
+ "psubw %[dest2_v], %[src1], %[src0] \n\t"
+ "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x18(%[src_argb]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_argb]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "dsll %[dest3_u], %[src_lo], %[sixteen] \n\t"
+ "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t"
+ "pinsrh_3 %[dest3_v], %[src_lo], %[value] \n\t"
+ "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
+ "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
+ "dsll %[src_lo], %[src_hi], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
+ "psubw %[dest3_u], %[src0], %[src1] \n\t"
+ "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
+ "psubw %[dest3_v], %[src1], %[src0] \n\t"
+ "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
+
+ "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
+ "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
+ "packushb %[dest0_u], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
+ "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
+
+ "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
+ "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
+ "packushb %[dest0_v], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
+ "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
+
+ "daddiu %[src_argb], %[src_argb], 0x20 \n\t"
+ "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
+ "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
+ [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]),
+ [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]),
+ [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]),
+ [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]),
+ [dest3_v] "=&f"(ftmp[11])
+ : [src_argb] "r"(src_argb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
+ [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
+ [value] "f"(value), [zero] "f"(0x00), [sixteen] "f"(0x10),
+ [eight] "f"(0x08)
+ : "memory");
+}
+
+void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ uint64_t src, src_lo, src_hi, src37, dest, dest_lo, dest_hi;
+ uint64_t tmp0, tmp1;
+ const uint64_t mask0 = 0x0;
+ const uint64_t mask1 = 0x01;
+ const uint64_t mask2 = 0x0080004D0096001DULL;
+ const uint64_t mask3 = 0xFF000000FF000000ULL;
+ const uint64_t mask4 = ~mask3;
+ const uint64_t shift = 0x08;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+
+ "and %[src37], %[src], %[mask3] \n\t"
+
+ "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[mask1] \n\t"
+ "pmaddhw %[dest_lo], %[src_lo], %[mask2] \n\t"
+ "punpcklwd %[tmp0], %[dest_lo], %[dest_lo] \n\t"
+ "punpckhwd %[tmp1], %[dest_lo], %[dest_lo] \n\t"
+ "paddw %[dest_lo], %[tmp0], %[tmp1] \n\t"
+ "psrlw %[dest_lo], %[dest_lo], %[shift] \n\t"
+ "packsswh %[dest_lo], %[dest_lo], %[dest_lo] \n\t"
+
+ "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[mask1] \n\t"
+ "pmaddhw %[dest_hi], %[src_hi], %[mask2] \n\t"
+ "punpcklwd %[tmp0], %[dest_hi], %[dest_hi] \n\t"
+ "punpckhwd %[tmp1], %[dest_hi], %[dest_hi] \n\t"
+ "paddw %[dest_hi], %[tmp0], %[tmp1] \n\t"
+ "psrlw %[dest_hi], %[dest_hi], %[shift] \n\t"
+ "packsswh %[dest_hi], %[dest_hi], %[dest_hi] \n\t"
+
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "and %[dest], %[dest], %[mask4] \n\t"
+ "or %[dest], %[dest], %[src37] \n\t"
+
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
+ [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [tmp0] "=&f"(tmp0),
+ [tmp1] "=&f"(tmp1), [src] "=&f"(src), [dest] "=&f"(dest),
+ [src37] "=&f"(src37)
+ : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width),
+ [shift] "f"(shift), [mask0] "f"(mask0), [mask1] "f"(mask1),
+ [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4)
+ : "memory");
+}
+
+// Convert a row of image to Sepia tone.
+void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width) {
+ uint64_t dest, dest_lo, dest_hi, dest37, dest0, dest1, dest2;
+ uint64_t tmp0, tmp1;
+ const uint64_t mask0 = 0x0;
+ const uint64_t mask1 = 0x002300440011ULL;
+ const uint64_t mask2 = 0x002D00580016ULL;
+ const uint64_t mask3 = 0x003200620018ULL;
+ const uint64_t mask4 = 0xFF000000FF000000ULL;
+ const uint64_t shift = 0x07;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "and %[dest37], %[dest], %[mask4] \n\t"
+
+ "punpcklbh %[dest_lo], %[dest], %[mask0] \n\t"
+ "pmaddhw %[dest0], %[dest_lo], %[mask1] \n\t"
+ "pmaddhw %[dest1], %[dest_lo], %[mask2] \n\t"
+ "pmaddhw %[dest2], %[dest_lo], %[mask3] \n\t"
+ "punpcklwd %[tmp0], %[dest0], %[dest1] \n\t"
+ "punpckhwd %[tmp1], %[dest0], %[dest1] \n\t"
+ "paddw %[dest0], %[tmp0], %[tmp1] \n\t"
+ "psrlw %[dest0], %[dest0], %[shift] \n\t"
+ "punpcklwd %[tmp0], %[dest2], %[mask0] \n\t"
+ "punpckhwd %[tmp1], %[dest2], %[mask0] \n\t"
+ "paddw %[dest1], %[tmp0], %[tmp1] \n\t"
+ "psrlw %[dest1], %[dest1], %[shift] \n\t"
+ "packsswh %[dest_lo], %[dest0], %[dest1] \n\t"
+
+ "punpckhbh %[dest_hi], %[dest], %[mask0] \n\t"
+ "pmaddhw %[dest0], %[dest_hi], %[mask1] \n\t"
+ "pmaddhw %[dest1], %[dest_hi], %[mask2] \n\t"
+ "pmaddhw %[dest2], %[dest_hi], %[mask3] \n\t"
+ "punpcklwd %[tmp0], %[dest0], %[dest1] \n\t"
+ "punpckhwd %[tmp1], %[dest0], %[dest1] \n\t"
+ "paddw %[dest0], %[tmp0], %[tmp1] \n\t"
+ "psrlw %[dest0], %[dest0], %[shift] \n\t"
+ "punpcklwd %[tmp0], %[dest2], %[mask0] \n\t"
+ "punpckhwd %[tmp1], %[dest2], %[mask0] \n\t"
+ "paddw %[dest1], %[tmp0], %[tmp1] \n\t"
+ "psrlw %[dest1], %[dest1], %[shift] \n\t"
+ "packsswh %[dest_hi], %[dest0], %[dest1] \n\t"
+
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "or %[dest], %[dest], %[dest37] \n\t"
+
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+ [dest37] "=&f"(dest37), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1),
+ [dest] "=&f"(dest)
+ : [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask0] "f"(mask0),
+ [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3),
+ [mask4] "f"(mask4), [shift] "f"(shift)
+ : "memory");
+}
+
+// Apply color matrix to a row of image. Matrix is signed.
+// TODO(fbarchard): Consider adding rounding (+32).
+void ARGBColorMatrixRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width) {
+ uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi, dest0, dest1, dest2,
+ dest3;
+ uint64_t matrix, matrix_hi, matrix_lo;
+ uint64_t tmp0, tmp1;
+ const uint64_t shift0 = 0x06;
+ const uint64_t shift1 = 0x08;
+ const uint64_t mask0 = 0x0;
+ const uint64_t mask1 = 0x08;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
+
+ "gsldlc1 %[matrix], 0x07(%[matrix_ptr]) \n\t"
+ "gsldrc1 %[matrix], 0x00(%[matrix_ptr]) \n\t"
+ "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t"
+ "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
+ "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
+ "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t"
+ "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
+ "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
+ "pmaddhw %[dest_lo], %[src_lo], %[matrix_lo] \n\t"
+ "pmaddhw %[dest_hi], %[src_lo], %[matrix_hi] \n\t"
+ "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t"
+ "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t"
+ "paddw %[dest0], %[tmp0], %[tmp1] \n\t"
+ "psraw %[dest0], %[dest0], %[shift0] \n\t"
+
+ "gsldlc1 %[matrix], 0x0f(%[matrix_ptr]) \n\t"
+ "gsldrc1 %[matrix], 0x08(%[matrix_ptr]) \n\t"
+ "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t"
+ "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
+ "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
+ "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t"
+ "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
+ "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
+ "pmaddhw %[dest_lo], %[src_lo], %[matrix_lo] \n\t"
+ "pmaddhw %[dest_hi], %[src_lo], %[matrix_hi] \n\t"
+ "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t"
+ "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t"
+ "paddw %[dest1], %[tmp0], %[tmp1] \n\t"
+ "psraw %[dest1], %[dest1], %[shift0] \n\t"
+
+ "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
+
+ "gsldlc1 %[matrix], 0x07(%[matrix_ptr]) \n\t"
+ "gsldrc1 %[matrix], 0x00(%[matrix_ptr]) \n\t"
+ "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t"
+ "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
+ "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
+ "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t"
+ "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
+ "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
+ "pmaddhw %[dest_lo], %[src_hi], %[matrix_lo] \n\t"
+ "pmaddhw %[dest_hi], %[src_hi], %[matrix_hi] \n\t"
+ "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t"
+ "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t"
+ "paddw %[dest2], %[tmp0], %[tmp1] \n\t"
+ "psraw %[dest2], %[dest2], %[shift0] \n\t"
+
+ "gsldlc1 %[matrix], 0x0f(%[matrix_ptr]) \n\t"
+ "gsldrc1 %[matrix], 0x08(%[matrix_ptr]) \n\t"
+ "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t"
+ "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
+ "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
+ "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t"
+ "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
+ "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
+ "pmaddhw %[dest_lo], %[src_hi], %[matrix_lo] \n\t"
+ "pmaddhw %[dest_hi], %[src_hi], %[matrix_hi] \n\t"
+ "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t"
+ "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t"
+ "paddw %[dest3], %[tmp0], %[tmp1] \n\t"
+ "psraw %[dest3], %[dest3], %[shift0] \n\t"
+
+ "packsswh %[tmp0], %[dest0], %[dest1] \n\t"
+ "packsswh %[tmp1], %[dest2], %[dest3] \n\t"
+ "packushb %[dest], %[tmp0], %[tmp1] \n\t"
+
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+ [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+ [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest),
+ [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [matrix_hi] "=&f"(matrix_hi),
+ [matrix_lo] "=&f"(matrix_lo), [matrix] "=&f"(matrix)
+ : [src_ptr] "r"(src_argb), [matrix_ptr] "r"(matrix_argb),
+ [dst_ptr] "r"(dst_argb), [width] "r"(width), [shift0] "f"(shift0),
+ [shift1] "f"(shift1), [mask0] "f"(mask0), [mask1] "f"(mask1)
+ : "memory");
+}
+
+void ARGBShadeRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ uint32_t value) {
+ uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi;
+ const uint64_t shift = 0x08;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[src] \n\t"
+ "punpckhbh %[src_hi], %[src], %[src] \n\t"
+
+ "punpcklbh %[value], %[value], %[value] \n\t"
+
+ "pmulhuh %[dest_lo], %[src_lo], %[value] \n\t"
+ "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t"
+ "pmulhuh %[dest_hi], %[src_hi], %[value] \n\t"
+ "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t"
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+ [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src] "=&f"(src),
+ [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width),
+ [value] "f"(value), [shift] "f"(shift)
+ : "memory");
+}
+
+void ARGBMultiplyRow_MMI(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ uint64_t src0, src0_hi, src0_lo, src1, src1_hi, src1_lo;
+ uint64_t dest, dest_lo, dest_hi;
+ const uint64_t mask = 0x0;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t"
+ "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t"
+ "punpcklbh %[src0_lo], %[src0], %[src0] \n\t"
+ "punpckhbh %[src0_hi], %[src0], %[src0] \n\t"
+
+ "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t"
+ "punpcklbh %[src1_lo], %[src1], %[mask] \n\t"
+ "punpckhbh %[src1_hi], %[src1], %[mask] \n\t"
+
+ "pmulhuh %[dest_lo], %[src0_lo], %[src1_lo] \n\t"
+ "pmulhuh %[dest_hi], %[src0_hi], %[src1_hi] \n\t"
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t"
+ "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
+ [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
+ [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src0] "=&f"(src0),
+ [src1] "=&f"(src1), [dest] "=&f"(dest)
+ : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
+ [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask)
+ : "memory");
+}
+
+void ARGBAddRow_MMI(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ uint64_t src0, src1, dest;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t"
+ "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t"
+ "paddusb %[dest], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t"
+ "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+ : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
+ [dst_ptr] "r"(dst_argb), [width] "r"(width)
+ : "memory");
+}
+
+void ARGBSubtractRow_MMI(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ uint64_t src0, src1, dest;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t"
+ "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t"
+ "psubusb %[dest], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t"
+ "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+ : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
+ [dst_ptr] "r"(dst_argb), [width] "r"(width)
+ : "memory");
+}
+
+// Sobel functions which mimics SSSE3.
+void SobelXRow_MMI(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
+ int width) {
+ uint64_t y00 = 0, y10 = 0, y20 = 0;
+ uint64_t y02 = 0, y12 = 0, y22 = 0;
+ uint64_t zero = 0x0;
+ uint64_t sobel = 0x0;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[y00], 0x07(%[src_y0]) \n\t" // a=src_y0[i]
+ "gsldrc1 %[y00], 0x00(%[src_y0]) \n\t"
+ "gsldlc1 %[y02], 0x09(%[src_y0]) \n\t" // a_sub=src_y0[i+2]
+ "gsldrc1 %[y02], 0x02(%[src_y0]) \n\t"
+
+ "gsldlc1 %[y10], 0x07(%[src_y1]) \n\t" // b=src_y1[i]
+ "gsldrc1 %[y10], 0x00(%[src_y1]) \n\t"
+ "gsldlc1 %[y12], 0x09(%[src_y1]) \n\t" // b_sub=src_y1[i+2]
+ "gsldrc1 %[y12], 0x02(%[src_y1]) \n\t"
+
+ "gsldlc1 %[y20], 0x07(%[src_y2]) \n\t" // c=src_y2[i]
+ "gsldrc1 %[y20], 0x00(%[src_y2]) \n\t"
+ "gsldlc1 %[y22], 0x09(%[src_y2]) \n\t" // c_sub=src_y2[i+2]
+ "gsldrc1 %[y22], 0x02(%[src_y2]) \n\t"
+
+ "punpcklbh %[y00], %[y00], %[zero] \n\t"
+ "punpcklbh %[y10], %[y10], %[zero] \n\t"
+ "punpcklbh %[y20], %[y20], %[zero] \n\t"
+
+ "punpcklbh %[y02], %[y02], %[zero] \n\t"
+ "punpcklbh %[y12], %[y12], %[zero] \n\t"
+ "punpcklbh %[y22], %[y22], %[zero] \n\t"
+
+ "paddh %[y00], %[y00], %[y10] \n\t" // a+b
+ "paddh %[y20], %[y20], %[y10] \n\t" // c+b
+ "paddh %[y00], %[y00], %[y20] \n\t" // a+2b+c
+
+ "paddh %[y02], %[y02], %[y12] \n\t" // a_sub+b_sub
+ "paddh %[y22], %[y22], %[y12] \n\t" // c_sub+b_sub
+ "paddh %[y02], %[y02], %[y22] \n\t" // a_sub+2b_sub+c_sub
+
+ "pmaxsh %[y10], %[y00], %[y02] \n\t"
+ "pminsh %[y20], %[y00], %[y02] \n\t"
+ "psubh %[sobel], %[y10], %[y20] \n\t" // Abs
+
+ "gsldlc1 %[y00], 0x0B(%[src_y0]) \n\t"
+ "gsldrc1 %[y00], 0x04(%[src_y0]) \n\t"
+ "gsldlc1 %[y02], 0x0D(%[src_y0]) \n\t"
+ "gsldrc1 %[y02], 0x06(%[src_y0]) \n\t"
+
+ "gsldlc1 %[y10], 0x0B(%[src_y1]) \n\t"
+ "gsldrc1 %[y10], 0x04(%[src_y1]) \n\t"
+ "gsldlc1 %[y12], 0x0D(%[src_y1]) \n\t"
+ "gsldrc1 %[y12], 0x06(%[src_y1]) \n\t"
+
+ "gsldlc1 %[y20], 0x0B(%[src_y2]) \n\t"
+ "gsldrc1 %[y20], 0x04(%[src_y2]) \n\t"
+ "gsldlc1 %[y22], 0x0D(%[src_y2]) \n\t"
+ "gsldrc1 %[y22], 0x06(%[src_y2]) \n\t"
+
+ "punpcklbh %[y00], %[y00], %[zero] \n\t"
+ "punpcklbh %[y10], %[y10], %[zero] \n\t"
+ "punpcklbh %[y20], %[y20], %[zero] \n\t"
+
+ "punpcklbh %[y02], %[y02], %[zero] \n\t"
+ "punpcklbh %[y12], %[y12], %[zero] \n\t"
+ "punpcklbh %[y22], %[y22], %[zero] \n\t"
+
+ "paddh %[y00], %[y00], %[y10] \n\t"
+ "paddh %[y20], %[y20], %[y10] \n\t"
+ "paddh %[y00], %[y00], %[y20] \n\t"
+
+ "paddh %[y02], %[y02], %[y12] \n\t"
+ "paddh %[y22], %[y22], %[y12] \n\t"
+ "paddh %[y02], %[y02], %[y22] \n\t"
+
+ "pmaxsh %[y10], %[y00], %[y02] \n\t"
+ "pminsh %[y20], %[y00], %[y02] \n\t"
+ "psubh %[y00], %[y10], %[y20] \n\t"
+
+ "packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255
+ "gssdrc1 %[sobel], 0(%[dst_sobelx]) \n\t"
+ "gssdlc1 %[sobel], 7(%[dst_sobelx]) \n\t"
+
+ "daddiu %[src_y0], %[src_y0], 8 \n\t"
+ "daddiu %[src_y1], %[src_y1], 8 \n\t"
+ "daddiu %[src_y2], %[src_y2], 8 \n\t"
+ "daddiu %[dst_sobelx], %[dst_sobelx], 8 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y10] "=&f"(y10),
+ [y20] "=&f"(y20), [y02] "=&f"(y02), [y12] "=&f"(y12), [y22] "=&f"(y22)
+ : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1), [src_y2] "r"(src_y2),
+ [dst_sobelx] "r"(dst_sobelx), [width] "r"(width), [zero] "f"(zero)
+ : "memory");
+}
+
+void SobelYRow_MMI(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
+ int width) {
+ uint64_t y00 = 0, y01 = 0, y02 = 0;
+ uint64_t y10 = 0, y11 = 0, y12 = 0;
+ uint64_t zero = 0x0;
+ uint64_t sobel = 0x0;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[y00], 0x07(%[src_y0]) \n\t" // a=src_y0[i]
+ "gsldrc1 %[y00], 0x00(%[src_y0]) \n\t"
+ "gsldlc1 %[y01], 0x08(%[src_y0]) \n\t" // b=src_y0[i+1]
+ "gsldrc1 %[y01], 0x01(%[src_y0]) \n\t"
+ "gsldlc1 %[y02], 0x09(%[src_y0]) \n\t" // c=src_y0[i+2]
+ "gsldrc1 %[y02], 0x02(%[src_y0]) \n\t"
+
+ "gsldlc1 %[y10], 0x07(%[src_y1]) \n\t" // a_sub=src_y1[i]
+ "gsldrc1 %[y10], 0x00(%[src_y1]) \n\t"
+ "gsldlc1 %[y11], 0x08(%[src_y1]) \n\t" // b_sub=src_y1[i+1]
+ "gsldrc1 %[y11], 0x01(%[src_y1]) \n\t"
+ "gsldlc1 %[y12], 0x09(%[src_y1]) \n\t" // c_sub=src_y1[i+2]
+ "gsldrc1 %[y12], 0x02(%[src_y1]) \n\t"
+
+ "punpcklbh %[y00], %[y00], %[zero] \n\t"
+ "punpcklbh %[y01], %[y01], %[zero] \n\t"
+ "punpcklbh %[y02], %[y02], %[zero] \n\t"
+
+ "punpcklbh %[y10], %[y10], %[zero] \n\t"
+ "punpcklbh %[y11], %[y11], %[zero] \n\t"
+ "punpcklbh %[y12], %[y12], %[zero] \n\t"
+
+ "paddh %[y00], %[y00], %[y01] \n\t" // a+b
+ "paddh %[y02], %[y02], %[y01] \n\t" // c+b
+ "paddh %[y00], %[y00], %[y02] \n\t" // a+2b+c
+
+ "paddh %[y10], %[y10], %[y11] \n\t" // a_sub+b_sub
+ "paddh %[y12], %[y12], %[y11] \n\t" // c_sub+b_sub
+ "paddh %[y10], %[y10], %[y12] \n\t" // a_sub+2b_sub+c_sub
+
+ "pmaxsh %[y02], %[y00], %[y10] \n\t"
+ "pminsh %[y12], %[y00], %[y10] \n\t"
+ "psubh %[sobel], %[y02], %[y12] \n\t" // Abs
+
+ "gsldlc1 %[y00], 0x0B(%[src_y0]) \n\t"
+ "gsldrc1 %[y00], 0x04(%[src_y0]) \n\t"
+ "gsldlc1 %[y01], 0x0C(%[src_y0]) \n\t"
+ "gsldrc1 %[y01], 0x05(%[src_y0]) \n\t"
+ "gsldlc1 %[y02], 0x0D(%[src_y0]) \n\t"
+ "gsldrc1 %[y02], 0x06(%[src_y0]) \n\t"
+
+ "gsldlc1 %[y10], 0x0B(%[src_y1]) \n\t"
+ "gsldrc1 %[y10], 0x04(%[src_y1]) \n\t"
+ "gsldlc1 %[y11], 0x0C(%[src_y1]) \n\t"
+ "gsldrc1 %[y11], 0x05(%[src_y1]) \n\t"
+ "gsldlc1 %[y12], 0x0D(%[src_y1]) \n\t"
+ "gsldrc1 %[y12], 0x06(%[src_y1]) \n\t"
+
+ "punpcklbh %[y00], %[y00], %[zero] \n\t"
+ "punpcklbh %[y01], %[y01], %[zero] \n\t"
+ "punpcklbh %[y02], %[y02], %[zero] \n\t"
+
+ "punpcklbh %[y10], %[y10], %[zero] \n\t"
+ "punpcklbh %[y11], %[y11], %[zero] \n\t"
+ "punpcklbh %[y12], %[y12], %[zero] \n\t"
+
+ "paddh %[y00], %[y00], %[y01] \n\t"
+ "paddh %[y02], %[y02], %[y01] \n\t"
+ "paddh %[y00], %[y00], %[y02] \n\t"
+
+ "paddh %[y10], %[y10], %[y11] \n\t"
+ "paddh %[y12], %[y12], %[y11] \n\t"
+ "paddh %[y10], %[y10], %[y12] \n\t"
+
+ "pmaxsh %[y02], %[y00], %[y10] \n\t"
+ "pminsh %[y12], %[y00], %[y10] \n\t"
+ "psubh %[y00], %[y02], %[y12] \n\t"
+
+ "packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255
+ "gssdrc1 %[sobel], 0(%[dst_sobely]) \n\t"
+ "gssdlc1 %[sobel], 7(%[dst_sobely]) \n\t"
+
+ "daddiu %[src_y0], %[src_y0], 8 \n\t"
+ "daddiu %[src_y1], %[src_y1], 8 \n\t"
+ "daddiu %[dst_sobely], %[dst_sobely], 8 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y01] "=&f"(y01),
+ [y02] "=&f"(y02), [y10] "=&f"(y10), [y11] "=&f"(y11), [y12] "=&f"(y12)
+ : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1),
+ [dst_sobely] "r"(dst_sobely), [width] "r"(width), [zero] "f"(zero)
+ : "memory");
+}
+
+void SobelRow_MMI(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ double temp[3];
+ uint64_t c1 = 0xff000000ff000000;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[t0], 0x07(%[src_sobelx]) \n\t" // a=src_sobelx[i]
+ "gsldrc1 %[t0], 0x00(%[src_sobelx]) \n\t"
+ "gsldlc1 %[t1], 0x07(%[src_sobely]) \n\t" // b=src_sobely[i]
+ "gsldrc1 %[t1], 0x00(%[src_sobely]) \n\t"
+ // s7 s6 s5 s4 s3 s2 s1 s0 = a+b
+ "paddusb %[t2] , %[t0], %[t1] \n\t"
+
+ // s3 s2 s1 s0->s3 s3 s2 s2 s1 s1 s0 s0
+ "punpcklbh %[t0], %[t2], %[t2] \n\t"
+
+ // s1 s1 s0 s0->s1 s2 s1 s1 s0 s0 s0 s0
+ "punpcklbh %[t1], %[t0], %[t0] \n\t"
+ "or %[t1], %[t1], %[c1] \n\t"
+ // 255 s1 s1 s1 s55 s0 s0 s0
+ "gssdrc1 %[t1], 0x00(%[dst_argb]) \n\t"
+ "gssdlc1 %[t1], 0x07(%[dst_argb]) \n\t"
+
+ // s3 s3 s2 s2->s3 s3 s3 s3 s2 s2 s2 s2
+ "punpckhbh %[t1], %[t0], %[t0] \n\t"
+ "or %[t1], %[t1], %[c1] \n\t"
+ // 255 s3 s3 s3 255 s2 s2 s2
+ "gssdrc1 %[t1], 0x08(%[dst_argb]) \n\t"
+ "gssdlc1 %[t1], 0x0f(%[dst_argb]) \n\t"
+
+ // s7 s6 s5 s4->s7 s7 s6 s6 s5 s5 s4 s4
+ "punpckhbh %[t0], %[t2], %[t2] \n\t"
+
+ // s5 s5 s4 s4->s5 s5 s5 s5 s4 s4 s4 s4
+ "punpcklbh %[t1], %[t0], %[t0] \n\t"
+ "or %[t1], %[t1], %[c1] \n\t"
+ "gssdrc1 %[t1], 0x10(%[dst_argb]) \n\t"
+ "gssdlc1 %[t1], 0x17(%[dst_argb]) \n\t"
+
+ // s7 s7 s6 s6->s7 s7 s7 s7 s6 s6 s6 s6
+ "punpckhbh %[t1], %[t0], %[t0] \n\t"
+ "or %[t1], %[t1], %[c1] \n\t"
+ "gssdrc1 %[t1], 0x18(%[dst_argb]) \n\t"
+ "gssdlc1 %[t1], 0x1f(%[dst_argb]) \n\t"
+
+ "daddiu %[dst_argb], %[dst_argb], 32 \n\t"
+ "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t"
+ "daddiu %[src_sobely], %[src_sobely], 8 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2])
+ : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
+ [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1)
+ : "memory");
+}
+
+void SobelToPlaneRow_MMI(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
+ int width) {
+ uint64_t tr = 0;
+ uint64_t tb = 0;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[tr], 0x0(%[src_sobelx]) \n\t"
+ "gsldlc1 %[tr], 0x7(%[src_sobelx]) \n\t" // r=src_sobelx[i]
+ "gsldrc1 %[tb], 0x0(%[src_sobely]) \n\t"
+ "gsldlc1 %[tb], 0x7(%[src_sobely]) \n\t" // b=src_sobely[i]
+ "paddusb %[tr], %[tr], %[tb] \n\t" // g
+ "gssdrc1 %[tr], 0x0(%[dst_y]) \n\t"
+ "gssdlc1 %[tr], 0x7(%[dst_y]) \n\t"
+
+ "daddiu %[dst_y], %[dst_y], 8 \n\t"
+ "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t"
+ "daddiu %[src_sobely], %[src_sobely], 8 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [tr] "=&f"(tr), [tb] "=&f"(tb)
+ : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
+ [dst_y] "r"(dst_y), [width] "r"(width)
+ : "memory");
+}
+
+void SobelXYRow_MMI(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ uint64_t temp[3];
+ uint64_t result = 0;
+ uint64_t gb = 0;
+ uint64_t cr = 0;
+ uint64_t c1 = 0xffffffffffffffff;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[tr], 0x07(%[src_sobelx]) \n\t" // r=src_sobelx[i]
+ "gsldrc1 %[tr], 0x00(%[src_sobelx]) \n\t"
+ "gsldlc1 %[tb], 0x07(%[src_sobely]) \n\t" // b=src_sobely[i]
+ "gsldrc1 %[tb], 0x00(%[src_sobely]) \n\t"
+ "paddusb %[tg] , %[tr], %[tb] \n\t" // g
+
+ // g3 b3 g2 b2 g1 b1 g0 b0
+ "punpcklbh %[gb], %[tb], %[tg] \n\t"
+ // c3 r3 r2 r2 c1 r1 c0 r0
+ "punpcklbh %[cr], %[tr], %[c1] \n\t"
+ // c1 r1 g1 b1 c0 r0 g0 b0
+ "punpcklhw %[result], %[gb], %[cr] \n\t"
+ "gssdrc1 %[result], 0x00(%[dst_argb]) \n\t"
+ "gssdlc1 %[result], 0x07(%[dst_argb]) \n\t"
+ // c3 r3 g3 b3 c2 r2 g2 b2
+ "punpckhhw %[result], %[gb], %[cr] \n\t"
+ "gssdrc1 %[result], 0x08(%[dst_argb]) \n\t"
+ "gssdlc1 %[result], 0x0f(%[dst_argb]) \n\t"
+
+ // g7 b7 g6 b6 g5 b5 g4 b4
+ "punpckhbh %[gb], %[tb], %[tg] \n\t"
+ // c7 r7 c6 r6 c5 r5 c4 r4
+ "punpckhbh %[cr], %[tr], %[c1] \n\t"
+ // c5 r5 g5 b5 c4 r4 g4 b4
+ "punpcklhw %[result], %[gb], %[cr] \n\t"
+ "gssdrc1 %[result], 0x10(%[dst_argb]) \n\t"
+ "gssdlc1 %[result], 0x17(%[dst_argb]) \n\t"
+ // c7 r7 g7 b7 c6 r6 g6 b6
+ "punpckhhw %[result], %[gb], %[cr] \n\t"
+ "gssdrc1 %[result], 0x18(%[dst_argb]) \n\t"
+ "gssdlc1 %[result], 0x1f(%[dst_argb]) \n\t"
+
+ "daddiu %[dst_argb], %[dst_argb], 32 \n\t"
+ "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t"
+ "daddiu %[src_sobely], %[src_sobely], 8 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [tr] "=&f"(temp[0]), [tb] "=&f"(temp[1]), [tg] "=&f"(temp[2]),
+ [gb] "=&f"(gb), [cr] "=&f"(cr), [result] "=&f"(result)
+ : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
+ [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1)
+ : "memory");
+}
+
+void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+ // Copy a Y to RGB.
+ uint64_t src, dest;
+ const uint64_t mask0 = 0x00ffffff00ffffffULL;
+ const uint64_t mask1 = ~mask0;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gslwlc1 %[src], 0x03(%[src_ptr]) \n\t"
+ "gslwrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "punpcklbh %[src], %[src], %[src] \n\t"
+ "punpcklhw %[dest], %[src], %[src] \n\t"
+ "and %[dest], %[dest], %[mask0] \n\t"
+ "or %[dest], %[dest], %[mask1] \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+
+ "punpckhhw %[dest], %[src], %[src] \n\t"
+ "and %[dest], %[dest], %[mask0] \n\t"
+ "or %[dest], %[dest], %[mask1] \n\t"
+ "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
+ "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x04 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_y), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
+ [mask1] "f"(mask1), [width] "r"(width)
+ : "memory");
+}
+
+// TODO - respect YuvConstants
+void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf,
+ const struct YuvConstants*, int width) {
+ uint64_t src, src_lo, src_hi, dest, dest_lo, dest_hi;
+ const uint64_t mask0 = 0x0;
+ const uint64_t mask1 = 0x55;
+ const uint64_t mask2 = 0xAA;
+ const uint64_t mask3 = 0xFF;
+ const uint64_t mask4 = 0x4A354A354A354A35ULL;
+ const uint64_t mask5 = 0x0488048804880488ULL;
+ const uint64_t shift0 = 0x08;
+ const uint64_t shift1 = 0x06;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
+ "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
+
+ "pshufh %[src], %[src_lo], %[mask0] \n\t"
+ "psllh %[dest_lo], %[src], %[shift0] \n\t"
+ "paddush %[dest_lo], %[dest_lo], %[src] \n\t"
+ "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t"
+ "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t"
+ "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t"
+ "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t"
+ "pshufh %[src], %[src_lo], %[mask1] \n\t"
+ "psllh %[dest_hi], %[src], %[shift0] \n\t"
+ "paddush %[dest_hi], %[dest_hi], %[src] \n\t"
+ "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t"
+ "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t"
+ "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t"
+ "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t"
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "pshufh %[src], %[src_lo], %[mask2] \n\t"
+ "psllh %[dest_lo], %[src], %[shift0] \n\t"
+ "paddush %[dest_lo], %[dest_lo], %[src] \n\t"
+ "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t"
+ "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t"
+ "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t"
+ "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t"
+ "pshufh %[src], %[src_lo], %[mask3] \n\t"
+ "psllh %[dest_hi], %[src], %[shift0] \n\t"
+ "paddush %[dest_hi], %[dest_hi], %[src] \n\t"
+ "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t"
+ "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t"
+ "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t"
+ "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t"
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
+
+ "pshufh %[src], %[src_hi], %[mask0] \n\t"
+ "psllh %[dest_lo], %[src], %[shift0] \n\t"
+ "paddush %[dest_lo], %[dest_lo], %[src] \n\t"
+ "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t"
+ "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t"
+ "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t"
+ "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t"
+ "pshufh %[src], %[src_hi], %[mask1] \n\t"
+ "psllh %[dest_hi], %[src], %[shift0] \n\t"
+ "paddush %[dest_hi], %[dest_hi], %[src] \n\t"
+ "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t"
+ "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t"
+ "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t"
+ "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t"
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gssdlc1 %[dest], 0x17(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x10(%[dst_ptr]) \n\t"
+
+ "pshufh %[src], %[src_hi], %[mask2] \n\t"
+ "psllh %[dest_lo], %[src], %[shift0] \n\t"
+ "paddush %[dest_lo], %[dest_lo], %[src] \n\t"
+ "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t"
+ "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t"
+ "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t"
+ "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t"
+ "pshufh %[src], %[src_hi], %[mask3] \n\t"
+ "psllh %[dest_hi], %[src], %[shift0] \n\t"
+ "paddush %[dest_hi], %[dest_hi], %[src] \n\t"
+ "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t"
+ "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t"
+ "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t"
+ "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t"
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gssdlc1 %[dest], 0x1f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x18(%[dst_ptr]) \n\t"
+
+ "daddi %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x20 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
+ [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi),
+ [dest_lo] "=&f"(dest_lo)
+ : [src_ptr] "r"(src_y), [dst_ptr] "r"(rgb_buf), [mask0] "f"(mask0),
+ [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3),
+ [mask4] "f"(mask4), [mask5] "f"(mask5), [shift0] "f"(shift0),
+ [shift1] "f"(shift1), [width] "r"(width)
+ : "memory");
+}
+
+void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
+ uint64_t source, src0, src1, dest;
+ const uint64_t mask0 = 0x0;
+ const uint64_t mask1 = 0x1b;
+
+ src += width - 1;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[source], 0(%[src_ptr]) \n\t"
+ "gsldrc1 %[source], -7(%[src_ptr]) \n\t"
+ "punpcklbh %[src0], %[source], %[mask0] \n\t"
+ "pshufh %[src0], %[src0], %[mask1] \n\t"
+ "punpckhbh %[src1], %[source], %[mask0] \n\t"
+ "pshufh %[src1], %[src1], %[mask1] \n\t"
+ "packushb %[dest], %[src1], %[src0] \n\t"
+
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddi %[src_ptr], %[src_ptr], -0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [source] "=&f"(source), [dest] "=&f"(dest), [src0] "=&f"(src0),
+ [src1] "=&f"(src1)
+ : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
+ [mask1] "f"(mask1), [width] "r"(width)
+ : "memory");
+}
+
+void MirrorSplitUVRow_MMI(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t src0, src1, dest0, dest1;
+ const uint64_t mask0 = 0x00ff00ff00ff00ffULL;
+ const uint64_t mask1 = 0x1b;
+ const uint64_t shift = 0x08;
+
+ src_uv += (width - 1) << 1;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src0], 1(%[src_ptr]) \n\t"
+ "gsldrc1 %[src0], -6(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], -7(%[src_ptr]) \n\t"
+ "gsldrc1 %[src1], -14(%[src_ptr]) \n\t"
+
+ "and %[dest0], %[src0], %[mask0] \n\t"
+ "pshufh %[dest0], %[dest0], %[mask1] \n\t"
+ "and %[dest1], %[src1], %[mask0] \n\t"
+ "pshufh %[dest1], %[dest1], %[mask1] \n\t"
+ "packushb %[dest0], %[dest0], %[dest1] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dstu_ptr]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dstu_ptr]) \n\t"
+
+ "psrlh %[dest0], %[src0], %[shift] \n\t"
+ "pshufh %[dest0], %[dest0], %[mask1] \n\t"
+ "psrlh %[dest1], %[src1], %[shift] \n\t"
+ "pshufh %[dest1], %[dest1], %[mask1] \n\t"
+ "packushb %[dest0], %[dest0], %[dest1] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dstv_ptr]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dstv_ptr]) \n\t"
+
+ "daddi %[src_ptr], %[src_ptr], -0x10 \n\t"
+ "daddiu %[dstu_ptr], %[dstu_ptr], 0x08 \n\t"
+ "daddiu %[dstv_ptr], %[dstv_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0),
+ [src1] "=&f"(src1)
+ : [src_ptr] "r"(src_uv), [dstu_ptr] "r"(dst_u), [dstv_ptr] "r"(dst_v),
+ [width] "r"(width), [mask0] "f"(mask0), [mask1] "f"(mask1),
+ [shift] "f"(shift)
+ : "memory");
+}
+
+void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
+ src += (width - 1) * 4;
+ uint64_t temp = 0x0;
+ uint64_t shuff = 0x4e; // 01 00 11 10
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[temp], 3(%[src]) \n\t"
+ "gsldrc1 %[temp], -4(%[src]) \n\t"
+ "pshufh %[temp], %[temp], %[shuff] \n\t"
+ "gssdrc1 %[temp], 0x0(%[dst]) \n\t"
+ "gssdlc1 %[temp], 0x7(%[dst]) \n\t"
+
+ "daddiu %[src], %[src], -0x08 \n\t"
+ "daddiu %[dst], %[dst], 0x08 \n\t"
+ "daddiu %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [temp] "=&f"(temp)
+ : [src] "r"(src), [dst] "r"(dst), [width] "r"(width), [shuff] "f"(shuff)
+ : "memory");
+}
+
+void SplitUVRow_MMI(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t c0 = 0x00ff00ff00ff00ff;
+ uint64_t temp[4];
+ uint64_t shift = 0x08;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[t0], 0x00(%[src_uv]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[src_uv]) \n\t"
+ "gsldrc1 %[t1], 0x08(%[src_uv]) \n\t"
+ "gsldlc1 %[t1], 0x0f(%[src_uv]) \n\t"
+
+ "and %[t2], %[t0], %[c0] \n\t"
+ "and %[t3], %[t1], %[c0] \n\t"
+ "packushb %[t2], %[t2], %[t3] \n\t"
+ "gssdrc1 %[t2], 0x0(%[dst_u]) \n\t"
+ "gssdlc1 %[t2], 0x7(%[dst_u]) \n\t"
+
+ "psrlh %[t2], %[t0], %[shift] \n\t"
+ "psrlh %[t3], %[t1], %[shift] \n\t"
+ "packushb %[t2], %[t2], %[t3] \n\t"
+ "gssdrc1 %[t2], 0x0(%[dst_v]) \n\t"
+ "gssdlc1 %[t2], 0x7(%[dst_v]) \n\t"
+
+ "daddiu %[src_uv], %[src_uv], 16 \n\t"
+ "daddiu %[dst_u], %[dst_u], 8 \n\t"
+ "daddiu %[dst_v], %[dst_v], 8 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
+ [t3] "=&f"(temp[3])
+ : [src_uv] "r"(src_uv), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
+ [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift)
+ : "memory");
+}
+
+void MergeUVRow_MMI(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width) {
+ uint64_t temp[3];
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[t0], 0x0(%[src_u]) \n\t"
+ "gsldlc1 %[t0], 0x7(%[src_u]) \n\t"
+ "gsldrc1 %[t1], 0x0(%[src_v]) \n\t"
+ "gsldlc1 %[t1], 0x7(%[src_v]) \n\t"
+ "punpcklbh %[t2], %[t0], %[t1] \n\t"
+ "gssdrc1 %[t2], 0x0(%[dst_uv]) \n\t"
+ "gssdlc1 %[t2], 0x7(%[dst_uv]) \n\t"
+ "punpckhbh %[t2], %[t0], %[t1] \n\t"
+ "gssdrc1 %[t2], 0x8(%[dst_uv]) \n\t"
+ "gssdlc1 %[t2], 0xf(%[dst_uv]) \n\t"
+
+ "daddiu %[src_u], %[src_u], 8 \n\t"
+ "daddiu %[src_v], %[src_v], 8 \n\t"
+ "daddiu %[dst_uv], %[dst_uv], 16 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2])
+ : [dst_uv] "r"(dst_uv), [src_u] "r"(src_u), [src_v] "r"(src_v),
+ [width] "r"(width)
+ : "memory");
+}
+
+void SplitRGBRow_MMI(const uint8_t* src_rgb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ uint64_t src[4];
+ uint64_t dest_hi, dest_lo, dest;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t"
+ "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t"
+ "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t"
+ "punpcklbh %[dest_lo], %[src0], %[src1] \n\t"
+ "gslwlc1 %[src2], 0x09(%[src_ptr]) \n\t"
+ "gslwrc1 %[src2], 0x06(%[src_ptr]) \n\t"
+ "gslwlc1 %[src3], 0x0c(%[src_ptr]) \n\t"
+ "gslwrc1 %[src3], 0x09(%[src_ptr]) \n\t"
+ "punpcklbh %[dest_hi], %[src2], %[src3] \n\t"
+
+ "punpcklhw %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gsswlc1 %[dest], 0x03(%[dstr_ptr]) \n\t"
+ "gsswrc1 %[dest], 0x00(%[dstr_ptr]) \n\t"
+ "punpckhwd %[dest], %[dest], %[dest] \n\t"
+ "gsswlc1 %[dest], 0x03(%[dstg_ptr]) \n\t"
+ "gsswrc1 %[dest], 0x00(%[dstg_ptr]) \n\t"
+ "punpckhhw %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gsswlc1 %[dest], 0x03(%[dstb_ptr]) \n\t"
+ "gsswrc1 %[dest], 0x00(%[dstb_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t"
+ "daddiu %[dstr_ptr], %[dstr_ptr], 0x04 \n\t"
+ "daddiu %[dstg_ptr], %[dstg_ptr], 0x04 \n\t"
+ "daddiu %[dstb_ptr], %[dstb_ptr], 0x04 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src[0]), [src1] "=&f"(src[1]), [src2] "=&f"(src[2]),
+ [src3] "=&f"(src[3]), [dest_hi] "=&f"(dest_hi),
+ [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_rgb), [dstr_ptr] "r"(dst_r), [dstg_ptr] "r"(dst_g),
+ [dstb_ptr] "r"(dst_b), [width] "r"(width)
+ : "memory");
+}
+
+void MergeRGBRow_MMI(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width) {
+ uint64_t srcr, srcg, srcb, dest;
+ uint64_t srcrg_hi, srcrg_lo, srcbz_hi, srcbz_lo;
+ const uint64_t temp = 0x0;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[srcr], 0x07(%[srcr_ptr]) \n\t"
+ "gsldrc1 %[srcr], 0x00(%[srcr_ptr]) \n\t"
+ "gsldlc1 %[srcg], 0x07(%[srcg_ptr]) \n\t"
+ "gsldrc1 %[srcg], 0x00(%[srcg_ptr]) \n\t"
+ "punpcklbh %[srcrg_lo], %[srcr], %[srcg] \n\t"
+ "punpckhbh %[srcrg_hi], %[srcr], %[srcg] \n\t"
+
+ "gsldlc1 %[srcb], 0x07(%[srcb_ptr]) \n\t"
+ "gsldrc1 %[srcb], 0x00(%[srcb_ptr]) \n\t"
+ "punpcklbh %[srcbz_lo], %[srcb], %[temp] \n\t"
+ "punpckhbh %[srcbz_hi], %[srcb], %[temp] \n\t"
+
+ "punpcklhw %[dest], %[srcrg_lo], %[srcbz_lo] \n\t"
+ "gsswlc1 %[dest], 0x03(%[dst_ptr]) \n\t"
+ "gsswrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+ "punpckhwd %[dest], %[dest], %[dest] \n\t"
+ "gsswlc1 %[dest], 0x06(%[dst_ptr]) \n\t"
+ "gsswrc1 %[dest], 0x03(%[dst_ptr]) \n\t"
+ "punpckhhw %[dest], %[srcrg_lo], %[srcbz_lo] \n\t"
+ "gsswlc1 %[dest], 0x09(%[dst_ptr]) \n\t"
+ "gsswrc1 %[dest], 0x06(%[dst_ptr]) \n\t"
+ "punpckhwd %[dest], %[dest], %[dest] \n\t"
+ "gsswlc1 %[dest], 0x0c(%[dst_ptr]) \n\t"
+ "gsswrc1 %[dest], 0x09(%[dst_ptr]) \n\t"
+ "punpcklhw %[dest], %[srcrg_hi], %[srcbz_hi] \n\t"
+ "gsswlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
+ "gsswrc1 %[dest], 0x0c(%[dst_ptr]) \n\t"
+ "punpckhwd %[dest], %[dest], %[dest] \n\t"
+ "gsswlc1 %[dest], 0x12(%[dst_ptr]) \n\t"
+ "gsswrc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
+ "punpckhhw %[dest], %[srcrg_hi], %[srcbz_hi] \n\t"
+ "gsswlc1 %[dest], 0x15(%[dst_ptr]) \n\t"
+ "gsswrc1 %[dest], 0x12(%[dst_ptr]) \n\t"
+ "punpckhwd %[dest], %[dest], %[dest] \n\t"
+ "gsswlc1 %[dest], 0x18(%[dst_ptr]) \n\t"
+ "gsswrc1 %[dest], 0x15(%[dst_ptr]) \n\t"
+
+ "daddiu %[srcr_ptr], %[srcr_ptr], 0x08 \n\t"
+ "daddiu %[srcg_ptr], %[srcg_ptr], 0x08 \n\t"
+ "daddiu %[srcb_ptr], %[srcb_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x18 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [srcr] "=&f"(srcr), [srcg] "=&f"(srcg), [srcb] "=&f"(srcb),
+ [dest] "=&f"(dest), [srcrg_hi] "=&f"(srcrg_hi),
+ [srcrg_lo] "=&f"(srcrg_lo), [srcbz_hi] "=&f"(srcbz_hi),
+ [srcbz_lo] "=&f"(srcbz_lo)
+ : [srcr_ptr] "r"(src_r), [srcg_ptr] "r"(src_g), [srcb_ptr] "r"(src_b),
+ [dst_ptr] "r"(dst_rgb), [width] "r"(width), [temp] "f"(temp)
+ : "memory");
+}
+
+// Filter 2 rows of YUY2 UV's (422) into U and V (420).
+void YUY2ToUVRow_MMI(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t c0 = 0xff00ff00ff00ff00;
+ uint64_t c1 = 0x00ff00ff00ff00ff;
+ uint64_t temp[3];
+ uint64_t data[4];
+ uint64_t shift = 0x08;
+ uint64_t src_stride = 0x0;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t"
+ "daddu %[src_stride], %[src_yuy2], %[src_stride_yuy2] \n\t"
+ "gsldrc1 %[t1], 0x00(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x07(%[src_stride]) \n\t"
+ "pavgb %[t0], %[t0], %[t1] \n\t"
+
+ "gsldrc1 %[t2], 0x08(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t2], 0x0f(%[src_yuy2]) \n\t"
+ "gsldrc1 %[t1], 0x08(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x0f(%[src_stride]) \n\t"
+ "pavgb %[t1], %[t2], %[t1] \n\t"
+
+ "and %[t0], %[t0], %[c0] \n\t"
+ "and %[t1], %[t1], %[c0] \n\t"
+ "psrlh %[t0], %[t0], %[shift] \n\t"
+ "psrlh %[t1], %[t1], %[shift] \n\t"
+ "packushb %[t0], %[t0], %[t1] \n\t"
+ "mov.s %[t1], %[t0] \n\t"
+ "and %[d0], %[t0], %[c1] \n\t"
+ "psrlh %[d1], %[t1], %[shift] \n\t"
+
+ "gsldrc1 %[t0], 0x10(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t0], 0x17(%[src_yuy2]) \n\t"
+ "gsldrc1 %[t1], 0x10(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x17(%[src_stride]) \n\t"
+ "pavgb %[t0], %[t0], %[t1] \n\t"
+
+ "gsldrc1 %[t2], 0x18(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t2], 0x1f(%[src_yuy2]) \n\t"
+ "gsldrc1 %[t1], 0x18(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x1f(%[src_stride]) \n\t"
+ "pavgb %[t1], %[t2], %[t1] \n\t"
+
+ "and %[t0], %[t0], %[c0] \n\t"
+ "and %[t1], %[t1], %[c0] \n\t"
+ "psrlh %[t0], %[t0], %[shift] \n\t"
+ "psrlh %[t1], %[t1], %[shift] \n\t"
+ "packushb %[t0], %[t0], %[t1] \n\t"
+ "mov.s %[t1], %[t0] \n\t"
+ "and %[d2], %[t0], %[c1] \n\t"
+ "psrlh %[d3], %[t1], %[shift] \n\t"
+
+ "packushb %[d0], %[d0], %[d2] \n\t"
+ "packushb %[d1], %[d1], %[d3] \n\t"
+ "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t"
+ "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t"
+ "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t"
+ "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t"
+ "daddiu %[src_yuy2], %[src_yuy2], 32 \n\t"
+ "daddiu %[dst_u], %[dst_u], 8 \n\t"
+ "daddiu %[dst_v], %[dst_v], 8 \n\t"
+ "daddiu %[width], %[width], -16 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
+ [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]),
+ [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride)
+ : [src_yuy2] "r"(src_yuy2), [src_stride_yuy2] "r"(src_stride_yuy2),
+ [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+ [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift)
+ : "memory");
+}
+
+// Copy row of YUY2 UV's (422) into U and V (422).
+void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t c0 = 0xff00ff00ff00ff00;
+ uint64_t c1 = 0x00ff00ff00ff00ff;
+ uint64_t temp[2];
+ uint64_t data[4];
+ uint64_t shift = 0x08;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t"
+ "gsldrc1 %[t1], 0x08(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t1], 0x0f(%[src_yuy2]) \n\t"
+ "and %[t0], %[t0], %[c0] \n\t"
+ "and %[t1], %[t1], %[c0] \n\t"
+ "psrlh %[t0], %[t0], %[shift] \n\t"
+ "psrlh %[t1], %[t1], %[shift] \n\t"
+ "packushb %[t0], %[t0], %[t1] \n\t"
+ "mov.s %[t1], %[t0] \n\t"
+ "and %[d0], %[t0], %[c1] \n\t"
+ "psrlh %[d1], %[t1], %[shift] \n\t"
+
+ "gsldrc1 %[t0], 0x10(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t0], 0x17(%[src_yuy2]) \n\t"
+ "gsldrc1 %[t1], 0x18(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t1], 0x1f(%[src_yuy2]) \n\t"
+ "and %[t0], %[t0], %[c0] \n\t"
+ "and %[t1], %[t1], %[c0] \n\t"
+ "psrlh %[t0], %[t0], %[shift] \n\t"
+ "psrlh %[t1], %[t1], %[shift] \n\t"
+ "packushb %[t0], %[t0], %[t1] \n\t"
+ "mov.s %[t1], %[t0] \n\t"
+ "and %[d2], %[t0], %[c1] \n\t"
+ "psrlh %[d3], %[t1], %[shift] \n\t"
+
+ "packushb %[d0], %[d0], %[d2] \n\t"
+ "packushb %[d1], %[d1], %[d3] \n\t"
+ "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t"
+ "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t"
+ "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t"
+ "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t"
+ "daddiu %[src_yuy2], %[src_yuy2], 32 \n\t"
+ "daddiu %[dst_u], %[dst_u], 8 \n\t"
+ "daddiu %[dst_v], %[dst_v], 8 \n\t"
+ "daddiu %[width], %[width], -16 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]),
+ [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
+ : [src_yuy2] "r"(src_yuy2), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
+ [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift)
+ : "memory");
+}
+
+// Copy row of YUY2 Y's (422) into Y (420/422).
+void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+ uint64_t c0 = 0x00ff00ff00ff00ff;
+ uint64_t temp[2];
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t"
+ "gsldrc1 %[t1], 0x08(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t1], 0x0f(%[src_yuy2]) \n\t"
+ "and %[t0], %[t0], %[c0] \n\t"
+ "and %[t1], %[t1], %[c0] \n\t"
+ "packushb %[t0], %[t0], %[t1] \n\t"
+ "gssdrc1 %[t0], 0x0(%[dst_y]) \n\t"
+ "gssdlc1 %[t0], 0x7(%[dst_y]) \n\t"
+ "daddiu %[src_yuy2], %[src_yuy2], 16 \n\t"
+ "daddiu %[dst_y], %[dst_y], 8 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1])
+ : [src_yuy2] "r"(src_yuy2), [dst_y] "r"(dst_y), [width] "r"(width),
+ [c0] "f"(c0)
+ : "memory");
+}
+
+// Filter 2 rows of UYVY UV's (422) into U and V (420).
+void UYVYToUVRow_MMI(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ // Output a row of UV values.
+ uint64_t c0 = 0x00ff00ff00ff00ff;
+ uint64_t temp[3];
+ uint64_t data[4];
+ uint64_t shift = 0x08;
+ uint64_t src_stride = 0x0;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t"
+ "daddu %[src_stride], %[src_uyvy], %[src_stride_uyvy] \n\t"
+ "gsldrc1 %[t1], 0x00(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x07(%[src_stride]) \n\t"
+ "pavgb %[t0], %[t0], %[t1] \n\t"
+
+ "gsldrc1 %[t2], 0x08(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t2], 0x0f(%[src_uyvy]) \n\t"
+ "gsldrc1 %[t1], 0x08(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x0f(%[src_stride]) \n\t"
+ "pavgb %[t1], %[t2], %[t1] \n\t"
+
+ "and %[t0], %[t0], %[c0] \n\t"
+ "and %[t1], %[t1], %[c0] \n\t"
+ "packushb %[t0], %[t0], %[t1] \n\t"
+ "mov.s %[t1], %[t0] \n\t"
+ "and %[d0], %[t0], %[c0] \n\t"
+ "psrlh %[d1], %[t1], %[shift] \n\t"
+
+ "gsldrc1 %[t0], 0x10(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t0], 0x17(%[src_uyvy]) \n\t"
+ "gsldrc1 %[t1], 0x10(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x17(%[src_stride]) \n\t"
+ "pavgb %[t0], %[t0], %[t1] \n\t"
+
+ "gsldrc1 %[t2], 0x18(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t2], 0x1f(%[src_uyvy]) \n\t"
+ "gsldrc1 %[t1], 0x18(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x1f(%[src_stride]) \n\t"
+ "pavgb %[t1], %[t2], %[t1] \n\t"
+
+ "and %[t0], %[t0], %[c0] \n\t"
+ "and %[t1], %[t1], %[c0] \n\t"
+ "packushb %[t0], %[t0], %[t1] \n\t"
+ "mov.s %[t1], %[t0] \n\t"
+ "and %[d2], %[t0], %[c0] \n\t"
+ "psrlh %[d3], %[t1], %[shift] \n\t"
+
+ "packushb %[d0], %[d0], %[d2] \n\t"
+ "packushb %[d1], %[d1], %[d3] \n\t"
+ "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t"
+ "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t"
+ "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t"
+ "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t"
+ "daddiu %[src_uyvy], %[src_uyvy], 32 \n\t"
+ "daddiu %[dst_u], %[dst_u], 8 \n\t"
+ "daddiu %[dst_v], %[dst_v], 8 \n\t"
+ "daddiu %[width], %[width], -16 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
+ [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]),
+ [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride)
+ : [src_uyvy] "r"(src_uyvy), [src_stride_uyvy] "r"(src_stride_uyvy),
+ [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+ [c0] "f"(c0), [shift] "f"(shift)
+ : "memory");
+}
+
+// Copy row of UYVY UV's (422) into U and V (422).
+void UYVYToUV422Row_MMI(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ // Output a row of UV values.
+ uint64_t c0 = 0x00ff00ff00ff00ff;
+ uint64_t temp[2];
+ uint64_t data[4];
+ uint64_t shift = 0x08;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t"
+ "gsldrc1 %[t1], 0x08(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t1], 0x0f(%[src_uyvy]) \n\t"
+ "and %[t0], %[t0], %[c0] \n\t"
+ "and %[t1], %[t1], %[c0] \n\t"
+ "packushb %[t0], %[t0], %[t1] \n\t"
+ "mov.s %[t1], %[t0] \n\t"
+ "and %[d0], %[t0], %[c0] \n\t"
+ "psrlh %[d1], %[t1], %[shift] \n\t"
+
+ "gsldrc1 %[t0], 0x10(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t0], 0x17(%[src_uyvy]) \n\t"
+ "gsldrc1 %[t1], 0x18(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t1], 0x1f(%[src_uyvy]) \n\t"
+ "and %[t0], %[t0], %[c0] \n\t"
+ "and %[t1], %[t1], %[c0] \n\t"
+ "packushb %[t0], %[t0], %[t1] \n\t"
+ "mov.s %[t1], %[t0] \n\t"
+ "and %[d2], %[t0], %[c0] \n\t"
+ "psrlh %[d3], %[t1], %[shift] \n\t"
+
+ "packushb %[d0], %[d0], %[d2] \n\t"
+ "packushb %[d1], %[d1], %[d3] \n\t"
+ "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t"
+ "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t"
+ "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t"
+ "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t"
+ "daddiu %[src_uyvy], %[src_uyvy], 32 \n\t"
+ "daddiu %[dst_u], %[dst_u], 8 \n\t"
+ "daddiu %[dst_v], %[dst_v], 8 \n\t"
+ "daddiu %[width], %[width], -16 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]),
+ [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
+ : [src_uyvy] "r"(src_uyvy), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
+ [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift)
+ : "memory");
+}
+
+// Copy row of UYVY Y's (422) into Y (420/422).
+void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+ // Output a row of Y values.
+ uint64_t c0 = 0x00ff00ff00ff00ff;
+ uint64_t shift = 0x08;
+ uint64_t temp[2];
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t"
+ "gsldrc1 %[t1], 0x08(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t1], 0x0f(%[src_uyvy]) \n\t"
+ "dsrl %[t0], %[t0], %[shift] \n\t"
+ "dsrl %[t1], %[t1], %[shift] \n\t"
+ "and %[t0], %[t0], %[c0] \n\t"
+ "and %[t1], %[t1], %[c0] \n\t"
+ "and %[t1], %[t1], %[c0] \n\t"
+ "packushb %[t0], %[t0], %[t1] \n\t"
+ "gssdrc1 %[t0], 0x0(%[dst_y]) \n\t"
+ "gssdlc1 %[t0], 0x7(%[dst_y]) \n\t"
+ "daddiu %[src_uyvy], %[src_uyvy], 16 \n\t"
+ "daddiu %[dst_y], %[dst_y], 8 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1])
+ : [src_uyvy] "r"(src_uyvy), [dst_y] "r"(dst_y), [width] "r"(width),
+ [c0] "f"(c0), [shift] "f"(shift)
+ : "memory");
+}
+
+// Blend src_argb0 over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb0 or src_argb1.
+// This code mimics the SSSE3 version for better testability.
+void ARGBBlendRow_MMI(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ uint64_t src0, src1, dest, alpha, src0_hi, src0_lo, src1_hi, src1_lo, dest_hi,
+ dest_lo;
+ const uint64_t mask0 = 0x0;
+ const uint64_t mask1 = 0x00FFFFFF00FFFFFFULL;
+ const uint64_t mask2 = 0x00FF00FF00FF00FFULL;
+ const uint64_t mask3 = 0xFF;
+ const uint64_t mask4 = ~mask1;
+ const uint64_t shift = 0x08;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t"
+ "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t"
+ "punpcklbh %[src0_lo], %[src0], %[mask0] \n\t"
+
+ "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t"
+ "punpcklbh %[src1_lo], %[src1], %[mask0] \n\t"
+
+ "psubush %[alpha], %[mask2], %[src0_lo] \n\t"
+ "pshufh %[alpha], %[alpha], %[mask3] \n\t"
+ "pmullh %[dest_lo], %[src1_lo], %[alpha] \n\t"
+ "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t"
+ "paddush %[dest_lo], %[dest_lo], %[src0_lo] \n\t"
+
+ "punpckhbh %[src0_hi], %[src0], %[mask0] \n\t"
+ "punpckhbh %[src1_hi], %[src1], %[mask0] \n\t"
+
+ "psubush %[alpha], %[mask2], %[src0_hi] \n\t"
+ "pshufh %[alpha], %[alpha], %[mask3] \n\t"
+ "pmullh %[dest_hi], %[src1_hi], %[alpha] \n\t"
+ "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t"
+ "paddush %[dest_hi], %[dest_hi], %[src0_hi] \n\t"
+
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "and %[dest], %[dest], %[mask1] \n\t"
+ "or %[dest], %[dest], %[mask4] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t"
+ "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [alpha] "=&f"(alpha),
+ [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
+ [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
+ [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo)
+ : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
+ [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1),
+ [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4),
+ [shift] "f"(shift), [width] "r"(width)
+ : "memory");
+}
+
+void BlendPlaneRow_MMI(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
+ int width) {
+ uint64_t source0, source1, dest, alph;
+ uint64_t src0_hi, src0_lo, src1_hi, src1_lo, alpha_hi, alpha_lo, dest_hi,
+ dest_lo;
+ uint64_t alpha_rev, alpha_rev_lo, alpha_rev_hi;
+ const uint64_t mask0 = 0x0;
+ const uint64_t mask1 = 0xFFFFFFFFFFFFFFFFULL;
+ const uint64_t mask2 = 0x00FF00FF00FF00FFULL;
+ const uint64_t shift = 0x08;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t"
+ "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t"
+ "punpcklbh %[src0_lo], %[src0], %[mask0] \n\t"
+ "punpckhbh %[src0_hi], %[src0], %[mask0] \n\t"
+
+ "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t"
+ "punpcklbh %[src1_lo], %[src1], %[mask0] \n\t"
+ "punpckhbh %[src1_hi], %[src1], %[mask0] \n\t"
+
+ "gsldlc1 %[alpha], 0x07(%[alpha_ptr]) \n\t"
+ "gsldrc1 %[alpha], 0x00(%[alpha_ptr]) \n\t"
+ "psubusb %[alpha_r], %[mask1], %[alpha] \n\t"
+ "punpcklbh %[alpha_lo], %[alpha], %[mask0] \n\t"
+ "punpckhbh %[alpha_hi], %[alpha], %[mask0] \n\t"
+ "punpcklbh %[alpha_rlo], %[alpha_r], %[mask0] \n\t"
+ "punpckhbh %[alpha_rhi], %[alpha_r], %[mask0] \n\t"
+
+ "pmullh %[dest_lo], %[src0_lo], %[alpha_lo] \n\t"
+ "pmullh %[dest], %[src1_lo], %[alpha_rlo] \n\t"
+ "paddush %[dest_lo], %[dest_lo], %[dest] \n\t"
+ "paddush %[dest_lo], %[dest_lo], %[mask2] \n\t"
+ "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t"
+
+ "pmullh %[dest_hi], %[src0_hi], %[alpha_hi] \n\t"
+ "pmullh %[dest], %[src1_hi], %[alpha_rhi] \n\t"
+ "paddush %[dest_hi], %[dest_hi], %[dest] \n\t"
+ "paddush %[dest_hi], %[dest_hi], %[mask2] \n\t"
+ "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t"
+
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t"
+ "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
+ "daddiu %[alpha_ptr], %[alpha_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(source0), [src1] "=&f"(source1), [alpha] "=&f"(alph),
+ [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
+ [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
+ [alpha_hi] "=&f"(alpha_hi), [alpha_lo] "=&f"(alpha_lo),
+ [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
+ [alpha_rlo] "=&f"(alpha_rev_lo), [alpha_rhi] "=&f"(alpha_rev_hi),
+ [alpha_r] "=&f"(alpha_rev)
+ : [src0_ptr] "r"(src0), [src1_ptr] "r"(src1), [alpha_ptr] "r"(alpha),
+ [dst_ptr] "r"(dst), [mask0] "f"(mask0), [mask1] "f"(mask1),
+ [mask2] "f"(mask2), [shift] "f"(shift), [width] "r"(width)
+ : "memory");
+}
+
+// Multiply source RGB by alpha and store to destination.
+// This code mimics the SSSE3 version for better testability.
+void ARGBAttenuateRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ uint64_t src, src_hi, src_lo, dest, dest_hi, dest_lo, alpha;
+ const uint64_t mask0 = 0xFF;
+ const uint64_t mask1 = 0xFF000000FF000000ULL;
+ const uint64_t mask2 = ~mask1;
+ const uint64_t shift = 0x08;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[src] \n\t"
+ "punpckhbh %[src_hi], %[src], %[src] \n\t"
+
+ "pshufh %[alpha], %[src_lo], %[mask0] \n\t"
+ "pmulhuh %[dest_lo], %[alpha], %[src_lo] \n\t"
+ "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t"
+ "pshufh %[alpha], %[src_hi], %[mask0] \n\t"
+ "pmulhuh %[dest_hi], %[alpha], %[src_hi] \n\t"
+ "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t"
+
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "and %[dest], %[dest], %[mask2] \n\t"
+ "and %[src], %[src], %[mask1] \n\t"
+ "or %[dest], %[dest], %[src] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
+ [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi),
+ [dest_lo] "=&f"(dest_lo), [alpha] "=&f"(alpha)
+ : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
+ [mask1] "f"(mask1), [mask2] "f"(mask2), [shift] "f"(shift),
+ [width] "r"(width)
+ : "memory");
+}
+
+void ComputeCumulativeSumRow_MMI(const uint8_t* row,
+ int32_t* cumsum,
+ const int32_t* previous_cumsum,
+ int width) {
+ int64_t row_sum[2] = {0, 0};
+ uint64_t src, dest0, dest1, presrc0, presrc1, dest;
+ const uint64_t mask = 0x0;
+
+ __asm__ volatile(
+ "xor %[row_sum0], %[row_sum0], %[row_sum0] \n\t"
+ "xor %[row_sum1], %[row_sum1], %[row_sum1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[src], 0x03(%[row_ptr]) \n\t"
+ "gslwrc1 %[src], 0x00(%[row_ptr]) \n\t"
+
+ "punpcklbh %[src], %[src], %[mask] \n\t"
+ "punpcklhw %[dest0], %[src], %[mask] \n\t"
+ "punpckhhw %[dest1], %[src], %[mask] \n\t"
+
+ "paddw %[row_sum0], %[row_sum0], %[dest0] \n\t"
+ "paddw %[row_sum1], %[row_sum1], %[dest1] \n\t"
+
+ "gsldlc1 %[presrc0], 0x07(%[pre_ptr]) \n\t"
+ "gsldrc1 %[presrc0], 0x00(%[pre_ptr]) \n\t"
+ "gsldlc1 %[presrc1], 0x0f(%[pre_ptr]) \n\t"
+ "gsldrc1 %[presrc1], 0x08(%[pre_ptr]) \n\t"
+
+ "paddw %[dest0], %[row_sum0], %[presrc0] \n\t"
+ "paddw %[dest1], %[row_sum1], %[presrc1] \n\t"
+
+ "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
+ "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
+
+ "daddiu %[row_ptr], %[row_ptr], 0x04 \n\t"
+ "daddiu %[pre_ptr], %[pre_ptr], 0x10 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x01 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
+ [dest1] "=&f"(dest1), [row_sum0] "+&f"(row_sum[0]),
+ [row_sum1] "+&f"(row_sum[1]), [presrc0] "=&f"(presrc0),
+ [presrc1] "=&f"(presrc1)
+ : [row_ptr] "r"(row), [pre_ptr] "r"(previous_cumsum),
+ [dst_ptr] "r"(cumsum), [width] "r"(width), [mask] "f"(mask)
+ : "memory");
+}
+
+// C version 2x2 -> 2x1.
+void InterpolateRow_MMI(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int width,
+ int source_y_fraction) {
+ if (source_y_fraction == 0) {
+ __asm__ volatile(
+ "1: \n\t"
+ "ld $t0, 0x0(%[src_ptr]) \n\t"
+ "sd $t0, 0x0(%[dst_ptr]) \n\t"
+ "daddiu %[src_ptr], %[src_ptr], 8 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ :
+ : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), [width] "r"(width)
+ : "memory");
+ return;
+ }
+ if (source_y_fraction == 128) {
+ uint64_t uv = 0x0;
+ uint64_t uv_stride = 0x0;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[uv], 0x0(%[src_ptr]) \n\t"
+ "gsldlc1 %[uv], 0x7(%[src_ptr]) \n\t"
+ "daddu $t0, %[src_ptr], %[stride] \n\t"
+ "gsldrc1 %[uv_stride], 0x0($t0) \n\t"
+ "gsldlc1 %[uv_stride], 0x7($t0) \n\t"
+
+ "pavgb %[uv], %[uv], %[uv_stride] \n\t"
+ "gssdrc1 %[uv], 0x0(%[dst_ptr]) \n\t"
+ "gssdlc1 %[uv], 0x7(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 8 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [uv] "=&f"(uv), [uv_stride] "=&f"(uv_stride)
+ : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(width),
+ [stride] "r"((int64_t)src_stride)
+ : "memory");
+ return;
+ }
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ uint64_t temp;
+ uint64_t data[4];
+ uint64_t zero = 0x0;
+ uint64_t c0 = 0x0080008000800080;
+ uint64_t fy0 = 0x0100010001000100;
+ uint64_t shift = 0x8;
+ __asm__ volatile(
+ "pshufh %[fy1], %[fy1], %[zero] \n\t"
+ "psubh %[fy0], %[fy0], %[fy1] \n\t"
+ "1: \n\t"
+ "gsldrc1 %[t0], 0x0(%[src_ptr]) \n\t"
+ "gsldlc1 %[t0], 0x7(%[src_ptr]) \n\t"
+ "punpcklbh %[d0], %[t0], %[zero] \n\t"
+ "punpckhbh %[d1], %[t0], %[zero] \n\t"
+ "gsldrc1 %[t0], 0x0(%[src_ptr1]) \n\t"
+ "gsldlc1 %[t0], 0x7(%[src_ptr1]) \n\t"
+ "punpcklbh %[d2], %[t0], %[zero] \n\t"
+ "punpckhbh %[d3], %[t0], %[zero] \n\t"
+
+ "pmullh %[d0], %[d0], %[fy0] \n\t"
+ "pmullh %[d2], %[d2], %[fy1] \n\t"
+ "paddh %[d0], %[d0], %[d2] \n\t"
+ "paddh %[d0], %[d0], %[c0] \n\t"
+ "psrlh %[d0], %[d0], %[shift] \n\t"
+
+ "pmullh %[d1], %[d1], %[fy0] \n\t"
+ "pmullh %[d3], %[d3], %[fy1] \n\t"
+ "paddh %[d1], %[d1], %[d3] \n\t"
+ "paddh %[d1], %[d1], %[c0] \n\t"
+ "psrlh %[d1], %[d1], %[shift] \n\t"
+
+ "packushb %[d0], %[d0], %[d1] \n\t"
+ "gssdrc1 %[d0], 0x0(%[dst_ptr]) \n\t"
+ "gssdlc1 %[d0], 0x7(%[dst_ptr]) \n\t"
+ "daddiu %[src_ptr], %[src_ptr], 8 \n\t"
+ "daddiu %[src_ptr1], %[src_ptr1], 8 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [t0] "=&f"(temp), [d0] "=&f"(data[0]), [d1] "=&f"(data[1]),
+ [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
+ : [src_ptr] "r"(src_ptr), [src_ptr1] "r"(src_ptr1),
+ [dst_ptr] "r"(dst_ptr), [width] "r"(width),
+ [fy1] "f"(source_y_fraction), [fy0] "f"(fy0), [c0] "f"(c0),
+ [shift] "f"(shift), [zero] "f"(zero)
+ : "memory");
+}
+
+// Use first 4 shuffler values to reorder ARGB channels.
+void ARGBShuffleRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width) {
+ uint64_t source, dest0, dest1, dest;
+ const uint64_t mask0 = 0x0;
+ const uint64_t mask1 = (shuffler[0] & 0x03) | ((shuffler[1] & 0x03) << 2) |
+ ((shuffler[2] & 0x03) << 4) |
+ ((shuffler[3] & 0x03) << 6);
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklbh %[dest0], %[src], %[mask0] \n\t"
+ "pshufh %[dest0], %[dest0], %[mask1] \n\t"
+ "punpckhbh %[dest1], %[src], %[mask0] \n\t"
+ "pshufh %[dest1], %[dest1], %[mask1] \n\t"
+ "packushb %[dest], %[dest0], %[dest1] \n\t"
+
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
+ [dest1] "=&f"(dest1)
+ : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
+ [mask1] "f"(mask1), [width] "r"(width)
+ : "memory");
+}
+
+void I422ToYUY2Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_frame,
+ int width) {
+ uint64_t temp[3];
+ uint64_t vu = 0x0;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[ty], 0x7(%[src_y]) \n\t" // r=src_sobelx[i]
+ "gsldrc1 %[ty], 0x0(%[src_y]) \n\t" // r=src_sobelx[i]
+ "gslwlc1 %[tu], 0x3(%[src_u]) \n\t" // b=src_sobely[i]
+ "gslwrc1 %[tu], 0x0(%[src_u]) \n\t" // b=src_sobely[i]
+ "gslwlc1 %[tv], 0x3(%[src_v]) \n\t" // b=src_sobely[i]
+ "gslwrc1 %[tv], 0x0(%[src_v]) \n\t" // b=src_sobely[i]
+ "punpcklbh %[vu], %[tu], %[tv] \n\t" // g
+ "punpcklbh %[tu], %[ty], %[vu] \n\t" // g
+ "gssdlc1 %[tu], 0x7(%[dst_frame]) \n\t"
+ "gssdrc1 %[tu], 0x0(%[dst_frame]) \n\t"
+ "punpckhbh %[tu], %[ty], %[vu] \n\t" // g
+ "gssdlc1 %[tu], 0x0F(%[dst_frame]) \n\t"
+ "gssdrc1 %[tu], 0x08(%[dst_frame]) \n\t"
+ "daddiu %[src_y], %[src_y], 8 \n\t"
+ "daddiu %[src_u], %[src_u], 4 \n\t"
+ "daddiu %[src_v], %[src_v], 4 \n\t"
+ "daddiu %[dst_frame], %[dst_frame], 16 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]),
+ [vu] "=&f"(vu)
+ : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
+ [dst_frame] "r"(dst_frame), [width] "r"(width)
+ : "memory");
+}
+
+void I422ToUYVYRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_frame,
+ int width) {
+ uint64_t temp[3];
+ uint64_t vu = 0x0;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[ty], 0x7(%[src_y]) \n\t" // r=src_sobelx[i]
+ "gsldrc1 %[ty], 0x0(%[src_y]) \n\t" // r=src_sobelx[i]
+ "gslwlc1 %[tu], 0x3(%[src_u]) \n\t" // b=src_sobely[i]
+ "gslwrc1 %[tu], 0x0(%[src_u]) \n\t" // b=src_sobely[i]
+ "gslwlc1 %[tv], 0x3(%[src_v]) \n\t" // b=src_sobely[i]
+ "gslwrc1 %[tv], 0x0(%[src_v]) \n\t" // b=src_sobely[i]
+ "punpcklbh %[vu], %[tu], %[tv] \n\t" // g
+ "punpcklbh %[tu], %[vu], %[ty] \n\t" // g
+ "gssdlc1 %[tu], 0x7(%[dst_frame]) \n\t"
+ "gssdrc1 %[tu], 0x0(%[dst_frame]) \n\t"
+ "punpckhbh %[tu], %[vu], %[ty] \n\t" // g
+ "gssdlc1 %[tu], 0x0F(%[dst_frame]) \n\t"
+ "gssdrc1 %[tu], 0x08(%[dst_frame]) \n\t"
+ "daddiu %[src_y], %[src_y], 8 \n\t"
+ "daddiu %[src_u], %[src_u], 4 \n\t"
+ "daddiu %[src_v], %[src_v], 4 \n\t"
+ "daddiu %[dst_frame], %[dst_frame], 16 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]),
+ [vu] "=&f"(vu)
+ : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
+ [dst_frame] "r"(dst_frame), [width] "r"(width)
+ : "memory");
+}
+
+void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
+ uint64_t source, dest;
+ const uint64_t mask0 = 0xff000000ff000000ULL;
+ const uint64_t mask1 = ~mask0;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "and %[src], %[src], %[mask0] \n\t"
+ "and %[dest], %[dest], %[mask1] \n\t"
+ "or %[dest], %[src], %[dest] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(source), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
+ [mask1] "f"(mask1), [width] "r"(width)
+ : "memory");
+}
+
+void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width) {
+ uint64_t src, dest0, dest1, dest_lo, dest_hi, dest;
+ const uint64_t mask = 0xff000000ff000000ULL;
+ const uint64_t shift = 0x18;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "and %[dest0], %[src], %[mask] \n\t"
+ "psrlw %[dest0], %[dest0], %[shift] \n\t"
+ "gsldlc1 %[src], 0x0f(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x08(%[src_ptr]) \n\t"
+ "and %[dest1], %[src], %[mask] \n\t"
+ "psrlw %[dest1], %[dest1], %[shift] \n\t"
+ "packsswh %[dest_lo], %[dest0], %[dest1] \n\t"
+
+ "gsldlc1 %[src], 0x17(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x10(%[src_ptr]) \n\t"
+ "and %[dest0], %[src], %[mask] \n\t"
+ "psrlw %[dest0], %[dest0], %[shift] \n\t"
+ "gsldlc1 %[src], 0x1f(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x18(%[src_ptr]) \n\t"
+ "and %[dest1], %[src], %[mask] \n\t"
+ "psrlw %[dest1], %[dest1], %[shift] \n\t"
+ "packsswh %[dest_hi], %[dest0], %[dest1] \n\t"
+
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
+ [dest1] "=&f"(dest1), [dest_lo] "=&f"(dest_lo), [dest_hi] "=&f"(dest_hi)
+ : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_a), [mask] "f"(mask),
+ [shift] "f"(shift), [width] "r"(width)
+ : "memory");
+}
+
+void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
+ uint64_t source, dest0, dest1, dest;
+ const uint64_t mask0 = 0x0;
+ const uint64_t mask1 = 0x00ffffff00ffffffULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklbh %[dest0], %[mask0], %[src] \n\t"
+ "punpcklhw %[dest1], %[mask0], %[dest0] \n\t"
+ "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+ "and %[dest], %[dest], %[mask1] \n\t"
+ "or %[dest], %[dest], %[dest1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+ "punpckhhw %[dest1], %[mask0], %[dest0] \n\t"
+ "gsldlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
+ "gsldrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
+ "and %[dest], %[dest], %[mask1] \n\t"
+ "or %[dest], %[dest], %[dest1] \n\t"
+ "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
+
+ "punpckhbh %[dest0], %[mask0], %[src] \n\t"
+ "punpcklhw %[dest1], %[mask0], %[dest0] \n\t"
+ "gsldlc1 %[dest], 0x17(%[dst_ptr]) \n\t"
+ "gsldrc1 %[dest], 0x10(%[dst_ptr]) \n\t"
+ "and %[dest], %[dest], %[mask1] \n\t"
+ "or %[dest], %[dest], %[dest1] \n\t"
+ "gssdlc1 %[dest], 0x17(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x10(%[dst_ptr]) \n\t"
+ "punpckhhw %[dest1], %[mask0], %[dest0] \n\t"
+ "gsldlc1 %[dest], 0x1f(%[dst_ptr]) \n\t"
+ "gsldrc1 %[dest], 0x18(%[dst_ptr]) \n\t"
+ "and %[dest], %[dest], %[mask1] \n\t"
+ "or %[dest], %[dest], %[dest1] \n\t"
+ "gssdlc1 %[dest], 0x1f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x18(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x20 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
+ [dest1] "=&f"(dest1)
+ : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
+ [mask1] "f"(mask1), [width] "r"(width)
+ : "memory");
+}
+
+void I444ToARGBRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y,u,v;
+ uint64_t b_vec[2],g_vec[2],r_vec[2];
+ uint64_t mask = 0xff00ff00ff00ff00ULL;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+ __asm__ volatile (
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub
+ "or %[ub], %[ub], %[mask] \n\t"//must sign extension
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"//sign extension
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101
+ "pmulhuh %[y], %[y], %[yg] \n\t"//y1
+
+ "punpcklbh %[u], %[u], %[zero] \n\t"//u
+ "paddsh %[b_vec0], %[y], %[bb] \n\t"
+ "pmullh %[b_vec1], %[u], %[ub] \n\t"
+ "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
+ "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
+
+ "punpcklbh %[v], %[v], %[zero] \n\t"//v
+ "paddsh %[g_vec0], %[y], %[bg] \n\t"
+ "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
+
+ "paddsh %[r_vec0], %[y], %[br] \n\t"
+ "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr
+ "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
+ "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
+
+ "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb
+ "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg
+ "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t"
+ "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb
+ "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr
+ "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb
+ "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb
+ "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [y]"=&f"(y),
+ [u]"=&f"(u), [v]"=&f"(v),
+ [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]),
+ [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]),
+ [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [alpha]"f"(-1),
+ [six]"f"(0x6), [five]"f"(0x55),
+ [mask]"f"(mask)
+ : "memory"
+ );
+}
+
+// Also used for 420
+void I422ToARGBRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y,u,v;
+ uint64_t b_vec[2],g_vec[2],r_vec[2];
+ uint64_t mask = 0xff00ff00ff00ff00ULL;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub
+ "or %[ub], %[ub], %[mask] \n\t"//must sign extension
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"//sign extension
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101
+ "pmulhuh %[y], %[y], %[yg] \n\t"//y1
+
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"//u
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec0], %[y], %[bb] \n\t"
+ "pmullh %[b_vec1], %[u], %[ub] \n\t"
+ "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
+ "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
+
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"//v
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec0], %[y], %[bg] \n\t"
+ "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
+
+ "paddsh %[r_vec0], %[y], %[br] \n\t"
+ "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr
+ "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
+ "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
+
+ "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb
+ "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg
+ "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t"
+ "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb
+ "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr
+ "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb
+ "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb
+ "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y),
+ [u]"=&f"(u), [v]"=&f"(v),
+ [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]),
+ [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]),
+ [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [alpha]"f"(-1),
+ [six]"f"(0x6), [five]"f"(0x55),
+ [mask]"f"(mask)
+ : "memory"
+ );
+}
+
+// 10 bit YUV to ARGB
+void I210ToARGBRow_MMI(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y,u,v;
+ uint64_t b_vec[2],g_vec[2],r_vec[2];
+ uint64_t mask = 0xff00ff00ff00ff00ULL;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"
+
+ "1: \n\t"
+ "gsldlc1 %[y], 0x07(%[y_ptr]) \n\t"
+ "gsldrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "psllh %[y], %[y], %[six] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "punpcklhw %[u], %[u], %[u] \n\t"
+ "psrah %[u], %[u], %[two] \n\t"
+ "punpcklhw %[v], %[v], %[v] \n\t"
+ "psrah %[v], %[v], %[two] \n\t"
+ "pminsh %[u], %[u], %[mask1] \n\t"
+ "pminsh %[v], %[v], %[mask1] \n\t"
+
+ "paddsh %[b_vec0], %[y], %[bb] \n\t"
+ "pmullh %[b_vec1], %[u], %[ub] \n\t"
+ "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
+
+ "paddsh %[g_vec0], %[y], %[bg] \n\t"
+ "pmullh %[g_vec1], %[u], %[ug] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "pmullh %[g_vec1], %[v], %[vg] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+
+ "paddsh %[r_vec0], %[y], %[br] \n\t"
+ "pmullh %[r_vec1], %[v], %[vr] \n\t"
+ "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
+
+ "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
+ "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
+ "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
+
+ "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"
+ "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"
+ "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t"
+ "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"
+ "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"
+ "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x08 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y),
+ [u]"=&f"(u), [v]"=&f"(v),
+ [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]),
+ [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]),
+ [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [alpha]"f"(-1),
+ [six]"f"(0x6), [five]"f"(0x55),
+ [mask]"f"(mask), [two]"f"(0x02),
+ [mask1]"f"(0x00ff00ff00ff00ff)
+ : "memory"
+ );
+}
+
+void I422AlphaToARGBRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y,u,v,a;
+ uint64_t b_vec[2],g_vec[2],r_vec[2];
+ uint64_t mask = 0xff00ff00ff00ff00ULL;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+ "gslwlc1 %[a], 0x03(%[a_ptr]) \n\t"
+ "gslwrc1 %[a], 0x00(%[a_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101
+ "pmulhuh %[y], %[y], %[yg] \n\t"//y1
+
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"//u
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec0], %[y], %[bb] \n\t"
+ "pmullh %[b_vec1], %[u], %[ub] \n\t"
+ "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
+ "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
+
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec0], %[y], %[bg] \n\t"
+ "pmullh %[g_vec1], %[u], %[ug] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "pmullh %[g_vec1], %[v], %[vg] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
+
+ "paddsh %[r_vec0], %[y], %[br] \n\t"
+ "pmullh %[r_vec1], %[v], %[vr] \n\t"
+ "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
+ "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
+
+ "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb
+ "packushb %[g_vec0], %[g_vec0], %[a] \n\t"
+ "punpcklwd %[g_vec0], %[g_vec0], %[a] \n\t"//aaaagggg
+ "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"
+ "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"
+ "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[a_ptr], %[a_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v), [a]"=&f"(a),
+ [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]),
+ [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]),
+ [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [a_ptr]"r"(src_a), [zero]"f"(0x00),
+ [six]"f"(0x6), [five]"f"(0x55),
+ [mask]"f"(mask)
+ : "memory"
+ );
+}
+
+void I422ToRGB24Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y,u,v;
+ uint64_t b_vec[2],g_vec[2],r_vec[2];
+ uint64_t mask = 0xff00ff00ff00ff00ULL;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101
+ "pmulhuh %[y], %[y], %[yg] \n\t"//y1
+
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"//u
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec0], %[y], %[bb] \n\t"
+ "pmullh %[b_vec1], %[u], %[ub] \n\t"
+ "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
+ "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
+
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec0], %[y], %[bg] \n\t"
+ "pmullh %[g_vec1], %[u], %[ug] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "pmullh %[g_vec1], %[v], %[vg] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
+
+ "paddsh %[r_vec0], %[y], %[br] \n\t"
+ "pmullh %[r_vec1], %[v], %[vr] \n\t"
+ "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
+ "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
+
+ "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"
+ "packushb %[g_vec0], %[g_vec0], %[zero] \n\t"
+ "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"
+ "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"
+
+ "punpckhwd %[r_vec0], %[g_vec0], %[g_vec0] \n\t"
+ "psllw %[r_vec1], %[r_vec0], %[lmove1] \n\t"
+ "or %[g_vec0], %[g_vec0], %[r_vec1] \n\t"
+ "psrlw %[r_vec1], %[r_vec0], %[rmove1] \n\t"
+ "pextrh %[r_vec1], %[r_vec1], %[zero] \n\t"
+ "pinsrh_2 %[g_vec0], %[g_vec0], %[r_vec1] \n\t"
+ "pextrh %[r_vec1], %[g_vec1], %[zero] \n\t"
+ "pinsrh_3 %[g_vec0], %[g_vec0], %[r_vec1] \n\t"
+ "pextrh %[r_vec1], %[g_vec1], %[one] \n\t"
+ "punpckhwd %[g_vec1], %[g_vec1], %[g_vec1] \n\t"
+ "psllw %[g_vec1], %[g_vec1], %[rmove1] \n\t"
+ "or %[g_vec1], %[g_vec1], %[r_vec1] \n\t"
+ "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gsswlc1 %[g_vec1], 0x0b(%[rgbbuf_ptr]) \n\t"
+ "gsswrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
+
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0c \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]),
+ [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]),
+ [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask]"f"(mask),
+ [lmove1]"f"(0x18), [rmove1]"f"(0x8),
+ [one]"f"(0x1)
+ : "memory"
+ );
+}
+
+void I422ToARGB4444Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y, u, v;
+ uint64_t b_vec, g_vec, r_vec, temp;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101
+ "pmulhuh %[y], %[y], %[yg] \n\t"//y1
+
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"//u
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "and %[g_vec], %[g_vec], %[mask1] \n\t"
+ "psrlw %[g_vec], %[g_vec], %[four] \n\t"
+ "psrlw %[r_vec], %[g_vec], %[four] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+ "punpcklbh %[r_vec], %[alpha], %[zero] \n\t"
+ "and %[g_vec], %[g_vec], %[r_vec] \n\t"
+
+ "and %[b_vec], %[b_vec], %[mask1] \n\t"
+ "psrlw %[b_vec], %[b_vec], %[four] \n\t"
+ "psrlw %[r_vec], %[b_vec], %[four] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "punpcklbh %[r_vec], %[alpha], %[zero] \n\t"
+ "and %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[b_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[dst_argb4444]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[dst_argb4444]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[dst_argb4444], %[dst_argb4444], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [dst_argb4444]"r"(dst_argb4444),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask]"f"(0xff00ff00ff00ff00),
+ [four]"f"(0x4), [mask1]"f"(0xf0f0f0f0f0f0f0f0),
+ [alpha]"f"(-1)
+ : "memory"
+ );
+}
+
+void I422ToARGB1555Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y, u, v;
+ uint64_t b_vec, g_vec, r_vec, temp;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "psrlw %[temp], %[g_vec], %[three] \n\t"
+ "and %[g_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+ "psrlw %[temp], %[temp], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+ "or %[g_vec], %[g_vec], %[mask3] \n\t"
+
+ "psrlw %[temp], %[b_vec], %[three] \n\t"
+ "and %[b_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "psrlw %[temp], %[temp], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "or %[b_vec], %[b_vec], %[mask3] \n\t"
+
+ "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t"
+ "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t"
+ "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[dst_argb1555]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[dst_argb1555]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[dst_argb1555], %[dst_argb1555], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [dst_argb1555]"r"(dst_argb1555),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [three]"f"(0x3), [mask2]"f"(0x1f0000001f),
+ [eight]"f"(0x8), [mask3]"f"(0x800000008000),
+ [lmove5]"f"(0x5)
+ : "memory"
+ );
+}
+
+void I422ToRGB565Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y, u, v;
+ uint64_t b_vec, g_vec, r_vec, temp;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "psrlh %[temp], %[g_vec], %[three] \n\t"
+ "and %[g_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[seven] \n\t"
+ "psrlw %[r_vec], %[mask1], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[r_vec] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+ "paddb %[r_vec], %[three], %[six] \n\t"
+ "psrlw %[temp], %[temp], %[r_vec] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "paddb %[temp], %[three], %[eight] \n\t"
+ "psllw %[r_vec], %[r_vec], %[temp] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+
+ "psrlh %[temp], %[b_vec], %[three] \n\t"
+ "and %[b_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[seven] \n\t"
+ "psrlw %[r_vec], %[mask1], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[r_vec] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "paddb %[r_vec], %[three], %[six] \n\t"
+ "psrlw %[temp], %[temp], %[r_vec] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "paddb %[temp], %[three], %[eight] \n\t"
+ "psllw %[r_vec], %[r_vec], %[temp] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t"
+ "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t"
+ "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [dst_rgb565]"r"(dst_rgb565),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [three]"f"(0x3), [mask2]"f"(0x1f0000001f),
+ [eight]"f"(0x8), [seven]"f"(0x7),
+ [lmove5]"f"(0x5)
+ : "memory"
+ );
+}
+
+void NV12ToARGBRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y, u, v;
+ uint64_t b_vec, g_vec, r_vec, temp;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "pshufh %[v], %[u], %[vshu] \n\t"
+ "pshufh %[u], %[u], %[ushu] \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv),
+ [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [alpha]"f"(-1)
+ : "memory"
+ );
+}
+
+void NV21ToARGBRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y, u, v;
+ uint64_t b_vec, g_vec, r_vec, temp;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "pshufh %[v], %[u], %[ushu] \n\t"
+ "pshufh %[u], %[u], %[vshu] \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu),
+ [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [alpha]"f"(-1)
+ : "memory"
+ );
+}
+
+void NV12ToRGB24Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y, u, v;
+ uint64_t b_vec, g_vec, r_vec, temp;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "pshufh %[v], %[u], %[vshu] \n\t"
+ "pshufh %[u], %[u], %[ushu] \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t"
+ "psllw %[temp], %[r_vec], %[lmove1] \n\t"
+ "or %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrlw %[temp], %[r_vec], %[rmove1] \n\t"
+ "pextrh %[temp], %[temp], %[zero] \n\t"
+ "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t"
+ "pextrh %[temp], %[b_vec], %[zero] \n\t"
+ "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t"
+ "pextrh %[temp], %[b_vec], %[one] \n\t"
+ "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t"
+ "psllw %[b_vec], %[b_vec], %[rmove1] \n\t"
+ "or %[b_vec], %[b_vec], %[temp] \n\t"
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t"
+ "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv),
+ [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [alpha]"f"(-1), [lmove1]"f"(0x18),
+ [one]"f"(0x1), [rmove1]"f"(0x8)
+ : "memory"
+ );
+}
+
+void NV21ToRGB24Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y, u, v;
+ uint64_t b_vec, g_vec, r_vec, temp;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "pshufh %[v], %[u], %[ushu] \n\t"
+ "pshufh %[u], %[u], %[vshu] \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t"
+ "psllw %[temp], %[r_vec], %[lmove1] \n\t"
+ "or %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrlw %[temp], %[r_vec], %[rmove1] \n\t"
+ "pextrh %[temp], %[temp], %[zero] \n\t"
+ "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t"
+ "pextrh %[temp], %[b_vec], %[zero] \n\t"
+ "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t"
+ "pextrh %[temp], %[b_vec], %[one] \n\t"
+ "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t"
+ "psllw %[b_vec], %[b_vec], %[rmove1] \n\t"
+ "or %[b_vec], %[b_vec], %[temp] \n\t"
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t"
+ "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu),
+ [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [lmove1]"f"(0x18), [rmove1]"f"(0x8),
+ [one]"f"(0x1)
+ : "memory"
+ );
+}
+
+void NV12ToRGB565Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y, u, v;
+ uint64_t b_vec, g_vec, r_vec, temp;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "pshufh %[v], %[u], %[vshu] \n\t"
+ "pshufh %[u], %[u], %[ushu] \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "psrlh %[temp], %[g_vec], %[three] \n\t"
+ "and %[g_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[seven] \n\t"
+ "psrlw %[r_vec], %[mask1], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[r_vec] \n\t"
+ "psubb %[y], %[eight], %[three] \n\t"//5
+ "psllw %[r_vec], %[r_vec], %[y] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+ "paddb %[r_vec], %[three], %[six] \n\t"
+ "psrlw %[temp], %[temp], %[r_vec] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "paddb %[temp], %[three], %[eight] \n\t"
+ "psllw %[r_vec], %[r_vec], %[temp] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+
+ "psrlh %[temp], %[b_vec], %[three] \n\t"
+ "and %[b_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[seven] \n\t"
+ "psrlw %[r_vec], %[mask1], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[r_vec] \n\t"
+ "psubb %[y], %[eight], %[three] \n\t"//5
+ "psllw %[r_vec], %[r_vec], %[y] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "paddb %[r_vec], %[three], %[six] \n\t"
+ "psrlw %[temp], %[temp], %[r_vec] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "paddb %[temp], %[three], %[eight] \n\t"
+ "psllw %[r_vec], %[r_vec], %[temp] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t"
+ "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t"
+ "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t"
+ "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv),
+ [dst_rgb565]"r"(dst_rgb565),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [three]"f"(0x3), [mask2]"f"(0x1f0000001f),
+ [eight]"f"(0x8), [seven]"f"(0x7)
+ : "memory"
+ );
+}
+
+void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y, u, v;
+ uint64_t b_vec, g_vec, r_vec, temp;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gsldlc1 %[y], 0x07(%[yuy2_ptr]) \n\t"
+ "gsldrc1 %[y], 0x00(%[yuy2_ptr]) \n\t"
+ "psrlh %[temp], %[y], %[eight] \n\t"
+ "pshufh %[u], %[temp], %[ushu] \n\t"
+ "pshufh %[v], %[temp], %[vshu] \n\t"
+
+ "psrlh %[temp], %[mask1], %[eight] \n\t"
+ "and %[y], %[y], %[temp] \n\t"
+ "psllh %[temp], %[y], %[eight] \n\t"
+ "or %[y], %[y], %[temp] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[yuy2_ptr], %[yuy2_ptr], 0x08 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [yuy2_ptr]"r"(src_yuy2), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [alpha]"f"(-1), [eight]"f"(0x8)
+ : "memory"
+ );
+}
+
+void UYVYToARGBRow_MMI(const uint8_t* src_uyvy,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y, u, v;
+ uint64_t b_vec, g_vec, r_vec, temp;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gsldlc1 %[y], 0x07(%[uyvy_ptr]) \n\t"
+ "gsldrc1 %[y], 0x00(%[uyvy_ptr]) \n\t"
+ "psrlh %[temp], %[mask1], %[eight] \n\t"
+ "and %[temp], %[y], %[temp] \n\t"
+ "pshufh %[u], %[temp], %[ushu] \n\t"
+ "pshufh %[v], %[temp], %[vshu] \n\t"
+
+ "psrlh %[y], %[y], %[eight] \n\t"
+ "psllh %[temp], %[y], %[eight] \n\t"
+ "or %[y], %[y], %[temp] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[uyvy_ptr], %[uyvy_ptr], 0x08 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [uyvy_ptr]"r"(src_uyvy), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [alpha]"f"(-1), [eight]"f"(0x8)
+ : "memory"
+ );
+}
+
+void I422ToRGBARow_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y, u, v;
+ uint64_t b_vec, g_vec, r_vec, temp;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "punpcklbh %[u], %[u], %[u] \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[alpha], %[g_vec] \n\t"
+ "punpcklbh %[b_vec], %[g_vec], %[r_vec] \n\t"
+ "punpckhbh %[r_vec], %[g_vec], %[r_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [alpha]"f"(-1)
+ : "memory"
+ );
+}
+
+void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width) {
+ __asm__ volatile (
+ "punpcklwd %[v32], %[v32], %[v32] \n\t"
+ "1: \n\t"
+ "gssdlc1 %[v32], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[v32], 0x00(%[dst_ptr]) \n\t"
+ "gssdlc1 %[v32], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[v32], 0x08(%[dst_ptr]) \n\t"
+
+ "daddi %[width], %[width], -0x04 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [v32]"+&f"(v32)
+ : [dst_ptr]"r"(dst_argb), [width]"r"(width)
+ : "memory"
+ );
+}
+// clang-format on
+
+// 10 bit YUV to ARGB
+#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/row_msa.cc b/chromium/third_party/libyuv/source/row_msa.cc
index effa68c8b4a..fe6df93a601 100644
--- a/chromium/third_party/libyuv/source/row_msa.cc
+++ b/chromium/third_party/libyuv/source/row_msa.cc
@@ -155,22 +155,21 @@ extern "C" {
}
// Loads current and next row of ARGB input and averages it to calculate U and V
-#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3) \
+#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3, const_0x0101) \
{ \
v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \
v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
- v16u8 vec8_m, vec9_m; \
v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \
v8u16 reg8_m, reg9_m; \
\
- src0_m = (v16u8)__msa_ld_b((void*)s, 0); \
- src1_m = (v16u8)__msa_ld_b((void*)s, 16); \
- src2_m = (v16u8)__msa_ld_b((void*)s, 32); \
- src3_m = (v16u8)__msa_ld_b((void*)s, 48); \
- src4_m = (v16u8)__msa_ld_b((void*)t, 0); \
- src5_m = (v16u8)__msa_ld_b((void*)t, 16); \
- src6_m = (v16u8)__msa_ld_b((void*)t, 32); \
- src7_m = (v16u8)__msa_ld_b((void*)t, 48); \
+ src0_m = (v16u8)__msa_ld_b((void*)s, 0); \
+ src1_m = (v16u8)__msa_ld_b((void*)s, 16); \
+ src2_m = (v16u8)__msa_ld_b((void*)s, 32); \
+ src3_m = (v16u8)__msa_ld_b((void*)s, 48); \
+ src4_m = (v16u8)__msa_ld_b((void*)t, 0); \
+ src5_m = (v16u8)__msa_ld_b((void*)t, 16); \
+ src6_m = (v16u8)__msa_ld_b((void*)t, 32); \
+ src7_m = (v16u8)__msa_ld_b((void*)t, 48); \
vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \
vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \
vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \
@@ -195,81 +194,81 @@ extern "C" {
reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \
reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \
reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \
- reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \
- reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \
- reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \
- reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \
- argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \
- argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
- src0_m = (v16u8)__msa_ld_b((void*)s, 64); \
- src1_m = (v16u8)__msa_ld_b((void*)s, 80); \
- src2_m = (v16u8)__msa_ld_b((void*)s, 96); \
- src3_m = (v16u8)__msa_ld_b((void*)s, 112); \
- src4_m = (v16u8)__msa_ld_b((void*)t, 64); \
- src5_m = (v16u8)__msa_ld_b((void*)t, 80); \
- src6_m = (v16u8)__msa_ld_b((void*)t, 96); \
- src7_m = (v16u8)__msa_ld_b((void*)t, 112); \
- vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \
- vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \
- vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \
- vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \
- vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \
- vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \
- vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \
- vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \
- reg0_m = __msa_hadd_u_h(vec2_m, vec2_m); \
- reg1_m = __msa_hadd_u_h(vec3_m, vec3_m); \
- reg2_m = __msa_hadd_u_h(vec4_m, vec4_m); \
- reg3_m = __msa_hadd_u_h(vec5_m, vec5_m); \
- reg4_m = __msa_hadd_u_h(vec6_m, vec6_m); \
- reg5_m = __msa_hadd_u_h(vec7_m, vec7_m); \
- reg6_m = __msa_hadd_u_h(vec8_m, vec8_m); \
- reg7_m = __msa_hadd_u_h(vec9_m, vec9_m); \
- reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \
- reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \
- reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \
- reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \
- reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \
- reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \
- reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \
- reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \
- reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \
- reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \
- reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \
- reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \
- argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \
- argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
+ reg8_m += const_0x0101; \
+ reg9_m += const_0x0101; \
+ reg0_m += const_0x0101; \
+ reg1_m += const_0x0101; \
+ argb0 = (v8u16)__msa_srai_h((v8i16)reg8_m, 1); \
+ argb1 = (v8u16)__msa_srai_h((v8i16)reg9_m, 1); \
+ argb2 = (v8u16)__msa_srai_h((v8i16)reg0_m, 1); \
+ argb3 = (v8u16)__msa_srai_h((v8i16)reg1_m, 1); \
}
-// Takes ARGB input and calculates U and V.
#define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
- shf0, shf1, shf2, shf3, v_out, u_out) \
+ shf0, shf1, shf2, shf3, shift, u_out, v_out) \
{ \
- v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
- v8u16 reg0_m, reg1_m, reg2_m, reg3_m; \
+ v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \
\
- vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0); \
- vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2); \
- vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0); \
- vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2); \
- vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0); \
- vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2); \
- vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0); \
- vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2); \
- reg0_m = __msa_dotp_u_h(vec0_m, const1); \
- reg1_m = __msa_dotp_u_h(vec1_m, const1); \
- reg2_m = __msa_dotp_u_h(vec4_m, const1); \
- reg3_m = __msa_dotp_u_h(vec5_m, const1); \
- reg0_m += const3; \
- reg1_m += const3; \
- reg2_m += const3; \
- reg3_m += const3; \
- reg0_m -= __msa_dotp_u_h(vec2_m, const0); \
- reg1_m -= __msa_dotp_u_h(vec3_m, const0); \
- reg2_m -= __msa_dotp_u_h(vec6_m, const2); \
- reg3_m -= __msa_dotp_u_h(vec7_m, const2); \
- v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m); \
- u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m); \
+ vec0_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \
+ vec1_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \
+ vec2_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \
+ vec3_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \
+ vec4_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \
+ vec5_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \
+ vec6_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \
+ vec7_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \
+ reg0_m = __msa_dotp_u_w(vec0_m, const0); \
+ reg1_m = __msa_dotp_u_w(vec1_m, const0); \
+ reg2_m = __msa_dotp_u_w(vec4_m, const0); \
+ reg3_m = __msa_dotp_u_w(vec5_m, const0); \
+ reg0_m += const1; \
+ reg1_m += const1; \
+ reg2_m += const1; \
+ reg3_m += const1; \
+ reg0_m -= (v4u32)__msa_dotp_u_w(vec2_m, const2); \
+ reg1_m -= (v4u32)__msa_dotp_u_w(vec3_m, const2); \
+ reg2_m -= (v4u32)__msa_dotp_u_w(vec6_m, const3); \
+ reg3_m -= (v4u32)__msa_dotp_u_w(vec7_m, const3); \
+ reg0_m = __msa_srl_w(reg0_m, shift); \
+ reg1_m = __msa_srl_w(reg1_m, shift); \
+ reg2_m = __msa_srl_w(reg2_m, shift); \
+ reg3_m = __msa_srl_w(reg3_m, shift); \
+ u_out = (v8u16)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \
+ v_out = (v8u16)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
+ }
+
+// Takes ARGB input and calculates U and V.
+#define ARGBTOUV_H(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
+ shf0, shf1, shf2, shf3, v_out, u_out) \
+ { \
+ v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \
+ \
+ vec0_m = __msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \
+ vec1_m = __msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \
+ vec2_m = __msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \
+ vec3_m = __msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \
+ vec4_m = __msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \
+ vec5_m = __msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \
+ vec6_m = __msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \
+ vec7_m = __msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \
+ reg0_m = __msa_dotp_u_w(vec0_m, const1); \
+ reg1_m = __msa_dotp_u_w(vec1_m, const1); \
+ reg2_m = __msa_dotp_u_w(vec4_m, const1); \
+ reg3_m = __msa_dotp_u_w(vec5_m, const1); \
+ reg0_m += (v4u32)const3; \
+ reg1_m += (v4u32)const3; \
+ reg2_m += (v4u32)const3; \
+ reg3_m += (v4u32)const3; \
+ reg0_m -= __msa_dotp_u_w(vec2_m, const0); \
+ reg1_m -= __msa_dotp_u_w(vec3_m, const0); \
+ reg2_m -= __msa_dotp_u_w(vec6_m, const2); \
+ reg3_m -= __msa_dotp_u_w(vec7_m, const2); \
+ u_out = (v16u8)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
+ v_out = (v16u8)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \
+ u_out = (v16u8)__msa_pckod_b((v16i8)u_out, (v16i8)u_out); \
+ v_out = (v16u8)__msa_pckod_b((v16i8)v_out, (v16i8)v_out); \
}
// Load I444 pixel data
@@ -302,6 +301,20 @@ void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
}
}
+void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ int x;
+ v8u16 src, dst;
+ v8u16 shuffler = {7, 6, 5, 4, 3, 2, 1, 0};
+ src_uv += (width - 8) << 1;
+ for (x = 0; x < width; x += 8) {
+ src = LD_UH(src_uv);
+ dst = __msa_vshf_h(shuffler, src, src);
+ ST_UH(dst, dst_uv);
+ src_uv -= 16;
+ dst_uv += 16;
+ }
+}
+
void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
int x;
v16u8 src0, src1, src2, src3;
@@ -825,12 +838,13 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0,
v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
v16u8 dst0, dst1;
- v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
- v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
- v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
- v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
- v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+ v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x38);
+ v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x25);
+ v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x13);
+ v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x2f);
+ v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x09);
v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
for (x = 0; x < width; x += 32) {
src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
@@ -889,12 +903,18 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0,
reg3 += __msa_hadd_u_h(vec5, vec5);
reg4 += __msa_hadd_u_h(vec0, vec0);
reg5 += __msa_hadd_u_h(vec1, vec1);
- reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2);
- reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2);
- reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2);
- reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2);
- reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2);
- reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2);
+ reg0 += const_0x0001;
+ reg1 += const_0x0001;
+ reg2 += const_0x0001;
+ reg3 += const_0x0001;
+ reg4 += const_0x0001;
+ reg5 += const_0x0001;
+ reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 1);
+ reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 1);
+ reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 1);
+ reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 1);
+ reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 1);
+ reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 1);
reg6 = reg0 * const_0x70;
reg7 = reg1 * const_0x70;
reg8 = reg2 * const_0x4A;
@@ -1412,17 +1432,17 @@ void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
int x;
v16u8 src0, src1, vec0, vec1, dst0, dst1;
v8u16 reg0;
- v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26);
- v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
+ v16u8 const_0x4D = (v16u8)__msa_ldi_h(0x4D);
+ v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D);
for (x = 0; x < width; x += 8) {
src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
- reg0 = __msa_dotp_u_h(vec0, const_0x4B0F);
- reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26);
- reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7);
+ reg0 = __msa_dotp_u_h(vec0, const_0x961D);
+ reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x4D);
+ reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 8);
vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0);
vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0);
dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0);
@@ -2031,12 +2051,13 @@ void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8i16 reg0, reg1, reg2, reg3;
v16u8 dst0;
- v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
- v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
- v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
- v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
- v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+ v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38);
+ v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25);
+ v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13);
+ v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f);
+ v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09);
v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
v16i8 zero = {0};
@@ -2085,10 +2106,14 @@ void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
- reg0 = __msa_srai_h((v8i16)reg0, 2);
- reg1 = __msa_srai_h((v8i16)reg1, 2);
- reg2 = __msa_srai_h((v8i16)reg2, 2);
- reg3 = __msa_srai_h((v8i16)reg3, 2);
+ reg0 += const_0x0001;
+ reg1 += const_0x0001;
+ reg2 += const_0x0001;
+ reg3 += const_0x0001;
+ reg0 = __msa_srai_h((v8i16)reg0, 1);
+ reg1 = __msa_srai_h((v8i16)reg1, 1);
+ reg2 = __msa_srai_h((v8i16)reg2, 1);
+ reg3 = __msa_srai_h((v8i16)reg3, 1);
vec4 = (v8u16)__msa_pckev_h(reg1, reg0);
vec5 = (v8u16)__msa_pckev_h(reg3, reg2);
vec6 = (v8u16)__msa_pckod_h(reg1, reg0);
@@ -2136,12 +2161,13 @@ void RAWToUVRow_MSA(const uint8_t* src_rgb0,
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8i16 reg0, reg1, reg2, reg3;
v16u8 dst0;
- v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
- v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
- v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
- v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
- v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+ v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38);
+ v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25);
+ v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13);
+ v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f);
+ v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09);
v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
v16i8 zero = {0};
@@ -2190,10 +2216,14 @@ void RAWToUVRow_MSA(const uint8_t* src_rgb0,
reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
- reg0 = __msa_srai_h(reg0, 2);
- reg1 = __msa_srai_h(reg1, 2);
- reg2 = __msa_srai_h(reg2, 2);
- reg3 = __msa_srai_h(reg3, 2);
+ reg0 += const_0x0001;
+ reg1 += const_0x0001;
+ reg2 += const_0x0001;
+ reg3 += const_0x0001;
+ reg0 = __msa_srai_h(reg0, 1);
+ reg1 = __msa_srai_h(reg1, 1);
+ reg2 = __msa_srai_h(reg2, 1);
+ reg3 = __msa_srai_h(reg3, 1);
vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
@@ -2419,16 +2449,16 @@ void SobelXYRow_MSA(const uint8_t* src_sobelx,
void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0;
- v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
- v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26);
- v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40);
+ v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D);
+ v16u8 const_0x4D = (v16u8)__msa_fill_h(0x4D);
+ v8u16 const_0x80 = (v8u16)__msa_fill_h(0x80);
for (x = 0; x < width; x += 16) {
src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
- ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7,
+ ARGBTOY(src0, src1, src2, src3, const_0x961D, const_0x4D, const_0x80, 8,
dst0);
ST_UB(dst0, dst_y);
src_argb0 += 64;
@@ -2504,61 +2534,123 @@ void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
int x;
const uint8_t* s = src_rgb0;
const uint8_t* t = src_rgb0 + src_stride_rgb;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16u8 vec0, vec1, vec2, vec3;
- v16u8 dst0, dst1;
- v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
- v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
- 18, 19, 22, 23, 26, 27, 30, 31};
- v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
- v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
- v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F);
- v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14);
- v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54);
- v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v8u16 src0, src1, src2, src3, src4, src5, src6, src7;
+ v8u16 vec0, vec1, vec2, vec3;
+ v8u16 dst0, dst1, dst2, dst3;
+ v16u8 zero = {0};
+ v8i16 shuffler0 = {0, 3, 4, 7, 8, 11, 12, 15};
+ v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14};
+ v8i16 shuffler2 = {2, 3, 6, 7, 10, 11, 14, 15};
+ v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13};
+ v8u16 const_0x0000003f = (v8u16)__msa_fill_w(0x0000003f);
+ v4u32 const_0x00008080 = (v8u16)__msa_fill_w(0x00008080);
+ v8u16 const_0x0015002a = (v8u16)__msa_fill_w(0x0015002a);
+ v8u16 const_0x0035000a = (v8u16)__msa_fill_w(0x0035000a);
+ v4i32 shift = __msa_fill_w(0x00000008);
for (x = 0; x < width; x += 32) {
- src0 = (v16u8)__msa_ld_b((void*)s, 0);
- src1 = (v16u8)__msa_ld_b((void*)s, 16);
- src2 = (v16u8)__msa_ld_b((void*)s, 32);
- src3 = (v16u8)__msa_ld_b((void*)s, 48);
- src4 = (v16u8)__msa_ld_b((void*)t, 0);
- src5 = (v16u8)__msa_ld_b((void*)t, 16);
- src6 = (v16u8)__msa_ld_b((void*)t, 32);
- src7 = (v16u8)__msa_ld_b((void*)t, 48);
- src0 = __msa_aver_u_b(src0, src4);
- src1 = __msa_aver_u_b(src1, src5);
- src2 = __msa_aver_u_b(src2, src6);
- src3 = __msa_aver_u_b(src3, src7);
- src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
- src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
- src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
- src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
- vec0 = __msa_aver_u_b(src4, src6);
- vec1 = __msa_aver_u_b(src5, src7);
- src0 = (v16u8)__msa_ld_b((void*)s, 64);
- src1 = (v16u8)__msa_ld_b((void*)s, 80);
- src2 = (v16u8)__msa_ld_b((void*)s, 96);
- src3 = (v16u8)__msa_ld_b((void*)s, 112);
- src4 = (v16u8)__msa_ld_b((void*)t, 64);
- src5 = (v16u8)__msa_ld_b((void*)t, 80);
- src6 = (v16u8)__msa_ld_b((void*)t, 96);
- src7 = (v16u8)__msa_ld_b((void*)t, 112);
- src0 = __msa_aver_u_b(src0, src4);
- src1 = __msa_aver_u_b(src1, src5);
- src2 = __msa_aver_u_b(src2, src6);
- src3 = __msa_aver_u_b(src3, src7);
- src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
- src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
- src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
- src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
- vec2 = __msa_aver_u_b(src4, src6);
- vec3 = __msa_aver_u_b(src5, src7);
- ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54,
- const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
- dst1);
- ST_UB(dst0, dst_v);
- ST_UB(dst1, dst_u);
+ src1 = __msa_ld_b((void*)s, 0);
+ src3 = __msa_ld_b((void*)s, 16);
+ src5 = __msa_ld_b((void*)t, 0);
+ src7 = __msa_ld_b((void*)t, 16);
+ src0 = __msa_ilvr_b(zero, src1);
+ src1 = __msa_ilvl_b(zero, src1);
+ src2 = __msa_ilvr_b(zero, src3);
+ src3 = __msa_ilvl_b(zero, src3);
+ src4 = __msa_ilvr_b(zero, src5);
+ src5 = __msa_ilvl_b(zero, src5);
+ src6 = __msa_ilvr_b(zero, src7);
+ src7 = __msa_ilvl_b(zero, src7);
+ src0 += src4;
+ src1 += src5;
+ src2 += src6;
+ src3 += src7;
+ src4 = __msa_ilvev_d(src1, src0);
+ src5 = __msa_ilvod_d(src1, src0);
+ src6 = __msa_ilvev_d(src3, src2);
+ src7 = __msa_ilvod_d(src3, src2);
+ vec0 = __msa_aver_u_h(src4, src5);
+ vec1 = __msa_aver_u_h(src6, src7);
+
+ src1 = __msa_ld_b((void*)s, 32);
+ src3 = __msa_ld_b((void*)s, 48);
+ src5 = __msa_ld_b((void*)t, 32);
+ src7 = __msa_ld_b((void*)t, 48);
+ src0 = __msa_ilvr_b(zero, src1);
+ src1 = __msa_ilvl_b(zero, src1);
+ src2 = __msa_ilvr_b(zero, src3);
+ src3 = __msa_ilvl_b(zero, src3);
+ src4 = __msa_ilvr_b(zero, src5);
+ src5 = __msa_ilvl_b(zero, src5);
+ src6 = __msa_ilvr_b(zero, src7);
+ src7 = __msa_ilvl_b(zero, src7);
+ src0 += src4;
+ src1 += src5;
+ src2 += src6;
+ src3 += src7;
+ src4 = __msa_ilvev_d(src1, src0);
+ src5 = __msa_ilvod_d(src1, src0);
+ src6 = __msa_ilvev_d(src3, src2);
+ src7 = __msa_ilvod_d(src3, src2);
+ vec2 = __msa_aver_u_h(src4, src5);
+ vec3 = __msa_aver_u_h(src6, src7);
+ ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080,
+ const_0x0015002a, const_0x0035000a, shuffler0, shuffler1,
+ shuffler2, shuffler3, shift, dst0, dst1);
+
+ src1 = __msa_ld_b((void*)s, 64);
+ src3 = __msa_ld_b((void*)s, 80);
+ src5 = __msa_ld_b((void*)t, 64);
+ src7 = __msa_ld_b((void*)t, 80);
+ src0 = __msa_ilvr_b(zero, src1);
+ src1 = __msa_ilvl_b(zero, src1);
+ src2 = __msa_ilvr_b(zero, src3);
+ src3 = __msa_ilvl_b(zero, src3);
+ src4 = __msa_ilvr_b(zero, src5);
+ src5 = __msa_ilvl_b(zero, src5);
+ src6 = __msa_ilvr_b(zero, src7);
+ src7 = __msa_ilvl_b(zero, src7);
+ src0 += src4;
+ src1 += src5;
+ src2 += src6;
+ src3 += src7;
+ src4 = __msa_ilvev_d(src1, src0);
+ src5 = __msa_ilvod_d(src1, src0);
+ src6 = __msa_ilvev_d(src3, src2);
+ src7 = __msa_ilvod_d(src3, src2);
+ vec0 = __msa_aver_u_h(src4, src5);
+ vec1 = __msa_aver_u_h(src6, src7);
+
+ src1 = __msa_ld_b((void*)s, 96);
+ src3 = __msa_ld_b((void*)s, 112);
+ src5 = __msa_ld_b((void*)t, 96);
+ src7 = __msa_ld_b((void*)t, 112);
+ src0 = __msa_ilvr_b(zero, src1);
+ src1 = __msa_ilvl_b(zero, src1);
+ src2 = __msa_ilvr_b(zero, src3);
+ src3 = __msa_ilvl_b(zero, src3);
+ src4 = __msa_ilvr_b(zero, src5);
+ src5 = __msa_ilvl_b(zero, src5);
+ src6 = __msa_ilvr_b(zero, src7);
+ src7 = __msa_ilvl_b(zero, src7);
+ src0 += src4;
+ src1 += src5;
+ src2 += src6;
+ src3 += src7;
+ src4 = __msa_ilvev_d(src1, src0);
+ src5 = __msa_ilvod_d(src1, src0);
+ src6 = __msa_ilvev_d(src3, src2);
+ src7 = __msa_ilvod_d(src3, src2);
+ vec2 = __msa_aver_u_h(src4, src5);
+ vec3 = __msa_aver_u_h(src6, src7);
+ ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080,
+ const_0x0015002a, const_0x0035000a, shuffler0, shuffler1,
+ shuffler2, shuffler3, shift, dst2, dst3);
+
+ dst0 = (v8u16)__msa_pckev_b(dst2, dst0);
+ dst1 = (v8u16)__msa_pckev_b(dst3, dst1);
+ ST_UB(dst0, dst_u);
+ ST_UB(dst1, dst_v);
s += 128;
t += 128;
dst_v += 16;
@@ -2574,28 +2666,30 @@ void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
int x;
const uint8_t* s = src_rgb0;
const uint8_t* t = src_rgb0 + src_stride_rgb;
- v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
- v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
- v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
- 18, 19, 22, 23, 26, 27, 30, 31};
- v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
- v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
- v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
- v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
- v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A);
- v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ const uint8_t unused = 0xf;
+ v8u16 src0, src1, src2, src3;
+ v16u8 dst0, dst1;
+ v8i16 shuffler0 = {1, unused, 5, unused, 9, unused, 13, unused};
+ v8i16 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15};
+ v8i16 shuffler2 = {3, unused, 7, unused, 11, unused, 15, unused};
+ v8i16 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14};
+ v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
+ v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
+ v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
+ v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
- for (x = 0; x < width; x += 32) {
- READ_ARGB(s, t, vec0, vec1, vec2, vec3);
- ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
- const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
- dst1);
- ST_UB(dst0, dst_v);
- ST_UB(dst1, dst_u);
- s += 128;
- t += 128;
- dst_v += 16;
- dst_u += 16;
+ for (x = 0; x < width; x += 16) {
+ READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
+ ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
+ const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
+ shuffler3, dst0, dst1);
+ *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
+ *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
+ s += 64;
+ t += 64;
+ dst_u += 8;
+ dst_v += 8;
}
}
@@ -2607,29 +2701,30 @@ void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
int x;
const uint8_t* s = src_rgb0;
const uint8_t* t = src_rgb0 + src_stride_rgb;
- v16u8 src0, src1, src2, src3;
+ const uint8_t unused = 0xf;
+ v8u16 src0, src1, src2, src3;
v16u8 dst0, dst1;
- v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
- v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
- 18, 19, 22, 23, 26, 27, 30, 31};
- v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
- v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
- v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26);
- v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070);
- v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
- v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v8i16 shuffler0 = {0, unused, 4, unused, 8, unused, 12, unused};
+ v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14};
+ v8i16 shuffler2 = {2, unused, 6, unused, 10, unused, 14, unused};
+ v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13};
+ v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
+ v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
+ v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
+ v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
- for (x = 0; x < width; x += 32) {
- READ_ARGB(s, t, src0, src1, src2, src3);
- ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E,
- const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
- dst1);
- ST_UB(dst0, dst_u);
- ST_UB(dst1, dst_v);
- s += 128;
- t += 128;
- dst_u += 16;
- dst_v += 16;
+ for (x = 0; x < width; x += 16) {
+ READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
+ ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
+ const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
+ shuffler3, dst0, dst1);
+ *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
+ *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
+ s += 64;
+ t += 64;
+ dst_u += 8;
+ dst_v += 8;
}
}
@@ -2641,28 +2736,30 @@ void RGBAToUVRow_MSA(const uint8_t* src_rgb0,
int x;
const uint8_t* s = src_rgb0;
const uint8_t* t = src_rgb0 + src_stride_rgb;
- v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
- v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
- v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
- 18, 19, 22, 23, 26, 27, 30, 31};
- v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
- v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
- v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A);
- v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
- v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E);
- v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ const uint8_t unused = 0xf;
+ v8u16 src0, src1, src2, src3;
+ v16u8 dst0, dst1;
+ v8i16 shuffler0 = {3, unused, 7, unused, 11, unused, 15, unused};
+ v8i16 shuffler1 = {2, 1, 6, 5, 10, 9, 14, 13};
+ v8i16 shuffler2 = {1, unused, 5, unused, 9, unused, 13, unused};
+ v8i16 shuffler3 = {3, 2, 7, 6, 11, 10, 15, 14};
+ v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
+ v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
+ v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
+ v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
- for (x = 0; x < width; x += 32) {
- READ_ARGB(s, t, vec0, vec1, vec2, vec3);
- ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
- const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
- dst1);
- ST_UB(dst0, dst_u);
- ST_UB(dst1, dst_v);
- s += 128;
- t += 128;
- dst_u += 16;
- dst_v += 16;
+ for (x = 0; x < width; x += 16) {
+ READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
+ ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
+ const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
+ shuffler3, dst0, dst1);
+ *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
+ *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
+ s += 64;
+ t += 64;
+ dst_u += 8;
+ dst_v += 8;
}
}
@@ -2734,13 +2831,24 @@ void I444ToARGBRow_MSA(const uint8_t* src_y,
}
}
-void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+// TODO - respect YuvConstants
+void I400ToARGBRow_MSA(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
int x;
+#if defined(__aarch64__) || defined(__arm__)
+ int ygb = yuvconstants->kUVBiasBGR[3];
+ int yg = yuvconstants->kYToRgb[1];
+#else
+ int ygb = yuvconstants->kYBiasToRgb[0];
+ int yg = yuvconstants->kYToRgb[0];
+#endif
v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3;
v8i16 vec0, vec1;
v4i32 reg0, reg1, reg2, reg3;
- v4i32 vec_yg = __msa_fill_w(0x4A35);
- v8i16 vec_ygb = __msa_fill_h(0xFB78);
+ v4i32 vec_yg = __msa_fill_w(yg);
+ v8i16 vec_ygb = __msa_fill_h(ygb);
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
v8i16 max = __msa_ldi_h(0xFF);
v8i16 zero = {0};
@@ -3006,7 +3114,7 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0,
uint8_t* dst_argb,
int width) {
int x;
- v16u8 src0, src1, src2, src3, dst0, dst1;
+ v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8u16 vec8, vec9, vec10, vec11, vec12, vec13;
v8u16 const_256 = (v8u16)__msa_ldi_h(256);
@@ -3051,12 +3159,12 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0,
vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8);
vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8);
vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8);
- vec0 += vec8;
- vec1 += vec9;
- vec2 += vec10;
- vec3 += vec11;
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ dst2 = (v16u8)__msa_pckev_b((v16i8)vec9, (v16i8)vec8);
+ dst3 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10);
+ dst0 = (v16u8)__msa_adds_u_b(dst0, dst2);
+ dst1 = (v16u8)__msa_adds_u_b(dst1, dst3);
dst0 = __msa_bmnz_v(dst0, const_255, mask);
dst1 = __msa_bmnz_v(dst1, const_255, mask);
ST_UB2(dst0, dst1, dst_argb, 16);
@@ -3082,7 +3190,7 @@ void ARGBQuantizeRow_MSA(uint8_t* dst_argb,
v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31};
v16i8 zero = {0};
- for (x = 0; x < width; x += 8) {
+ for (x = 0; x < width; x += 16) {
src0 = (v16u8)__msa_ld_b((void*)dst_argb, 0);
src1 = (v16u8)__msa_ld_b((void*)dst_argb, 16);
src2 = (v16u8)__msa_ld_b((void*)dst_argb, 32);
@@ -3315,10 +3423,10 @@ void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) {
}
}
-void MirrorUVRow_MSA(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
+void MirrorSplitUVRow_MSA(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
int x;
v16u8 src0, src1, src2, src3;
v16u8 dst0, dst1, dst2, dst3;
diff --git a/chromium/third_party/libyuv/source/row_neon.cc b/chromium/third_party/libyuv/source/row_neon.cc
index ff87e74c62c..a5aeaabfbd7 100644
--- a/chromium/third_party/libyuv/source/row_neon.cc
+++ b/chromium/third_party/libyuv/source/row_neon.cc
@@ -114,11 +114,11 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n" READYUV444 YUVTORGB
- "subs %4, %4, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
+ "subs %4, %4, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -140,11 +140,11 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
+ "subs %4, %4, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -168,10 +168,10 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
- "subs %5, %5, #8 \n"
- "vld1.8 {d23}, [%3]! \n"
- "vst4.8 {d20, d21, d22, d23}, [%4]! \n"
- "bgt 1b \n"
+ "subs %5, %5, #8 \n"
+ "vld1.8 {d23}, [%3]! \n"
+ "vst4.8 {d20, d21, d22, d23}, [%4]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -195,10 +195,10 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d19, #255 \n" // YUVTORGB modified d19
- "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d19, #255 \n" // YUVTORGB modified d19
+ "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -221,9 +221,9 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vst3.8 {d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
+ "subs %4, %4, #8 \n"
+ "vst3.8 {d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -253,9 +253,9 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n" ARGBTORGB565
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
+ "subs %4, %4, #8 \n" ARGBTORGB565
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -287,10 +287,10 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n" ARGBTOARGB1555
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels
- "bgt 1b \n"
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n" ARGBTOARGB1555
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -321,14 +321,14 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
- "vmov.u8 d4, #0x0f \n" // vbic bits to clear
+ "vmov.u8 d4, #0x0f \n" // vbic bits to clear
"1: \n"
READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n" ARGBTOARGB4444
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels
- "bgt 1b \n"
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n" ARGBTOARGB4444
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -342,35 +342,38 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
"q12", "q13", "q14", "q15");
}
-void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+void I400ToARGBRow_NEON(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
asm volatile(
YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n" READYUV400 YUVTORGB
- "subs %2, %2, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
- : [kUVToRB] "r"(&kYuvI601Constants.kUVToRB),
- [kUVToG] "r"(&kYuvI601Constants.kUVToG),
- [kUVBiasBGR] "r"(&kYuvI601Constants.kUVBiasBGR),
- [kYToRgb] "r"(&kYuvI601Constants.kYToRgb)
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15");
}
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile(
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n"
- "vld1.8 {d20}, [%0]! \n"
- "vmov d21, d20 \n"
- "vmov d22, d20 \n"
- "subs %2, %2, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.8 {d20}, [%0]! \n"
+ "vmov d21, d20 \n"
+ "vmov d22, d20 \n"
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -384,11 +387,11 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n" READNV12 YUVTORGB
- "subs %3, %3, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
- "bgt 1b \n"
+ "subs %3, %3, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_argb), // %2
@@ -407,11 +410,11 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n" READNV21 YUVTORGB
- "subs %3, %3, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
- "bgt 1b \n"
+ "subs %3, %3, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_argb), // %2
@@ -436,9 +439,9 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
"1: \n"
READNV12 YUVTORGB
- "subs %3, %3, #8 \n"
- "vst3.8 {d20, d21, d22}, [%2]! \n"
- "bgt 1b \n"
+ "subs %3, %3, #8 \n"
+ "vst3.8 {d20, d21, d22}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_rgb24), // %2
@@ -463,9 +466,9 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
"1: \n"
READNV21 YUVTORGB
- "subs %3, %3, #8 \n"
- "vst3.8 {d20, d21, d22}, [%2]! \n"
- "bgt 1b \n"
+ "subs %3, %3, #8 \n"
+ "vst3.8 {d20, d21, d22}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_rgb24), // %2
@@ -486,9 +489,9 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"1: \n" READNV12 YUVTORGB
- "subs %3, %3, #8 \n" ARGBTORGB565
- "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
+ "subs %3, %3, #8 \n" ARGBTORGB565
+ "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_rgb565), // %2
@@ -506,11 +509,11 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n" READYUY2 YUVTORGB
- "subs %2, %2, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -527,11 +530,11 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n" READUYVY YUVTORGB
- "subs %2, %2, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -550,18 +553,18 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
int width) {
asm volatile(
"1: \n"
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
- "subs %3, %3, #16 \n" // 16 processed per loop
- "vst1.8 {q0}, [%1]! \n" // store U
- "vst1.8 {q1}, [%2]! \n" // store V
- "bgt 1b \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vst1.8 {q0}, [%1]! \n" // store U
+ "vst1.8 {q1}, [%2]! \n" // store V
+ "bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3 // Output registers
: // Input registers
: "cc", "memory", "q0", "q1" // Clobber List
- );
+ );
}
// Reads 16 U's and V's and writes out 16 pairs of UV.
@@ -571,18 +574,18 @@ void MergeUVRow_NEON(const uint8_t* src_u,
int width) {
asm volatile(
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load U
- "vld1.8 {q1}, [%1]! \n" // load V
- "subs %3, %3, #16 \n" // 16 processed per loop
- "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n" // load U
+ "vld1.8 {q1}, [%1]! \n" // load V
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
+ "bgt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3 // Output registers
: // Input registers
: "cc", "memory", "q0", "q1" // Clobber List
- );
+ );
}
// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
@@ -593,13 +596,13 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
int width) {
asm volatile(
"1: \n"
- "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
- "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
- "subs %4, %4, #16 \n" // 16 processed per loop
- "vst1.8 {q0}, [%1]! \n" // store R
- "vst1.8 {q1}, [%2]! \n" // store G
- "vst1.8 {q2}, [%3]! \n" // store B
- "bgt 1b \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
+ "subs %4, %4, #16 \n" // 16 processed per loop
+ "vst1.8 {q0}, [%1]! \n" // store R
+ "vst1.8 {q1}, [%2]! \n" // store G
+ "vst1.8 {q2}, [%3]! \n" // store B
+ "bgt 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
@@ -607,7 +610,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
"+r"(width) // %4
: // Input registers
: "cc", "memory", "d0", "d1", "d2" // Clobber List
- );
+ );
}
// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
@@ -618,13 +621,13 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
int width) {
asm volatile(
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load R
- "vld1.8 {q1}, [%1]! \n" // load G
- "vld1.8 {q2}, [%2]! \n" // load B
- "subs %4, %4, #16 \n" // 16 processed per loop
- "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB
- "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n" // load R
+ "vld1.8 {q1}, [%1]! \n" // load G
+ "vld1.8 {q2}, [%2]! \n" // load B
+ "subs %4, %4, #16 \n" // 16 processed per loop
+ "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB
+ "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB
+ "bgt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
@@ -632,33 +635,33 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
"+r"(width) // %4
: // Input registers
: "cc", "memory", "q0", "q1", "q2" // Clobber List
- );
+ );
}
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"1: \n"
- "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
- "subs %2, %2, #32 \n" // 32 processed per loop
- "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
- "bgt 1b \n"
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
+ "subs %2, %2, #32 \n" // 32 processed per loop
+ "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
+ "bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2 // Output registers
: // Input registers
: "cc", "memory", "q0", "q1" // Clobber List
- );
+ );
}
// SetRow writes 'width' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
asm volatile(
- "vdup.8 q0, %2 \n" // duplicate 16 bytes
+ "vdup.8 q0, %2 \n" // duplicate 16 bytes
"1: \n"
- "subs %1, %1, #16 \n" // 16 bytes per loop
- "vst1.8 {q0}, [%0]! \n" // store
- "bgt 1b \n"
+ "subs %1, %1, #16 \n" // 16 bytes per loop
+ "vst1.8 {q0}, [%0]! \n" // store
+ "bgt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
: "r"(v8) // %2
@@ -668,11 +671,11 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
// ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
asm volatile(
- "vdup.u32 q0, %2 \n" // duplicate 4 ints
+ "vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n"
- "subs %1, %1, #4 \n" // 4 pixels per loop
- "vst1.8 {q0}, [%0]! \n" // store
- "bgt 1b \n"
+ "subs %1, %1, #4 \n" // 4 pixels per loop
+ "vst1.8 {q0}, [%0]! \n" // store
+ "bgt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
: "r"(v32) // %2
@@ -682,41 +685,62 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
// Start at end of source row.
- "mov r3, #-16 \n"
- "add %0, %0, %2 \n"
- "sub %0, #16 \n"
-
- "1: \n"
- "vld1.8 {q0}, [%0], r3 \n" // src -= 16
- "subs %2, #16 \n" // 16 pixels per loop.
- "vrev64.8 q0, q0 \n"
- "vst1.8 {d1}, [%1]! \n" // dst += 16
- "vst1.8 {d0}, [%1]! \n"
- "bgt 1b \n"
+ "add %0, %0, %2 \n"
+ "sub %0, %0, #32 \n" // 32 bytes per loop
+
+ "1: \n"
+ "vld1.8 {q1, q2}, [%0], %3 \n" // src -= 32
+ "subs %2, #32 \n" // 32 pixels per loop.
+ "vrev64.8 q0, q2 \n"
+ "vrev64.8 q1, q1 \n"
+ "vswp d0, d1 \n"
+ "vswp d2, d3 \n"
+ "vst1.8 {q0, q1}, [%1]! \n" // dst += 32
+ "bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
+ : "r"(-32) // %3
+ : "cc", "memory", "q0", "q1", "q2");
+}
+
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "mov r12, #-16 \n"
+ "add %0, %0, %2, lsl #1 \n"
+ "sub %0, #16 \n"
+
+ "1: \n"
+ "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
+ "subs %2, #8 \n" // 8 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ "vst2.8 {d0, d1}, [%1]! \n" // dst += 16
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(width) // %2
:
- : "cc", "memory", "r3", "q0");
+ : "cc", "memory", "r12", "q0");
}
-void MirrorUVRow_NEON(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
+void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
asm volatile(
// Start at end of source row.
- "mov r12, #-16 \n"
- "add %0, %0, %3, lsl #1 \n"
- "sub %0, #16 \n"
+ "mov r12, #-16 \n"
+ "add %0, %0, %3, lsl #1 \n"
+ "sub %0, #16 \n"
"1: \n"
- "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
- "subs %3, #8 \n" // 8 pixels per loop.
- "vrev64.8 q0, q0 \n"
- "vst1.8 {d0}, [%1]! \n" // dst += 8
- "vst1.8 {d1}, [%2]! \n"
- "bgt 1b \n"
+ "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
+ "subs %3, #8 \n" // 8 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ "vst1.8 {d0}, [%1]! \n" // dst += 8
+ "vst1.8 {d1}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -725,77 +749,113 @@ void MirrorUVRow_NEON(const uint8_t* src_uv,
: "cc", "memory", "r12", "q0");
}
-void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile(
- // Start at end of source row.
- "mov r3, #-16 \n"
- "add %0, %0, %2, lsl #2 \n"
- "sub %0, #16 \n"
+ "add %0, %0, %2, lsl #2 \n"
+ "sub %0, #32 \n"
"1: \n"
- "vld1.8 {q0}, [%0], r3 \n" // src -= 16
- "subs %2, #4 \n" // 4 pixels per loop.
- "vrev64.32 q0, q0 \n"
- "vst1.8 {d1}, [%1]! \n" // dst += 16
- "vst1.8 {d0}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "r3", "q0");
+ "vld4.8 {d0, d1, d2, d3}, [%0], %3 \n" // src -= 32
+ "subs %2, #8 \n" // 8 pixels per loop.
+ "vrev64.8 d0, d0 \n"
+ "vrev64.8 d1, d1 \n"
+ "vrev64.8 d2, d2 \n"
+ "vrev64.8 d3, d3 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // dst += 32
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(-32) // %3
+ : "cc", "memory", "d0", "d1", "d2", "d3");
+}
+
+void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
+ uint8_t* dst_rgb24,
+ int width) {
+ src_rgb24 += width * 3 - 24;
+ asm volatile(
+ "1: \n"
+ "vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24
+ "subs %2, #8 \n" // 8 pixels per loop.
+ "vrev64.8 d0, d0 \n"
+ "vrev64.8 d1, d1 \n"
+ "vrev64.8 d2, d2 \n"
+ "vst3.8 {d0, d1, d2}, [%1]! \n" // dst += 24
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ : "r"(-24) // %3
+ : "cc", "memory", "d0", "d1", "d2");
}
void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d4, #255 \n" // Alpha
+ "vmov.u8 d4, #255 \n" // Alpha
"1: \n"
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
- );
+ );
}
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
asm volatile(
- "vmov.u8 d4, #255 \n" // Alpha
+ "vmov.u8 d4, #255 \n" // Alpha
"1: \n"
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
- "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
- );
+ );
}
+void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+ asm volatile(
+ "vmov.u8 d0, #255 \n" // Alpha
+ "1: \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgba), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ );
+}
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
asm volatile(
"1: \n"
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
// RGB24.
- "bgt 1b \n"
+ "bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
:
: "cc", "memory", "d1", "d2", "d3" // Clobber List
- );
+ );
}
#define RGB565TOARGB \
@@ -814,19 +874,19 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d3, #255 \n" // Alpha
+ "vmov.u8 d3, #255 \n" // Alpha
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
- );
+ );
}
#define ARGB1555TOARGB \
@@ -860,19 +920,19 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d3, #255 \n" // Alpha
+ "vmov.u8 d3, #255 \n" // Alpha
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
- );
+ );
}
#define ARGB4444TOARGB \
@@ -889,19 +949,19 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d3, #255 \n" // Alpha
+ "vmov.u8 d3, #255 \n" // Alpha
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "q0", "q1", "q2" // Clobber List
- );
+ );
}
void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
@@ -909,63 +969,63 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
+ "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
// RGB24.
- "bgt 1b \n"
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
:
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
- );
+ );
}
void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
asm volatile(
"1: \n"
- "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
- "bgt 1b \n"
+ "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_raw), // %1
"+r"(width) // %2
:
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
- );
+ );
}
void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
- "subs %2, %2, #16 \n" // 16 processed per loop.
- "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
- "bgt 1b \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
:
: "cc", "memory", "q0", "q1" // Clobber List
- );
+ );
}
void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
- "subs %2, %2, #16 \n" // 16 processed per loop.
- "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
- "bgt 1b \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
:
: "cc", "memory", "q0", "q1" // Clobber List
- );
+ );
}
void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
@@ -974,18 +1034,18 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
- "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
- "vst1.8 {d1}, [%1]! \n" // store 8 U.
- "vst1.8 {d3}, [%2]! \n" // store 8 V.
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ "vst1.8 {d1}, [%1]! \n" // store 8 U.
+ "vst1.8 {d3}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3
:
: "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
- );
+ );
}
void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
@@ -994,18 +1054,18 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
- "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
- "vst1.8 {d0}, [%1]! \n" // store 8 U.
- "vst1.8 {d2}, [%2]! \n" // store 8 V.
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ "vst1.8 {d0}, [%1]! \n" // store 8 U.
+ "vst1.8 {d2}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3
:
: "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
- );
+ );
}
void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
@@ -1014,16 +1074,16 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
uint8_t* dst_v,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // stride + src_yuy2
+ "add %1, %0, %1 \n" // stride + src_yuy2
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
- "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
- "vrhadd.u8 d1, d1, d5 \n" // average rows of U
- "vrhadd.u8 d3, d3, d7 \n" // average rows of V
- "vst1.8 {d1}, [%2]! \n" // store 8 U.
- "vst1.8 {d3}, [%3]! \n" // store 8 V.
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
+ "vrhadd.u8 d1, d1, d5 \n" // average rows of U
+ "vrhadd.u8 d3, d3, d7 \n" // average rows of V
+ "vst1.8 {d1}, [%2]! \n" // store 8 U.
+ "vst1.8 {d3}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(stride_yuy2), // %1
"+r"(dst_u), // %2
@@ -1032,7 +1092,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
:
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
"d7" // Clobber List
- );
+ );
}
void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
@@ -1041,16 +1101,16 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
uint8_t* dst_v,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // stride + src_uyvy
+ "add %1, %0, %1 \n" // stride + src_uyvy
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
- "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
- "vrhadd.u8 d0, d0, d4 \n" // average rows of U
- "vrhadd.u8 d2, d2, d6 \n" // average rows of V
- "vst1.8 {d0}, [%2]! \n" // store 8 U.
- "vst1.8 {d2}, [%3]! \n" // store 8 V.
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
+ "vrhadd.u8 d0, d0, d4 \n" // average rows of U
+ "vrhadd.u8 d2, d2, d6 \n" // average rows of V
+ "vst1.8 {d0}, [%2]! \n" // store 8 U.
+ "vst1.8 {d2}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(stride_uyvy), // %1
"+r"(dst_u), // %2
@@ -1059,7 +1119,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
:
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
"d7" // Clobber List
- );
+ );
}
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
@@ -1068,20 +1128,20 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
const uint8_t* shuffler,
int width) {
asm volatile(
- "vld1.8 {q2}, [%3] \n" // shuffler
+ "vld1.8 {q2}, [%3] \n" // shuffler
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
- "subs %2, %2, #4 \n" // 4 processed per loop
- "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
- "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
- "vst1.8 {q1}, [%1]! \n" // store 4.
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
+ "subs %2, %2, #4 \n" // 4 processed per loop
+ "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
+ "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
+ "vst1.8 {q1}, [%1]! \n" // store 4.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "r"(shuffler) // %3
: "cc", "memory", "q0", "q1", "q2" // Clobber List
- );
+ );
}
void I422ToYUY2Row_NEON(const uint8_t* src_y,
@@ -1091,12 +1151,12 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
"1: \n"
- "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
- "vld1.8 {d1}, [%1]! \n" // load 8 Us
- "vld1.8 {d3}, [%2]! \n" // load 8 Vs
- "subs %4, %4, #16 \n" // 16 pixels
- "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
- "bgt 1b \n"
+ "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
+ "vld1.8 {d1}, [%1]! \n" // load 8 Us
+ "vld1.8 {d3}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -1113,12 +1173,12 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
int width) {
asm volatile(
"1: \n"
- "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
- "vld1.8 {d0}, [%1]! \n" // load 8 Us
- "vld1.8 {d2}, [%2]! \n" // load 8 Vs
- "subs %4, %4, #16 \n" // 16 pixels
- "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
- "bgt 1b \n"
+ "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
+ "vld1.8 {d0}, [%1]! \n" // load 8 Us
+ "vld1.8 {d2}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -1133,11 +1193,11 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTORGB565
- "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
+ "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb565), // %1
"+r"(width) // %2
@@ -1150,16 +1210,16 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
const uint32_t dither4,
int width) {
asm volatile(
- "vdup.32 d2, %2 \n" // dither4
+ "vdup.32 d2, %2 \n" // dither4
"1: \n"
- "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d20, d20, d2 \n"
- "vqadd.u8 d21, d21, d2 \n"
- "vqadd.u8 d22, d22, d2 \n" // add for dither
+ "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d20, d20, d2 \n"
+ "vqadd.u8 d21, d21, d2 \n"
+ "vqadd.u8 d22, d22, d2 \n" // add for dither
ARGBTORGB565
- "vst1.8 {q0}, [%0]! \n" // store 8 RGB565.
- "bgt 1b \n"
+ "vst1.8 {q0}, [%0]! \n" // store 8 RGB565.
+ "bgt 1b \n"
: "+r"(dst_rgb) // %0
: "r"(src_argb), // %1
"r"(dither4), // %2
@@ -1172,11 +1232,11 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555
- "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555.
- "bgt 1b \n"
+ "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb1555), // %1
"+r"(width) // %2
@@ -1188,14 +1248,14 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_argb4444,
int width) {
asm volatile(
- "vmov.u8 d4, #0x0f \n" // bits to clear with
+ "vmov.u8 d4, #0x0f \n" // bits to clear with
// vbic.
"1: \n"
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444
- "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444.
- "bgt 1b \n"
+ "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb4444), // %1
"+r"(width) // %2
@@ -1205,20 +1265,20 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1231,33 +1291,54 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
- "subs %2, %2, #16 \n" // 16 processed per loop
- "vst1.8 {q3}, [%1]! \n" // store 16 A's.
- "bgt 1b \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q3}, [%1]! \n" // store 16 A's.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_a), // %1
"+r"(width) // %2
:
: "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
- );
+ );
}
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
- "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
- "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
+ "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
+ "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
+}
+
+void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
+ "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
+ "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 RGBA pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d1, d24 \n" // B
+ "vmlal.u8 q2, d2, d25 \n" // G
+ "vmlal.u8 q2, d3, d26 \n" // R
+ "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1271,32 +1352,32 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_v,
int width) {
asm volatile(
- "vmov.u8 d24, #112 \n" // UB / VR 0.875
+ "vmov.u8 d24, #112 \n" // UB / VR 0.875
// coefficient
- "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
- "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
- "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
- "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlsl.u8 q2, d1, d25 \n" // G
- "vmlsl.u8 q2, d2, d26 \n" // R
- "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
-
- "vmull.u8 q3, d2, d24 \n" // R
- "vmlsl.u8 q3, d1, d28 \n" // G
- "vmlsl.u8 q3, d0, d27 \n" // B
- "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
+ "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
+ "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
+ "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
+ "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlsl.u8 q2, d1, d25 \n" // G
+ "vmlsl.u8 q2, d2, d26 \n" // R
+ "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
+
+ "vmull.u8 q3, d2, d24 \n" // R
+ "vmlsl.u8 q3, d1, d28 \n" // G
+ "vmlsl.u8 q3, d0, d27 \n" // B
+ "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1328,34 +1409,34 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride_argb), // %1
"+r"(dst_u), // %2
@@ -1374,34 +1455,34 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
- "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
- "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
- "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
- "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
+ "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
+ "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
+ "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
+ "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride_argb), // %1
"+r"(dst_u), // %2
@@ -1419,34 +1500,34 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_bgra
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
- "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
- "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q1, q1, #1 \n" // 2x average
- "vrshr.u16 q2, q2, #1 \n"
- "vrshr.u16 q3, q3, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
+ "add %1, %0, %1 \n" // src_stride + src_bgra
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
+ "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
+ "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q1, q1, #1 \n" // 2x average
+ "vrshr.u16 q2, q2, #1 \n"
+ "vrshr.u16 q3, q3, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q3, q2, q1)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_bgra), // %0
"+r"(src_stride_bgra), // %1
"+r"(dst_u), // %2
@@ -1464,34 +1545,34 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_abgr
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
- "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
- "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
+ "add %1, %0, %1 \n" // src_stride + src_abgr
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
+ "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
+ "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q2, q1, q0)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_abgr), // %0
"+r"(src_stride_abgr), // %1
"+r"(dst_u), // %2
@@ -1509,34 +1590,34 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_rgba
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
- "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
- "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
+ "add %1, %0, %1 \n" // src_stride + src_rgba
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
+ "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
+ "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_rgba), // %0
"+r"(src_stride_rgba), // %1
"+r"(dst_u), // %2
@@ -1554,34 +1635,34 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_rgb24
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
- "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
- "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
+ "add %1, %0, %1 \n" // src_stride + src_rgb24
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(src_stride_rgb24), // %1
"+r"(dst_u), // %2
@@ -1599,34 +1680,34 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_raw
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
- "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
- "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
- "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
- "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
- "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
+ "add %1, %0, %1 \n" // src_stride + src_raw
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
+ "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
+ "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q2, q1, q0)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(src_stride_raw), // %1
"+r"(dst_u), // %2
@@ -1645,55 +1726,55 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
uint8_t* dst_v,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
// coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
RGB565TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
RGB565TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
+ "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
RGB565TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
RGB565TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(src_stride_rgb565), // %1
"+r"(dst_u), // %2
@@ -1711,55 +1792,55 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_v,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
// coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(src_stride_argb1555), // %1
"+r"(dst_u), // %2
@@ -1777,55 +1858,46 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_v,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
// coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
- "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q0, q4, #1 \n" // 2x average
+ "vrshr.u16 q1, q5, #1 \n"
+ "vrshr.u16 q2, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(src_stride_argb4444), // %1
"+r"(dst_u), // %2
@@ -1838,21 +1910,21 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
+ "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1864,21 +1936,21 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_y,
int width) {
asm volatile(
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
+ "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1890,21 +1962,21 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_y,
int width) {
asm volatile(
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
+ "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1914,20 +1986,20 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d1, d4 \n" // R
- "vmlal.u8 q8, d2, d5 \n" // G
- "vmlal.u8 q8, d3, d6 \n" // B
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d1, d4 \n" // R
+ "vmlal.u8 q8, d2, d5 \n" // G
+ "vmlal.u8 q8, d3, d6 \n" // B
+ "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_bgra), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1937,20 +2009,20 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // R
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // B
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // R
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // B
+ "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1960,20 +2032,20 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d1, d4 \n" // B
- "vmlal.u8 q8, d2, d5 \n" // G
- "vmlal.u8 q8, d3, d6 \n" // R
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d1, d4 \n" // B
+ "vmlal.u8 q8, d2, d5 \n" // G
+ "vmlal.u8 q8, d3, d6 \n" // R
+ "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1983,20 +2055,20 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- "1: \n"
- "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // B
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // R
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // B
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // R
+ "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2006,20 +2078,20 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- "1: \n"
- "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // B
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // R
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // B
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // R
+ "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2027,6 +2099,48 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
}
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+ asm volatile(
+ "vmov.u8 d4, #29 \n" // B * 0.1140 coefficient
+ "vmov.u8 d5, #150 \n" // G * 0.5870 coefficient
+ "vmov.u8 d6, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q4, d0, d4 \n" // B
+ "vmlal.u8 q4, d1, d5 \n" // G
+ "vmlal.u8 q4, d2, d6 \n" // R
+ "vqrshrn.u16 d0, q4, #8 \n" // 16 bit to 8 bit Y
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_yj), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q4");
+}
+
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+ asm volatile(
+ "vmov.u8 d6, #29 \n" // B * 0.1140 coefficient
+ "vmov.u8 d5, #150 \n" // G * 0.5870 coefficient
+ "vmov.u8 d4, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q4, d0, d4 \n" // B
+ "vmlal.u8 q4, d1, d5 \n" // G
+ "vmlal.u8 q4, d2, d6 \n" // R
+ "vqrshrn.u16 d0, q4, #8 \n" // 16 bit to 8 bit Y
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_yj), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q4");
+}
+
// Bilinear filter 16x2 -> 16x1
void InterpolateRow_NEON(uint8_t* dst_ptr,
const uint8_t* src_ptr,
@@ -2035,46 +2149,46 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
int source_y_fraction) {
int y1_fraction = source_y_fraction;
asm volatile(
- "cmp %4, #0 \n"
- "beq 100f \n"
- "add %2, %1 \n"
- "cmp %4, #128 \n"
- "beq 50f \n"
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
- "vdup.8 d5, %4 \n"
- "rsb %4, #256 \n"
- "vdup.8 d4, %4 \n"
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vmull.u8 q13, d0, d4 \n"
- "vmull.u8 q14, d1, d4 \n"
- "vmlal.u8 q13, d2, d5 \n"
- "vmlal.u8 q14, d3, d5 \n"
- "vrshrn.u16 d0, q13, #8 \n"
- "vrshrn.u16 d1, q14, #8 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 1b \n"
- "b 99f \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
// Blend 50 / 50.
"50: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 50b \n"
- "b 99f \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
- "vld1.8 {q0}, [%1]! \n"
- "subs %3, %3, #16 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 100b \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
"99: \n"
: "+r"(dst_ptr), // %0
@@ -2092,51 +2206,51 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0,
uint8_t* dst_argb,
int width) {
asm volatile(
- "subs %3, #8 \n"
- "blt 89f \n"
+ "subs %3, #8 \n"
+ "blt 89f \n"
// Blend 8 pixels.
"8: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q10, d4, d3 \n" // db * a
- "vmull.u8 q11, d5, d3 \n" // dg * a
- "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
"vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
"vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
"vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
- "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
- "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
- "vqadd.u8 q0, q0, q2 \n" // + sbg
- "vqadd.u8 d2, d2, d6 \n" // + sr
- "vmov.u8 d3, #255 \n" // a = 255
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
- "bge 8b \n"
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
+ "bge 8b \n"
"89: \n"
- "adds %3, #8-1 \n"
- "blt 99f \n"
+ "adds %3, #8-1 \n"
+ "blt 99f \n"
// Blend 1 pixels.
"1: \n"
- "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
- "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
- "subs %3, %3, #1 \n" // 1 processed per loop.
- "vmull.u8 q10, d4, d3 \n" // db * a
- "vmull.u8 q11, d5, d3 \n" // dg * a
- "vmull.u8 q12, d6, d3 \n" // dr * a
- "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
- "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
- "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
- "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
- "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
- "vqadd.u8 q0, q0, q2 \n" // + sbg
- "vqadd.u8 d2, d2, d6 \n" // + sr
- "vmov.u8 d3, #255 \n" // a = 255
- "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
- "bge 1b \n"
-
- "99: \n"
+ "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
+ "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
+ "subs %3, %3, #1 \n" // 1 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
+ "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
+ "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
+ "bge 1b \n"
+
+ "99: \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -2153,16 +2267,16 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
asm volatile(
// Attenuate 8 pixels.
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q10, d0, d3 \n" // b * a
- "vmull.u8 q11, d1, d3 \n" // g * a
- "vmull.u8 q12, d2, d3 \n" // r * a
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d0, d3 \n" // b * a
+ "vmull.u8 q11, d1, d3 \n" // g * a
+ "vmull.u8 q12, d2, d3 \n" // r * a
"vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
"vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
"vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2178,32 +2292,32 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
int interval_offset,
int width) {
asm volatile(
- "vdup.u16 q8, %2 \n"
- "vshr.u16 q8, q8, #1 \n" // scale >>= 1
- "vdup.u16 q9, %3 \n" // interval multiply.
- "vdup.u16 q10, %4 \n" // interval add
+ "vdup.u16 q8, %2 \n"
+ "vshr.u16 q8, q8, #1 \n" // scale >>= 1
+ "vdup.u16 q9, %3 \n" // interval multiply.
+ "vdup.u16 q10, %4 \n" // interval add
// 8 pixel loop.
"1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
- "subs %1, %1, #8 \n" // 8 processed per loop.
- "vmovl.u8 q0, d0 \n" // b (0 .. 255)
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q2, d4 \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q0, d0 \n" // b (0 .. 255)
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q2, d4 \n"
"vqdmulh.s16 q0, q0, q8 \n" // b * scale
"vqdmulh.s16 q1, q1, q8 \n" // g
"vqdmulh.s16 q2, q2, q8 \n" // r
- "vmul.u16 q0, q0, q9 \n" // b * interval_size
- "vmul.u16 q1, q1, q9 \n" // g
- "vmul.u16 q2, q2, q9 \n" // r
- "vadd.u16 q0, q0, q10 \n" // b + interval_offset
- "vadd.u16 q1, q1, q10 \n" // g
- "vadd.u16 q2, q2, q10 \n" // r
- "vqmovn.u16 d0, q0 \n"
- "vqmovn.u16 d2, q1 \n"
- "vqmovn.u16 d4, q2 \n"
- "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vmul.u16 q0, q0, q9 \n" // b * interval_size
+ "vmul.u16 q1, q1, q9 \n" // g
+ "vmul.u16 q2, q2, q9 \n" // r
+ "vadd.u16 q0, q0, q10 \n" // b + interval_offset
+ "vadd.u16 q1, q1, q10 \n" // g
+ "vadd.u16 q2, q2, q10 \n" // r
+ "vqmovn.u16 d0, q0 \n"
+ "vqmovn.u16 d2, q1 \n"
+ "vqmovn.u16 d4, q2 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
: "r"(scale), // %2
@@ -2220,28 +2334,28 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
int width,
uint32_t value) {
asm volatile(
- "vdup.u32 q0, %3 \n" // duplicate scale value.
- "vzip.u8 d0, d1 \n" // d0 aarrggbb.
- "vshr.u16 q0, q0, #1 \n" // scale / 2.
+ "vdup.u32 q0, %3 \n" // duplicate scale value.
+ "vzip.u8 d0, d1 \n" // d0 aarrggbb.
+ "vshr.u16 q0, q0, #1 \n" // scale / 2.
// 8 pixel loop.
"1: \n"
- "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmovl.u8 q10, d20 \n" // b (0 .. 255)
- "vmovl.u8 q11, d22 \n"
- "vmovl.u8 q12, d24 \n"
- "vmovl.u8 q13, d26 \n"
+ "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q10, d20 \n" // b (0 .. 255)
+ "vmovl.u8 q11, d22 \n"
+ "vmovl.u8 q12, d24 \n"
+ "vmovl.u8 q13, d26 \n"
"vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
"vqrdmulh.s16 q11, q11, d0[1] \n" // g
"vqrdmulh.s16 q12, q12, d0[2] \n" // r
"vqrdmulh.s16 q13, q13, d0[3] \n" // a
- "vqmovn.u16 d20, q10 \n"
- "vqmovn.u16 d22, q11 \n"
- "vqmovn.u16 d24, q12 \n"
- "vqmovn.u16 d26, q13 \n"
- "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vqmovn.u16 d20, q10 \n"
+ "vqmovn.u16 d22, q11 \n"
+ "vqmovn.u16 d24, q12 \n"
+ "vqmovn.u16 d26, q13 \n"
+ "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2251,23 +2365,23 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
// Similar to ARGBToYJ but stores ARGB.
-// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile(
- "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
- "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
- "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
- "vmov d1, d0 \n" // G
- "vmov d2, d0 \n" // R
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
+ "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
+ "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit B
+ "vmov d1, d0 \n" // G
+ "vmov d2, d0 \n" // R
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2281,32 +2395,32 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
// r = (r * 50 + g * 98 + b * 24) >> 7
void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
asm volatile(
- "vmov.u8 d20, #17 \n" // BB coefficient
- "vmov.u8 d21, #68 \n" // BG coefficient
- "vmov.u8 d22, #35 \n" // BR coefficient
- "vmov.u8 d24, #22 \n" // GB coefficient
- "vmov.u8 d25, #88 \n" // GG coefficient
- "vmov.u8 d26, #45 \n" // GR coefficient
- "vmov.u8 d28, #24 \n" // BB coefficient
- "vmov.u8 d29, #98 \n" // BG coefficient
- "vmov.u8 d30, #50 \n" // BR coefficient
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
- "subs %1, %1, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d20 \n" // B to Sepia B
- "vmlal.u8 q2, d1, d21 \n" // G
- "vmlal.u8 q2, d2, d22 \n" // R
- "vmull.u8 q3, d0, d24 \n" // B to Sepia G
- "vmlal.u8 q3, d1, d25 \n" // G
- "vmlal.u8 q3, d2, d26 \n" // R
- "vmull.u8 q8, d0, d28 \n" // B to Sepia R
- "vmlal.u8 q8, d1, d29 \n" // G
- "vmlal.u8 q8, d2, d30 \n" // R
- "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
- "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
- "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
- "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vmov.u8 d20, #17 \n" // BB coefficient
+ "vmov.u8 d21, #68 \n" // BG coefficient
+ "vmov.u8 d22, #35 \n" // BR coefficient
+ "vmov.u8 d24, #22 \n" // GB coefficient
+ "vmov.u8 d25, #88 \n" // GG coefficient
+ "vmov.u8 d26, #45 \n" // GR coefficient
+ "vmov.u8 d28, #24 \n" // BB coefficient
+ "vmov.u8 d29, #98 \n" // BG coefficient
+ "vmov.u8 d30, #50 \n" // BR coefficient
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d20 \n" // B to Sepia B
+ "vmlal.u8 q2, d1, d21 \n" // G
+ "vmlal.u8 q2, d2, d22 \n" // R
+ "vmull.u8 q3, d0, d24 \n" // B to Sepia G
+ "vmlal.u8 q3, d1, d25 \n" // G
+ "vmlal.u8 q3, d2, d26 \n" // R
+ "vmull.u8 q8, d0, d28 \n" // B to Sepia R
+ "vmlal.u8 q8, d1, d29 \n" // G
+ "vmlal.u8 q8, d2, d30 \n" // R
+ "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
+ "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
+ "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
+ "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
:
@@ -2322,51 +2436,51 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
const int8_t* matrix_argb,
int width) {
asm volatile(
- "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
- "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
- "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
-
- "1: \n"
- "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
- "vmovl.u8 q9, d18 \n" // g
- "vmovl.u8 q10, d20 \n" // r
- "vmovl.u8 q11, d22 \n" // a
- "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
- "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
- "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
- "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
- "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
- "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
- "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
- "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
- "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
- "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
- "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
- "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
- "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B
- "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G
- "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R
- "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
+ "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
+ "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
+
+ "1: \n"
+ "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
+ "vmovl.u8 q9, d18 \n" // g
+ "vmovl.u8 q10, d20 \n" // r
+ "vmovl.u8 q11, d22 \n" // a
+ "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
+ "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
+ "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
+ "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
+ "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
+ "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
+ "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
+ "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
+ "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
+ "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
+ "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B
+ "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G
+ "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R
+ "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
"vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
"vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
"vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
"vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
- "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2383,19 +2497,19 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
asm volatile(
// 8 pixel loop.
"1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q0, d0, d1 \n" // multiply B
- "vmull.u8 q1, d2, d3 \n" // multiply G
- "vmull.u8 q2, d4, d5 \n" // multiply R
- "vmull.u8 q3, d6, d7 \n" // multiply A
- "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
- "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
- "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
- "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q0, d0, d1 \n" // multiply B
+ "vmull.u8 q1, d2, d3 \n" // multiply G
+ "vmull.u8 q2, d4, d5 \n" // multiply R
+ "vmull.u8 q3, d6, d7 \n" // multiply A
+ "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
+ "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
+ "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
+ "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -2412,13 +2526,13 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
asm volatile(
// 8 pixel loop.
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 q0, q0, q2 \n" // add B, G
- "vqadd.u8 q1, q1, q3 \n" // add R, A
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 q0, q0, q2 \n" // add B, G
+ "vqadd.u8 q1, q1, q3 \n" // add R, A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -2435,13 +2549,13 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
asm volatile(
// 8 pixel loop.
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqsub.u8 q0, q0, q2 \n" // subtract B, G
- "vqsub.u8 q1, q1, q3 \n" // subtract R, A
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqsub.u8 q0, q0, q2 \n" // subtract B, G
+ "vqsub.u8 q1, q1, q3 \n" // subtract R, A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -2460,17 +2574,17 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d3, #255 \n" // alpha
+ "vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
- "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
- "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d0, d0, d1 \n" // add
- "vmov.u8 d1, d0 \n"
- "vmov.u8 d2, d0 \n"
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
+ "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d0, d0, d1 \n" // add
+ "vmov.u8 d1, d0 \n"
+ "vmov.u8 d2, d0 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -2487,12 +2601,12 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
asm volatile(
// 16 pixel loop.
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
- "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
- "subs %3, %3, #16 \n" // 16 processed per loop.
- "vqadd.u8 q0, q0, q1 \n" // add
- "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
+ "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vqadd.u8 q0, q0, q1 \n" // add
+ "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
+ "bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_y), // %2
@@ -2511,15 +2625,15 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d3, #255 \n" // alpha
+ "vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
- "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
- "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d1, d0, d2 \n" // add
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
+ "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d1, d0, d2 \n" // add
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -2539,23 +2653,23 @@ void SobelXRow_NEON(const uint8_t* src_y0,
int width) {
asm volatile(
"1: \n"
- "vld1.8 {d0}, [%0],%5 \n" // top
- "vld1.8 {d1}, [%0],%6 \n"
- "vsubl.u8 q0, d0, d1 \n"
- "vld1.8 {d2}, [%1],%5 \n" // center * 2
- "vld1.8 {d3}, [%1],%6 \n"
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vld1.8 {d2}, [%2],%5 \n" // bottom
- "vld1.8 {d3}, [%2],%6 \n"
- "subs %4, %4, #8 \n" // 8 pixels
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vabs.s16 q0, q0 \n"
- "vqmovn.u16 d0, q0 \n"
- "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
- "bgt 1b \n"
+ "vld1.8 {d0}, [%0],%5 \n" // top
+ "vld1.8 {d1}, [%0],%6 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ "vld1.8 {d2}, [%1],%5 \n" // center * 2
+ "vld1.8 {d3}, [%1],%6 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vld1.8 {d2}, [%2],%5 \n" // bottom
+ "vld1.8 {d3}, [%2],%6 \n"
+ "subs %4, %4, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
+ "bgt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(src_y2), // %2
@@ -2564,7 +2678,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
: "r"(2), // %5
"r"(6) // %6
: "cc", "memory", "q0", "q1" // Clobber List
- );
+ );
}
// SobelY as a matrix is
@@ -2577,23 +2691,23 @@ void SobelYRow_NEON(const uint8_t* src_y0,
int width) {
asm volatile(
"1: \n"
- "vld1.8 {d0}, [%0],%4 \n" // left
- "vld1.8 {d1}, [%1],%4 \n"
- "vsubl.u8 q0, d0, d1 \n"
- "vld1.8 {d2}, [%0],%4 \n" // center * 2
- "vld1.8 {d3}, [%1],%4 \n"
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vld1.8 {d2}, [%0],%5 \n" // right
- "vld1.8 {d3}, [%1],%5 \n"
- "subs %3, %3, #8 \n" // 8 pixels
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vabs.s16 q0, q0 \n"
- "vqmovn.u16 d0, q0 \n"
- "vst1.8 {d0}, [%2]! \n" // store 8 sobely
- "bgt 1b \n"
+ "vld1.8 {d0}, [%0],%4 \n" // left
+ "vld1.8 {d1}, [%1],%4 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ "vld1.8 {d2}, [%0],%4 \n" // center * 2
+ "vld1.8 {d3}, [%1],%4 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vld1.8 {d2}, [%0],%5 \n" // right
+ "vld1.8 {d3}, [%1],%5 \n"
+ "subs %3, %3, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 sobely
+ "bgt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(dst_sobely), // %2
@@ -2601,7 +2715,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
: "r"(1), // %4
"r"(6) // %5
: "cc", "memory", "q0", "q1" // Clobber List
- );
+ );
}
// %y passes a float as a scalar vector for vector * scalar multiply.
@@ -2615,18 +2729,18 @@ void HalfFloat1Row_NEON(const uint16_t* src,
asm volatile(
"1: \n"
- "vld1.8 {q1}, [%0]! \n" // load 8 shorts
- "subs %2, %2, #8 \n" // 8 pixels per loop
- "vmovl.u16 q2, d2 \n" // 8 int's
- "vmovl.u16 q3, d3 \n"
- "vcvt.f32.u32 q2, q2 \n" // 8 floats
- "vcvt.f32.u32 q3, q3 \n"
- "vmul.f32 q2, q2, %y3 \n" // adjust exponent
- "vmul.f32 q3, q3, %y3 \n"
- "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
- "vqshrn.u32 d3, q3, #13 \n"
- "vst1.8 {q1}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.8 {q1}, [%0]! \n" // load 8 shorts
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u16 q2, d2 \n" // 8 int's
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, %y3 \n" // adjust exponent
+ "vmul.f32 q3, q3, %y3 \n"
+ "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
+ "vqshrn.u32 d3, q3, #13 \n"
+ "vst1.8 {q1}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2641,18 +2755,18 @@ void HalfFloatRow_NEON(const uint16_t* src,
asm volatile(
"1: \n"
- "vld1.8 {q1}, [%0]! \n" // load 8 shorts
- "subs %2, %2, #8 \n" // 8 pixels per loop
- "vmovl.u16 q2, d2 \n" // 8 int's
- "vmovl.u16 q3, d3 \n"
- "vcvt.f32.u32 q2, q2 \n" // 8 floats
- "vcvt.f32.u32 q3, q3 \n"
- "vmul.f32 q2, q2, %y3 \n" // adjust exponent
- "vmul.f32 q3, q3, %y3 \n"
- "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
- "vqshrn.u32 d3, q3, #13 \n"
- "vst1.8 {q1}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.8 {q1}, [%0]! \n" // load 8 shorts
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u16 q2, d2 \n" // 8 int's
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, %y3 \n" // adjust exponent
+ "vmul.f32 q3, q3, %y3 \n"
+ "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
+ "vqshrn.u32 d3, q3, #13 \n"
+ "vst1.8 {q1}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2667,17 +2781,17 @@ void ByteToFloatRow_NEON(const uint8_t* src,
asm volatile(
"1: \n"
- "vld1.8 {d2}, [%0]! \n" // load 8 bytes
- "subs %2, %2, #8 \n" // 8 pixels per loop
- "vmovl.u8 q1, d2 \n" // 8 shorts
- "vmovl.u16 q2, d2 \n" // 8 ints
- "vmovl.u16 q3, d3 \n"
- "vcvt.f32.u32 q2, q2 \n" // 8 floats
- "vcvt.f32.u32 q3, q3 \n"
- "vmul.f32 q2, q2, %y3 \n" // scale
- "vmul.f32 q3, q3, %y3 \n"
- "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats
- "bgt 1b \n"
+ "vld1.8 {d2}, [%0]! \n" // load 8 bytes
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u8 q1, d2 \n" // 8 shorts
+ "vmovl.u16 q2, d2 \n" // 8 ints
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, %y3 \n" // scale
+ "vmul.f32 q3, q3, %y3 \n"
+ "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats
+ "bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2685,6 +2799,238 @@ void ByteToFloatRow_NEON(const uint8_t* src,
: "cc", "memory", "q1", "q2", "q3");
}
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_NEON(const uint16_t* src0,
+ const uint16_t* src1,
+ const uint16_t* src2,
+ const uint16_t* src3,
+ const uint16_t* src4,
+ uint32_t* dst,
+ int width) {
+ asm volatile(
+ "vmov.u16 d6, #4 \n" // constant 4
+ "vmov.u16 d7, #6 \n" // constant 6
+
+ "1: \n"
+ "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows
+ "vld1.16 {q2}, [%4]! \n"
+ "vaddl.u16 q0, d2, d4 \n" // * 1
+ "vaddl.u16 q1, d3, d5 \n" // * 1
+ "vld1.16 {q2}, [%1]! \n"
+ "vmlal.u16 q0, d4, d6 \n" // * 4
+ "vmlal.u16 q1, d5, d6 \n" // * 4
+ "vld1.16 {q2}, [%2]! \n"
+ "vmlal.u16 q0, d4, d7 \n" // * 6
+ "vmlal.u16 q1, d5, d7 \n" // * 6
+ "vld1.16 {q2}, [%3]! \n"
+ "vmlal.u16 q0, d4, d6 \n" // * 4
+ "vmlal.u16 q1, d5, d6 \n" // * 4
+ "subs %6, %6, #8 \n" // 8 processed per loop
+ "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples
+ "bgt 1b \n"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(src2), // %2
+ "+r"(src3), // %3
+ "+r"(src4), // %4
+ "+r"(dst), // %5
+ "+r"(width) // %6
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
+ const uint32_t* src1 = src + 1;
+ const uint32_t* src2 = src + 2;
+ const uint32_t* src3 = src + 3;
+ asm volatile(
+ "vmov.u32 q10, #4 \n" // constant 4
+ "vmov.u32 q11, #6 \n" // constant 6
+
+ "1: \n"
+ "vld1.32 {q0, q1}, [%0]! \n" // load 12 source samples
+ "vld1.32 {q2}, [%0] \n"
+ "vadd.u32 q0, q0, q1 \n" // * 1
+ "vadd.u32 q1, q1, q2 \n" // * 1
+ "vld1.32 {q2, q3}, [%2]! \n"
+ "vmla.u32 q0, q2, q11 \n" // * 6
+ "vmla.u32 q1, q3, q11 \n" // * 6
+ "vld1.32 {q2, q3}, [%1]! \n"
+ "vld1.32 {q8, q9}, [%3]! \n"
+ "vadd.u32 q2, q2, q8 \n" // add rows for * 4
+ "vadd.u32 q3, q3, q9 \n"
+ "vmla.u32 q0, q2, q10 \n" // * 4
+ "vmla.u32 q1, q3, q10 \n" // * 4
+ "subs %5, %5, #8 \n" // 8 processed per loop
+ "vqshrn.u32 d0, q0, #8 \n" // round and pack
+ "vqshrn.u32 d1, q1, #8 \n"
+ "vst1.u16 {q0}, [%4]! \n" // store 8 samples
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(src1), // %1
+ "+r"(src2), // %2
+ "+r"(src3), // %3
+ "+r"(dst), // %4
+ "+r"(width) // %5
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+}
+
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q2}, [%0]! \n" // load 16 Y values
+ "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values
+ "vmov d1, d0 \n"
+ "vzip.u8 d0, d1 \n" // VV
+ "vmov d3, d2 \n"
+ "vzip.u8 d2, d3 \n" // UU
+ "subs %3, %3, #16 \n" // 16 pixels per loop
+ "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels
+ "vst3.8 {d1, d3, d5}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_yuv24), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2");
+}
+
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // src_stride + src_AYUV
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
+ // pixels.
+ "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
+ // pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
+ // pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vqrshrun.s16 d1, q0, #2 \n" // 2x2 average
+ "vqrshrun.s16 d0, q1, #2 \n"
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV.
+ "bgt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(src_stride_ayuv), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+}
+
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_vu,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // src_stride + src_AYUV
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
+ // pixels.
+ "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
+ // pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
+ // pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vqrshrun.s16 d0, q0, #2 \n" // 2x2 average
+ "vqrshrun.s16 d1, q1, #2 \n"
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU.
+ "bgt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(src_stride_ayuv), // %1
+ "+r"(dst_vu), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+}
+
+// Copy row of AYUV Y's into Y.
+// Similar to ARGBExtractAlphaRow_NEON
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q2}, [%1]! \n" // store 16 Y's.
+ "bgt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
+ "vld2.8 {d1, d3}, [%0]! \n"
+ "vorr.u8 q2, q0, q0 \n" // move U after V
+ "subs %2, %2, #16 \n" // 16 pixels per loop
+ "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_vu), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2");
+}
+
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ const uint8_t* src_u_1 = src_u + src_stride_u;
+ const uint8_t* src_v_1 = src_v + src_stride_v;
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 16 U values
+ "vld1.8 {q1}, [%2]! \n" // load 16 V values
+ "vld1.8 {q2}, [%1]! \n"
+ "vld1.8 {q3}, [%3]! \n"
+ "vpaddl.u8 q0, q0 \n" // half size
+ "vpaddl.u8 q1, q1 \n"
+ "vpadal.u8 q0, q2 \n"
+ "vpadal.u8 q1, q3 \n"
+ "vqrshrn.u16 d0, q0, #2 \n"
+ "vqrshrn.u16 d1, q1, #2 \n"
+ "subs %5, %5, #16 \n" // 16 src pixels per loop
+ "vst2.8 {d0, d1}, [%4]! \n" // store 8 UV pixels
+ "bgt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_u_1), // %1
+ "+r"(src_v), // %2
+ "+r"(src_v_1), // %3
+ "+r"(dst_uv), // %4
+ "+r"(width) // %5
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
#ifdef __cplusplus
diff --git a/chromium/third_party/libyuv/source/row_neon64.cc b/chromium/third_party/libyuv/source/row_neon64.cc
index 24b4520babc..d5258a3aef3 100644
--- a/chromium/third_party/libyuv/source/row_neon64.cc
+++ b/chromium/third_party/libyuv/source/row_neon64.cc
@@ -68,13 +68,13 @@ extern "C" {
"uzp2 v3.8b, v2.8b, v2.8b \n" \
"ins v1.s[1], v3.s[0] \n"
-#define YUVTORGB_SETUP \
- "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \
- "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \
- "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \
- "ld1r {v31.4s}, [%[kYToRgb]] \n" \
- "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
- "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
+#define YUVTORGB_SETUP \
+ "ld3r {v24.8h, v25.8h, v26.8h}, [%[kUVBiasBGR]] \n" \
+ "ld1r {v31.4s}, [%[kYToRgb]] \n" \
+ "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
+ "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
+
+// clang-format off
#define YUVTORGB(vR, vG, vB) \
"uxtl v0.8h, v0.8b \n" /* Extract Y */ \
@@ -89,29 +89,23 @@ extern "C" {
"mov v2.d[0], v1.d[1] \n" /* Extract V */ \
"uxtl v2.8h, v2.8b \n" \
"uxtl v1.8h, v1.8b \n" /* Extract U */ \
- "mul v3.8h, v1.8h, v27.8h \n" \
- "mul v5.8h, v1.8h, v29.8h \n" \
- "mul v6.8h, v2.8h, v30.8h \n" \
- "mul v7.8h, v2.8h, v28.8h \n" \
+ "mul v3.8h, v27.8h, v1.8h \n" \
+ "mul v5.8h, v29.8h, v1.8h \n" \
+ "mul v6.8h, v30.8h, v2.8h \n" \
+ "mul v7.8h, v28.8h, v2.8h \n" \
"sqadd v6.8h, v6.8h, v5.8h \n" \
- "sqadd " #vB \
- ".8h, v24.8h, v0.8h \n" /* B */ \
- "sqadd " #vG \
- ".8h, v25.8h, v0.8h \n" /* G */ \
- "sqadd " #vR \
- ".8h, v26.8h, v0.8h \n" /* R */ \
- "sqadd " #vB ".8h, " #vB \
- ".8h, v3.8h \n" /* B */ \
- "sqsub " #vG ".8h, " #vG \
- ".8h, v6.8h \n" /* G */ \
- "sqadd " #vR ".8h, " #vR \
- ".8h, v7.8h \n" /* R */ \
- "sqshrun " #vB ".8b, " #vB \
- ".8h, #6 \n" /* B */ \
- "sqshrun " #vG ".8b, " #vG \
- ".8h, #6 \n" /* G */ \
+ "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \
+ "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \
+ "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \
+ "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \
+ "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \
+ "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \
+ "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \
+ "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \
"sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */
+// clang-format on
+
void I444ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@@ -120,13 +114,16 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v23.8b, #255 \n" /* A */
- "1: \n"
+ "movi v23.8b, #255 \n" /* A */
+ "1: \n"
READYUV444
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "subs %w4, %w4, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -149,13 +146,17 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v23.8b, #255 \n" /* A */
- "1: \n"
+ "movi v23.8b, #255 \n" /* A */
+
+ "1: \n"
READYUV422
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "subs %w4, %w4, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -179,13 +180,17 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "1: \n"
+ "1: \n"
READYUV422
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "ld1 {v23.8b}, [%3], #8 \n"
- "subs %w5, %w5, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
- "b.gt 1b \n"
+ "ld1 {v23.8b}, [%3], #8 \n"
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "subs %w5, %w5, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -209,13 +214,16 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v20.8b, #255 \n" /* A */
- "1: \n"
+ "movi v20.8b, #255 \n" /* A */
+ "1: \n"
READYUV422
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v23, v22, v21)
- "subs %w4, %w4, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "subs %w4, %w4, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -238,12 +246,15 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "1: \n"
+ "1: \n"
READYUV422
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
- "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "subs %w4, %w4, #8 \n"
+ "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -265,6 +276,8 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
"sri v0.8h, v21.8h, #5 \n" /* RG */ \
"sri v0.8h, v20.8h, #11 \n" /* RGB */
+// clang-format off
+
void I422ToRGB565Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@@ -272,13 +285,17 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
- YUVTORGB_SETUP
- "1: \n" READYUV422 YUVTORGB(
- v22, v21,
- v20) "subs %w4, %w4, #8 \n" ARGBTORGB565
- "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
- // RGB565.
- "b.gt 1b \n"
+ YUVTORGB_SETUP
+ "1: \n"
+ READYUV422
+ YUVTORGB(v22, v21, v20)
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #8 \n"
+ ARGBTORGB565
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -308,14 +325,18 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n" READYUV422 YUVTORGB(
- v22, v21,
- v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555
- "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
- // RGB565.
- "b.gt 1b \n"
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ READYUV422
+ YUVTORGB(v22, v21, v20)
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #8 \n"
+ ARGBTOARGB1555
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -328,6 +349,7 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
}
+// clang-format on
#define ARGBTOARGB4444 \
/* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
@@ -347,15 +369,18 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v4.16b, #0x0f \n" // bits to clear with vbic.
- "1: \n"
+ "movi v4.16b, #0x0f \n" // bits to clear with vbic.
+ "1: \n"
READYUV422
YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
- "movi v23.8b, #255 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #8 \n"
+ "movi v23.8b, #255 \n"
ARGBTOARGB4444
- "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -370,23 +395,27 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
);
}
-void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+void I400ToARGBRow_NEON(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
+ "movi v23.8b, #255 \n"
+ "1: \n"
READYUV400
YUVTORGB(v22, v21, v20)
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
- : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
- [kUVToG]"r"(&kYuvI601Constants.kUVToG),
- [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
- [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
@@ -394,14 +423,15 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile(
- "movi v23.8b, #255 \n"
+ "movi v23.8b, #255 \n"
"1: \n"
- "ld1 {v20.8b}, [%0], #8 \n"
- "orr v21.8b, v20.8b, v20.8b \n"
- "orr v22.8b, v20.8b, v20.8b \n"
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
- "b.gt 1b \n"
+ "ld1 {v20.8b}, [%0], #8 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v21.8b, v20.8b, v20.8b \n"
+ "orr v22.8b, v20.8b, v20.8b \n"
+ "subs %w2, %w2, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -416,13 +446,15 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
+ "movi v23.8b, #255 \n"
+ "1: \n"
READNV12
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "subs %w3, %w3, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 256] \n"
+ "subs %w3, %w3, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_argb), // %2
@@ -443,13 +475,15 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
+ "movi v23.8b, #255 \n"
+ "1: \n"
READNV21
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "subs %w3, %w3, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 256] \n"
+ "subs %w3, %w3, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_argb), // %2
@@ -470,12 +504,14 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "1: \n"
+ "1: \n"
READNV12
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "subs %w3, %w3, #8 \n"
- "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 256] \n"
+ "subs %w3, %w3, #8 \n"
+ "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_rgb24), // %2
@@ -496,12 +532,14 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "1: \n"
+ "1: \n"
READNV21
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "subs %w3, %w3, #8 \n"
- "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 256] \n"
+ "subs %w3, %w3, #8 \n"
+ "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_rgb24), // %2
@@ -521,13 +559,13 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
- YUVTORGB_SETUP
- "1: \n" READNV12 YUVTORGB(
- v22, v21,
- v20) "subs %w3, %w3, #8 \n" ARGBTORGB565
- "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
- // RGB565.
- "b.gt 1b \n"
+ YUVTORGB_SETUP "1: \n" READNV12
+ "prfm pldl1keep, [%0, 448] \n" YUVTORGB(
+ v22, v21, v20) ARGBTORGB565
+ "prfm pldl1keep, [%1, 256] \n"
+ "subs %w3, %w3, #8 \n"
+ "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_rgb565), // %2
@@ -546,13 +584,14 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
+ "movi v23.8b, #255 \n"
+ "1: \n"
READYUY2
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
- "b.gt 1b \n"
+ "subs %w2, %w2, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -571,13 +610,14 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
+ "movi v23.8b, #255 \n"
+ "1: \n"
READUYVY
YUVTORGB(v22, v21, v20)
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
+ "b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -597,18 +637,19 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
int width) {
asm volatile(
"1: \n"
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
- "subs %w3, %w3, #16 \n" // 16 processed per loop
- "st1 {v0.16b}, [%1], #16 \n" // store U
- "st1 {v1.16b}, [%2], #16 \n" // store V
- "b.gt 1b \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "st1 {v0.16b}, [%1], #16 \n" // store U
+ "st1 {v1.16b}, [%2], #16 \n" // store V
+ "b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3 // Output registers
: // Input registers
: "cc", "memory", "v0", "v1" // Clobber List
- );
+ );
}
// Reads 16 U's and V's and writes out 16 pairs of UV.
@@ -618,18 +659,20 @@ void MergeUVRow_NEON(const uint8_t* src_u,
int width) {
asm volatile(
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load U
- "ld1 {v1.16b}, [%1], #16 \n" // load V
- "subs %w3, %w3, #16 \n" // 16 processed per loop
- "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load U
+ "ld1 {v1.16b}, [%1], #16 \n" // load V
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
+ "b.gt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3 // Output registers
: // Input registers
: "cc", "memory", "v0", "v1" // Clobber List
- );
+ );
}
// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
@@ -640,12 +683,13 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
int width) {
asm volatile(
"1: \n"
- "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB
- "subs %w4, %w4, #16 \n" // 16 processed per loop
- "st1 {v0.16b}, [%1], #16 \n" // store R
- "st1 {v1.16b}, [%2], #16 \n" // store G
- "st1 {v2.16b}, [%3], #16 \n" // store B
- "b.gt 1b \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #16 \n" // 16 processed per loop
+ "st1 {v0.16b}, [%1], #16 \n" // store R
+ "st1 {v1.16b}, [%2], #16 \n" // store G
+ "st1 {v2.16b}, [%3], #16 \n" // store B
+ "b.gt 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
@@ -653,7 +697,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
"+r"(width) // %4
: // Input registers
: "cc", "memory", "v0", "v1", "v2" // Clobber List
- );
+ );
}
// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
@@ -664,12 +708,16 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
int width) {
asm volatile(
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load R
- "ld1 {v1.16b}, [%1], #16 \n" // load G
- "ld1 {v2.16b}, [%2], #16 \n" // load B
- "subs %w4, %w4, #16 \n" // 16 processed per loop
- "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load R
+ "ld1 {v1.16b}, [%1], #16 \n" // load G
+ "ld1 {v2.16b}, [%2], #16 \n" // load B
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "subs %w4, %w4, #16 \n" // 16 processed per loop
+ "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
@@ -677,33 +725,34 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
"+r"(width) // %4
: // Input registers
: "cc", "memory", "v0", "v1", "v2" // Clobber List
- );
+ );
}
// Copy multiple of 32.
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"1: \n"
- "ldp q0, q1, [%0], #32 \n"
- "subs %w2, %w2, #32 \n" // 32 processed per loop
- "stp q0, q1, [%1], #32 \n"
- "b.gt 1b \n"
+ "ldp q0, q1, [%0], #32 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #32 \n" // 32 processed per loop
+ "stp q0, q1, [%1], #32 \n"
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2 // Output registers
: // Input registers
: "cc", "memory", "v0", "v1" // Clobber List
- );
+ );
}
// SetRow writes 'width' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
asm volatile(
- "dup v0.16b, %w2 \n" // duplicate 16 bytes
+ "dup v0.16b, %w2 \n" // duplicate 16 bytes
"1: \n"
- "subs %w1, %w1, #16 \n" // 16 bytes per loop
- "st1 {v0.16b}, [%0], #16 \n" // store
- "b.gt 1b \n"
+ "subs %w1, %w1, #16 \n" // 16 bytes per loop
+ "st1 {v0.16b}, [%0], #16 \n" // store
+ "b.gt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
: "r"(v8) // %2
@@ -712,130 +761,219 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
asm volatile(
- "dup v0.4s, %w2 \n" // duplicate 4 ints
+ "dup v0.4s, %w2 \n" // duplicate 4 ints
"1: \n"
- "subs %w1, %w1, #4 \n" // 4 ints per loop
- "st1 {v0.16b}, [%0], #16 \n" // store
- "b.gt 1b \n"
+ "subs %w1, %w1, #4 \n" // 4 ints per loop
+ "st1 {v0.16b}, [%0], #16 \n" // store
+ "b.gt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
: "r"(v32) // %2
: "cc", "memory", "v0");
}
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+ 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
// Start at end of source row.
- "add %0, %0, %w2, sxtw \n"
- "sub %0, %0, #16 \n"
- "1: \n"
- "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
- "subs %w2, %w2, #16 \n" // 16 pixels per loop.
- "rev64 v0.16b, v0.16b \n"
- "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
- "st1 {v0.D}[0], [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"((ptrdiff_t)-16) // %3
- : "cc", "memory", "v0");
+ "ld1 {v3.16b}, [%3] \n" // shuffler
+ "add %0, %0, %w2, sxtw \n"
+ "sub %0, %0, #32 \n"
+ "1: \n"
+ "ldr q2, [%0, 16] \n"
+ "ldr q1, [%0], -32 \n" // src -= 32
+ "subs %w2, %w2, #32 \n" // 32 pixels per loop.
+ "tbl v0.16b, {v2.16b}, v3.16b \n"
+ "tbl v1.16b, {v1.16b}, v3.16b \n"
+ "st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleMirror) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3");
}
-void MirrorUVRow_NEON(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
+// Shuffle table for reversing the UV.
+static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
+ 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
+
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
asm volatile(
// Start at end of source row.
- "add %0, %0, %w3, sxtw #1 \n"
- "sub %0, %0, #16 \n"
- "1: \n"
- "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
- "subs %w3, %w3, #8 \n" // 8 pixels per loop.
- "rev64 v0.8b, v0.8b \n"
- "rev64 v1.8b, v1.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // dst += 8
- "st1 {v1.8b}, [%2], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- : "r"((ptrdiff_t)-16) // %4
- : "cc", "memory", "v0", "v1");
-}
-
-void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+ "ld1 {v4.16b}, [%3] \n" // shuffler
+ "add %0, %0, %w2, sxtw #1 \n"
+ "sub %0, %0, #32 \n"
+ "1: \n"
+ "ldr q1, [%0, 16] \n"
+ "ldr q0, [%0], -32 \n" // src -= 32
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop.
+ "tbl v2.16b, {v1.16b}, v4.16b \n"
+ "tbl v3.16b, {v0.16b}, v4.16b \n"
+ "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleMirrorUV) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
asm volatile(
// Start at end of source row.
- "add %0, %0, %w2, sxtw #2 \n"
- "sub %0, %0, #16 \n"
- "1: \n"
- "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
- "subs %w2, %w2, #4 \n" // 4 pixels per loop.
- "rev64 v0.4s, v0.4s \n"
- "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
- "st1 {v0.D}[0], [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"((ptrdiff_t)-16) // %3
- : "cc", "memory", "v0");
+ "ld1 {v4.16b}, [%4] \n" // shuffler
+ "add %0, %0, %w3, sxtw #1 \n"
+ "sub %0, %0, #32 \n"
+ "1: \n"
+ "ldr q1, [%0, 16] \n"
+ "ldr q0, [%0], -32 \n" // src -= 32
+ "subs %w3, %w3, #16 \n" // 16 pixels per loop.
+ "tbl v2.16b, {v1.16b}, v4.16b \n"
+ "tbl v3.16b, {v0.16b}, v4.16b \n"
+ "uzp1 v0.16b, v2.16b, v3.16b \n" // U
+ "uzp2 v1.16b, v2.16b, v3.16b \n" // V
+ "st1 {v0.16b}, [%1], #16 \n" // dst += 16
+ "st1 {v1.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"(&kShuffleMirrorUV) // %4
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+// Shuffle table for reversing the ARGB.
+static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
+ 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u};
+
+void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "ld1 {v4.16b}, [%3] \n" // shuffler
+ "add %0, %0, %w2, sxtw #2 \n"
+ "sub %0, %0, #32 \n"
+ "1: \n"
+ "ldr q1, [%0, 16] \n"
+ "ldr q0, [%0], -32 \n" // src -= 32
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop.
+ "tbl v2.16b, {v1.16b}, v4.16b \n"
+ "tbl v3.16b, {v0.16b}, v4.16b \n"
+ "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleMirrorARGB) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
+ uint8_t* dst_rgb24,
+ int width) {
+ asm volatile(
+ "ld1 {v3.16b}, [%4] \n" // shuffler
+ "add %0, %0, %w2, sxtw #1 \n" // Start at end of row.
+ "add %0, %0, %w2, sxtw \n"
+ "sub %0, %0, #48 \n"
+
+ "1: \n"
+ "ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3 \n" // src -= 48
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop.
+ "tbl v0.16b, {v0.16b}, v3.16b \n"
+ "tbl v1.16b, {v1.16b}, v3.16b \n"
+ "tbl v2.16b, {v2.16b}, v3.16b \n"
+ "st3 {v0.16b, v1.16b, v2.16b}, [%1], #48 \n" // dst += 48
+ "b.gt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ : "r"((ptrdiff_t)-48), // %3
+ "r"(&kShuffleMirror) // %4
+ : "cc", "memory", "v0", "v1", "v2", "v3");
}
void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_argb,
int width) {
asm volatile(
- "movi v4.8b, #255 \n" // Alpha
+ "movi v4.8b, #255 \n" // Alpha
"1: \n"
- "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of
+ // RGB24.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
- );
+ );
}
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
asm volatile(
- "movi v5.8b, #255 \n" // Alpha
+ "movi v5.8b, #255 \n" // Alpha
"1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "orr v3.8b, v1.8b, v1.8b \n" // move g
- "orr v4.8b, v0.8b, v0.8b \n" // move r
- "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
- "b.gt 1b \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v3.8b, v1.8b, v1.8b \n" // move g
+ "orr v4.8b, v0.8b, v0.8b \n" // move r
+ "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
+ "b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
- );
+ );
+}
+
+void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+ asm volatile(
+ "movi v0.8b, #255 \n" // Alpha
+ "1: \n"
+ "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v2.8b, v4.8b, v4.8b \n" // move g
+ "orr v1.8b, v5.8b, v5.8b \n" // move r
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgba), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
+ );
}
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
asm volatile(
"1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "orr v3.8b, v1.8b, v1.8b \n" // move g
- "orr v4.8b, v0.8b, v0.8b \n" // move r
- "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
- "b.gt 1b \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v3.8b, v1.8b, v1.8b \n" // move g
+ "orr v4.8b, v0.8b, v0.8b \n" // move r
+ "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
+ "b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
- );
+ );
}
#define RGB565TOARGB \
@@ -855,19 +993,20 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
uint8_t* dst_argb,
int width) {
asm volatile(
- "movi v3.8b, #255 \n" // Alpha
+ "movi v3.8b, #255 \n" // Alpha
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
- );
+ );
}
#define ARGB1555TOARGB \
@@ -911,22 +1050,24 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_argb,
int width) {
asm volatile(
- "movi v3.8b, #255 \n" // Alpha
+ "movi v3.8b, #255 \n" // Alpha
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
- // pixels
- "b.gt 1b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
+ );
}
+// Convert v0.8h to b = v0.8b g = v1.8b r = v2.8b
+// clobbers v3
#define ARGB4444TOARGB \
"shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
"xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
@@ -944,18 +1085,18 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
int width) {
asm volatile(
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
- // pixels
- "b.gt 1b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
- );
+ );
}
void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
@@ -963,64 +1104,68 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
- // RGB24.
- "b.gt 1b \n"
+ "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
+ // RGB24
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
:
: "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
- );
+ );
}
void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
asm volatile(
"1: \n"
- "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "orr v4.8b, v2.8b, v2.8b \n" // mov g
- "orr v5.8b, v1.8b, v1.8b \n" // mov b
- "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
- "b.gt 1b \n"
+ "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v4.8b, v2.8b, v2.8b \n" // mov g
+ "orr v5.8b, v1.8b, v1.8b \n" // mov b
+ "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_raw), // %1
"+r"(width) // %2
:
: "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
- );
+ );
}
void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
- "subs %w2, %w2, #16 \n" // 16 processed per loop.
- "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
- "b.gt 1b \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
+ "b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1" // Clobber List
- );
+ );
}
void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
- "subs %w2, %w2, #16 \n" // 16 processed per loop.
- "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
- "b.gt 1b \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
+ "b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1" // Clobber List
- );
+ );
}
void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
@@ -1029,18 +1174,19 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
int width) {
asm volatile(
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
- "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
- "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
- "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
+ "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
+ "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
+ "b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3
:
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
+ );
}
void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
@@ -1049,18 +1195,19 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
int width) {
asm volatile(
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
- "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
- "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
- "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
+ "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
+ "b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3
:
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
+ );
}
void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
@@ -1071,14 +1218,15 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
asm volatile(
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
- "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
- "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
- "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
- "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
- "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
+ "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
+ "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
+ "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
+ "b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(src_yuy2b), // %1
"+r"(dst_u), // %2
@@ -1087,7 +1235,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
"v7" // Clobber List
- );
+ );
}
void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
@@ -1098,14 +1246,15 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
asm volatile(
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
- "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
- "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
- "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
- "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
- "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
+ "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
+ "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
+ "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
+ "b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(src_uyvyb), // %1
"+r"(dst_u), // %2
@@ -1114,7 +1263,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
"v7" // Clobber List
- );
+ );
}
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
@@ -1123,19 +1272,20 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
const uint8_t* shuffler,
int width) {
asm volatile(
- "ld1 {v2.16b}, [%3] \n" // shuffler
+ "ld1 {v2.16b}, [%3] \n" // shuffler
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
- "subs %w2, %w2, #4 \n" // 4 processed per loop
- "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
- "st1 {v1.16b}, [%1], #16 \n" // store 4.
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #4 \n" // 4 processed per loop
+ "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
+ "st1 {v1.16b}, [%1], #16 \n" // store 4.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "r"(shuffler) // %3
: "cc", "memory", "v0", "v1", "v2" // Clobber List
- );
+ );
}
void I422ToYUY2Row_NEON(const uint8_t* src_y,
@@ -1145,13 +1295,14 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
"1: \n"
- "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
- "orr v2.8b, v1.8b, v1.8b \n"
- "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
- "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
- "subs %w4, %w4, #16 \n" // 16 pixels
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
- "b.gt 1b \n"
+ "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v2.8b, v1.8b, v1.8b \n"
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
+ "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
+ "subs %w4, %w4, #16 \n" // 16 pixels
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -1168,13 +1319,14 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
int width) {
asm volatile(
"1: \n"
- "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
- "orr v3.8b, v2.8b, v2.8b \n"
- "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
- "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
- "subs %w4, %w4, #16 \n" // 16 pixels
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
- "b.gt 1b \n"
+ "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v3.8b, v2.8b, v2.8b \n"
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
+ "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
+ "subs %w4, %w4, #16 \n" // 16 pixels
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -1189,11 +1341,13 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
+ // pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTORGB565
- "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
- "b.gt 1b \n"
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb565), // %1
"+r"(width) // %2
@@ -1206,15 +1360,17 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
const uint32_t dither4,
int width) {
asm volatile(
- "dup v1.4s, %w2 \n" // dither4
- "1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v20.8b, v20.8b, v1.8b \n"
- "uqadd v21.8b, v21.8b, v1.8b \n"
- "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565
- "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
- "b.gt 1b \n"
+ "dup v1.4s, %w2 \n" // dither4
+ "1: \n"
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8
+ // pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v20.8b, v20.8b, v1.8b \n"
+ "uqadd v21.8b, v21.8b, v1.8b \n"
+ "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565
+ "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
: "+r"(dst_rgb) // %0
: "r"(src_argb), // %1
"r"(dither4), // %2
@@ -1227,12 +1383,13 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
+ // pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555
- "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
- // ARGB1555.
- "b.gt 1b \n"
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb1555), // %1
"+r"(width) // %2
@@ -1244,15 +1401,16 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_argb4444,
int width) {
asm volatile(
- "movi v4.16b, #0x0f \n" // bits to clear with
+ "movi v4.16b, #0x0f \n" // bits to clear with
// vbic.
"1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
+ // pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444
- "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
- // ARGB4444.
- "b.gt 1b \n"
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb4444), // %1
"+r"(width) // %2
@@ -1262,20 +1420,21 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "movi v4.8b, #13 \n" // B * 0.1016 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #33 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v3.8h, v0.8b, v4.8b \n" // B
- "umlal v3.8h, v1.8b, v5.8b \n" // G
- "umlal v3.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "movi v4.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1288,33 +1447,56 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16
- // pixels
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
- "b.gt 1b \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_a), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
+ );
}
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "movi v4.8b, #15 \n" // B * 0.11400 coefficient
- "movi v5.8b, #75 \n" // G * 0.58700 coefficient
- "movi v6.8b, #38 \n" // R * 0.29900 coefficient
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v3.8h, v0.8b, v4.8b \n" // B
- "umlal v3.8h, v1.8b, v5.8b \n" // G
- "umlal v3.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "movi v4.8b, #29 \n" // B * 0.1140 coefficient
+ "movi v5.8b, #150 \n" // G * 0.5870 coefficient
+ "movi v6.8b, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movi v4.8b, #29 \n" // B * 0.1140 coefficient
+ "movi v5.8b, #150 \n" // G * 0.5870 coefficient
+ "movi v6.8b, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 RGBA
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v0.8h, v1.8b, v4.8b \n" // B
+ "umlal v0.8h, v2.8b, v5.8b \n" // G
+ "umlal v0.8h, v3.8b, v6.8b \n" // R
+ "uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
+ "st1 {v3.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1328,33 +1510,33 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_v,
int width) {
asm volatile(
- "movi v24.8b, #112 \n" // UB / VR 0.875
+ "movi v24.8b, #112 \n" // UB / VR 0.875
// coefficient
- "movi v25.8b, #74 \n" // UG -0.5781 coefficient
- "movi v26.8b, #38 \n" // UR -0.2969 coefficient
- "movi v27.8b, #18 \n" // VB -0.1406 coefficient
- "movi v28.8b, #94 \n" // VG -0.7344 coefficient
- "movi v29.16b,#0x80 \n" // 128.5
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- // pixels.
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v24.8b \n" // B
- "umlsl v4.8h, v1.8b, v25.8b \n" // G
- "umlsl v4.8h, v2.8b, v26.8b \n" // R
- "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
-
- "umull v3.8h, v2.8b, v24.8b \n" // R
- "umlsl v3.8h, v1.8b, v28.8b \n" // G
- "umlsl v3.8h, v0.8b, v27.8b \n" // B
- "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
-
- "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
-
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "movi v25.8b, #74 \n" // UG -0.5781 coefficient
+ "movi v26.8b, #38 \n" // UR -0.2969 coefficient
+ "movi v27.8b, #18 \n" // VB -0.1406 coefficient
+ "movi v28.8b, #94 \n" // VG -0.7344 coefficient
+ "movi v29.16b,#0x80 \n" // 128.5
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v24.8b \n" // B
+ "umlsl v4.8h, v1.8b, v25.8b \n" // G
+ "umlsl v4.8h, v2.8b, v26.8b \n" // R
+ "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
+
+ "umull v3.8h, v2.8b, v24.8b \n" // R
+ "umlsl v3.8h, v1.8b, v28.8b \n" // G
+ "umlsl v3.8h, v0.8b, v27.8b \n" // B
+ "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
+
+ "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
+
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1398,26 +1580,28 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
-
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
- "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_argb_1), // %1
"+r"(dst_u), // %2
@@ -1429,7 +1613,6 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
);
}
-// TODO(fbarchard): Subsample match C code.
void ARGBToUVJRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
@@ -1437,31 +1620,33 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
int width) {
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
- "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
- "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
- "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
- "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
- "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
- "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
- "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
+ "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
+ "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
+ "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
+ "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
+ "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_argb_1), // %1
"+r"(dst_u), // %2
@@ -1481,25 +1666,27 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
- "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v3.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v3.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_bgra), // %0
"+r"(src_bgra_1), // %1
"+r"(dst_u), // %2
@@ -1519,25 +1706,27 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
- "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v3.8h, #1 \n" // 2x average
- "urshr v2.8h, v2.8h, #1 \n"
- "urshr v1.8h, v1.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v3.8h, #1 \n" // 2x average
+ "urshr v2.8h, v2.8h, #1 \n"
+ "urshr v1.8h, v1.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v2.8h, v1.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_abgr), // %0
"+r"(src_abgr_1), // %1
"+r"(dst_u), // %2
@@ -1557,25 +1746,27 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
- "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_rgba), // %0
"+r"(src_rgba_1), // %1
"+r"(dst_u), // %2
@@ -1595,25 +1786,27 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
- "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "1: \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(src_rgb24_1), // %1
"+r"(dst_u), // %2
@@ -1633,25 +1826,27 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
const uint8_t* src_raw_1 = src_raw + src_stride_raw;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
- "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
- "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
- "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v2.8h, v2.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v0.8h, v0.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "1: \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
+ "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v2.8h, v2.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v0.8h, v0.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v2.8h, v1.8h, v0.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(src_raw_1), // %1
"+r"(dst_u), // %2
@@ -1663,7 +1858,7 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
);
}
-// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+// 16x2 pixels -> 8x1. width is number of rgb pixels. e.g. 16.
void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
int src_stride_rgb565,
uint8_t* dst_u,
@@ -1671,67 +1866,54 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
int width) {
const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
asm volatile(
- "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) /
- // 2
- "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2
- "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
- "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
- "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
- "movi v27.16b, #0x80 \n" // 128.5 0x8080 in 16bit
+ RGBTOUV_SETUP_REG
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
RGB565TOARGB
- "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
RGB565TOARGB
- "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
+ "prfm pldl1keep, [%1, 448] \n"
RGB565TOARGB
- "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
RGB565TOARGB
- "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
-
- "ins v16.D[1], v17.D[0] \n"
- "ins v18.D[1], v19.D[0] \n"
- "ins v20.D[1], v21.D[0] \n"
-
- "urshr v4.8h, v16.8h, #1 \n" // 2x average
- "urshr v5.8h, v18.8h, #1 \n"
- "urshr v6.8h, v20.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 16 processed per loop.
- "mul v16.8h, v4.8h, v22.8h \n" // B
- "mls v16.8h, v5.8h, v23.8h \n" // G
- "mls v16.8h, v6.8h, v24.8h \n" // R
- "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned
- "mul v17.8h, v6.8h, v22.8h \n" // R
- "mls v17.8h, v5.8h, v26.8h \n" // G
- "mls v17.8h, v4.8h, v25.8h \n" // B
- "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned
- "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
+
+ "urshr v0.8h, v16.8h, #1 \n" // 2x average
+ "urshr v1.8h, v17.8h, #1 \n"
+ "urshr v2.8h, v18.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(src_rgb565_1), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
:
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
- "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
- "v27");
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+ "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+ "v28");
}
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
@@ -1744,50 +1926,43 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
asm volatile(
RGBTOUV_SETUP_REG
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
RGB555TOARGB
- "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
- "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
+ "prfm pldl1keep, [%1, 448] \n"
RGB555TOARGB
- "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
- "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
-
- "ins v16.D[1], v26.D[0] \n"
- "ins v17.D[1], v27.D[0] \n"
- "ins v18.D[1], v28.D[0] \n"
-
- "urshr v4.8h, v16.8h, #1 \n" // 2x average
- "urshr v5.8h, v17.8h, #1 \n"
- "urshr v6.8h, v18.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 16 processed per loop.
- "mul v2.8h, v4.8h, v20.8h \n" // B
- "mls v2.8h, v5.8h, v21.8h \n" // G
- "mls v2.8h, v6.8h, v22.8h \n" // R
- "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
- "mul v3.8h, v6.8h, v20.8h \n" // R
- "mls v3.8h, v5.8h, v24.8h \n" // G
- "mls v3.8h, v4.8h, v23.8h \n" // B
- "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
- "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
+
+ "urshr v0.8h, v16.8h, #1 \n" // 2x average
+ "urshr v1.8h, v17.8h, #1 \n"
+ "urshr v2.8h, v18.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(src_argb1555_1), // %1
"+r"(dst_u), // %2
@@ -1807,52 +1982,45 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
int width) {
const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
asm volatile(
- RGBTOUV_SETUP_REG
+ RGBTOUV_SETUP_REG // sets v20-v25
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
ARGB4444TOARGB
- "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
- "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
+ "prfm pldl1keep, [%1, 448] \n"
ARGB4444TOARGB
- "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
- "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
-
- "ins v16.D[1], v26.D[0] \n"
- "ins v17.D[1], v27.D[0] \n"
- "ins v18.D[1], v28.D[0] \n"
-
- "urshr v4.8h, v16.8h, #1 \n" // 2x average
- "urshr v5.8h, v17.8h, #1 \n"
- "urshr v6.8h, v18.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 16 processed per loop.
- "mul v2.8h, v4.8h, v20.8h \n" // B
- "mls v2.8h, v5.8h, v21.8h \n" // G
- "mls v2.8h, v6.8h, v22.8h \n" // R
- "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
- "mul v3.8h, v6.8h, v20.8h \n" // R
- "mls v3.8h, v5.8h, v24.8h \n" // G
- "mls v3.8h, v4.8h, v23.8h \n" // B
- "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
- "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
+
+ "urshr v0.8h, v16.8h, #1 \n" // 2x average
+ "urshr v1.8h, v17.8h, #1 \n"
+ "urshr v2.8h, v18.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(src_argb4444_1), // %1
"+r"(dst_u), // %2
@@ -1863,26 +2031,27 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
"v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
"v28"
- );
+ );
}
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
asm volatile(
- "movi v24.8b, #13 \n" // B * 0.1016 coefficient
- "movi v25.8b, #65 \n" // G * 0.5078 coefficient
- "movi v26.8b, #33 \n" // R * 0.2578 coefficient
- "movi v27.8b, #16 \n" // Add 16 constant
+ "movi v24.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v25.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v26.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v27.8b, #16 \n" // Add 16 constant
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB
- "umull v3.8h, v0.8b, v24.8b \n" // B
- "umlal v3.8h, v1.8b, v25.8b \n" // G
- "umlal v3.8h, v2.8b, v26.8b \n" // R
- "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v27.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "umull v3.8h, v0.8b, v24.8b \n" // B
+ "umlal v3.8h, v1.8b, v25.8b \n" // G
+ "umlal v3.8h, v2.8b, v26.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v27.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1895,21 +2064,22 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_y,
int width) {
asm volatile(
- "movi v4.8b, #13 \n" // B * 0.1016 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #33 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
+ "movi v4.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
- "umull v3.8h, v0.8b, v4.8b \n" // B
- "umlal v3.8h, v1.8b, v5.8b \n" // G
- "umlal v3.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1921,21 +2091,22 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_y,
int width) {
asm volatile(
- "movi v24.8b, #13 \n" // B * 0.1016 coefficient
- "movi v25.8b, #65 \n" // G * 0.5078 coefficient
- "movi v26.8b, #33 \n" // R * 0.2578 coefficient
- "movi v27.8b, #16 \n" // Add 16 constant
+ "movi v24.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v25.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v26.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v27.8b, #16 \n" // Add 16 constant
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
- "umull v3.8h, v0.8b, v24.8b \n" // B
- "umlal v3.8h, v1.8b, v25.8b \n" // G
- "umlal v3.8h, v2.8b, v26.8b \n" // R
- "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v27.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "umull v3.8h, v0.8b, v24.8b \n" // B
+ "umlal v3.8h, v1.8b, v25.8b \n" // G
+ "umlal v3.8h, v2.8b, v26.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v27.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1945,20 +2116,21 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
asm volatile(
- "movi v4.8b, #33 \n" // R * 0.2578 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #13 \n" // B * 0.1016 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v1.8b, v4.8b \n" // R
- "umlal v16.8h, v2.8b, v5.8b \n" // G
- "umlal v16.8h, v3.8b, v6.8b \n" // B
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "movi v4.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v1.8b, v4.8b \n" // R
+ "umlal v16.8h, v2.8b, v5.8b \n" // G
+ "umlal v16.8h, v3.8b, v6.8b \n" // B
+ "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_bgra), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1968,20 +2140,21 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
asm volatile(
- "movi v4.8b, #33 \n" // R * 0.2578 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #13 \n" // B * 0.1016 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v0.8b, v4.8b \n" // R
- "umlal v16.8h, v1.8b, v5.8b \n" // G
- "umlal v16.8h, v2.8b, v6.8b \n" // B
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "movi v6.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v4.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v0.8b, v4.8b \n" // R
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // B
+ "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1991,20 +2164,21 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile(
- "movi v4.8b, #13 \n" // B * 0.1016 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #33 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v1.8b, v4.8b \n" // B
- "umlal v16.8h, v2.8b, v5.8b \n" // G
- "umlal v16.8h, v3.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "movi v4.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v1.8b, v4.8b \n" // B
+ "umlal v16.8h, v2.8b, v5.8b \n" // G
+ "umlal v16.8h, v3.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2014,20 +2188,21 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
asm volatile(
- "movi v4.8b, #13 \n" // B * 0.1016 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #33 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v0.8b, v4.8b \n" // B
- "umlal v16.8h, v1.8b, v5.8b \n" // G
- "umlal v16.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "movi v4.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v0.8b, v4.8b \n" // B
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2037,20 +2212,21 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
asm volatile(
- "movi v4.8b, #33 \n" // R * 0.2578 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #13 \n" // B * 0.1016 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v0.8b, v4.8b \n" // B
- "umlal v16.8h, v1.8b, v5.8b \n" // G
- "umlal v16.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "movi v6.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v4.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v0.8b, v4.8b \n" // B
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2058,6 +2234,50 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
}
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+ asm volatile(
+ "movi v4.8b, #29 \n" // B * 0.1140 coefficient
+ "movi v5.8b, #150 \n" // G * 0.5870 coefficient
+ "movi v6.8b, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v0.8h, v0.8b, v4.8b \n" // B
+ "umlal v0.8h, v1.8b, v5.8b \n" // G
+ "umlal v0.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_yj), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+ asm volatile(
+ "movi v6.8b, #29 \n" // B * 0.1140 coefficient
+ "movi v5.8b, #150 \n" // G * 0.5870 coefficient
+ "movi v4.8b, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v0.8h, v0.8b, v4.8b \n" // B
+ "umlal v0.8h, v1.8b, v5.8b \n" // G
+ "umlal v0.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_yj), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
// Bilinear filter 16x2 -> 16x1
void InterpolateRow_NEON(uint8_t* dst_ptr,
const uint8_t* src_ptr,
@@ -2068,44 +2288,49 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
int y0_fraction = 256 - y1_fraction;
const uint8_t* src_ptr1 = src_ptr + src_stride;
asm volatile(
- "cmp %w4, #0 \n"
- "b.eq 100f \n"
- "cmp %w4, #128 \n"
- "b.eq 50f \n"
+ "cmp %w4, #0 \n"
+ "b.eq 100f \n"
+ "cmp %w4, #128 \n"
+ "b.eq 50f \n"
- "dup v5.16b, %w4 \n"
- "dup v4.16b, %w5 \n"
+ "dup v5.16b, %w4 \n"
+ "dup v4.16b, %w5 \n"
// General purpose row blend.
"1: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "umull v2.8h, v0.8b, v4.8b \n"
- "umull2 v3.8h, v0.16b, v4.16b \n"
- "umlal v2.8h, v1.8b, v5.8b \n"
- "umlal2 v3.8h, v1.16b, v5.16b \n"
- "rshrn v0.8b, v2.8h, #8 \n"
- "rshrn2 v0.16b, v3.8h, #8 \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 1b \n"
- "b 99f \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "subs %w3, %w3, #16 \n"
+ "umull v2.8h, v0.8b, v4.8b \n"
+ "umull2 v3.8h, v0.16b, v4.16b \n"
+ "umlal v2.8h, v1.8b, v5.8b \n"
+ "umlal2 v3.8h, v1.16b, v5.16b \n"
+ "rshrn v0.8b, v2.8h, #8 \n"
+ "rshrn2 v0.16b, v3.8h, #8 \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 1b \n"
+ "b 99f \n"
// Blend 50 / 50.
"50: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 50b \n"
- "b 99f \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 50b \n"
+ "b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "subs %w3, %w3, #16 \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 100b \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #16 \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 100b \n"
"99: \n"
: "+r"(dst_ptr), // %0
@@ -2124,56 +2349,60 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0,
uint8_t* dst_argb,
int width) {
asm volatile(
- "subs %w3, %w3, #8 \n"
- "b.lt 89f \n"
+ "subs %w3, %w3, #8 \n"
+ "b.lt 89f \n"
// Blend 8 pixels.
"8: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0
- // pixels
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1
- // pixels
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "umull v16.8h, v4.8b, v3.8b \n" // db * a
- "umull v17.8h, v5.8b, v3.8b \n" // dg * a
- "umull v18.8h, v6.8b, v3.8b \n" // dr * a
- "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
- "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
- "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
- "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
- "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
- "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
- "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
- "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
- "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
- "movi v3.8b, #255 \n" // a = 255
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- // pixels
- "b.ge 8b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v4.8b, v3.8b \n" // db * a
+ "umull v17.8h, v5.8b, v3.8b \n" // dg * a
+ "umull v18.8h, v6.8b, v3.8b \n" // dr * a
+ "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
+ "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
+ "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
+ "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
+ "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
+ "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
+ "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
+ "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
+ "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
+ "movi v3.8b, #255 \n" // a = 255
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ // pixels
+ "b.ge 8b \n"
"89: \n"
- "adds %w3, %w3, #8-1 \n"
- "b.lt 99f \n"
+ "adds %w3, %w3, #8-1 \n"
+ "b.lt 99f \n"
// Blend 1 pixels.
"1: \n"
- "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
- "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
- "subs %w3, %w3, #1 \n" // 1 processed per loop.
- "umull v16.8h, v4.8b, v3.8b \n" // db * a
- "umull v17.8h, v5.8b, v3.8b \n" // dg * a
- "umull v18.8h, v6.8b, v3.8b \n" // dr * a
- "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
- "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
- "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
- "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
- "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
- "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
- "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
- "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
- "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
- "movi v3.8b, #255 \n" // a = 255
- "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
- "b.ge 1b \n"
+ "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel
+ // ARGB0.
+ "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel
+ // ARGB1.
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #1 \n" // 1 processed per loop.
+ "umull v16.8h, v4.8b, v3.8b \n" // db * a
+ "umull v17.8h, v5.8b, v3.8b \n" // dg * a
+ "umull v18.8h, v6.8b, v3.8b \n" // dr * a
+ "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
+ "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
+ "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
+ "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
+ "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
+ "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
+ "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
+ "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
+ "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
+ "movi v3.8b, #255 \n" // a = 255
+ "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
+ "b.ge 1b \n"
"99: \n"
@@ -2193,17 +2422,17 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
asm volatile(
// Attenuate 8 pixels.
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v3.8b \n" // b * a
- "umull v5.8h, v1.8b, v3.8b \n" // g * a
- "umull v6.8h, v2.8b, v3.8b \n" // r * a
- "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
- "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
- "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
- // pixels
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v3.8b \n" // b * a
+ "umull v5.8h, v1.8b, v3.8b \n" // g * a
+ "umull v6.8h, v2.8b, v3.8b \n" // r * a
+ "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
+ "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
+ "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2219,32 +2448,33 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
int interval_offset,
int width) {
asm volatile(
- "dup v4.8h, %w2 \n"
- "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
- "dup v5.8h, %w3 \n" // interval multiply.
- "dup v6.8h, %w4 \n" // interval add
+ "dup v4.8h, %w2 \n"
+ "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
+ "dup v5.8h, %w3 \n" // interval multiply.
+ "dup v6.8h, %w4 \n" // interval add
// 8 pixel loop.
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
- "subs %w1, %w1, #8 \n" // 8 processed per loop.
- "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
- "uxtl v1.8h, v1.8b \n"
- "uxtl v2.8h, v2.8b \n"
- "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
- "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
- "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
- "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
- "mul v1.8h, v1.8h, v5.8h \n" // g
- "mul v2.8h, v2.8h, v5.8h \n" // r
- "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
- "add v1.8h, v1.8h, v6.8h \n" // g
- "add v2.8h, v2.8h, v6.8h \n" // r
- "uqxtn v0.8b, v0.8h \n"
- "uqxtn v1.8b, v1.8h \n"
- "uqxtn v2.8b, v2.8h \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w1, %w1, #8 \n" // 8 processed per loop.
+ "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
+ "uxtl v1.8h, v1.8b \n"
+ "uxtl v2.8h, v2.8b \n"
+ "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
+ "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
+ "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
+ "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
+ "mul v1.8h, v1.8h, v5.8h \n" // g
+ "mul v2.8h, v2.8h, v5.8h \n" // r
+ "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
+ "add v1.8h, v1.8h, v6.8h \n" // g
+ "add v2.8h, v2.8h, v6.8h \n" // r
+ "uqxtn v0.8b, v0.8h \n"
+ "uqxtn v1.8b, v1.8h \n"
+ "uqxtn v2.8b, v2.8h \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
: "r"(scale), // %2
@@ -2261,28 +2491,29 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
int width,
uint32_t value) {
asm volatile(
- "dup v0.4s, %w3 \n" // duplicate scale value.
- "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
- "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
+ "dup v0.4s, %w3 \n" // duplicate scale value.
+ "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
+ "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
// 8 pixel loop.
"1: \n"
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
- "uxtl v5.8h, v5.8b \n"
- "uxtl v6.8h, v6.8b \n"
- "uxtl v7.8h, v7.8b \n"
- "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
- "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
- "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
- "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
- "uqxtn v4.8b, v4.8h \n"
- "uqxtn v5.8b, v5.8h \n"
- "uqxtn v6.8b, v6.8h \n"
- "uqxtn v7.8b, v7.8h \n"
- "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
+ "uxtl v5.8h, v5.8b \n"
+ "uxtl v6.8h, v6.8b \n"
+ "uxtl v7.8h, v7.8b \n"
+ "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
+ "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
+ "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
+ "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
+ "uqxtn v4.8b, v4.8h \n"
+ "uqxtn v5.8b, v5.8h \n"
+ "uqxtn v6.8b, v6.8h \n"
+ "uqxtn v7.8b, v7.8h \n"
+ "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2292,23 +2523,24 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
// Similar to ARGBToYJ but stores ARGB.
-// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile(
- "movi v24.8b, #15 \n" // B * 0.11400 coefficient
- "movi v25.8b, #75 \n" // G * 0.58700 coefficient
- "movi v26.8b, #38 \n" // R * 0.29900 coefficient
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v24.8b \n" // B
- "umlal v4.8h, v1.8b, v25.8b \n" // G
- "umlal v4.8h, v2.8b, v26.8b \n" // R
- "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B
- "orr v1.8b, v0.8b, v0.8b \n" // G
- "orr v2.8b, v0.8b, v0.8b \n" // R
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
- "b.gt 1b \n"
+ "movi v24.8b, #29 \n" // B * 0.1140 coefficient
+ "movi v25.8b, #150 \n" // G * 0.5870 coefficient
+ "movi v26.8b, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v24.8b \n" // B
+ "umlal v4.8h, v1.8b, v25.8b \n" // G
+ "umlal v4.8h, v2.8b, v26.8b \n" // R
+ "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B
+ "orr v1.8b, v0.8b, v0.8b \n" // G
+ "orr v2.8b, v0.8b, v0.8b \n" // R
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2323,32 +2555,33 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
asm volatile(
- "movi v20.8b, #17 \n" // BB coefficient
- "movi v21.8b, #68 \n" // BG coefficient
- "movi v22.8b, #35 \n" // BR coefficient
- "movi v24.8b, #22 \n" // GB coefficient
- "movi v25.8b, #88 \n" // GG coefficient
- "movi v26.8b, #45 \n" // GR coefficient
- "movi v28.8b, #24 \n" // BB coefficient
- "movi v29.8b, #98 \n" // BG coefficient
- "movi v30.8b, #50 \n" // BR coefficient
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
- "subs %w1, %w1, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
- "umlal v4.8h, v1.8b, v21.8b \n" // G
- "umlal v4.8h, v2.8b, v22.8b \n" // R
- "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
- "umlal v5.8h, v1.8b, v25.8b \n" // G
- "umlal v5.8h, v2.8b, v26.8b \n" // R
- "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
- "umlal v6.8h, v1.8b, v29.8b \n" // G
- "umlal v6.8h, v2.8b, v30.8b \n" // R
- "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
- "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
- "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
- "b.gt 1b \n"
+ "movi v20.8b, #17 \n" // BB coefficient
+ "movi v21.8b, #68 \n" // BG coefficient
+ "movi v22.8b, #35 \n" // BR coefficient
+ "movi v24.8b, #22 \n" // GB coefficient
+ "movi v25.8b, #88 \n" // GG coefficient
+ "movi v26.8b, #45 \n" // GR coefficient
+ "movi v28.8b, #24 \n" // BB coefficient
+ "movi v29.8b, #98 \n" // BG coefficient
+ "movi v30.8b, #50 \n" // BR coefficient
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w1, %w1, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
+ "umlal v4.8h, v1.8b, v21.8b \n" // G
+ "umlal v4.8h, v2.8b, v22.8b \n" // R
+ "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
+ "umlal v5.8h, v1.8b, v25.8b \n" // G
+ "umlal v5.8h, v2.8b, v26.8b \n" // R
+ "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
+ "umlal v6.8h, v1.8b, v29.8b \n" // G
+ "umlal v6.8h, v2.8b, v30.8b \n" // R
+ "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
+ "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
+ "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
+ "b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
:
@@ -2364,51 +2597,52 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
const int8_t* matrix_argb,
int width) {
asm volatile(
- "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
- "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
- "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
-
- "1: \n"
- "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
- "uxtl v17.8h, v17.8b \n" // g
- "uxtl v18.8h, v18.8b \n" // r
- "uxtl v19.8h, v19.8b \n" // a
- "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
- "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
- "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
- "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
- "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
- "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
- "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
- "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
- "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
- "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
- "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
- "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
- "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
- "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
- "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
- "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
- "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
- "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
- "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
- "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
- "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
- "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
- "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
- "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
- "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
- "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
- "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
- "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
- "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
- "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
- "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
- "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
- "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
+ "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
+ "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
+
+ "1: \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
+ "uxtl v17.8h, v17.8b \n" // g
+ "uxtl v18.8h, v18.8b \n" // r
+ "uxtl v19.8h, v19.8b \n" // a
+ "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
+ "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
+ "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
+ "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
+ "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
+ "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
+ "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
+ "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
+ "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
+ "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
+ "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
+ "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
+ "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
+ "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
+ "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
+ "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
+ "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2426,19 +2660,21 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
asm volatile(
// 8 pixel loop.
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "umull v0.8h, v0.8b, v4.8b \n" // multiply B
- "umull v1.8h, v1.8b, v5.8b \n" // multiply G
- "umull v2.8h, v2.8b, v6.8b \n" // multiply R
- "umull v3.8h, v3.8b, v7.8b \n" // multiply A
- "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
- "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
- "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
- "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v0.8h, v0.8b, v4.8b \n" // multiply B
+ "umull v1.8h, v1.8b, v5.8b \n" // multiply G
+ "umull v2.8h, v2.8b, v6.8b \n" // multiply R
+ "umull v3.8h, v3.8b, v7.8b \n" // multiply A
+ "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
+ "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
+ "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
+ "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -2455,15 +2691,17 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
asm volatile(
// 8 pixel loop.
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v0.8b, v0.8b, v4.8b \n"
- "uqadd v1.8b, v1.8b, v5.8b \n"
- "uqadd v2.8b, v2.8b, v6.8b \n"
- "uqadd v3.8b, v3.8b, v7.8b \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v0.8b, v0.8b, v4.8b \n"
+ "uqadd v1.8b, v1.8b, v5.8b \n"
+ "uqadd v2.8b, v2.8b, v6.8b \n"
+ "uqadd v3.8b, v3.8b, v7.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -2480,15 +2718,17 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
asm volatile(
// 8 pixel loop.
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqsub v0.8b, v0.8b, v4.8b \n"
- "uqsub v1.8b, v1.8b, v5.8b \n"
- "uqsub v2.8b, v2.8b, v6.8b \n"
- "uqsub v3.8b, v3.8b, v7.8b \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqsub v0.8b, v0.8b, v4.8b \n"
+ "uqsub v1.8b, v1.8b, v5.8b \n"
+ "uqsub v2.8b, v2.8b, v6.8b \n"
+ "uqsub v3.8b, v3.8b, v7.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -2507,17 +2747,19 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "movi v3.8b, #255 \n" // alpha
+ "movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
- "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
- "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v0.8b, v0.8b, v1.8b \n" // add
- "orr v1.8b, v0.8b, v0.8b \n"
- "orr v2.8b, v0.8b, v0.8b \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v0.8b, v0.8b, v1.8b \n" // add
+ "orr v1.8b, v0.8b, v0.8b \n"
+ "orr v2.8b, v0.8b, v0.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -2534,12 +2776,14 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
asm volatile(
// 16 pixel loop.
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
- "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
- "subs %w3, %w3, #16 \n" // 16 processed per loop.
- "uqadd v0.16b, v0.16b, v1.16b \n" // add
- "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
+ "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "uqadd v0.16b, v0.16b, v1.16b \n" // add
+ "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
+ "b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_y), // %2
@@ -2558,15 +2802,17 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "movi v3.8b, #255 \n" // alpha
+ "movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
- "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
- "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v1.8b, v0.8b, v2.8b \n" // add
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v1.8b, v0.8b, v2.8b \n" // add
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -2586,23 +2832,26 @@ void SobelXRow_NEON(const uint8_t* src_y0,
int width) {
asm volatile(
"1: \n"
- "ld1 {v0.8b}, [%0],%5 \n" // top
- "ld1 {v1.8b}, [%0],%6 \n"
- "usubl v0.8h, v0.8b, v1.8b \n"
- "ld1 {v2.8b}, [%1],%5 \n" // center * 2
- "ld1 {v3.8b}, [%1],%6 \n"
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "ld1 {v2.8b}, [%2],%5 \n" // bottom
- "ld1 {v3.8b}, [%2],%6 \n"
- "subs %w4, %w4, #8 \n" // 8 pixels
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "abs v0.8h, v0.8h \n"
- "uqxtn v0.8b, v0.8h \n"
- "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
- "b.gt 1b \n"
+ "ld1 {v0.8b}, [%0],%5 \n" // top
+ "ld1 {v1.8b}, [%0],%6 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
+ "ld1 {v2.8b}, [%1],%5 \n" // center * 2
+ "ld1 {v3.8b}, [%1],%6 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "ld1 {v2.8b}, [%2],%5 \n" // bottom
+ "ld1 {v3.8b}, [%2],%6 \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "subs %w4, %w4, #8 \n" // 8 pixels
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
+ "b.gt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(src_y2), // %2
@@ -2611,7 +2860,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
: "r"(2LL), // %5
"r"(6LL) // %6
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
+ );
}
// SobelY as a matrix is
@@ -2624,23 +2873,25 @@ void SobelYRow_NEON(const uint8_t* src_y0,
int width) {
asm volatile(
"1: \n"
- "ld1 {v0.8b}, [%0],%4 \n" // left
- "ld1 {v1.8b}, [%1],%4 \n"
- "usubl v0.8h, v0.8b, v1.8b \n"
- "ld1 {v2.8b}, [%0],%4 \n" // center * 2
- "ld1 {v3.8b}, [%1],%4 \n"
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "ld1 {v2.8b}, [%0],%5 \n" // right
- "ld1 {v3.8b}, [%1],%5 \n"
- "subs %w3, %w3, #8 \n" // 8 pixels
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "abs v0.8h, v0.8h \n"
- "uqxtn v0.8b, v0.8h \n"
- "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
- "b.gt 1b \n"
+ "ld1 {v0.8b}, [%0],%4 \n" // left
+ "ld1 {v1.8b}, [%1],%4 \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
+ "ld1 {v2.8b}, [%0],%4 \n" // center * 2
+ "ld1 {v3.8b}, [%1],%4 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "ld1 {v2.8b}, [%0],%5 \n" // right
+ "ld1 {v3.8b}, [%1],%5 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 pixels
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
+ "b.gt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(dst_sobely), // %2
@@ -2648,7 +2899,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
: "r"(1LL), // %4
"r"(6LL) // %5
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
+ );
}
// Caveat - rounds float to half float whereas scaling version truncates.
@@ -2658,16 +2909,17 @@ void HalfFloat1Row_NEON(const uint16_t* src,
int width) {
asm volatile(
"1: \n"
- "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
- "subs %w2, %w2, #8 \n" // 8 pixels per loop
- "uxtl v2.4s, v1.4h \n" // 8 int's
- "uxtl2 v3.4s, v1.8h \n"
- "scvtf v2.4s, v2.4s \n" // 8 floats
- "scvtf v3.4s, v3.4s \n"
- "fcvtn v1.4h, v2.4s \n" // 8 half floats
- "fcvtn2 v1.8h, v3.4s \n"
- "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
- "b.gt 1b \n"
+ "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v2.4s, v1.4h \n" // 8 int's
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fcvtn v1.4h, v2.4s \n" // 8 half floats
+ "fcvtn2 v1.8h, v3.4s \n"
+ "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2681,18 +2933,19 @@ void HalfFloatRow_NEON(const uint16_t* src,
int width) {
asm volatile(
"1: \n"
- "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
- "subs %w2, %w2, #8 \n" // 8 pixels per loop
- "uxtl v2.4s, v1.4h \n" // 8 int's
- "uxtl2 v3.4s, v1.8h \n"
- "scvtf v2.4s, v2.4s \n" // 8 floats
- "scvtf v3.4s, v3.4s \n"
- "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
- "fmul v3.4s, v3.4s, %3.s[0] \n"
- "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
- "uqshrn2 v1.8h, v3.4s, #13 \n"
- "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
- "b.gt 1b \n"
+ "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v2.4s, v1.4h \n" // 8 int's
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
+ "fmul v3.4s, v3.4s, %3.s[0] \n"
+ "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
+ "uqshrn2 v1.8h, v3.4s, #13 \n"
+ "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2706,17 +2959,18 @@ void ByteToFloatRow_NEON(const uint8_t* src,
int width) {
asm volatile(
"1: \n"
- "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes
- "subs %w2, %w2, #8 \n" // 8 pixels per loop
- "uxtl v1.8h, v1.8b \n" // 8 shorts
- "uxtl v2.4s, v1.4h \n" // 8 ints
- "uxtl2 v3.4s, v1.8h \n"
- "scvtf v2.4s, v2.4s \n" // 8 floats
- "scvtf v3.4s, v3.4s \n"
- "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
- "fmul v3.4s, v3.4s, %3.s[0] \n"
- "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats
- "b.gt 1b \n"
+ "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v1.8h, v1.8b \n" // 8 shorts
+ "uxtl v2.4s, v1.4h \n" // 8 ints
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
+ "fmul v3.4s, v3.4s, %3.s[0] \n"
+ "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2730,20 +2984,21 @@ float ScaleMaxSamples_NEON(const float* src,
int width) {
float fmax;
asm volatile(
- "movi v5.4s, #0 \n" // max
- "movi v6.4s, #0 \n"
-
- "1: \n"
- "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
- "fmul v4.4s, v2.4s, %4.s[0] \n" // scale
- "fmax v5.4s, v5.4s, v1.4s \n" // max
- "fmax v6.4s, v6.4s, v2.4s \n"
- "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
- "b.gt 1b \n"
- "fmax v5.4s, v5.4s, v6.4s \n" // max
- "fmaxv %s3, v5.4s \n" // signed max acculator
+ "movi v5.4s, #0 \n" // max
+ "movi v6.4s, #0 \n"
+
+ "1: \n"
+ "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
+ "fmul v4.4s, v2.4s, %4.s[0] \n" // scale
+ "fmax v5.4s, v5.4s, v1.4s \n" // max
+ "fmax v6.4s, v6.4s, v2.4s \n"
+ "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ "fmax v5.4s, v5.4s, v6.4s \n" // max
+ "fmaxv %s3, v5.4s \n" // signed max acculator
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width), // %2
@@ -2759,21 +3014,22 @@ float ScaleSumSamples_NEON(const float* src,
int width) {
float fsum;
asm volatile(
- "movi v5.4s, #0 \n" // max
- "movi v6.4s, #0 \n" // max
-
- "1: \n"
- "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
- "fmul v4.4s, v2.4s, %4.s[0] \n"
- "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
- "fmla v6.4s, v2.4s, v2.4s \n"
- "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
- "b.gt 1b \n"
- "faddp v5.4s, v5.4s, v6.4s \n"
- "faddp v5.4s, v5.4s, v5.4s \n"
- "faddp %3.4s, v5.4s, v5.4s \n" // sum
+ "movi v5.4s, #0 \n" // max
+ "movi v6.4s, #0 \n" // max
+
+ "1: \n"
+ "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
+ "fmul v4.4s, v2.4s, %4.s[0] \n"
+ "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
+ "fmla v6.4s, v2.4s, v2.4s \n"
+ "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ "faddp v5.4s, v5.4s, v6.4s \n"
+ "faddp v5.4s, v5.4s, v5.4s \n"
+ "faddp %3.4s, v5.4s, v5.4s \n" // sum
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width), // %2
@@ -2786,12 +3042,13 @@ float ScaleSumSamples_NEON(const float* src,
void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
asm volatile(
"1: \n"
- "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "fmul v1.4s, v1.4s, %3.s[0] \n" // scale
- "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
- "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
- "b.gt 1b \n"
+ "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "fmul v1.4s, v1.4s, %3.s[0] \n" // scale
+ "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
+ "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2808,26 +3065,31 @@ void GaussCol_NEON(const uint16_t* src0,
uint32_t* dst,
int width) {
asm volatile(
- "movi v6.8h, #4 \n" // constant 4
- "movi v7.8h, #6 \n" // constant 6
-
- "1: \n"
- "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
- "ld1 {v2.8h}, [%4], #16 \n"
- "uaddl v0.4s, v1.4h, v2.4h \n" // * 1
- "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
- "ld1 {v2.8h}, [%1], #16 \n"
- "umlal v0.4s, v2.4h, v6.4h \n" // * 4
- "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
- "ld1 {v2.8h}, [%2], #16 \n"
- "umlal v0.4s, v2.4h, v7.4h \n" // * 6
- "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
- "ld1 {v2.8h}, [%3], #16 \n"
- "umlal v0.4s, v2.4h, v6.4h \n" // * 4
- "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
- "subs %w6, %w6, #8 \n" // 8 processed per loop
- "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
- "b.gt 1b \n"
+ "movi v6.8h, #4 \n" // constant 4
+ "movi v7.8h, #6 \n" // constant 6
+
+ "1: \n"
+ "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
+ "ld1 {v2.8h}, [%4], #16 \n"
+ "uaddl v0.4s, v1.4h, v2.4h \n" // * 1
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
+ "ld1 {v2.8h}, [%1], #16 \n"
+ "umlal v0.4s, v2.4h, v6.4h \n" // * 4
+ "prfm pldl1keep, [%1, 448] \n"
+ "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
+ "ld1 {v2.8h}, [%2], #16 \n"
+ "umlal v0.4s, v2.4h, v7.4h \n" // * 6
+ "prfm pldl1keep, [%2, 448] \n"
+ "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
+ "ld1 {v2.8h}, [%3], #16 \n"
+ "umlal v0.4s, v2.4h, v6.4h \n" // * 4
+ "prfm pldl1keep, [%3, 448] \n"
+ "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
+ "subs %w6, %w6, #8 \n" // 8 processed per loop
+ "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
+ "prfm pldl1keep, [%4, 448] \n"
+ "b.gt 1b \n"
: "+r"(src0), // %0
"+r"(src1), // %1
"+r"(src2), // %2
@@ -2845,27 +3107,28 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
const uint32_t* src2 = src + 2;
const uint32_t* src3 = src + 3;
asm volatile(
- "movi v6.4s, #4 \n" // constant 4
- "movi v7.4s, #6 \n" // constant 6
-
- "1: \n"
- "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples
- "add v0.4s, v0.4s, v1.4s \n" // * 1
- "add v1.4s, v1.4s, v2.4s \n" // * 1
- "ld1 {v2.4s,v3.4s}, [%2], #32 \n"
- "mla v0.4s, v2.4s, v7.4s \n" // * 6
- "mla v1.4s, v3.4s, v7.4s \n" // * 6
- "ld1 {v2.4s,v3.4s}, [%1], #32 \n"
- "ld1 {v4.4s,v5.4s}, [%3], #32 \n"
- "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
- "add v3.4s, v3.4s, v5.4s \n"
- "mla v0.4s, v2.4s, v6.4s \n" // * 4
- "mla v1.4s, v3.4s, v6.4s \n" // * 4
- "subs %w5, %w5, #8 \n" // 8 processed per loop
- "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
- "uqrshrn2 v0.8h, v1.4s, #8 \n"
- "st1 {v0.8h}, [%4], #16 \n" // store 8 samples
- "b.gt 1b \n"
+ "movi v6.4s, #4 \n" // constant 4
+ "movi v7.4s, #6 \n" // constant 6
+
+ "1: \n"
+ "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples
+ "add v0.4s, v0.4s, v1.4s \n" // * 1
+ "add v1.4s, v1.4s, v2.4s \n" // * 1
+ "ld1 {v2.4s,v3.4s}, [%2], #32 \n"
+ "mla v0.4s, v2.4s, v7.4s \n" // * 6
+ "mla v1.4s, v3.4s, v7.4s \n" // * 6
+ "ld1 {v2.4s,v3.4s}, [%1], #32 \n"
+ "ld1 {v4.4s,v5.4s}, [%3], #32 \n"
+ "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
+ "add v3.4s, v3.4s, v5.4s \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "mla v0.4s, v2.4s, v6.4s \n" // * 4
+ "mla v1.4s, v3.4s, v6.4s \n" // * 4
+ "subs %w5, %w5, #8 \n" // 8 processed per loop
+ "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
+ "uqrshrn2 v0.8h, v1.4s, #8 \n"
+ "st1 {v0.8h}, [%4], #16 \n" // store 8 samples
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(src1), // %1
"+r"(src2), // %2
@@ -2876,6 +3139,246 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
}
+static const vecf32 kGaussCoefficients = {4.0f, 6.0f, 1.0f / 256.0f, 0.0f};
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_F32_NEON(const float* src0,
+ const float* src1,
+ const float* src2,
+ const float* src3,
+ const float* src4,
+ float* dst,
+ int width) {
+ asm volatile(
+ "ld2r {v6.4s, v7.4s}, [%7] \n" // constants 4 and 6
+
+ "1: \n"
+ "ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows
+ "ld1 {v2.4s, v3.4s}, [%1], #32 \n"
+ "fmla v0.4s, v2.4s, v6.4s \n" // * 4
+ "ld1 {v4.4s, v5.4s}, [%2], #32 \n"
+ "fmla v1.4s, v3.4s, v6.4s \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "fmla v0.4s, v4.4s, v7.4s \n" // * 6
+ "ld1 {v2.4s, v3.4s}, [%3], #32 \n"
+ "fmla v1.4s, v5.4s, v7.4s \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "fmla v0.4s, v2.4s, v6.4s \n" // * 4
+ "ld1 {v4.4s, v5.4s}, [%4], #32 \n"
+ "fmla v1.4s, v3.4s, v6.4s \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "fadd v0.4s, v0.4s, v4.4s \n" // * 1
+ "prfm pldl1keep, [%3, 448] \n"
+ "fadd v1.4s, v1.4s, v5.4s \n"
+ "prfm pldl1keep, [%4, 448] \n"
+ "subs %w6, %w6, #8 \n" // 8 processed per loop
+ "st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(src2), // %2
+ "+r"(src3), // %3
+ "+r"(src4), // %4
+ "+r"(dst), // %5
+ "+r"(width) // %6
+ : "r"(&kGaussCoefficients) // %7
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_F32_NEON(const float* src, float* dst, int width) {
+ asm volatile(
+ "ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256
+
+ "1: \n"
+ "ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4 \n" // load 12 samples, 5
+ // rows
+ "fadd v0.4s, v0.4s, v1.4s \n" // * 1
+ "ld1 {v4.4s, v5.4s}, [%0], %5 \n"
+ "fadd v1.4s, v1.4s, v2.4s \n"
+ "fmla v0.4s, v4.4s, v7.4s \n" // * 6
+ "ld1 {v2.4s, v3.4s}, [%0], %4 \n"
+ "fmla v1.4s, v5.4s, v7.4s \n"
+ "ld1 {v4.4s, v5.4s}, [%0], %6 \n"
+ "fadd v2.4s, v2.4s, v4.4s \n"
+ "fadd v3.4s, v3.4s, v5.4s \n"
+ "fmla v0.4s, v2.4s, v6.4s \n" // * 4
+ "fmla v1.4s, v3.4s, v6.4s \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "fmul v0.4s, v0.4s, v8.4s \n" // / 256
+ "fmul v1.4s, v1.4s, v8.4s \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(&kGaussCoefficients), // %3
+ "r"(8LL), // %4
+ "r"(-4LL), // %5
+ "r"(20LL) // %6
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8");
+}
+
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values
+ "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values
+ "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
+ "subs %w3, %w3, #16 \n" // 16 pixels per loop
+ "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_yuv24), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2");
+}
+
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_uv,
+ int width) {
+ const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
+ asm volatile(
+
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
+ "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average
+ "uqrshrn v2.8b, v1.8h, #2 \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV.
+ "b.gt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(src_ayuv_1), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_vu,
+ int width) {
+ const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
+ asm volatile(
+
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
+ "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average
+ "uqrshrn v1.8b, v1.8h, #2 \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU.
+ "b.gt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(src_ayuv_1), // %1
+ "+r"(dst_vu), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Copy row of AYUV Y's into Y
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop
+ "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels
+ "b.gt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// Shuffle table for swapping UV bytes.
+static const uvec8 kShuffleSwapUV = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
+ 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+ asm volatile(
+ "ld1 {v2.16b}, [%3] \n" // shuffler
+ "1: \n"
+ "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values
+ "ld1 {v1.16b}, [%0], 16 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop
+ "tbl v0.16b, {v0.16b}, v2.16b \n"
+ "tbl v1.16b, {v1.16b}, v2.16b \n"
+ "stp q0, q1, [%1], 32 \n" // store 16 VU pixels
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_vu), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleSwapUV) // %3
+ : "cc", "memory", "v0", "v1", "v2");
+}
+
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ const uint8_t* src_u_1 = src_u + src_stride_u;
+ const uint8_t* src_v_1 = src_v + src_stride_v;
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 U values
+ "ld1 {v1.16b}, [%2], #16 \n" // load 16 V values
+ "ld1 {v2.16b}, [%1], #16 \n"
+ "ld1 {v3.16b}, [%3], #16 \n"
+ "uaddlp v0.8h, v0.16b \n" // half size
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "uadalp v0.8h, v2.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v1.8h, v3.16b \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "uqrshrn v0.8b, v0.8h, #2 \n"
+ "uqrshrn v1.8b, v1.8h, #2 \n"
+ "subs %w5, %w5, #16 \n" // 16 src pixels per loop
+ "st2 {v0.8b, v1.8b}, [%4], #16 \n" // store 8 UV pixels
+ "b.gt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_u_1), // %1
+ "+r"(src_v), // %2
+ "+r"(src_v_1), // %3
+ "+r"(dst_uv), // %4
+ "+r"(width) // %5
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
diff --git a/chromium/third_party/libyuv/source/row_win.cc b/chromium/third_party/libyuv/source/row_win.cc
index 5500d7f5a64..9afcf060a4d 100644
--- a/chromium/third_party/libyuv/source/row_win.cc
+++ b/chromium/third_party/libyuv/source/row_win.cc
@@ -1594,9 +1594,9 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
- vbroadcastf128 ymm5, xmmword ptr kAddUV128
- vbroadcastf128 ymm6, xmmword ptr kARGBToV
- vbroadcastf128 ymm7, xmmword ptr kARGBToU
+ vbroadcastf128 ymm5, xmmword ptr kAddUVJ128
+ vbroadcastf128 ymm6, xmmword ptr kARGBToVJ
+ vbroadcastf128 ymm7, xmmword ptr kARGBToUJ
sub edi, edx // stride from u to v
convertloop:
@@ -2898,10 +2898,12 @@ __declspec(naked) void I422ToRGBARow_SSSE3(
}
#endif // HAS_I422TOARGBROW_SSSE3
+// I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter
#ifdef HAS_I400TOARGBROW_SSE2
// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
__declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
uint8_t* rgb_buf,
+ const struct YuvConstants*,
int width) {
__asm {
mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
@@ -2949,6 +2951,7 @@ __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
// note: vpunpcklbw mutates and vpackuswb unmutates.
__declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
uint8_t* rgb_buf,
+ const struct YuvConstants*,
int width) {
__asm {
mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
@@ -3045,15 +3048,15 @@ __declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
}
#endif // HAS_MIRRORROW_AVX2
-#ifdef HAS_MIRRORUVROW_SSSE3
+#ifdef HAS_MIRRORSPLITUVROW_SSSE3
// Shuffle table for reversing the bytes of UV channels.
static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
-__declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
+__declspec(naked) void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src
@@ -3078,7 +3081,7 @@ __declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src,
ret
}
}
-#endif // HAS_MIRRORUVROW_SSSE3
+#endif // HAS_MIRRORSPLITUVROW_SSSE3
#ifdef HAS_ARGBMIRRORROW_SSE2
__declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
@@ -4222,7 +4225,7 @@ __declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
add ecx, 4 - 1
jl convertloop1b
- // 1 pixel loop.
+ // 1 pixel loop.
convertloop1:
movd xmm3, [eax] // src argb
lea eax, [eax + 4]
@@ -5360,7 +5363,7 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
add ecx, 4 - 1
jl l1b
- // 1 pixel loop
+ // 1 pixel loop
l1:
movdqu xmm0, [eax]
psubd xmm0, [eax + edx * 4]
@@ -5448,9 +5451,9 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
add ecx, 4 - 1
jl l1b
- // 1 pixel loop
+ // 1 pixel loop
l1:
- movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
+ movd xmm2, dword ptr [eax] // 1 argb pixel
lea eax, [eax + 4]
punpcklbw xmm2, xmm1
punpcklwd xmm2, xmm1
@@ -5534,7 +5537,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
add ecx, 4 - 1
jl l1b
- // 1 pixel loop
+ // 1 pixel loop
l1:
cvttps2dq xmm0, xmm2 // x, y float to int
packssdw xmm0, xmm0 // x, y as shorts
diff --git a/chromium/third_party/libyuv/source/scale.cc b/chromium/third_party/libyuv/source/scale.cc
index 2cfa1c6cb1c..cf3c0332573 100644
--- a/chromium/third_party/libyuv/source/scale.cc
+++ b/chromium/third_party/libyuv/source/scale.cc
@@ -17,6 +17,7 @@
#include "libyuv/planar_functions.h" // For CopyPlane
#include "libyuv/row.h"
#include "libyuv/scale_row.h"
+#include "libyuv/scale_uv.h" // For UVScale
#ifdef __cplusplus
namespace libyuv {
@@ -103,6 +104,21 @@ static void ScalePlaneDown2(int src_width,
}
}
#endif
+#if defined(HAS_SCALEROWDOWN2_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleRowDown2 =
+ filtering == kFilterNone
+ ? ScaleRowDown2_Any_MMI
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MMI
+ : ScaleRowDown2Box_Any_MMI);
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MMI
+ : (filtering == kFilterLinear
+ ? ScaleRowDown2Linear_MMI
+ : ScaleRowDown2Box_MMI);
+ }
+ }
+#endif
#if defined(HAS_SCALEROWDOWN2_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ScaleRowDown2 =
@@ -169,6 +185,14 @@ static void ScalePlaneDown2_16(int src_width,
: ScaleRowDown2Box_16_SSE2);
}
#endif
+#if defined(HAS_SCALEROWDOWN2_16_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_MMI
+ : (filtering == kFilterLinear
+ ? ScaleRowDown2Linear_16_MMI
+ : ScaleRowDown2Box_16_MMI);
+ }
+#endif
if (filtering == kFilterLinear) {
src_stride = 0;
@@ -232,6 +256,15 @@ static void ScalePlaneDown4(int src_width,
}
}
#endif
+#if defined(HAS_SCALEROWDOWN4_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_Any_MMI : ScaleRowDown4_Any_MMI;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_MMI : ScaleRowDown4_MMI;
+ }
+ }
+#endif
#if defined(HAS_SCALEROWDOWN4_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ScaleRowDown4 =
@@ -284,6 +317,11 @@ static void ScalePlaneDown4_16(int src_width,
filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2;
}
#endif
+#if defined(HAS_SCALEROWDOWN4_16_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_MMI : ScaleRowDown4_16_MMI;
+ }
+#endif
if (filtering == kFilterLinear) {
src_stride = 0;
@@ -341,6 +379,18 @@ static void ScalePlaneDown34(int src_width,
}
}
#endif
+#if defined(HAS_SCALEROWDOWN34_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_Any_MMI;
+ ScaleRowDown34_1 = ScaleRowDown34_Any_MMI;
+ if (dst_width % 24 == 0) {
+ ScaleRowDown34_0 = ScaleRowDown34_MMI;
+ ScaleRowDown34_1 = ScaleRowDown34_MMI;
+ }
+ }
+ }
+#endif
#if defined(HAS_SCALEROWDOWN34_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
if (!filtering) {
@@ -841,6 +891,14 @@ static void ScalePlaneBox(int src_width,
}
}
#endif
+#if defined(HAS_SCALEADDROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleAddRow = ScaleAddRow_Any_MMI;
+ if (IS_ALIGNED(src_width, 8)) {
+ ScaleAddRow = ScaleAddRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_SCALEADDROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ScaleAddRow = ScaleAddRow_Any_MSA;
@@ -904,6 +962,11 @@ static void ScalePlaneBox_16(int src_width,
}
#endif
+#if defined(HAS_SCALEADDROW_16_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(src_width, 4)) {
+ ScaleAddRow = ScaleAddRow_16_MMI;
+ }
+#endif
for (j = 0; j < dst_height; ++j) {
int boxheight;
int iy = y >> 16;
@@ -980,6 +1043,14 @@ void ScalePlaneBilinearDown(int src_width,
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
@@ -1207,6 +1278,11 @@ void ScalePlaneBilinearUp(int src_width,
ScaleFilterCols = ScaleColsUp2_SSE2;
}
#endif
+#if defined(HAS_SCALECOLS_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+ ScaleFilterCols = ScaleColsUp2_MMI;
+ }
+#endif
}
if (y > max_y) {
@@ -1334,6 +1410,11 @@ void ScalePlaneBilinearUp_16(int src_width,
ScaleFilterCols = ScaleColsUp2_16_SSE2;
}
#endif
+#if defined(HAS_SCALECOLS_16_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+ ScaleFilterCols = ScaleColsUp2_16_MMI;
+ }
+#endif
}
if (y > max_y) {
@@ -1419,6 +1500,11 @@ static void ScalePlaneSimple(int src_width,
ScaleCols = ScaleColsUp2_SSE2;
}
#endif
+#if defined(HAS_SCALECOLS_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+ ScaleCols = ScaleColsUp2_MMI;
+ }
+#endif
}
for (i = 0; i < dst_height; ++i) {
@@ -1455,6 +1541,11 @@ static void ScalePlaneSimple_16(int src_width,
ScaleCols = ScaleColsUp2_16_SSE2;
}
#endif
+#if defined(HAS_SCALECOLS_16_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+ ScaleCols = ScaleColsUp2_16_MMI;
+ }
+#endif
}
for (i = 0; i < dst_height; ++i) {
@@ -1580,7 +1671,7 @@ void ScalePlane_16(const uint16_t* src,
}
if (dst_width == src_width && filtering != kFilterBox) {
int dy = FixedDiv(src_height, dst_height);
- // Arbitrary scale vertically, but unscaled vertically.
+ // Arbitrary scale vertically, but unscaled horizontally.
ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
dst_stride, src, dst, 0, 0, dy, 1, filtering);
return;
@@ -1710,6 +1801,109 @@ int I420Scale_16(const uint16_t* src_y,
return 0;
}
+// Scale an I444 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int I444Scale(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+ dst_width, dst_height, filtering);
+ ScalePlane(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
+ dst_width, dst_height, filtering);
+ ScalePlane(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
+ dst_width, dst_height, filtering);
+ return 0;
+}
+
+LIBYUV_API
+int I444Scale_16(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+ dst_width, dst_height, filtering);
+ ScalePlane_16(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
+ dst_width, dst_height, filtering);
+ ScalePlane_16(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
+ dst_width, dst_height, filtering);
+ return 0;
+}
+
+// Scale an NV12 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int NV12Scale(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+ int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+ if (!src_y || !src_uv || src_width == 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_uv ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+ dst_width, dst_height, filtering);
+ UVScale(src_uv, src_stride_uv, src_halfwidth, src_halfheight, dst_uv,
+ dst_stride_uv, dst_halfwidth, dst_halfheight, filtering);
+ return 0;
+}
+
// Deprecated api
LIBYUV_API
int Scale(const uint8_t* src_y,
diff --git a/chromium/third_party/libyuv/source/scale_any.cc b/chromium/third_party/libyuv/source/scale_any.cc
index 53ad1364049..c93d70c5fc7 100644
--- a/chromium/third_party/libyuv/source/scale_any.cc
+++ b/chromium/third_party/libyuv/source/scale_any.cc
@@ -8,6 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <string.h> // For memset/memcpy
+
#include "libyuv/scale.h"
#include "libyuv/scale_row.h"
@@ -18,46 +20,6 @@ namespace libyuv {
extern "C" {
#endif
-// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
-#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
- void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
- int dx) { \
- int r = dst_width & MASK; \
- int n = dst_width & ~MASK; \
- if (n > 0) { \
- TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
- } \
- TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \
- }
-
-#ifdef HAS_SCALEFILTERCOLS_NEON
-CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
-#endif
-#ifdef HAS_SCALEFILTERCOLS_MSA
-CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
-#endif
-#ifdef HAS_SCALEARGBCOLS_NEON
-CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
-#endif
-#ifdef HAS_SCALEARGBCOLS_MSA
-CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
-#endif
-#ifdef HAS_SCALEARGBFILTERCOLS_NEON
-CANY(ScaleARGBFilterCols_Any_NEON,
- ScaleARGBFilterCols_NEON,
- ScaleARGBFilterCols_C,
- 4,
- 3)
-#endif
-#ifdef HAS_SCALEARGBFILTERCOLS_MSA
-CANY(ScaleARGBFilterCols_Any_MSA,
- ScaleARGBFilterCols_MSA,
- ScaleARGBFilterCols_C,
- 4,
- 7)
-#endif
-#undef CANY
-
// Fixed scale down.
// Mask may be non-power of 2, so use MOD
#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
@@ -108,6 +70,22 @@ SDODD(ScaleRowDown2Box_Odd_SSSE3,
1,
15)
#endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
+SDANY(ScaleUVRowDown2Box_Any_SSSE3,
+ ScaleUVRowDown2Box_SSSE3,
+ ScaleUVRowDown2Box_C,
+ 2,
+ 2,
+ 4)
+#endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
+SDANY(ScaleUVRowDown2Box_Any_AVX2,
+ ScaleUVRowDown2Box_AVX2,
+ ScaleUVRowDown2Box_C,
+ 2,
+ 2,
+ 8)
+#endif
#ifdef HAS_SCALEROWDOWN2_AVX2
SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
SDANY(ScaleRowDown2Linear_Any_AVX2,
@@ -150,6 +128,15 @@ SDODD(ScaleRowDown2Box_Odd_NEON,
1,
15)
#endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_NEON
+SDANY(ScaleUVRowDown2Box_Any_NEON,
+ ScaleUVRowDown2Box_NEON,
+ ScaleUVRowDown2Box_C,
+ 2,
+ 2,
+ 8)
+#endif
+
#ifdef HAS_SCALEROWDOWN2_MSA
SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
SDANY(ScaleRowDown2Linear_Any_MSA,
@@ -165,6 +152,27 @@ SDANY(ScaleRowDown2Box_Any_MSA,
1,
31)
#endif
+#ifdef HAS_SCALEROWDOWN2_MMI
+SDANY(ScaleRowDown2_Any_MMI, ScaleRowDown2_MMI, ScaleRowDown2_C, 2, 1, 7)
+SDANY(ScaleRowDown2Linear_Any_MMI,
+ ScaleRowDown2Linear_MMI,
+ ScaleRowDown2Linear_C,
+ 2,
+ 1,
+ 7)
+SDANY(ScaleRowDown2Box_Any_MMI,
+ ScaleRowDown2Box_MMI,
+ ScaleRowDown2Box_C,
+ 2,
+ 1,
+ 7)
+SDODD(ScaleRowDown2Box_Odd_MMI,
+ ScaleRowDown2Box_MMI,
+ ScaleRowDown2Box_Odd_C,
+ 2,
+ 1,
+ 7)
+#endif
#ifdef HAS_SCALEROWDOWN4_SSSE3
SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
SDANY(ScaleRowDown4Box_Any_SSSE3,
@@ -201,6 +209,15 @@ SDANY(ScaleRowDown4Box_Any_MSA,
1,
15)
#endif
+#ifdef HAS_SCALEROWDOWN4_MMI
+SDANY(ScaleRowDown4_Any_MMI, ScaleRowDown4_MMI, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_MMI,
+ ScaleRowDown4Box_MMI,
+ ScaleRowDown4Box_C,
+ 4,
+ 1,
+ 7)
+#endif
#ifdef HAS_SCALEROWDOWN34_SSSE3
SDANY(ScaleRowDown34_Any_SSSE3,
ScaleRowDown34_SSSE3,
@@ -261,6 +278,14 @@ SDANY(ScaleRowDown34_1_Box_Any_MSA,
1,
47)
#endif
+#ifdef HAS_SCALEROWDOWN34_MMI
+SDANY(ScaleRowDown34_Any_MMI,
+ ScaleRowDown34_MMI,
+ ScaleRowDown34_C,
+ 4 / 3,
+ 1,
+ 23)
+#endif
#ifdef HAS_SCALEROWDOWN38_SSSE3
SDANY(ScaleRowDown38_Any_SSSE3,
ScaleRowDown38_SSSE3,
@@ -382,6 +407,26 @@ SDANY(ScaleARGBRowDown2Box_Any_MSA,
4,
3)
#endif
+#ifdef HAS_SCALEARGBROWDOWN2_MMI
+SDANY(ScaleARGBRowDown2_Any_MMI,
+ ScaleARGBRowDown2_MMI,
+ ScaleARGBRowDown2_C,
+ 2,
+ 4,
+ 1)
+SDANY(ScaleARGBRowDown2Linear_Any_MMI,
+ ScaleARGBRowDown2Linear_MMI,
+ ScaleARGBRowDown2Linear_C,
+ 2,
+ 4,
+ 1)
+SDANY(ScaleARGBRowDown2Box_Any_MMI,
+ ScaleARGBRowDown2Box_MMI,
+ ScaleARGBRowDown2Box_C,
+ 2,
+ 4,
+ 1)
+#endif
#undef SDANY
// Scale down by even scale factor.
@@ -433,6 +478,64 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_MSA,
4,
3)
#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_MMI
+SDAANY(ScaleARGBRowDownEven_Any_MMI,
+ ScaleARGBRowDownEven_MMI,
+ ScaleARGBRowDownEven_C,
+ 4,
+ 1)
+SDAANY(ScaleARGBRowDownEvenBox_Any_MMI,
+ ScaleARGBRowDownEvenBox_MMI,
+ ScaleARGBRowDownEvenBox_C,
+ 4,
+ 1)
+#endif
+#ifdef HAS_SCALEUVROWDOWNEVEN_NEON
+SDAANY(ScaleUVRowDownEven_Any_NEON,
+ ScaleUVRowDownEven_NEON,
+ ScaleUVRowDownEven_C,
+ 2,
+ 3)
+#endif
+
+#ifdef SASIMDONLY
+// This also works and uses memcpy and SIMD instead of C, but is slower on ARM
+
+// Add rows box filter scale down. Using macro from row_any
+#define SAROW(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint16_t dst_temp[32]); \
+ SIMD_ALIGNED(uint8_t src_temp[32]); \
+ memset(dst_temp, 0, 32 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, n); \
+ } \
+ memcpy(src_temp, src_ptr + n * SBPP, r * SBPP); \
+ memcpy(dst_temp, dst_ptr + n * BPP, r * BPP); \
+ ANY_SIMD(src_temp, dst_temp, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, dst_temp, r * BPP); \
+ }
+
+#ifdef HAS_SCALEADDROW_SSE2
+SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15)
+#endif
+#ifdef HAS_SCALEADDROW_AVX2
+SAROW(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, 1, 2, 31)
+#endif
+#ifdef HAS_SCALEADDROW_NEON
+SAROW(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, 1, 2, 15)
+#endif
+#ifdef HAS_SCALEADDROW_MSA
+SAROW(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, 1, 2, 15)
+#endif
+#ifdef HAS_SCALEADDROW_MMI
+SAROW(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, 1, 2, 7)
+#endif
+#undef SAANY
+
+#else
// Add rows box filter scale down.
#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
@@ -456,8 +559,56 @@ SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
#ifdef HAS_SCALEADDROW_MSA
SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15)
#endif
+#ifdef HAS_SCALEADDROW_MMI
+SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7)
+#endif
#undef SAANY
+#endif // SASIMDONLY
+
+// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
+#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
+ void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
+ int dx) { \
+ int r = dst_width & MASK; \
+ int n = dst_width & ~MASK; \
+ if (n > 0) { \
+ TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
+ } \
+ TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \
+ }
+
+#ifdef HAS_SCALEFILTERCOLS_NEON
+CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
+#endif
+#ifdef HAS_SCALEFILTERCOLS_MSA
+CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
+#endif
+#ifdef HAS_SCALEARGBCOLS_NEON
+CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
+#endif
+#ifdef HAS_SCALEARGBCOLS_MSA
+CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBCOLS_MMI
+CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_NEON
+CANY(ScaleARGBFilterCols_Any_NEON,
+ ScaleARGBFilterCols_NEON,
+ ScaleARGBFilterCols_C,
+ 4,
+ 3)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_MSA
+CANY(ScaleARGBFilterCols_Any_MSA,
+ ScaleARGBFilterCols_MSA,
+ ScaleARGBFilterCols_C,
+ 4,
+ 7)
+#endif
+#undef CANY
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/chromium/third_party/libyuv/source/scale_argb.cc b/chromium/third_party/libyuv/source/scale_argb.cc
index 53a22e8b41e..451d4ec4d1b 100644
--- a/chromium/third_party/libyuv/source/scale_argb.cc
+++ b/chromium/third_party/libyuv/source/scale_argb.cc
@@ -95,6 +95,22 @@ static void ScaleARGBDown2(int src_width,
}
}
#endif
+#if defined(HAS_SCALEARGBROWDOWN2_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_Any_MMI
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MMI
+ : ScaleARGBRowDown2Box_Any_MMI);
+ if (IS_ALIGNED(dst_width, 2)) {
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_MMI
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MMI
+ : ScaleARGBRowDown2Box_MMI);
+ }
+ }
+#endif
#if defined(HAS_SCALEARGBROWDOWN2_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ScaleARGBRowDown2 =
@@ -227,6 +243,16 @@ static void ScaleARGBDownEven(int src_width,
}
}
#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MMI
+ : ScaleARGBRowDownEven_Any_MMI;
+ if (IS_ALIGNED(dst_width, 2)) {
+ ScaleARGBRowDownEven =
+ filtering ? ScaleARGBRowDownEvenBox_MMI : ScaleARGBRowDownEven_MMI;
+ }
+ }
+#endif
#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA
@@ -410,6 +436,14 @@ static void ScaleARGBBilinearUp(int src_width,
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(dst_width, 2)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
@@ -456,6 +490,14 @@ static void ScaleARGBBilinearUp(int src_width,
}
}
#endif
+#if defined(HAS_SCALEARGBCOLS_MMI)
+ if (!filtering && TestCpuFlag(kCpuHasMMI)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_MMI;
+ if (IS_ALIGNED(dst_width, 1)) {
+ ScaleARGBFilterCols = ScaleARGBCols_MMI;
+ }
+ }
+#endif
#if defined(HAS_SCALEARGBCOLS_MSA)
if (!filtering && TestCpuFlag(kCpuHasMSA)) {
ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
@@ -471,6 +513,11 @@ static void ScaleARGBBilinearUp(int src_width,
ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
}
#endif
+#if defined(HAS_SCALEARGBCOLSUP2_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_MMI;
+ }
+#endif
}
if (y > max_y) {
@@ -572,6 +619,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
}
}
#endif
+#if defined(HAS_I422TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(src_width, 4)) {
+ I422ToARGBRow = I422ToARGBRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_I422TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToARGBRow = I422ToARGBRow_Any_MSA;
@@ -658,6 +713,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
}
}
#endif
+#if defined(HAS_SCALEARGBCOLS_MMI)
+ if (!filtering && TestCpuFlag(kCpuHasMMI)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_MMI;
+ if (IS_ALIGNED(dst_width, 1)) {
+ ScaleARGBFilterCols = ScaleARGBCols_MMI;
+ }
+ }
+#endif
#if defined(HAS_SCALEARGBCOLS_MSA)
if (!filtering && TestCpuFlag(kCpuHasMSA)) {
ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
@@ -673,6 +736,11 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
}
#endif
+#if defined(HAS_SCALEARGBCOLSUP2_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_MMI;
+ }
+#endif
}
const int max_y = (src_height - 1) << 16;
@@ -789,6 +857,14 @@ static void ScaleARGBSimple(int src_width,
}
}
#endif
+#if defined(HAS_SCALEARGBCOLS_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleARGBCols = ScaleARGBCols_Any_MMI;
+ if (IS_ALIGNED(dst_width, 1)) {
+ ScaleARGBCols = ScaleARGBCols_MMI;
+ }
+ }
+#endif
#if defined(HAS_SCALEARGBCOLS_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ScaleARGBCols = ScaleARGBCols_Any_MSA;
@@ -804,6 +880,11 @@ static void ScaleARGBSimple(int src_width,
ScaleARGBCols = ScaleARGBColsUp2_SSE2;
}
#endif
+#if defined(HAS_SCALEARGBCOLSUP2_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBCols = ScaleARGBColsUp2_MMI;
+ }
+#endif
}
for (j = 0; j < dst_height; ++j) {
@@ -900,7 +981,7 @@ static void ScaleARGB(const uint8_t* src,
}
}
if (dx == 0x10000 && (x & 0xffff) == 0) {
- // Arbitrary scale vertically, but unscaled vertically.
+ // Arbitrary scale vertically, but unscaled horizontally.
ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
dst_stride, src, dst, x, y, dy, 4, filtering);
return;
diff --git a/chromium/third_party/libyuv/source/scale_common.cc b/chromium/third_party/libyuv/source/scale_common.cc
index b28d7da41fc..81959925c8a 100644
--- a/chromium/third_party/libyuv/source/scale_common.cc
+++ b/chromium/third_party/libyuv/source/scale_common.cc
@@ -542,7 +542,9 @@ void ScaleFilterCols64_C(uint8_t* dst_ptr,
// Same as 8 bit arm blender but return is cast to uint16_t
#define BLENDER(a, b, f) \
- (uint16_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+ (uint16_t)( \
+ (int)(a) + \
+ (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> 16))
void ScaleFilterCols_16_C(uint16_t* dst_ptr,
const uint16_t* src_ptr,
@@ -774,6 +776,8 @@ void ScaleAddRow_16_C(const uint16_t* src_ptr,
}
}
+// ARGB scale row functions
+
void ScaleARGBRowDown2_C(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
@@ -1016,6 +1020,235 @@ void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
#undef BLENDERC
#undef BLENDER
+// UV scale row functions
+// same as ARGB but 2 channels
+
+void ScaleUVRowDown2_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src[1];
+ dst[1] = src[3];
+ src += 2;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[1];
+ }
+}
+
+void ScaleUVRowDown2Linear_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width; ++x) {
+ dst_uv[0] = (src_uv[0] + src_uv[2] + 1) >> 1;
+ dst_uv[1] = (src_uv[1] + src_uv[3] + 1) >> 1;
+ src_uv += 4;
+ dst_uv += 2;
+ }
+}
+
+void ScaleUVRowDown2Box_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] +
+ src_uv[src_stride + 2] + 2) >>
+ 2;
+ dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] +
+ src_uv[src_stride + 3] + 2) >>
+ 2;
+ src_uv += 4;
+ dst_uv += 2;
+ }
+}
+
+void ScaleUVRowDownEven_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ (void)src_stride;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src[0];
+ dst[1] = src[src_stepx];
+ src += src_stepx * 2;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[0];
+ }
+}
+
+void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] +
+ src_uv[src_stride + 2] + 2) >>
+ 2;
+ dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] +
+ src_uv[src_stride + 3] + 2) >>
+ 2;
+ src_uv += src_stepx * 2;
+ dst_uv += 2;
+ }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleUVCols_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[0] = src[x >> 16];
+ x += dx;
+ dst[1] = src[x >> 16];
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[x >> 16];
+ }
+}
+
+void ScaleUVCols64_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x32,
+ int dx) {
+ int64_t x = (int64_t)(x32);
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[0] = src[x >> 16];
+ x += dx;
+ dst[1] = src[x >> 16];
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[x >> 16];
+ }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleUVColsUp2_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ (void)x;
+ (void)dx;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[1] = dst[0] = src[0];
+ src += 1;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[0];
+ }
+}
+
+// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607.
+// Mimics SSSE3 blender
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
+#define BLENDERC(a, b, f, s) \
+ (uint16_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f) BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+
+void ScaleUVFilterCols_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint16_t a = src[xi];
+ uint16_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ x += dx;
+ xi = x >> 16;
+ xf = (x >> 9) & 0x7f;
+ a = src[xi];
+ b = src[xi + 1];
+ dst[1] = BLENDER(a, b, xf);
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ int xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint16_t a = src[xi];
+ uint16_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ }
+}
+
+void ScaleUVFilterCols64_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x32,
+ int dx) {
+ int64_t x = (int64_t)(x32);
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int64_t xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint16_t a = src[xi];
+ uint16_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ x += dx;
+ xi = x >> 16;
+ xf = (x >> 9) & 0x7f;
+ a = src[xi];
+ b = src[xi + 1];
+ dst[1] = BLENDER(a, b, xf);
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ int64_t xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint16_t a = src[xi];
+ uint16_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ }
+}
+#undef BLENDER1
+#undef BLENDERC
+#undef BLENDER
+
// Scale plane vertically with bilinear interpolation.
void ScalePlaneVertical(int src_height,
int dst_width,
@@ -1065,6 +1298,14 @@ void ScalePlaneVertical(int src_height,
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(dst_width_bytes, 8)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
@@ -1171,8 +1412,8 @@ enum FilterMode ScaleFilterReduce(int src_width,
src_height = -src_height;
}
if (filtering == kFilterBox) {
- // If scaling both axis to 0.5 or larger, switch from Box to Bilinear.
- if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
+ // If scaling either axis to 0.5 or larger, switch from Box to Bilinear.
+ if (dst_width * 2 >= src_width || dst_height * 2 >= src_height) {
filtering = kFilterBilinear;
}
}
diff --git a/chromium/third_party/libyuv/source/scale_gcc.cc b/chromium/third_party/libyuv/source/scale_gcc.cc
index 312236d2df8..e575ee18bcb 100644
--- a/chromium/third_party/libyuv/source/scale_gcc.cc
+++ b/chromium/third_party/libyuv/source/scale_gcc.cc
@@ -102,16 +102,16 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
// 16 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -125,25 +125,25 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrlw $0xf,%%xmm4 \n"
- "packuswb %%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pavgw %%xmm5,%%xmm0 \n"
- "pavgw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -156,33 +156,33 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrlw $0xf,%%xmm4 \n"
- "packuswb %%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%3,1),%%xmm2 \n"
- "movdqu 0x10(%0,%3,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "psrlw $0x1,%%xmm0 \n"
- "psrlw $0x1,%%xmm1 \n"
- "pavgw %%xmm5,%%xmm0 \n"
- "pavgw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "psrlw $0x1,%%xmm0 \n"
+ "psrlw $0x1,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -200,17 +200,17 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -225,26 +225,26 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
- "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -258,34 +258,34 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
- "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
- "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
- "lea 0x40(%0),%0 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
- "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -301,24 +301,24 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrld $0x18,%%xmm5 \n"
- "pslld $0x10,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrld $0x18,%%xmm5 \n"
+ "pslld $0x10,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -332,46 +332,46 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
int dst_width) {
intptr_t stridex3;
asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrlw $0xf,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "packuswb %%xmm4,%%xmm4 \n"
- "psllw $0x3,%%xmm5 \n"
- "lea 0x00(%4,%4,2),%3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "psllw $0x3,%%xmm5 \n"
+ "lea 0x00(%4,%4,2),%3 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%4,1),%%xmm2 \n"
- "movdqu 0x10(%0,%4,1),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "movdqu 0x00(%0,%4,2),%%xmm2 \n"
- "movdqu 0x10(%0,%4,2),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "movdqu 0x00(%0,%3,1),%%xmm2 \n"
- "movdqu 0x10(%0,%3,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "psrlw $0x4,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "movdqu 0x00(%0,%4,2),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,2),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "psrlw $0x4,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -387,26 +387,26 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrld $0x18,%%ymm5,%%ymm5 \n"
- "vpslld $0x10,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrld $0x18,%%ymm5,%%ymm5 \n"
+ "vpslld $0x10,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -420,46 +420,46 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
- "vpsllw $0x3,%%ymm4,%%ymm5 \n"
- "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpsllw $0x3,%%ymm4,%%ymm5 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
- "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vmovdqu 0x00(%0,%3,2),%%ymm2 \n"
- "vmovdqu 0x20(%0,%3,2),%%ymm3 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vmovdqu 0x00(%0,%4,1),%%ymm2 \n"
- "vmovdqu 0x20(%0,%4,1),%%ymm3 \n"
- "lea 0x40(%0),%0 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu 0x00(%0,%3,2),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%3,2),%%ymm3 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu 0x00(%0,%4,1),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%4,1),%%ymm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -476,32 +476,32 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "movdqa %0,%%xmm3 \n"
- "movdqa %1,%%xmm4 \n"
- "movdqa %2,%%xmm5 \n"
+ "movdqa %0,%%xmm3 \n"
+ "movdqa %1,%%xmm4 \n"
+ "movdqa %2,%%xmm5 \n"
:
: "m"(kShuf0), // %0
"m"(kShuf1), // %1
"m"(kShuf2) // %2
- );
+ );
asm volatile(
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm2 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "palignr $0x8,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm3,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x8(%1) \n"
- "movq %%xmm2,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm2 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "palignr $0x8,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x8(%1) \n"
+ "movq %%xmm2,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -514,58 +514,58 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movdqa %0,%%xmm2 \n" // kShuf01
- "movdqa %1,%%xmm3 \n" // kShuf11
- "movdqa %2,%%xmm4 \n" // kShuf21
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
:
: "m"(kShuf01), // %0
"m"(kShuf11), // %1
"m"(kShuf21) // %2
- );
+ );
asm volatile(
- "movdqa %0,%%xmm5 \n" // kMadd01
- "movdqa %1,%%xmm0 \n" // kMadd11
- "movdqa %2,%%xmm1 \n" // kRound34
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
:
: "m"(kMadd01), // %0
"m"(kMadd11), // %1
"m"(kRound34) // %2
- );
+ );
asm volatile(
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x00(%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,(%1) \n"
- "movdqu 0x8(%0),%%xmm6 \n"
- "movdqu 0x8(%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm0,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x8(%1) \n"
- "movdqu 0x10(%0),%%xmm6 \n"
- "movdqu 0x10(%0,%3,1),%%xmm7 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm4,%%xmm6 \n"
- "pmaddubsw %4,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,(%1) \n"
+ "movdqu 0x8(%0),%%xmm6 \n"
+ "movdqu 0x8(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x8(%1) \n"
+ "movdqu 0x10(%0),%%xmm6 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm7 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -580,62 +580,62 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movdqa %0,%%xmm2 \n" // kShuf01
- "movdqa %1,%%xmm3 \n" // kShuf11
- "movdqa %2,%%xmm4 \n" // kShuf21
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
:
: "m"(kShuf01), // %0
"m"(kShuf11), // %1
"m"(kShuf21) // %2
- );
+ );
asm volatile(
- "movdqa %0,%%xmm5 \n" // kMadd01
- "movdqa %1,%%xmm0 \n" // kMadd11
- "movdqa %2,%%xmm1 \n" // kRound34
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
:
: "m"(kMadd01), // %0
"m"(kMadd11), // %1
"m"(kRound34) // %2
- );
+ );
asm volatile(
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x00(%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,(%1) \n"
- "movdqu 0x8(%0),%%xmm6 \n"
- "movdqu 0x8(%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm0,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x8(%1) \n"
- "movdqu 0x10(%0),%%xmm6 \n"
- "movdqu 0x10(%0,%3,1),%%xmm7 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm4,%%xmm6 \n"
- "pmaddubsw %4,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,(%1) \n"
+ "movdqu 0x8(%0),%%xmm6 \n"
+ "movdqu 0x8(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x8(%1) \n"
+ "movdqu 0x10(%0),%%xmm6 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm7 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -651,23 +651,23 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movhlps %%xmm0,%%xmm1 \n"
- "movd %%xmm1,0x8(%1) \n"
- "lea 0xc(%1),%1 \n"
- "sub $0xc,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movd %%xmm1,0x8(%1) \n"
+ "lea 0xc(%1),%1 \n"
+ "sub $0xc,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -681,39 +681,39 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movdqa %0,%%xmm2 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm4 \n"
- "movdqa %3,%%xmm5 \n"
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "movdqa %3,%%xmm5 \n"
:
: "m"(kShufAb0), // %0
"m"(kShufAb1), // %1
"m"(kShufAb2), // %2
"m"(kScaleAb2) // %3
- );
+ );
asm volatile(
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%3,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pshufb %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "paddusw %%xmm6,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movd %%xmm1,(%1) \n"
- "psrlq $0x10,%%xmm1 \n"
- "movd %%xmm1,0x2(%1) \n"
- "lea 0x6(%1),%1 \n"
- "sub $0x6,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "paddusw %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movd %%xmm1,(%1) \n"
+ "psrlq $0x10,%%xmm1 \n"
+ "movd %%xmm1,0x2(%1) \n"
+ "lea 0x6(%1),%1 \n"
+ "sub $0x6,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -726,57 +726,57 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movdqa %0,%%xmm2 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
:
: "m"(kShufAc), // %0
"m"(kShufAc3), // %1
"m"(kScaleAc33) // %2
- );
+ );
asm volatile(
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%3,1),%%xmm6 \n"
- "movhlps %%xmm0,%%xmm1 \n"
- "movhlps %%xmm6,%%xmm7 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm6 \n"
- "punpcklbw %%xmm5,%%xmm7 \n"
- "paddusw %%xmm6,%%xmm0 \n"
- "paddusw %%xmm7,%%xmm1 \n"
- "movdqu 0x00(%0,%3,2),%%xmm6 \n"
- "lea 0x10(%0),%0 \n"
- "movhlps %%xmm6,%%xmm7 \n"
- "punpcklbw %%xmm5,%%xmm6 \n"
- "punpcklbw %%xmm5,%%xmm7 \n"
- "paddusw %%xmm6,%%xmm0 \n"
- "paddusw %%xmm7,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "psrldq $0x2,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm6 \n"
- "psrldq $0x2,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "movdqa %%xmm1,%%xmm7 \n"
- "psrldq $0x2,%%xmm1 \n"
- "paddusw %%xmm1,%%xmm7 \n"
- "psrldq $0x2,%%xmm1 \n"
- "paddusw %%xmm1,%%xmm7 \n"
- "pshufb %%xmm3,%%xmm7 \n"
- "paddusw %%xmm7,%%xmm6 \n"
- "pmulhuw %%xmm4,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movd %%xmm6,(%1) \n"
- "psrlq $0x10,%%xmm6 \n"
- "movd %%xmm6,0x2(%1) \n"
- "lea 0x6(%1),%1 \n"
- "sub $0x6,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm6 \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ "movdqu 0x00(%0,%3,2),%%xmm6 \n"
+ "lea 0x10(%0),%0 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "pshufb %%xmm3,%%xmm7 \n"
+ "paddusw %%xmm7,%%xmm6 \n"
+ "pmulhuw %%xmm4,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movd %%xmm6,(%1) \n"
+ "psrlq $0x10,%%xmm6 \n"
+ "movd %%xmm6,0x2(%1) \n"
+ "lea 0x6(%1),%1 \n"
+ "sub $0x6,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -791,25 +791,25 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
int src_width) {
asm volatile(
- "pxor %%xmm5,%%xmm5 \n"
+ "pxor %%xmm5,%%xmm5 \n"
// 16 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm3 \n"
- "lea 0x10(%0),%0 \n" // src_ptr += 16
- "movdqu (%1),%%xmm0 \n"
- "movdqu 0x10(%1),%%xmm1 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpckhbw %%xmm5,%%xmm3 \n"
- "paddusw %%xmm2,%%xmm0 \n"
- "paddusw %%xmm3,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n" // src_ptr += 16
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu 0x10(%1),%%xmm1 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpckhbw %%xmm5,%%xmm3 \n"
+ "paddusw %%xmm2,%%xmm0 \n"
+ "paddusw %%xmm3,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
@@ -824,22 +824,22 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr,
int src_width) {
asm volatile(
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm3 \n"
- "lea 0x20(%0),%0 \n" // src_ptr += 32
- "vpermq $0xd8,%%ymm3,%%ymm3 \n"
- "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
- "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
- "vpaddusw (%1),%%ymm2,%%ymm0 \n"
- "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm3 \n"
+ "lea 0x20(%0),%0 \n" // src_ptr += 32
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n"
+ "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
+ "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpaddusw (%1),%%ymm2,%%ymm0 \n"
+ "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -866,69 +866,69 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
int dx) {
intptr_t x0, x1, temp_pixel;
asm volatile(
- "movd %6,%%xmm2 \n"
- "movd %7,%%xmm3 \n"
- "movl $0x04040000,%k2 \n"
- "movd %k2,%%xmm5 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x9,%%xmm6 \n" // 0x007f007f
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psrlw $15,%%xmm7 \n" // 0x00010001
-
- "pextrw $0x1,%%xmm2,%k3 \n"
- "subl $0x2,%5 \n"
- "jl 29f \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "punpckldq %%xmm0,%%xmm2 \n"
- "punpckldq %%xmm3,%%xmm3 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
+ "movd %6,%%xmm2 \n"
+ "movd %7,%%xmm3 \n"
+ "movl $0x04040000,%k2 \n"
+ "movd %k2,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x9,%%xmm6 \n" // 0x007f007f
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $15,%%xmm7 \n" // 0x00010001
+
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "subl $0x2,%5 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
LABELALIGN
"2: \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "paddd %%xmm3,%%xmm2 \n"
- "movzwl 0x00(%1,%3,1),%k2 \n"
- "movd %k2,%%xmm0 \n"
- "psrlw $0x9,%%xmm1 \n"
- "movzwl 0x00(%1,%4,1),%k2 \n"
- "movd %k2,%%xmm4 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "punpcklwd %%xmm4,%%xmm0 \n"
- "psubb %8,%%xmm0 \n" // make pixels signed.
- "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) +
+ "movdqa %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "movzwl 0x00(%1,%3,1),%k2 \n"
+ "movd %k2,%%xmm0 \n"
+ "psrlw $0x9,%%xmm1 \n"
+ "movzwl 0x00(%1,%4,1),%k2 \n"
+ "movd %k2,%%xmm4 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "punpcklwd %%xmm4,%%xmm0 \n"
+ "psubb %8,%%xmm0 \n" // make pixels signed.
+ "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) +
// 1
- "paddusb %%xmm7,%%xmm1 \n"
- "pmaddubsw %%xmm0,%%xmm1 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
- "paddw %9,%%xmm1 \n" // make pixels unsigned.
- "psrlw $0x7,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movd %%xmm1,%k2 \n"
- "mov %w2,(%0) \n"
- "lea 0x2(%0),%0 \n"
- "subl $0x2,%5 \n"
- "jge 2b \n"
+ "paddusb %%xmm7,%%xmm1 \n"
+ "pmaddubsw %%xmm0,%%xmm1 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "paddw %9,%%xmm1 \n" // make pixels unsigned.
+ "psrlw $0x7,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movd %%xmm1,%k2 \n"
+ "mov %w2,(%0) \n"
+ "lea 0x2(%0),%0 \n"
+ "subl $0x2,%5 \n"
+ "jge 2b \n"
LABELALIGN
"29: \n"
- "addl $0x1,%5 \n"
- "jl 99f \n"
- "movzwl 0x00(%1,%3,1),%k2 \n"
- "movd %k2,%%xmm0 \n"
- "psrlw $0x9,%%xmm2 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "psubb %8,%%xmm0 \n" // make pixels signed.
- "pxor %%xmm6,%%xmm2 \n"
- "paddusb %%xmm7,%%xmm2 \n"
- "pmaddubsw %%xmm0,%%xmm2 \n"
- "paddw %9,%%xmm2 \n" // make pixels unsigned.
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm2 \n"
- "movd %%xmm2,%k2 \n"
- "mov %b2,(%0) \n"
+ "addl $0x1,%5 \n"
+ "jl 99f \n"
+ "movzwl 0x00(%1,%3,1),%k2 \n"
+ "movd %k2,%%xmm0 \n"
+ "psrlw $0x9,%%xmm2 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "psubb %8,%%xmm0 \n" // make pixels signed.
+ "pxor %%xmm6,%%xmm2 \n"
+ "paddusb %%xmm7,%%xmm2 \n"
+ "pmaddubsw %%xmm0,%%xmm2 \n"
+ "paddw %9,%%xmm2 \n" // make pixels unsigned.
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm2 \n"
+ "movd %%xmm2,%k2 \n"
+ "mov %b2,(%0) \n"
"99: \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
@@ -966,16 +966,16 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
LABELALIGN
"1: \n"
- "movdqu (%1),%%xmm0 \n"
- "lea 0x10(%1),%1 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "movdqu %%xmm0,(%0) \n"
- "movdqu %%xmm1,0x10(%0) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "movdqu (%1),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "movdqu %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
@@ -993,14 +993,14 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "shufps $0xdd,%%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
@@ -1017,17 +1017,17 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
@@ -1043,21 +1043,21 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%3,1),%%xmm2 \n"
- "movdqu 0x10(%0,%3,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
@@ -1076,23 +1076,23 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
intptr_t src_stepx_x12;
(void)src_stride;
asm volatile(
- "lea 0x00(,%1,4),%1 \n"
- "lea 0x00(%1,%1,2),%4 \n"
+ "lea 0x00(,%1,4),%1 \n"
+ "lea 0x00(%1,%1,2),%4 \n"
LABELALIGN
"1: \n"
- "movd (%0),%%xmm0 \n"
- "movd 0x00(%0,%1,1),%%xmm1 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- "movd 0x00(%0,%1,2),%%xmm2 \n"
- "movd 0x00(%0,%4,1),%%xmm3 \n"
- "lea 0x00(%0,%1,4),%0 \n"
- "punpckldq %%xmm3,%%xmm2 \n"
- "punpcklqdq %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
+ "movd (%0),%%xmm0 \n"
+ "movd 0x00(%0,%1,1),%%xmm1 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movd 0x00(%0,%1,2),%%xmm2 \n"
+ "movd 0x00(%0,%4,1),%%xmm3 \n"
+ "lea 0x00(%0,%1,4),%0 \n"
+ "punpckldq %%xmm3,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stepx_x4), // %1
"+r"(dst_argb), // %2
@@ -1113,32 +1113,32 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
intptr_t src_stepx_x12;
intptr_t row1 = (intptr_t)(src_stride);
asm volatile(
- "lea 0x00(,%1,4),%1 \n"
- "lea 0x00(%1,%1,2),%4 \n"
- "lea 0x00(%0,%5,1),%5 \n"
+ "lea 0x00(,%1,4),%1 \n"
+ "lea 0x00(%1,%1,2),%4 \n"
+ "lea 0x00(%0,%5,1),%5 \n"
LABELALIGN
"1: \n"
- "movq (%0),%%xmm0 \n"
- "movhps 0x00(%0,%1,1),%%xmm0 \n"
- "movq 0x00(%0,%1,2),%%xmm1 \n"
- "movhps 0x00(%0,%4,1),%%xmm1 \n"
- "lea 0x00(%0,%1,4),%0 \n"
- "movq (%5),%%xmm2 \n"
- "movhps 0x00(%5,%1,1),%%xmm2 \n"
- "movq 0x00(%5,%1,2),%%xmm3 \n"
- "movhps 0x00(%5,%4,1),%%xmm3 \n"
- "lea 0x00(%5,%1,4),%5 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm0 \n"
+ "movhps 0x00(%0,%1,1),%%xmm0 \n"
+ "movq 0x00(%0,%1,2),%%xmm1 \n"
+ "movhps 0x00(%0,%4,1),%%xmm1 \n"
+ "lea 0x00(%0,%1,4),%0 \n"
+ "movq (%5),%%xmm2 \n"
+ "movhps 0x00(%5,%1,1),%%xmm2 \n"
+ "movq 0x00(%5,%1,2),%%xmm3 \n"
+ "movhps 0x00(%5,%4,1),%%xmm3 \n"
+ "lea 0x00(%5,%1,4),%5 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stepx_x4), // %1
"+r"(dst_argb), // %2
@@ -1156,56 +1156,56 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb,
int dx) {
intptr_t x0, x1;
asm volatile(
- "movd %5,%%xmm2 \n"
- "movd %6,%%xmm3 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
- "pshufd $0x11,%%xmm3,%%xmm0 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pshufd $0x5,%%xmm3,%%xmm0 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pextrw $0x1,%%xmm2,%k0 \n"
- "pextrw $0x3,%%xmm2,%k1 \n"
- "cmp $0x0,%4 \n"
- "jl 99f \n"
- "sub $0x4,%4 \n"
- "jl 49f \n"
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "pshufd $0x11,%%xmm3,%%xmm0 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pshufd $0x5,%%xmm3,%%xmm0 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pextrw $0x1,%%xmm2,%k0 \n"
+ "pextrw $0x3,%%xmm2,%k1 \n"
+ "cmp $0x0,%4 \n"
+ "jl 99f \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
LABELALIGN
"40: \n"
- "movd 0x00(%3,%0,4),%%xmm0 \n"
- "movd 0x00(%3,%1,4),%%xmm1 \n"
- "pextrw $0x5,%%xmm2,%k0 \n"
- "pextrw $0x7,%%xmm2,%k1 \n"
- "paddd %%xmm3,%%xmm2 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- "movd 0x00(%3,%0,4),%%xmm1 \n"
- "movd 0x00(%3,%1,4),%%xmm4 \n"
- "pextrw $0x1,%%xmm2,%k0 \n"
- "pextrw $0x3,%%xmm2,%k1 \n"
- "punpckldq %%xmm4,%%xmm1 \n"
- "punpcklqdq %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%4 \n"
- "jge 40b \n"
+ "movd 0x00(%3,%0,4),%%xmm0 \n"
+ "movd 0x00(%3,%1,4),%%xmm1 \n"
+ "pextrw $0x5,%%xmm2,%k0 \n"
+ "pextrw $0x7,%%xmm2,%k1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movd 0x00(%3,%0,4),%%xmm1 \n"
+ "movd 0x00(%3,%1,4),%%xmm4 \n"
+ "pextrw $0x1,%%xmm2,%k0 \n"
+ "pextrw $0x3,%%xmm2,%k1 \n"
+ "punpckldq %%xmm4,%%xmm1 \n"
+ "punpcklqdq %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%4 \n"
+ "jge 40b \n"
"49: \n"
- "test $0x2,%4 \n"
- "je 29f \n"
- "movd 0x00(%3,%0,4),%%xmm0 \n"
- "movd 0x00(%3,%1,4),%%xmm1 \n"
- "pextrw $0x5,%%xmm2,%k0 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- "movq %%xmm0,(%2) \n"
- "lea 0x8(%2),%2 \n"
+ "test $0x2,%4 \n"
+ "je 29f \n"
+ "movd 0x00(%3,%0,4),%%xmm0 \n"
+ "movd 0x00(%3,%1,4),%%xmm1 \n"
+ "pextrw $0x5,%%xmm2,%k0 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movq %%xmm0,(%2) \n"
+ "lea 0x8(%2),%2 \n"
"29: \n"
- "test $0x1,%4 \n"
- "je 99f \n"
- "movd 0x00(%3,%0,4),%%xmm0 \n"
- "movd %%xmm0,(%2) \n"
+ "test $0x1,%4 \n"
+ "je 99f \n"
+ "movd 0x00(%3,%0,4),%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
"99: \n"
: "=&a"(x0), // %0
"=&d"(x1), // %1
@@ -1230,16 +1230,16 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
LABELALIGN
"1: \n"
- "movdqu (%1),%%xmm0 \n"
- "lea 0x10(%1),%1 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpckldq %%xmm0,%%xmm0 \n"
- "punpckhdq %%xmm1,%%xmm1 \n"
- "movdqu %%xmm0,(%0) \n"
- "movdqu %%xmm1,0x10(%0) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%1),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpckldq %%xmm0,%%xmm0 \n"
+ "punpckhdq %%xmm1,%%xmm1 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "movdqu %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
@@ -1267,63 +1267,64 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
int dx) {
intptr_t x0, x1;
asm volatile(
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm5 \n"
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm5 \n"
:
: "m"(kShuffleColARGB), // %0
"m"(kShuffleFractions) // %1
- );
+ );
asm volatile(
- "movd %5,%%xmm2 \n"
- "movd %6,%%xmm3 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x9,%%xmm6 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "sub $0x2,%2 \n"
- "jl 29f \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "punpckldq %%xmm0,%%xmm2 \n"
- "punpckldq %%xmm3,%%xmm3 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x9,%%xmm6 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "sub $0x2,%2 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
LABELALIGN
"2: \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "paddd %%xmm3,%%xmm2 \n"
- "movq 0x00(%1,%3,4),%%xmm0 \n"
- "psrlw $0x9,%%xmm1 \n"
- "movhps 0x00(%1,%4,4),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pxor %%xmm6,%%xmm1 \n"
- "pmaddubsw %%xmm1,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%0) \n"
- "lea 0x8(%0),%0 \n"
- "sub $0x2,%2 \n"
- "jge 2b \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "movq 0x00(%1,%3,4),%%xmm0 \n"
+ "psrlw $0x9,%%xmm1 \n"
+ "movhps 0x00(%1,%4,4),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%0) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x2,%2 \n"
+ "jge 2b \n"
LABELALIGN
"29: \n"
- "add $0x1,%2 \n"
- "jl 99f \n"
- "psrlw $0x9,%%xmm2 \n"
- "movq 0x00(%1,%3,4),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pxor %%xmm6,%%xmm2 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0,(%0) \n"
-
- LABELALIGN "99: \n" // clang-format error.
+ "add $0x1,%2 \n"
+ "jl 99f \n"
+ "psrlw $0x9,%%xmm2 \n"
+ "movq 0x00(%1,%3,4),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm2 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,(%0) \n"
+
+ LABELALIGN
+ "99: \n" // clang-format error.
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
@@ -1339,10 +1340,10 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
int FixedDiv_X86(int num, int div) {
asm volatile(
"cdq \n"
- "shld $0x10,%%eax,%%edx \n"
- "shl $0x10,%%eax \n"
- "idiv %1 \n"
- "mov %0, %%eax \n"
+ "shld $0x10,%%eax,%%edx \n"
+ "shl $0x10,%%eax \n"
+ "idiv %1 \n"
+ "mov %0, %%eax \n"
: "+a"(num) // %0
: "c"(div) // %1
: "memory", "cc", "edx");
@@ -1353,19 +1354,108 @@ int FixedDiv_X86(int num, int div) {
int FixedDiv1_X86(int num, int div) {
asm volatile(
"cdq \n"
- "shld $0x10,%%eax,%%edx \n"
- "shl $0x10,%%eax \n"
- "sub $0x10001,%%eax \n"
- "sbb $0x0,%%edx \n"
- "sub $0x1,%1 \n"
- "idiv %1 \n"
- "mov %0, %%eax \n"
+ "shld $0x10,%%eax,%%edx \n"
+ "shl $0x10,%%eax \n"
+ "sub $0x10001,%%eax \n"
+ "sbb $0x0,%%edx \n"
+ "sub $0x1,%1 \n"
+ "idiv %1 \n"
+ "mov %0, %%eax \n"
: "+a"(num) // %0
: "c"(div) // %1
: "memory", "cc", "edx");
return num;
}
+#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
+// Shuffle table for splitting UV into upper and lower part of register.
+static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
+ 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
+static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u,
+ 6u, 14u, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80};
+
+void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5, %%xmm5 \n" // zero
+ "movdqa %4,%%xmm1 \n" // split shuffler
+ "movdqa %5,%%xmm3 \n" // merge shuffler
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // 8 UV row 0
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1
+ "lea 0x10(%0),%0 \n"
+ "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv
+ "pshufb %%xmm1,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n" // vertical add
+ "psrlw $0x1,%%xmm0 \n" // round
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm3,%%xmm0 \n" // merge uv
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n" // 4 UV
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kShuffleSplitUV), // %4
+ "m"(kShuffleMergeUV) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_SCALEUVROWDOWN2BOX_SSSE3
+
+#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
+void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero
+ "vbroadcastf128 %4,%%ymm1 \n" // split shuffler
+ "vbroadcastf128 %5,%%ymm3 \n" // merge shuffler
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0
+ "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1
+ "lea 0x20(%0),%0 \n"
+ "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv
+ "vpshufb %%ymm1,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add
+ "vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n" // 8 UV
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kShuffleSplitUV), // %4
+ "m"(kShuffleMergeUV) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_SCALEUVROWDOWN2BOX_AVX2
+
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus
diff --git a/chromium/third_party/libyuv/source/scale_mmi.cc b/chromium/third_party/libyuv/source/scale_mmi.cc
new file mode 100644
index 00000000000..1226ef3eaf5
--- /dev/null
+++ b/chromium/third_party/libyuv/source/scale_mmi.cc
@@ -0,0 +1,1168 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h" // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Mips MMI.
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+// clang-format off
+
+// CPU agnostic row functions
+void ScaleRowDown2_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+
+ uint64_t src0, src1, dest;
+ const uint64_t shift = 0x8ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
+ "psrlh %[src0], %[src0], %[shift] \n\t"
+
+ "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
+ "psrlh %[src1], %[src1], %[shift] \n\t"
+
+ "packushb %[dest], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
+ [shift] "f"(shift)
+ : "memory");
+}
+
+void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+
+ uint64_t src0, src1;
+ uint64_t dest, dest0, dest1;
+
+ const uint64_t mask = 0x00ff00ff00ff00ffULL;
+ const uint64_t shift = 0x8ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
+ "and %[dest0], %[src0], %[mask] \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
+ "and %[dest1], %[src1], %[mask] \n\t"
+ "packushb %[dest0], %[dest0], %[dest1] \n\t"
+
+ "psrlh %[src0], %[src0], %[shift] \n\t"
+ "psrlh %[src1], %[src1], %[shift] \n\t"
+ "packushb %[dest1], %[src0], %[src1] \n\t"
+
+ "pavgb %[dest], %[dest0], %[dest1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest0] "=&f"(dest0),
+ [dest1] "=&f"(dest1), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [mask] "f"(mask),
+ [shift] "f"(shift), [width] "r"(dst_width)
+ : "memory");
+}
+
+void ScaleRowDown2Box_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
+
+ uint64_t s0, s1, t0, t1;
+ uint64_t dest, dest0, dest1;
+
+ const uint64_t ph = 0x0002000200020002ULL;
+ const uint64_t mask = 0x00ff00ff00ff00ffULL;
+ const uint64_t shift0 = 0x2ULL;
+ const uint64_t shift1 = 0x8ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[s0], 0x00(%[s]) \n\t"
+ "gsldlc1 %[s0], 0x07(%[s]) \n\t"
+ "psrlh %[s1], %[s0], %[shift1] \n\t"
+ "and %[s0], %[s0], %[mask] \n\t"
+
+ "gsldrc1 %[t0], 0x00(%[t]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[t]) \n\t"
+ "psrlh %[t1], %[t0], %[shift1] \n\t"
+ "and %[t0], %[t0], %[mask] \n\t"
+
+ "paddh %[dest0], %[s0], %[s1] \n\t"
+ "paddh %[dest0], %[dest0], %[t0] \n\t"
+ "paddh %[dest0], %[dest0], %[t1] \n\t"
+ "paddh %[dest0], %[dest0], %[ph] \n\t"
+ "psrlh %[dest0], %[dest0], %[shift0] \n\t"
+
+ "gsldrc1 %[s0], 0x08(%[s]) \n\t"
+ "gsldlc1 %[s0], 0x0f(%[s]) \n\t"
+ "psrlh %[s1], %[s0], %[shift1] \n\t"
+ "and %[s0], %[s0], %[mask] \n\t"
+
+ "gsldrc1 %[t0], 0x08(%[t]) \n\t"
+ "gsldlc1 %[t0], 0x0f(%[t]) \n\t"
+ "psrlh %[t1], %[t0], %[shift1] \n\t"
+ "and %[t0], %[t0], %[mask] \n\t"
+
+ "paddh %[dest1], %[s0], %[s1] \n\t"
+ "paddh %[dest1], %[dest1], %[t0] \n\t"
+ "paddh %[dest1], %[dest1], %[t1] \n\t"
+ "paddh %[dest1], %[dest1], %[ph] \n\t"
+ "psrlh %[dest1], %[dest1], %[shift0] \n\t"
+
+ "packushb %[dest], %[dest0], %[dest1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[s], %[s], 0x10 \n\t"
+ "daddiu %[t], %[t], 0x10 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest] "=&f"(dest)
+ : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width),
+ [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph),
+ [mask] "f"(mask)
+ : "memory");
+}
+
+void ScaleARGBRowDown2_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+
+ const uint32_t* src = (const uint32_t*)(src_argb);
+ uint32_t* dst = (uint32_t*)(dst_argb);
+
+ uint64_t src0, src1, dest;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
+ "punpckhwd %[dest], %[src0], %[src1] \n\t"
+
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width)
+ : "memory");
+}
+
+void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+
+ uint64_t src0, src1;
+ uint64_t dest, dest_hi, dest_lo;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "lwc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "lwc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "punpcklwd %[dest_lo], %[src0], %[src1] \n\t"
+ "lwc1 %[src0], 0x04(%[src_ptr]) \n\t"
+ "lwc1 %[src1], 0x0c(%[src_ptr]) \n\t"
+ "punpcklwd %[dest_hi], %[src0], %[src1] \n\t"
+
+ "pavgb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
+ [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width)
+ : "memory");
+}
+
+void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ const uint8_t* s = src_argb;
+ const uint8_t* t = src_argb + src_stride;
+
+ uint64_t s0, s_hi, s_lo;
+ uint64_t t0, t_hi, t_lo;
+ uint64_t dest, dest_hi, dest_lo;
+
+ const uint64_t mask = 0x0ULL;
+ const uint64_t ph = 0x0002000200020002ULL;
+ const uint64_t shfit = 0x2ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[s0], 0x00(%[s]) \n\t"
+ "gsldlc1 %[s0], 0x07(%[s]) \n\t"
+ "punpcklbh %[s_lo], %[s0], %[mask] \n\t"
+ "punpckhbh %[s_hi], %[s0], %[mask] \n\t"
+ "paddh %[dest_lo], %[s_lo], %[s_hi] \n\t"
+
+ "gsldrc1 %[t0], 0x00(%[t]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[t]) \n\t"
+ "punpcklbh %[t_lo], %[t0], %[mask] \n\t"
+ "punpckhbh %[t_hi], %[t0], %[mask] \n\t"
+ "paddh %[dest_lo], %[dest_lo], %[t_lo] \n\t"
+ "paddh %[dest_lo], %[dest_lo], %[t_hi] \n\t"
+
+ "paddh %[dest_lo], %[dest_lo], %[ph] \n\t"
+ "psrlh %[dest_lo], %[dest_lo], %[shfit] \n\t"
+
+ "gsldrc1 %[s0], 0x08(%[s]) \n\t"
+ "gsldlc1 %[s0], 0x0f(%[s]) \n\t"
+ "punpcklbh %[s_lo], %[s0], %[mask] \n\t"
+ "punpckhbh %[s_hi], %[s0], %[mask] \n\t"
+ "paddh %[dest_hi], %[s_lo], %[s_hi] \n\t"
+
+ "gsldrc1 %[t0], 0x08(%[t]) \n\t"
+ "gsldlc1 %[t0], 0x0f(%[t]) \n\t"
+ "punpcklbh %[t_lo], %[t0], %[mask] \n\t"
+ "punpckhbh %[t_hi], %[t0], %[mask] \n\t"
+ "paddh %[dest_hi], %[dest_hi], %[t_lo] \n\t"
+ "paddh %[dest_hi], %[dest_hi], %[t_hi] \n\t"
+
+ "paddh %[dest_hi], %[dest_hi], %[ph] \n\t"
+ "psrlh %[dest_hi], %[dest_hi], %[shfit] \n\t"
+
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[s], %[s], 0x10 \n\t"
+ "daddiu %[t], %[t], 0x10 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [s0] "=&f"(s0), [t0] "=&f"(t0), [dest_hi] "=&f"(dest_hi),
+ [dest_lo] "=&f"(dest_lo), [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo),
+ [t_hi] "=&f"(t_hi), [t_lo] "=&f"(t_lo), [dest] "=&f"(dest)
+ : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width),
+ [mask] "f"(mask), [ph] "f"(ph), [shfit] "f"(shfit)
+ : "memory");
+}
+
+void ScaleRowDown2_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ (void)src_stride;
+
+ uint64_t src0, src1, dest;
+ const uint64_t shift = 0x10ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
+ "psrlw %[src0], %[src0], %[shift] \n\t"
+
+ "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
+ "psrlw %[src1], %[src1], %[shift] \n\t"
+
+ "packsswh %[dest], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
+ [shift] "f"(shift)
+ : "memory");
+}
+
+void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ (void)src_stride;
+
+ uint64_t src0, src1;
+ uint64_t dest, dest_hi, dest_lo;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
+ "punpcklhw %[dest_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[dest_hi], %[src0], %[src1] \n\t"
+
+ "punpcklhw %[src0], %[dest_lo], %[dest_hi] \n\t"
+ "punpckhhw %[src1], %[dest_lo], %[dest_hi] \n\t"
+
+ "pavgh %[dest], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
+ [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width)
+ : "memory");
+}
+
+void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ const uint16_t* s = src_ptr;
+ const uint16_t* t = src_ptr + src_stride;
+
+ uint64_t s0, s1, s_hi, s_lo;
+ uint64_t t0, t1, t_hi, t_lo;
+ uint64_t dest, dest0, dest1;
+
+ const uint64_t ph = 0x0000000200000002ULL;
+ const uint64_t mask = 0x0000ffff0000ffffULL;
+ const uint64_t shift0 = 0x10ULL;
+ const uint64_t shift1 = 0x2ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[s0], 0x00(%[s]) \n\t"
+ "gsldlc1 %[s0], 0x07(%[s]) \n\t"
+ "psrlw %[s1], %[s0], %[shift0] \n\t"
+ "and %[s0], %[s0], %[mask] \n\t"
+
+ "gsldrc1 %[t0], 0x00(%[t]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[t]) \n\t"
+ "psrlw %[t1], %[t0], %[shift0] \n\t"
+ "and %[t0], %[t0], %[mask] \n\t"
+
+ "paddw %[dest0], %[s0], %[s1] \n\t"
+ "paddw %[dest0], %[dest0], %[t0] \n\t"
+ "paddw %[dest0], %[dest0], %[t1] \n\t"
+ "paddw %[dest0], %[dest0], %[ph] \n\t"
+ "psrlw %[dest0], %[dest0], %[shift1] \n\t"
+
+ "gsldrc1 %[s0], 0x08(%[s]) \n\t"
+ "gsldlc1 %[s0], 0x0f(%[s]) \n\t"
+ "psrlw %[s1], %[s0], %[shift0] \n\t"
+ "and %[s0], %[s0], %[mask] \n\t"
+
+ "gsldrc1 %[t0], 0x08(%[t]) \n\t"
+ "gsldlc1 %[t0], 0x0f(%[t]) \n\t"
+ "psrlw %[t1], %[t0], %[shift0] \n\t"
+ "and %[t0], %[t0], %[mask] \n\t"
+
+ "paddw %[dest1], %[s0], %[s1] \n\t"
+ "paddw %[dest1], %[dest1], %[t0] \n\t"
+ "paddw %[dest1], %[dest1], %[t1] \n\t"
+ "paddw %[dest1], %[dest1], %[ph] \n\t"
+ "psrlw %[dest1], %[dest1], %[shift1] \n\t"
+
+ "packsswh %[dest], %[dest0], %[dest1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[s], %[s], 0x10 \n\t"
+ "daddiu %[t], %[t], 0x10 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1),
+ [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo), [t_hi] "=&f"(t_hi),
+ [t_lo] "=&f"(t_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
+ [dest] "=&f"(dest)
+ : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width),
+ [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph),
+ [mask] "f"(mask)
+ : "memory");
+}
+
+void ScaleRowDown4_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+
+ uint64_t src0, src1;
+ uint64_t dest, dest_hi, dest_lo;
+
+ const uint64_t shift = 0x10ULL;
+ const uint64_t mask = 0x000000ff000000ffULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
+ "psrlw %[src0], %[src0], %[shift] \n\t"
+ "and %[src0], %[src0], %[mask] \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
+ "psrlw %[src1], %[src1], %[shift] \n\t"
+ "and %[src1], %[src1], %[mask] \n\t"
+ "packsswh %[dest_lo], %[src0], %[src1] \n\t"
+
+ "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t"
+ "psrlw %[src0], %[src0], %[shift] \n\t"
+ "and %[src0], %[src0], %[mask] \n\t"
+ "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t"
+ "psrlw %[src1], %[src1], %[shift] \n\t"
+ "and %[src1], %[src1], %[mask] \n\t"
+ "packsswh %[dest_hi], %[src0], %[src1] \n\t"
+
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
+ [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
+ [shift] "f"(shift), [mask] "f"(mask)
+ : "memory");
+}
+
+void ScaleRowDown4_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ (void)src_stride;
+
+ uint64_t src0, src1;
+ uint64_t dest, dest_hi, dest_lo;
+
+ const uint64_t mask = 0x0ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
+ "punpckhhw %[dest_lo], %[src0], %[src1] \n\t"
+ "punpcklhw %[dest_lo], %[dest_lo], %[mask] \n\t"
+
+ "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t"
+ "punpckhhw %[dest_hi], %[src0], %[src1] \n\t"
+ "punpcklhw %[dest_hi], %[dest_hi], %[mask] \n\t"
+
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
+ [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
+ [mask] "f"(mask)
+ : "memory");
+}
+
+#define DO_SCALEROWDOWN4BOX_PUNPCKADD() \
+ "punpcklbh %[src_lo], %[src], %[mask0] \n\t" \
+ "punpckhbh %[src_hi], %[src], %[mask0] \n\t" \
+ "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" \
+ "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t"
+
+#define DO_SCALEROWDOWN4BOX_LOOP(reg) \
+ "ldc1 %[src], 0x00(%[src0_ptr]) \n\t" \
+ "punpcklbh %[dest_lo], %[src], %[mask0] \n\t" \
+ "punpckhbh %[dest_hi], %[src], %[mask0] \n\t" \
+ \
+ "ldc1 %[src], 0x00(%[src1_ptr]) \n\t" \
+ DO_SCALEROWDOWN4BOX_PUNPCKADD() \
+ \
+ "ldc1 %[src], 0x00(%[src2_ptr]) \n\t" \
+ DO_SCALEROWDOWN4BOX_PUNPCKADD() \
+ \
+ "ldc1 %[src], 0x00(%[src3_ptr]) \n\t" \
+ DO_SCALEROWDOWN4BOX_PUNPCKADD() \
+ \
+ "pmaddhw %[dest_lo], %[dest_lo], %[mask1] \n\t" \
+ "pmaddhw %[dest_hi], %[dest_hi], %[mask1] \n\t" \
+ "packsswh " #reg ", %[dest_lo], %[dest_hi] \n\t" \
+ "pmaddhw " #reg ", " #reg ", %[mask1] \n\t" \
+ "paddh " #reg ", " #reg ", %[ph] \n\t" \
+ "psrlh " #reg ", " #reg ", %[shift] \n\t" \
+ \
+ "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" \
+ "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" \
+ "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" \
+ "daddiu %[src3_ptr], %[src3_ptr], 0x08 \n\t"
+
+/* LibYUVScaleTest.ScaleDownBy4_Box */
+void ScaleRowDown4Box_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ const uint8_t* src0_ptr = src_ptr;
+ const uint8_t* src1_ptr = src_ptr + src_stride;
+ const uint8_t* src2_ptr = src_ptr + src_stride * 2;
+ const uint8_t* src3_ptr = src_ptr + src_stride * 3;
+
+ uint64_t src, src_hi, src_lo;
+ uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3;
+
+ const uint64_t mask0 = 0x0ULL;
+ const uint64_t mask1 = 0x0001000100010001ULL;
+ const uint64_t ph = 0x0008000800080008ULL;
+ const uint64_t shift = 0x4ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+
+ DO_SCALEROWDOWN4BOX_LOOP(%[dest0])
+ DO_SCALEROWDOWN4BOX_LOOP(%[dest1])
+ DO_SCALEROWDOWN4BOX_LOOP(%[dest2])
+ DO_SCALEROWDOWN4BOX_LOOP(%[dest3])
+
+ "packsswh %[dest_lo], %[dest0], %[dest1] \n\t"
+ "packsswh %[dest_hi], %[dest2], %[dest3] \n\t"
+
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+ [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+ [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest)
+ : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
+ [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst),
+ [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0),
+ [ph] "f"(ph), [mask1] "f"(mask1)
+ : "memory");
+}
+
+#define DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \
+ "punpcklbh %[src_lo], %[src], %[mask0] \n\t" \
+ "punpckhbh %[src_hi], %[src], %[mask0] \n\t" \
+ "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" \
+ "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t"
+
+#define DO_SCALEROWDOWN4BOX_16_LOOP(reg) \
+ "ldc1 %[src], 0x00(%[src0_ptr]) \n\t" \
+ "punpcklbh %[dest_lo], %[src], %[mask0] \n\t" \
+ "punpckhbh %[dest_hi], %[src], %[mask0] \n\t" \
+ \
+ "ldc1 %[src], 0x00(%[src1_ptr]) \n\t" \
+ DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \
+ \
+ "ldc1 %[src], 0x00(%[src2_ptr]) \n\t" \
+ DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \
+ \
+ "ldc1 %[src], 0x00(%[src3_ptr]) \n\t" \
+ DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \
+ \
+ "paddw %[dest], %[dest_lo], %[dest_hi] \n\t" \
+ "punpckhwd %[dest_hi], %[dest], %[dest] \n\t" \
+ "paddw %[dest], %[dest_hi], %[dest] \n\t" \
+ "paddw %[dest], %[dest], %[ph] \n\t" \
+ "psraw %[dest], %[dest], %[shift] \n\t" \
+ "and " #reg ", %[dest], %[mask1] \n\t" \
+ \
+ "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" \
+ "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" \
+ "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" \
+ "daddiu %[src3_ptr], %[src3_ptr], 0x08 \n\t"
+
+/* LibYUVScaleTest.ScaleDownBy4_Box_16 */
+void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ const uint16_t* src0_ptr = src_ptr;
+ const uint16_t* src1_ptr = src_ptr + src_stride;
+ const uint16_t* src2_ptr = src_ptr + src_stride * 2;
+ const uint16_t* src3_ptr = src_ptr + src_stride * 3;
+
+ uint64_t src, src_hi, src_lo;
+ uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3;
+
+ const uint64_t mask0 = 0x0ULL;
+ const uint64_t mask1 = 0x00000000ffffffffULL;
+ const uint64_t ph = 0x0000000800000008ULL;
+ const uint64_t shift = 0x04ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+
+ DO_SCALEROWDOWN4BOX_16_LOOP(%[dest0])
+ DO_SCALEROWDOWN4BOX_16_LOOP(%[dest1])
+ DO_SCALEROWDOWN4BOX_16_LOOP(%[dest2])
+ DO_SCALEROWDOWN4BOX_16_LOOP(%[dest3])
+ "punpcklwd %[dest_lo], %[dest0], %[dest1] \n\t"
+ "punpcklwd %[dest_hi], %[dest2], %[dest3] \n\t"
+
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+ [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+ [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest)
+ : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
+ [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst),
+ [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0),
+ [ph] "f"(ph), [mask1] "f"(mask1)
+ : "memory");
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleColsUp2_MMI(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ uint64_t src, dest;
+
+ (void)x;
+ (void)dx;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "lwc1 %[src], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklbh %[dest], %[src], %[src] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x04 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width)
+ : "memory");
+}
+
+void ScaleColsUp2_16_MMI(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ uint64_t src, dest;
+
+ (void)x;
+ (void)dx;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+
+ "punpcklhw %[dest], %[src], %[src] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "punpckhhw %[dest], %[src], %[src] \n\t"
+ "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width)
+ : "memory");
+}
+
+void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
+ uint64_t src, src_hi, src_lo, dest0, dest1;
+ const uint64_t mask = 0x0ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[mask] \n\t"
+
+ "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
+ "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
+ "paddush %[dest0], %[dest0], %[src_lo] \n\t"
+ "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
+ "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
+ "paddush %[dest1], %[dest1], %[src_hi] \n\t"
+
+ "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
+ "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi),
+ [src_lo] "=&f"(src_lo), [src] "=&f"(src)
+ : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width),
+ [mask] "f"(mask)
+ : "memory");
+}
+
+void ScaleAddRow_16_MMI(const uint16_t* src_ptr,
+ uint32_t* dst_ptr,
+ int src_width) {
+ uint64_t src, src_hi, src_lo, dest0, dest1;
+ const uint64_t mask = 0x0ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "punpcklhw %[src_lo], %[src], %[mask] \n\t"
+ "punpckhhw %[src_hi], %[src], %[mask] \n\t"
+
+ "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
+ "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
+ "paddw %[dest0], %[dest0], %[src_lo] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
+
+ "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
+ "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
+ "paddw %[dest1], %[dest1], %[src_hi] \n\t"
+ "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi),
+ [src_lo] "=&f"(src_lo), [src] "=&f"(src)
+ : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width),
+ [mask] "f"(mask)
+ : "memory");
+}
+
+void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+
+ uint64_t src0, src1, dest;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "lwc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "dadd %[src_ptr], %[src_ptr], %[src_stepx_4]\n\t"
+ "lwc1 %[src1], 0x00(%[src_ptr]) \n\t"
+ "punpcklwd %[dest], %[src0], %[src1] \n\t"
+
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "dadd %[src_ptr], %[src_ptr], %[src_stepx_4]\n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb),
+ [src_stepx_4] "r"(src_stepx << 2), [width] "r"(dst_width)
+ : "memory");
+}
+
+void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ const uint8_t* src0_ptr = src_argb;
+ const uint8_t* src1_ptr = src_argb + src_stride;
+
+ uint64_t src0, src1, src_hi, src_lo;
+ uint64_t dest, dest_hi, dest_lo, dest0, dest1;
+
+ const uint64_t mask = 0x0ULL;
+ const uint64_t ph = 0x0002000200020002ULL;
+ const uint64_t shift = 0x2ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+
+ "lwc1 %[src0], 0x00(%[src0_ptr]) \n\t"
+ "punpcklbh %[dest_lo], %[src0], %[mask] \n\t"
+ "lwc1 %[src0], 0x04(%[src0_ptr]) \n\t"
+ "punpcklbh %[dest_hi], %[src0], %[mask] \n\t"
+
+ "lwc1 %[src1], 0x00(%[src1_ptr]) \n\t"
+ "punpcklbh %[src_lo], %[src1], %[mask] \n\t"
+ "lwc1 %[src1], 0x04(%[src1_ptr]) \n\t"
+ "punpcklbh %[src_hi], %[src1], %[mask] \n\t"
+ "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t"
+ "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t"
+ "paddh %[dest0], %[dest_hi], %[dest_lo] \n\t"
+ "paddh %[dest0], %[dest0], %[ph] \n\t"
+ "psrlh %[dest0], %[dest0], %[shift] \n\t"
+
+ "dadd %[src0_ptr], %[src0_ptr], %[src_stepx_4] \n\t"
+ "dadd %[src1_ptr], %[src1_ptr], %[src_stepx_4] \n\t"
+
+ "lwc1 %[src0], 0x00(%[src0_ptr]) \n\t"
+ "punpcklbh %[dest_lo], %[src0], %[mask] \n\t"
+ "lwc1 %[src0], 0x04(%[src0_ptr]) \n\t"
+ "punpcklbh %[dest_hi], %[src0], %[mask] \n\t"
+
+ "lwc1 %[src1], 0x00(%[src1_ptr]) \n\t"
+ "punpcklbh %[src_lo], %[src1], %[mask] \n\t"
+ "lwc1 %[src1], 0x04(%[src1_ptr]) \n\t"
+ "punpcklbh %[src_hi], %[src1], %[mask] \n\t"
+ "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t"
+ "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t"
+ "paddh %[dest1], %[dest_hi], %[dest_lo] \n\t"
+ "paddh %[dest1], %[dest1], %[ph] \n\t"
+ "psrlh %[dest1], %[dest1], %[shift] \n\t"
+
+ "packushb %[dest], %[dest0], %[dest1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "dadd %[src0_ptr], %[src0_ptr], %[src_stepx_4] \n\t"
+ "dadd %[src1_ptr], %[src1_ptr], %[src_stepx_4] \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+ [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0),
+ [src1] "=&f"(src1), [dest] "=&f"(dest)
+ : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
+ [dst_ptr] "r"(dst_argb), [width] "r"(dst_width),
+ [src_stepx_4] "r"(src_stepx << 2), [shift] "f"(shift), [mask] "f"(mask),
+ [ph] "f"(ph)
+ : "memory");
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleARGBCols_MMI(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint32_t* src = (const uint32_t*)(src_argb);
+ uint32_t* dst = (uint32_t*)(dst_argb);
+
+ const uint32_t* src_tmp;
+
+ uint64_t dest, offset;
+
+ const uint64_t shift0 = 16;
+ const uint64_t shift1 = 2;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "srav %[offset], %[x], %[shift0] \n\t"
+ "sllv %[offset], %[offset], %[shift1] \n\t"
+ "dadd %[src_tmp], %[src_ptr], %[offset] \n\t"
+ "lwc1 %[dest], 0x00(%[src_tmp]) \n\t"
+ "swc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "dadd %[x], %[x], %[dx] \n\t"
+
+ "daddiu %[dst_ptr], %[dst_ptr], 0x04 \n\t"
+ "daddi %[width], %[width], -0x01 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [dest] "=&f"(dest), [offset] "=&r"(offset), [src_tmp] "=&r"(src_tmp)
+ : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width),
+ [dx] "r"(dx), [x] "r"(x), [shift0] "r"(shift0), [shift1] "r"(shift1)
+ : "memory");
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleARGBColsUp2_MMI(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ uint64_t src, dest0, dest1;
+ (void)x;
+ (void)dx;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "punpcklwd %[dest0], %[src], %[src] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
+ "punpckhwd %[dest1], %[src], %[src] \n\t"
+ "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src] "=&f"(src)
+ : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width)
+ : "memory");
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+/* LibYUVBaseTest.TestFixedDiv */
+int FixedDiv_MIPS(int num, int div) {
+ int quotient = 0;
+ const int shift = 16;
+
+ asm(
+ "dsll %[num], %[num], %[shift] \n\t"
+ "ddiv %[num], %[div] \t\n"
+ "mflo %[quo] \t\n"
+ : [quo] "+&r"(quotient)
+ : [num] "r"(num), [div] "r"(div), [shift] "r"(shift));
+
+ return quotient;
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+/* LibYUVScaleTest.ARGBScaleTo320x240_Linear */
+int FixedDiv1_MIPS(int num, int div) {
+ int quotient = 0;
+ const int shift = 16;
+ const int val1 = 1;
+ const int64_t val11 = 0x00010001ULL;
+
+ asm(
+ "dsll %[num], %[num], %[shift] \n\t"
+ "dsub %[num], %[num], %[val11] \n\t"
+ "dsub %[div], %[div], %[val1] \n\t"
+ "ddiv %[num], %[div] \t\n"
+ "mflo %[quo] \t\n"
+ : [quo] "+&r"(quotient)
+ : [num] "r"(num), [div] "r"(div), [val1] "r"(val1), [val11] "r"(val11),
+ [shift] "r"(shift));
+
+ return quotient;
+}
+
+// Read 8x2 upsample with filtering and write 16x1.
+// actually reads an extra pixel, so 9x2.
+void ScaleRowUp2_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ const uint16_t* src2_ptr = src_ptr + src_stride;
+
+ uint64_t src0, src1;
+ uint64_t dest, dest04, dest15, dest26, dest37;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ const uint64_t mask0 = 0x0003000900030009ULL;
+ const uint64_t mask1 = 0x0001000300010003ULL;
+ const uint64_t mask2 = 0x0009000300090003ULL;
+ const uint64_t mask3 = 0x0003000100030001ULL;
+ const uint64_t ph = 0x0000000800000008ULL;
+ const uint64_t shift = 4;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src1_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src1_ptr]) \n\t"
+ "pmaddhw %[dest04], %[src0], %[mask0] \n\t"
+ "gsldrc1 %[src1], 0x00(%[src2_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[src2_ptr]) \n\t"
+ "pmaddhw %[dest], %[src1], %[mask1] \n\t"
+ "paddw %[dest04], %[dest04], %[dest] \n\t"
+ "paddw %[dest04], %[dest04], %[ph] \n\t"
+ "psrlw %[dest04], %[dest04], %[shift] \n\t"
+
+ "pmaddhw %[dest15], %[src0], %[mask2] \n\t"
+ "pmaddhw %[dest], %[src1], %[mask3] \n\t"
+ "paddw %[dest15], %[dest15], %[dest] \n\t"
+ "paddw %[dest15], %[dest15], %[ph] \n\t"
+ "psrlw %[dest15], %[dest15], %[shift] \n\t"
+
+ "gsldrc1 %[src0], 0x02(%[src1_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x09(%[src1_ptr]) \n\t"
+ "pmaddhw %[dest26], %[src0], %[mask0] \n\t"
+ "gsldrc1 %[src1], 0x02(%[src2_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x09(%[src2_ptr]) \n\t"
+ "pmaddhw %[dest], %[src1], %[mask1] \n\t"
+ "paddw %[dest26], %[dest26], %[dest] \n\t"
+ "paddw %[dest26], %[dest26], %[ph] \n\t"
+ "psrlw %[dest26], %[dest26], %[shift] \n\t"
+
+ "pmaddhw %[dest37], %[src0], %[mask2] \n\t"
+ "pmaddhw %[dest], %[src1], %[mask3] \n\t"
+ "paddw %[dest37], %[dest37], %[dest] \n\t"
+ "paddw %[dest37], %[dest37], %[ph] \n\t"
+ "psrlw %[dest37], %[dest37], %[shift] \n\t"
+
+ /* tmp0 = ( 00 04 02 06 ) */
+ "packsswh %[tmp0], %[dest04], %[dest26] \n\t"
+ /* tmp1 = ( 01 05 03 07 ) */
+ "packsswh %[tmp1], %[dest15], %[dest37] \n\t"
+
+ /* tmp2 = ( 00 01 04 05 )*/
+ "punpcklhw %[tmp2], %[tmp0], %[tmp1] \n\t"
+ /* tmp3 = ( 02 03 06 07 )*/
+ "punpckhhw %[tmp3], %[tmp0], %[tmp1] \n\t"
+
+ /* ( 00 01 02 03 ) */
+ "punpcklwd %[dest], %[tmp2], %[tmp3] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ /* ( 04 05 06 07 ) */
+ "punpckhwd %[dest], %[tmp2], %[tmp3] \n\t"
+ "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
+
+ "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
+ "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest04] "=&f"(dest04),
+ [dest15] "=&f"(dest15), [dest26] "=&f"(dest26), [dest37] "=&f"(dest37),
+ [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
+ [tmp3] "=&f"(tmp3), [dest] "=&f"(dest)
+ : [src1_ptr] "r"(src_ptr), [src2_ptr] "r"(src2_ptr), [dst_ptr] "r"(dst),
+ [width] "r"(dst_width), [mask0] "f"(mask0), [mask1] "f"(mask1),
+ [mask2] "f"(mask2), [mask3] "f"(mask3), [shift] "f"(shift), [ph] "f"(ph)
+ : "memory");
+}
+
+void ScaleRowDown34_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ uint64_t src[2];
+ uint64_t tmp[2];
+ __asm__ volatile (
+ "1: \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "and %[tmp1], %[src0], %[mask1] \n\t"
+ "psrlw %[tmp0], %[src0], %[rmov] \n\t"
+ "psllw %[tmp0], %[tmp0], %[lmov1] \n\t"
+ "or %[src0], %[tmp0], %[tmp1] \n\t"
+ "punpckhwd %[tmp0], %[src0], %[src0] \n\t"
+ "psllw %[tmp1], %[tmp0], %[rmov] \n\t"
+ "or %[src0], %[src0], %[tmp1] \n\t"
+ "psrlw %[tmp0], %[tmp0], %[rmov8] \n\t"
+ "pextrh %[tmp0], %[tmp0], %[zero] \n\t"
+ "pinsrh_2 %[src0], %[src0], %[tmp0] \n\t"
+ "pextrh %[tmp0], %[src1], %[zero] \n\t"
+ "pinsrh_3 %[src0], %[src0], %[tmp0] \n\t"
+
+ "punpckhwd %[tmp0], %[src1], %[src1] \n\t"
+ "pextrh %[tmp1], %[tmp0], %[zero] \n\t"
+ "psrlw %[src1], %[src1], %[rmov] \n\t"
+ "psllw %[tmp1], %[tmp1], %[rmov8] \n\t"
+ "or %[src1], %[src1], %[tmp1] \n\t"
+ "and %[tmp0], %[tmp0], %[mask2] \n\t"
+ "or %[src1], %[src1], %[tmp0] \n\t"
+
+ "gssdlc1 %[src0], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[src0], 0x00(%[dst_ptr]) \n\t"
+ "gsswlc1 %[src1], 0x0b(%[dst_ptr]) \n\t"
+ "gsswrc1 %[src1], 0x08(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x0c \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x0c \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [src0]"=&f"(src[0]), [src1]"=&f"(src[1]),
+ [tmp0]"=&f"(tmp[0]), [tmp1]"=&f"(tmp[1])
+ : [src_ptr]"r"(src_ptr), [dst_ptr]"r"(dst),
+ [lmov]"f"(0xc), [rmov]"f"(0x18),
+ [mask1]"f"(0xffff0000ffff), [rmov8]"f"(0x8),
+ [zero]"f"(0x0), [mask2]"f"(0xff000000),
+ [width]"r"(dst_width), [lmov1]"f"(0x10)
+ : "memory"
+ );
+}
+// clang-format on
+
+#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/scale_neon.cc b/chromium/third_party/libyuv/source/scale_neon.cc
index 459a2995dfe..572b4bfa9b3 100644
--- a/chromium/third_party/libyuv/source/scale_neon.cc
+++ b/chromium/third_party/libyuv/source/scale_neon.cc
@@ -31,16 +31,16 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
asm volatile(
"1: \n"
// load even pixels into q0, odd into q1
- "vld2.8 {q0, q1}, [%0]! \n"
- "subs %2, %2, #16 \n" // 16 processed per loop
- "vst1.8 {q1}, [%1]! \n" // store odd pixels
- "bgt 1b \n"
+ "vld2.8 {q0, q1}, [%0]! \n"
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q1}, [%1]! \n" // store odd pixels
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
:
: "q0", "q1" // Clobber List
- );
+ );
}
// Read 32x1 average down and write 16x1.
@@ -51,17 +51,17 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
(void)src_stride;
asm volatile(
"1: \n"
- "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
- "subs %2, %2, #16 \n" // 16 processed per loop
- "vrhadd.u8 q0, q0, q1 \n" // rounding half add
- "vst1.8 {q0}, [%1]! \n"
- "bgt 1b \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vrhadd.u8 q0, q0, q1 \n" // rounding half add
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
:
: "q0", "q1" // Clobber List
- );
+ );
}
// Read 32x2 average down and write 16x1.
@@ -71,28 +71,28 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
- "add %1, %0 \n"
+ "add %1, %0 \n"
"1: \n"
- "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
- "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
- "subs %3, %3, #16 \n" // 16 processed per loop
- "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
- "vpaddl.u8 q1, q1 \n"
- "vpadal.u8 q0, q2 \n" // row 2 add adjacent +
+ "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
+ "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
+ "vpaddl.u8 q1, q1 \n"
+ "vpadal.u8 q0, q2 \n" // row 2 add adjacent +
// row1
- "vpadal.u8 q1, q3 \n"
- "vrshrn.u16 d0, q0, #2 \n" // downshift, round and
+ "vpadal.u8 q1, q3 \n"
+ "vrshrn.u16 d0, q0, #2 \n" // downshift, round and
// pack
- "vrshrn.u16 d1, q1, #2 \n"
- "vst1.8 {q0}, [%2]! \n"
- "bgt 1b \n"
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
"+r"(dst_width) // %3
:
: "q0", "q1", "q2", "q3" // Clobber List
- );
+ );
}
void ScaleRowDown4_NEON(const uint8_t* src_ptr,
@@ -102,10 +102,10 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
(void)src_stride;
asm volatile(
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "subs %2, %2, #8 \n" // 8 processed per loop
- "vst1.8 {d2}, [%1]! \n"
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vst1.8 {d2}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -122,20 +122,20 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
asm volatile(
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load up 16x4
- "vld1.8 {q1}, [%3]! \n"
- "vld1.8 {q2}, [%4]! \n"
- "vld1.8 {q3}, [%5]! \n"
- "subs %2, %2, #4 \n"
- "vpaddl.u8 q0, q0 \n"
- "vpadal.u8 q0, q1 \n"
- "vpadal.u8 q0, q2 \n"
- "vpadal.u8 q0, q3 \n"
- "vpaddl.u16 q0, q0 \n"
- "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
- "vmovn.u16 d0, q0 \n"
- "vst1.32 {d0[0]}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n" // load up 16x4
+ "vld1.8 {q1}, [%3]! \n"
+ "vld1.8 {q2}, [%4]! \n"
+ "vld1.8 {q3}, [%5]! \n"
+ "subs %2, %2, #4 \n"
+ "vpaddl.u8 q0, q0 \n"
+ "vpadal.u8 q0, q1 \n"
+ "vpadal.u8 q0, q2 \n"
+ "vpadal.u8 q0, q3 \n"
+ "vpaddl.u16 q0, q0 \n"
+ "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
+ "vmovn.u16 d0, q0 \n"
+ "vst1.32 {d0[0]}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -156,11 +156,11 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
(void)src_stride;
asm volatile(
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "subs %2, %2, #24 \n"
- "vmov d2, d3 \n" // order d0, d1, d2
- "vst3.8 {d0, d1, d2}, [%1]! \n"
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #24 \n"
+ "vmov d2, d3 \n" // order d0, d1, d2
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -173,49 +173,49 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "vmov.u8 d24, #3 \n"
- "add %3, %0 \n"
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
- "subs %2, %2, #24 \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
// filter src line 0 with src line 1
// expand chars to shorts to allow for room
// when adding lines together
- "vmovl.u8 q8, d4 \n"
- "vmovl.u8 q9, d5 \n"
- "vmovl.u8 q10, d6 \n"
- "vmovl.u8 q11, d7 \n"
+ "vmovl.u8 q8, d4 \n"
+ "vmovl.u8 q9, d5 \n"
+ "vmovl.u8 q10, d6 \n"
+ "vmovl.u8 q11, d7 \n"
// 3 * line_0 + line_1
- "vmlal.u8 q8, d0, d24 \n"
- "vmlal.u8 q9, d1, d24 \n"
- "vmlal.u8 q10, d2, d24 \n"
- "vmlal.u8 q11, d3, d24 \n"
+ "vmlal.u8 q8, d0, d24 \n"
+ "vmlal.u8 q9, d1, d24 \n"
+ "vmlal.u8 q10, d2, d24 \n"
+ "vmlal.u8 q11, d3, d24 \n"
// (3 * line_0 + line_1) >> 2
- "vqrshrn.u16 d0, q8, #2 \n"
- "vqrshrn.u16 d1, q9, #2 \n"
- "vqrshrn.u16 d2, q10, #2 \n"
- "vqrshrn.u16 d3, q11, #2 \n"
+ "vqrshrn.u16 d0, q8, #2 \n"
+ "vqrshrn.u16 d1, q9, #2 \n"
+ "vqrshrn.u16 d2, q10, #2 \n"
+ "vqrshrn.u16 d3, q11, #2 \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
- "vmovl.u8 q8, d1 \n"
- "vmlal.u8 q8, d0, d24 \n"
- "vqrshrn.u16 d0, q8, #2 \n"
+ "vmovl.u8 q8, d1 \n"
+ "vmlal.u8 q8, d0, d24 \n"
+ "vqrshrn.u16 d0, q8, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
- "vrhadd.u8 d1, d1, d2 \n"
+ "vrhadd.u8 d1, d1, d2 \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
- "vmovl.u8 q8, d2 \n"
- "vmlal.u8 q8, d3, d24 \n"
- "vqrshrn.u16 d2, q8, #2 \n"
+ "vmovl.u8 q8, d2 \n"
+ "vmlal.u8 q8, d3, d24 \n"
+ "vqrshrn.u16 d2, q8, #2 \n"
- "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
- "bgt 1b \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -230,31 +230,31 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "vmov.u8 d24, #3 \n"
- "add %3, %0 \n"
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
- "subs %2, %2, #24 \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
// average src line 0 with src line 1
- "vrhadd.u8 q0, q0, q2 \n"
- "vrhadd.u8 q1, q1, q3 \n"
+ "vrhadd.u8 q0, q0, q2 \n"
+ "vrhadd.u8 q1, q1, q3 \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
- "vmovl.u8 q3, d1 \n"
- "vmlal.u8 q3, d0, d24 \n"
- "vqrshrn.u16 d0, q3, #2 \n"
+ "vmovl.u8 q3, d1 \n"
+ "vmlal.u8 q3, d0, d24 \n"
+ "vqrshrn.u16 d0, q3, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
- "vrhadd.u8 d1, d1, d2 \n"
+ "vrhadd.u8 d1, d1, d2 \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
- "vmovl.u8 q3, d2 \n"
- "vmlal.u8 q3, d3, d24 \n"
- "vqrshrn.u16 d2, q3, #2 \n"
+ "vmovl.u8 q3, d2 \n"
+ "vmlal.u8 q3, d3, d24 \n"
+ "vqrshrn.u16 d2, q3, #2 \n"
- "vst3.8 {d0, d1, d2}, [%1]! \n"
- "bgt 1b \n"
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -282,15 +282,15 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "vld1.8 {q3}, [%3] \n"
+ "vld1.8 {q3}, [%3] \n"
"1: \n"
- "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
- "subs %2, %2, #12 \n"
- "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
- "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
- "vst1.8 {d4}, [%1]! \n"
- "vst1.32 {d5[0]}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
+ "subs %2, %2, #12 \n"
+ "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
+ "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
+ "vst1.8 {d4}, [%1]! \n"
+ "vst1.32 {d5[0]}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -306,57 +306,57 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
asm volatile(
- "vld1.16 {q13}, [%5] \n"
- "vld1.8 {q14}, [%6] \n"
- "vld1.8 {q15}, [%7] \n"
- "add %3, %0 \n"
+ "vld1.16 {q13}, [%5] \n"
+ "vld1.8 {q14}, [%6] \n"
+ "vld1.8 {q15}, [%7] \n"
+ "add %3, %0 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
- "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
- "subs %2, %2, #12 \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
+ "subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53
- "vtrn.u8 d0, d1 \n"
- "vtrn.u8 d4, d5 \n"
- "vtrn.u8 d16, d17 \n"
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
+ "vtrn.u8 d16, d17 \n"
// d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73
- "vtrn.u8 d2, d3 \n"
- "vtrn.u8 d6, d7 \n"
- "vtrn.u8 d18, d19 \n"
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
+ "vtrn.u8 d18, d19 \n"
// d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53
- "vpaddl.u8 q0, q0 \n"
- "vpaddl.u8 q2, q2 \n"
- "vpaddl.u8 q8, q8 \n"
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
+ "vpaddl.u8 q8, q8 \n"
// d3 = 60+70 61+71 62+72 63+73
- "vpaddl.u8 d3, d3 \n"
- "vpaddl.u8 d7, d7 \n"
- "vpaddl.u8 d19, d19 \n"
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
+ "vpaddl.u8 d19, d19 \n"
// combine source lines
- "vadd.u16 q0, q2 \n"
- "vadd.u16 q0, q8 \n"
- "vadd.u16 d4, d3, d7 \n"
- "vadd.u16 d4, d19 \n"
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 q0, q8 \n"
+ "vadd.u16 d4, d3, d7 \n"
+ "vadd.u16 d4, d19 \n"
// dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
// + s[6 + st * 1] + s[7 + st * 1]
// + s[6 + st * 2] + s[7 + st * 2]) / 6
"vqrdmulh.s16 q2, q2, q13 \n"
- "vmovn.u16 d4, q2 \n"
+ "vmovn.u16 d4, q2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
@@ -364,24 +364,24 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
// registers are already expanded. Then do transposes
// to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q3, d6 \n"
- "vmovl.u8 q9, d18 \n"
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
+ "vmovl.u8 q9, d18 \n"
// combine source lines
- "vadd.u16 q1, q3 \n"
- "vadd.u16 q1, q9 \n"
+ "vadd.u16 q1, q3 \n"
+ "vadd.u16 q1, q9 \n"
// d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33
- "vtrn.u32 d2, d3 \n"
+ "vtrn.u32 d2, d3 \n"
// d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33
- "vtrn.u16 d2, d3 \n"
+ "vtrn.u16 d2, d3 \n"
// 0+1+2, 3+4+5
- "vadd.u16 q0, q1 \n"
+ "vadd.u16 q0, q1 \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
@@ -390,14 +390,14 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
// Align for table lookup, vtbl requires registers to
// be adjacent
- "vmov.u8 d2, d4 \n"
+ "vmov.u8 d2, d4 \n"
- "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
- "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
- "vst1.8 {d3}, [%1]! \n"
- "vst1.32 {d4[0]}, [%1]! \n"
- "bgt 1b \n"
+ "vst1.8 {d3}, [%1]! \n"
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -416,46 +416,46 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "vld1.16 {q13}, [%4] \n"
- "vld1.8 {q14}, [%5] \n"
- "add %3, %0 \n"
+ "vld1.16 {q13}, [%4] \n"
+ "vld1.8 {q14}, [%5] \n"
+ "add %3, %0 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
- "subs %2, %2, #12 \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53
- "vtrn.u8 d0, d1 \n"
- "vtrn.u8 d4, d5 \n"
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
// d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73
- "vtrn.u8 d2, d3 \n"
- "vtrn.u8 d6, d7 \n"
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
// d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53
- "vpaddl.u8 q0, q0 \n"
- "vpaddl.u8 q2, q2 \n"
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
// d3 = 60+70 61+71 62+72 63+73
- "vpaddl.u8 d3, d3 \n"
- "vpaddl.u8 d7, d7 \n"
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
// combine source lines
- "vadd.u16 q0, q2 \n"
- "vadd.u16 d4, d3, d7 \n"
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 d4, d3, d7 \n"
// dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
- "vqrshrn.u16 d4, q2, #2 \n"
+ "vqrshrn.u16 d4, q2, #2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
@@ -463,22 +463,22 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
// registers are already expanded. Then do transposes
// to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q3, d6 \n"
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
// combine source lines
- "vadd.u16 q1, q3 \n"
+ "vadd.u16 q1, q3 \n"
// d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33
- "vtrn.u32 d2, d3 \n"
+ "vtrn.u32 d2, d3 \n"
// d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33
- "vtrn.u16 d2, d3 \n"
+ "vtrn.u16 d2, d3 \n"
// 0+1+2, 3+4+5
- "vadd.u16 q0, q1 \n"
+ "vadd.u16 q0, q1 \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
@@ -487,14 +487,14 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
// Align for table lookup, vtbl requires registers to
// be adjacent
- "vmov.u8 d2, d4 \n"
+ "vmov.u8 d2, d4 \n"
- "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
- "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
- "vst1.8 {d3}, [%1]! \n"
- "vst1.32 {d4[0]}, [%1]! \n"
- "bgt 1b \n"
+ "vst1.8 {d3}, [%1]! \n"
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -504,38 +504,26 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
: "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
}
-void ScaleAddRows_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint16_t* dst_ptr,
- int src_width,
- int src_height) {
- const uint8_t* src_tmp;
+// Add a row of bytes to a row of shorts. Used for box filter.
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+void ScaleAddRow_NEON(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width) {
asm volatile(
"1: \n"
- "mov %0, %1 \n"
- "mov r12, %5 \n"
- "veor q2, q2, q2 \n"
- "veor q3, q3, q3 \n"
- "2: \n"
- // load 16 pixels into q0
- "vld1.8 {q0}, [%0], %3 \n"
- "vaddw.u8 q3, q3, d1 \n"
- "vaddw.u8 q2, q2, d0 \n"
- "subs r12, r12, #1 \n"
- "bgt 2b \n"
- "vst1.16 {q2, q3}, [%2]! \n" // store pixels
- "add %1, %1, #16 \n"
- "subs %4, %4, #16 \n" // 16 processed per loop
- "bgt 1b \n"
- : "=&r"(src_tmp), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_ptr), // %2
- "+r"(src_stride), // %3
- "+r"(src_width), // %4
- "+r"(src_height) // %5
+ "vld1.16 {q1, q2}, [%1] \n" // load accumulator
+ "vld1.8 {q0}, [%0]! \n" // load 16 bytes
+ "vaddw.u8 q2, q2, d1 \n" // add
+ "vaddw.u8 q1, q1, d0 \n"
+ "vst1.16 {q1, q2}, [%1]! \n" // store accumulator
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_width) // %2
:
- : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List
- );
+ : "memory", "cc", "q0", "q1", "q2" // Clobber List
+ );
}
// TODO(Yang Zhang): Investigate less load instructions for
@@ -559,17 +547,17 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
int* tmp = dx_offset;
const uint8_t* src_tmp = src_ptr;
asm volatile (
- "vdup.32 q0, %3 \n" // x
- "vdup.32 q1, %4 \n" // dx
- "vld1.32 {q2}, [%5] \n" // 0 1 2 3
- "vshl.i32 q3, q1, #2 \n" // 4 * dx
- "vmul.s32 q1, q1, q2 \n"
+ "vdup.32 q0, %3 \n" // x
+ "vdup.32 q1, %4 \n" // dx
+ "vld1.32 {q2}, [%5] \n" // 0 1 2 3
+ "vshl.i32 q3, q1, #2 \n" // 4 * dx
+ "vmul.s32 q1, q1, q2 \n"
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
- "vadd.s32 q1, q1, q0 \n"
+ "vadd.s32 q1, q1, q0 \n"
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
- "vadd.s32 q2, q1, q3 \n"
- "vshl.i32 q0, q3, #1 \n" // 8 * dx
- "1: \n"
+ "vadd.s32 q2, q1, q3 \n"
+ "vshl.i32 q0, q3, #1 \n" // 8 * dx
+ "1: \n"
LOAD2_DATA8_LANE(0)
LOAD2_DATA8_LANE(1)
LOAD2_DATA8_LANE(2)
@@ -578,27 +566,27 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
LOAD2_DATA8_LANE(5)
LOAD2_DATA8_LANE(6)
LOAD2_DATA8_LANE(7)
- "vmov q10, q1 \n"
- "vmov q11, q2 \n"
- "vuzp.16 q10, q11 \n"
- "vmovl.u8 q8, d6 \n"
- "vmovl.u8 q9, d7 \n"
- "vsubl.s16 q11, d18, d16 \n"
- "vsubl.s16 q12, d19, d17 \n"
- "vmovl.u16 q13, d20 \n"
- "vmovl.u16 q10, d21 \n"
- "vmul.s32 q11, q11, q13 \n"
- "vmul.s32 q12, q12, q10 \n"
- "vrshrn.s32 d18, q11, #16 \n"
- "vrshrn.s32 d19, q12, #16 \n"
- "vadd.s16 q8, q8, q9 \n"
- "vmovn.s16 d6, q8 \n"
-
- "vst1.8 {d6}, [%0]! \n" // store pixels
- "vadd.s32 q1, q1, q0 \n"
- "vadd.s32 q2, q2, q0 \n"
- "subs %2, %2, #8 \n" // 8 processed per loop
- "bgt 1b \n"
+ "vmov q10, q1 \n"
+ "vmov q11, q2 \n"
+ "vuzp.16 q10, q11 \n"
+ "vmovl.u8 q8, d6 \n"
+ "vmovl.u8 q9, d7 \n"
+ "vsubl.s16 q11, d18, d16 \n"
+ "vsubl.s16 q12, d19, d17 \n"
+ "vmovl.u16 q13, d20 \n"
+ "vmovl.u16 q10, d21 \n"
+ "vmul.s32 q11, q11, q13 \n"
+ "vmul.s32 q12, q12, q10 \n"
+ "vrshrn.s32 d18, q11, #16 \n"
+ "vrshrn.s32 d19, q12, #16 \n"
+ "vadd.s16 q8, q8, q9 \n"
+ "vmovn.s16 d6, q8 \n"
+
+ "vst1.8 {d6}, [%0]! \n" // store pixels
+ "vadd.s32 q1, q1, q0 \n"
+ "vadd.s32 q2, q2, q0 \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "bgt 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width), // %2
@@ -621,75 +609,75 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
int dst_width,
int source_y_fraction) {
asm volatile(
- "cmp %4, #0 \n"
- "beq 100f \n"
- "add %2, %1 \n"
- "cmp %4, #64 \n"
- "beq 75f \n"
- "cmp %4, #128 \n"
- "beq 50f \n"
- "cmp %4, #192 \n"
- "beq 25f \n"
-
- "vdup.8 d5, %4 \n"
- "rsb %4, #256 \n"
- "vdup.8 d4, %4 \n"
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #64 \n"
+ "beq 75f \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
+ "cmp %4, #192 \n"
+ "beq 25f \n"
+
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vmull.u8 q13, d0, d4 \n"
- "vmull.u8 q14, d1, d4 \n"
- "vmlal.u8 q13, d2, d5 \n"
- "vmlal.u8 q14, d3, d5 \n"
- "vrshrn.u16 d0, q13, #8 \n"
- "vrshrn.u16 d1, q14, #8 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 1b \n"
- "b 99f \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
// Blend 25 / 75.
"25: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vrhadd.u8 q0, q1 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 25b \n"
- "b 99f \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 25b \n"
+ "b 99f \n"
// Blend 50 / 50.
"50: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 50b \n"
- "b 99f \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
// Blend 75 / 25.
"75: \n"
- "vld1.8 {q1}, [%1]! \n"
- "vld1.8 {q0}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vrhadd.u8 q0, q1 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 75b \n"
- "b 99f \n"
+ "vld1.8 {q1}, [%1]! \n"
+ "vld1.8 {q0}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 75b \n"
+ "b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
- "vld1.8 {q0}, [%1]! \n"
- "subs %3, %3, #16 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 100b \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
"99: \n"
- "vst1.8 {d1[7]}, [%0] \n"
+ "vst1.8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_stride), // %2
@@ -706,18 +694,18 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
(void)src_stride;
asm volatile(
"1: \n"
- "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
- "subs %2, %2, #8 \n" // 8 processed per loop
- "vmov q2, q1 \n" // load next 8 ARGB
- "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels
- "bgt 1b \n"
+ "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vmov q2, q1 \n" // load next 8 ARGB
+ "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
- );
+ );
}
// 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]!
@@ -734,19 +722,19 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
(void)src_stride;
asm volatile(
"1: \n"
- "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
- "subs %2, %2, #8 \n" // 8 processed per loop
- "vrhadd.u8 q0, q0, q1 \n" // rounding half add
- "vrhadd.u8 q1, q2, q3 \n" // rounding half add
- "vst2.32 {q0, q1}, [%1]! \n"
- "bgt 1b \n"
+ "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vrhadd.u8 q0, q0, q1 \n" // rounding half add
+ "vrhadd.u8 q1, q2, q3 \n" // rounding half add
+ "vst2.32 {q0, q1}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
- );
+ );
}
void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
@@ -755,27 +743,27 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
- "add %1, %1, %0 \n"
+ "add %1, %1, %0 \n"
"1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
- "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB
- "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB
- "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
- "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
- "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
- "vrshrn.u16 d1, q1, #2 \n"
- "vrshrn.u16 d2, q2, #2 \n"
- "vrshrn.u16 d3, q3, #2 \n"
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
- "bgt 1b \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
+ "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB
+ "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB
+ "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
+ "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
+ "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vrshrn.u16 d2, q2, #2 \n"
+ "vrshrn.u16 d3, q3, #2 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
@@ -793,15 +781,15 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
int dst_width) {
(void)src_stride;
asm volatile(
- "mov r12, %3, lsl #2 \n"
+ "mov r12, %3, lsl #2 \n"
"1: \n"
- "vld1.32 {d0[0]}, [%0], r12 \n"
- "vld1.32 {d0[1]}, [%0], r12 \n"
- "vld1.32 {d1[0]}, [%0], r12 \n"
- "vld1.32 {d1[1]}, [%0], r12 \n"
- "subs %2, %2, #4 \n" // 4 pixels per loop.
- "vst1.8 {q0}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.32 {d0[0]}, [%0], r12 \n"
+ "vld1.32 {d0[1]}, [%0], r12 \n"
+ "vld1.32 {d1[0]}, [%0], r12 \n"
+ "vld1.32 {d1[1]}, [%0], r12 \n"
+ "subs %2, %2, #4 \n" // 4 pixels per loop.
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
@@ -817,30 +805,30 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
uint8_t* dst_argb,
int dst_width) {
asm volatile(
- "mov r12, %4, lsl #2 \n"
- "add %1, %1, %0 \n"
+ "mov r12, %4, lsl #2 \n"
+ "add %1, %1, %0 \n"
"1: \n"
- "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1
- "vld1.8 {d1}, [%1], r12 \n"
- "vld1.8 {d2}, [%0], r12 \n"
- "vld1.8 {d3}, [%1], r12 \n"
- "vld1.8 {d4}, [%0], r12 \n"
- "vld1.8 {d5}, [%1], r12 \n"
- "vld1.8 {d6}, [%0], r12 \n"
- "vld1.8 {d7}, [%1], r12 \n"
- "vaddl.u8 q0, d0, d1 \n"
- "vaddl.u8 q1, d2, d3 \n"
- "vaddl.u8 q2, d4, d5 \n"
- "vaddl.u8 q3, d6, d7 \n"
- "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
- "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
- "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
- "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
- "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
- "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
- "subs %3, %3, #4 \n" // 4 pixels per loop.
- "vst1.8 {q0}, [%2]! \n"
- "bgt 1b \n"
+ "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1
+ "vld1.8 {d1}, [%1], r12 \n"
+ "vld1.8 {d2}, [%0], r12 \n"
+ "vld1.8 {d3}, [%1], r12 \n"
+ "vld1.8 {d4}, [%0], r12 \n"
+ "vld1.8 {d5}, [%1], r12 \n"
+ "vld1.8 {d6}, [%0], r12 \n"
+ "vld1.8 {d7}, [%1], r12 \n"
+ "vaddl.u8 q0, d0, d1 \n"
+ "vaddl.u8 q1, d2, d3 \n"
+ "vaddl.u8 q2, d4, d5 \n"
+ "vaddl.u8 q3, d6, d7 \n"
+ "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
+ "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
+ "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
+ "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
+ "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
+ "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
+ "subs %3, %3, #4 \n" // 4 pixels per loop.
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride), // %1
"+r"(dst_argb), // %2
@@ -877,8 +865,8 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
LOAD1_DATA32_LANE(d3, 1)
// clang-format on
"vst1.32 {q0, q1}, [%0]! \n" // store pixels
- "subs %2, %2, #8 \n" // 8 processed per loop
- "bgt 1b \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "bgt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
@@ -909,16 +897,16 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
int* tmp = dx_offset;
const uint8_t* src_tmp = src_argb;
asm volatile (
- "vdup.32 q0, %3 \n" // x
- "vdup.32 q1, %4 \n" // dx
- "vld1.32 {q2}, [%5] \n" // 0 1 2 3
- "vshl.i32 q9, q1, #2 \n" // 4 * dx
- "vmul.s32 q1, q1, q2 \n"
- "vmov.i8 q3, #0x7f \n" // 0x7F
- "vmov.i16 q15, #0x7f \n" // 0x7F
+ "vdup.32 q0, %3 \n" // x
+ "vdup.32 q1, %4 \n" // dx
+ "vld1.32 {q2}, [%5] \n" // 0 1 2 3
+ "vshl.i32 q9, q1, #2 \n" // 4 * dx
+ "vmul.s32 q1, q1, q2 \n"
+ "vmov.i8 q3, #0x7f \n" // 0x7F
+ "vmov.i16 q15, #0x7f \n" // 0x7F
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
- "vadd.s32 q8, q1, q0 \n"
- "1: \n"
+ "vadd.s32 q8, q1, q0 \n"
+ "1: \n"
// d0, d1: a
// d2, d3: b
LOAD2_DATA32_LANE(d0, d2, 0)
@@ -962,6 +950,64 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
#undef LOAD2_DATA32_LANE
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels.
+ "vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vpaddl.u8 q0, q0 \n" // U 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // V 16 bytes -> 8 shorts.
+ "vld2.8 {d16, d18}, [%1]! \n" // load 8 more UV
+ "vld2.8 {d17, d19}, [%1]! \n" // load last 8 UV
+ "vpadal.u8 q0, q8 \n" // U 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q9 \n" // V 16 bytes -> 8 shorts.
+ "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vst2.8 {d0, d1}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q8", "q9");
+}
+
+// Reads 4 pixels at a time.
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx, // pixel step
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
+ const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
+ const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "vld1.16 {d0[0]}, [%0], %6 \n"
+ "vld1.16 {d0[1]}, [%1], %6 \n"
+ "vld1.16 {d0[2]}, [%2], %6 \n"
+ "vld1.16 {d0[3]}, [%3], %6 \n"
+ "subs %5, %5, #4 \n" // 4 pixels per loop.
+ "vst1.8 {d0}, [%4]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src1_ptr), // %1
+ "+r"(src2_ptr), // %2
+ "+r"(src3_ptr), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_width) // %5
+ : "r"(src_stepx * 8) // %6
+ : "memory", "cc", "d0");
+}
+
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
diff --git a/chromium/third_party/libyuv/source/scale_neon64.cc b/chromium/third_party/libyuv/source/scale_neon64.cc
index 494a9cfbfbe..185591cb55b 100644
--- a/chromium/third_party/libyuv/source/scale_neon64.cc
+++ b/chromium/third_party/libyuv/source/scale_neon64.cc
@@ -29,16 +29,17 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
asm volatile(
"1: \n"
// load even pixels into v0, odd into v1
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
- "b.gt 1b \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
:
: "v0", "v1" // Clobber List
- );
+ );
}
// Read 32x1 average down and write 16x1.
@@ -50,17 +51,18 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
asm volatile(
"1: \n"
// load even pixels into v0, odd into v1
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
- "st1 {v0.16b}, [%1], #16 \n"
- "b.gt 1b \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v0.16b}, [%1], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
:
: "v0", "v1" // Clobber List
- );
+ );
}
// Read 32x2 average down and write 16x1.
@@ -70,26 +72,28 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
- "add %1, %1, %0 \n"
+ "add %1, %1, %0 \n"
"1: \n"
- "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
- "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
- "subs %w3, %w3, #16 \n" // 16 processed per loop
- "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
- "uaddlp v1.8h, v1.16b \n"
- "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent
- "uadalp v1.8h, v3.16b \n"
- "rshrn v0.8b, v0.8h, #2 \n" // round and pack
- "rshrn2 v0.16b, v1.8h, #2 \n"
- "st1 {v0.16b}, [%2], #16 \n"
- "b.gt 1b \n"
+ "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
+ "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uaddlp v1.8h, v1.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent
+ "uadalp v1.8h, v3.16b \n"
+ "rshrn v0.8b, v0.8h, #2 \n" // round and pack
+ "rshrn2 v0.16b, v1.8h, #2 \n"
+ "st1 {v0.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
"+r"(dst_width) // %3
:
: "v0", "v1", "v2", "v3" // Clobber List
- );
+ );
}
void ScaleRowDown4_NEON(const uint8_t* src_ptr,
@@ -99,10 +103,11 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
(void)src_stride;
asm volatile(
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "st1 {v2.8b}, [%1], #8 \n"
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v2.8b}, [%1], #8 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -119,19 +124,23 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
asm volatile(
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
- "ld1 {v1.16b}, [%2], #16 \n"
- "ld1 {v2.16b}, [%3], #16 \n"
- "ld1 {v3.16b}, [%4], #16 \n"
- "subs %w5, %w5, #4 \n"
- "uaddlp v0.8h, v0.16b \n"
- "uadalp v0.8h, v1.16b \n"
- "uadalp v0.8h, v2.16b \n"
- "uadalp v0.8h, v3.16b \n"
- "addp v0.8h, v0.8h, v0.8h \n"
- "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
- "st1 {v0.s}[0], [%1], #4 \n"
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "ld1 {v2.16b}, [%3], #16 \n"
+ "ld1 {v3.16b}, [%4], #16 \n"
+ "subs %w5, %w5, #4 \n"
+ "uaddlp v0.8h, v0.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uadalp v0.8h, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "uadalp v0.8h, v2.16b \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "uadalp v0.8h, v3.16b \n"
+ "prfm pldl1keep, [%4, 448] \n"
+ "addp v0.8h, v0.8h, v0.8h \n"
+ "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
+ "st1 {v0.s}[0], [%1], #4 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_ptr1), // %2
@@ -151,12 +160,13 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- "subs %w2, %w2, #24 \n"
- "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
- "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
- "b.gt 1b \n"
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "subs %w2, %w2, #24 \n"
+ "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -169,49 +179,51 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movi v20.8b, #3 \n"
- "add %3, %3, %0 \n"
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
- "subs %w2, %w2, #24 \n"
+ "movi v20.8b, #3 \n"
+ "add %3, %3, %0 \n"
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
+ "subs %w2, %w2, #24 \n"
// filter src line 0 with src line 1
// expand chars to shorts to allow for room
// when adding lines together
- "ushll v16.8h, v4.8b, #0 \n"
- "ushll v17.8h, v5.8b, #0 \n"
- "ushll v18.8h, v6.8b, #0 \n"
- "ushll v19.8h, v7.8b, #0 \n"
+ "ushll v16.8h, v4.8b, #0 \n"
+ "ushll v17.8h, v5.8b, #0 \n"
+ "ushll v18.8h, v6.8b, #0 \n"
+ "ushll v19.8h, v7.8b, #0 \n"
// 3 * line_0 + line_1
- "umlal v16.8h, v0.8b, v20.8b \n"
- "umlal v17.8h, v1.8b, v20.8b \n"
- "umlal v18.8h, v2.8b, v20.8b \n"
- "umlal v19.8h, v3.8b, v20.8b \n"
+ "umlal v16.8h, v0.8b, v20.8b \n"
+ "umlal v17.8h, v1.8b, v20.8b \n"
+ "umlal v18.8h, v2.8b, v20.8b \n"
+ "umlal v19.8h, v3.8b, v20.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// (3 * line_0 + line_1) >> 2
- "uqrshrn v0.8b, v16.8h, #2 \n"
- "uqrshrn v1.8b, v17.8h, #2 \n"
- "uqrshrn v2.8b, v18.8h, #2 \n"
- "uqrshrn v3.8b, v19.8h, #2 \n"
+ "uqrshrn v0.8b, v16.8h, #2 \n"
+ "uqrshrn v1.8b, v17.8h, #2 \n"
+ "uqrshrn v2.8b, v18.8h, #2 \n"
+ "uqrshrn v3.8b, v19.8h, #2 \n"
+ "prfm pldl1keep, [%3, 448] \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
- "ushll v16.8h, v1.8b, #0 \n"
- "umlal v16.8h, v0.8b, v20.8b \n"
- "uqrshrn v0.8b, v16.8h, #2 \n"
+ "ushll v16.8h, v1.8b, #0 \n"
+ "umlal v16.8h, v0.8b, v20.8b \n"
+ "uqrshrn v0.8b, v16.8h, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
- "urhadd v1.8b, v1.8b, v2.8b \n"
+ "urhadd v1.8b, v1.8b, v2.8b \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
- "ushll v16.8h, v2.8b, #0 \n"
- "umlal v16.8h, v3.8b, v20.8b \n"
- "uqrshrn v2.8b, v16.8h, #2 \n"
+ "ushll v16.8h, v2.8b, #0 \n"
+ "umlal v16.8h, v3.8b, v20.8b \n"
+ "uqrshrn v2.8b, v16.8h, #2 \n"
- "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
- "b.gt 1b \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -226,33 +238,35 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movi v20.8b, #3 \n"
- "add %3, %3, %0 \n"
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
- "subs %w2, %w2, #24 \n"
+ "movi v20.8b, #3 \n"
+ "add %3, %3, %0 \n"
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
+ "subs %w2, %w2, #24 \n"
// average src line 0 with src line 1
- "urhadd v0.8b, v0.8b, v4.8b \n"
- "urhadd v1.8b, v1.8b, v5.8b \n"
- "urhadd v2.8b, v2.8b, v6.8b \n"
- "urhadd v3.8b, v3.8b, v7.8b \n"
+ "urhadd v0.8b, v0.8b, v4.8b \n"
+ "urhadd v1.8b, v1.8b, v5.8b \n"
+ "urhadd v2.8b, v2.8b, v6.8b \n"
+ "urhadd v3.8b, v3.8b, v7.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// a0 = (src[0] * 3 + s[1] * 1) >> 2
- "ushll v4.8h, v1.8b, #0 \n"
- "umlal v4.8h, v0.8b, v20.8b \n"
- "uqrshrn v0.8b, v4.8h, #2 \n"
+ "ushll v4.8h, v1.8b, #0 \n"
+ "umlal v4.8h, v0.8b, v20.8b \n"
+ "uqrshrn v0.8b, v4.8h, #2 \n"
+ "prfm pldl1keep, [%3, 448] \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
- "urhadd v1.8b, v1.8b, v2.8b \n"
+ "urhadd v1.8b, v1.8b, v2.8b \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
- "ushll v4.8h, v2.8b, #0 \n"
- "umlal v4.8h, v3.8b, v20.8b \n"
- "uqrshrn v2.8b, v4.8h, #2 \n"
+ "ushll v4.8h, v2.8b, #0 \n"
+ "umlal v4.8h, v3.8b, v20.8b \n"
+ "uqrshrn v2.8b, v4.8h, #2 \n"
- "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
- "b.gt 1b \n"
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -279,14 +293,15 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "ld1 {v3.16b}, [%3] \n"
- "1: \n"
- "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
- "subs %w2, %w2, #12 \n"
- "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
- "st1 {v2.8b}, [%1], #8 \n"
- "st1 {v2.s}[2], [%1], #4 \n"
- "b.gt 1b \n"
+ "ld1 {v3.16b}, [%3] \n"
+ "1: \n"
+ "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #12 \n"
+ "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v2.8b}, [%1], #8 \n"
+ "st1 {v2.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -303,68 +318,68 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t tmp_src_stride = src_stride;
asm volatile(
- "ld1 {v29.8h}, [%5] \n"
- "ld1 {v30.16b}, [%6] \n"
- "ld1 {v31.8h}, [%7] \n"
- "add %2, %2, %0 \n"
- "1: \n"
+ "ld1 {v29.8h}, [%5] \n"
+ "ld1 {v30.16b}, [%6] \n"
+ "ld1 {v31.8h}, [%7] \n"
+ "add %2, %2, %0 \n"
+ "1: \n"
// 00 40 01 41 02 42 03 43
// 10 50 11 51 12 52 13 53
// 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
- "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
- "subs %w4, %w4, #12 \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
+ "subs %w4, %w4, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// 00 10 01 11 02 12 03 13
// 40 50 41 51 42 52 43 53
- "trn1 v20.8b, v0.8b, v1.8b \n"
- "trn2 v21.8b, v0.8b, v1.8b \n"
- "trn1 v22.8b, v4.8b, v5.8b \n"
- "trn2 v23.8b, v4.8b, v5.8b \n"
- "trn1 v24.8b, v16.8b, v17.8b \n"
- "trn2 v25.8b, v16.8b, v17.8b \n"
+ "trn1 v20.8b, v0.8b, v1.8b \n"
+ "trn2 v21.8b, v0.8b, v1.8b \n"
+ "trn1 v22.8b, v4.8b, v5.8b \n"
+ "trn2 v23.8b, v4.8b, v5.8b \n"
+ "trn1 v24.8b, v16.8b, v17.8b \n"
+ "trn2 v25.8b, v16.8b, v17.8b \n"
// 20 30 21 31 22 32 23 33
// 60 70 61 71 62 72 63 73
- "trn1 v0.8b, v2.8b, v3.8b \n"
- "trn2 v1.8b, v2.8b, v3.8b \n"
- "trn1 v4.8b, v6.8b, v7.8b \n"
- "trn2 v5.8b, v6.8b, v7.8b \n"
- "trn1 v16.8b, v18.8b, v19.8b \n"
- "trn2 v17.8b, v18.8b, v19.8b \n"
+ "trn1 v0.8b, v2.8b, v3.8b \n"
+ "trn2 v1.8b, v2.8b, v3.8b \n"
+ "trn1 v4.8b, v6.8b, v7.8b \n"
+ "trn2 v5.8b, v6.8b, v7.8b \n"
+ "trn1 v16.8b, v18.8b, v19.8b \n"
+ "trn2 v17.8b, v18.8b, v19.8b \n"
// 00+10 01+11 02+12 03+13
// 40+50 41+51 42+52 43+53
- "uaddlp v20.4h, v20.8b \n"
- "uaddlp v21.4h, v21.8b \n"
- "uaddlp v22.4h, v22.8b \n"
- "uaddlp v23.4h, v23.8b \n"
- "uaddlp v24.4h, v24.8b \n"
- "uaddlp v25.4h, v25.8b \n"
+ "uaddlp v20.4h, v20.8b \n"
+ "uaddlp v21.4h, v21.8b \n"
+ "uaddlp v22.4h, v22.8b \n"
+ "uaddlp v23.4h, v23.8b \n"
+ "uaddlp v24.4h, v24.8b \n"
+ "uaddlp v25.4h, v25.8b \n"
// 60+70 61+71 62+72 63+73
- "uaddlp v1.4h, v1.8b \n"
- "uaddlp v5.4h, v5.8b \n"
- "uaddlp v17.4h, v17.8b \n"
+ "uaddlp v1.4h, v1.8b \n"
+ "uaddlp v5.4h, v5.8b \n"
+ "uaddlp v17.4h, v17.8b \n"
// combine source lines
- "add v20.4h, v20.4h, v22.4h \n"
- "add v21.4h, v21.4h, v23.4h \n"
- "add v20.4h, v20.4h, v24.4h \n"
- "add v21.4h, v21.4h, v25.4h \n"
- "add v2.4h, v1.4h, v5.4h \n"
- "add v2.4h, v2.4h, v17.4h \n"
+ "add v20.4h, v20.4h, v22.4h \n"
+ "add v21.4h, v21.4h, v23.4h \n"
+ "add v20.4h, v20.4h, v24.4h \n"
+ "add v21.4h, v21.4h, v25.4h \n"
+ "add v2.4h, v1.4h, v5.4h \n"
+ "add v2.4h, v2.4h, v17.4h \n"
// dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
// + s[6 + st * 1] + s[7 + st * 1]
// + s[6 + st * 2] + s[7 + st * 2]) / 6
- "sqrdmulh v2.8h, v2.8h, v29.8h \n"
- "xtn v2.8b, v2.8h \n"
+ "sqrdmulh v2.8h, v2.8h, v29.8h \n"
+ "xtn v2.8b, v2.8h \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
@@ -372,35 +387,38 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
// registers are already expanded. Then do transposes
// to get aligned.
// xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "ushll v16.8h, v16.8b, #0 \n"
- "uaddl v0.8h, v0.8b, v4.8b \n"
+ "ushll v16.8h, v16.8b, #0 \n"
+ "uaddl v0.8h, v0.8b, v4.8b \n"
// combine source lines
- "add v0.8h, v0.8h, v16.8h \n"
+ "add v0.8h, v0.8h, v16.8h \n"
// xx 20 xx 21 xx 22 xx 23
// xx 30 xx 31 xx 32 xx 33
- "trn1 v1.8h, v0.8h, v0.8h \n"
- "trn2 v4.8h, v0.8h, v0.8h \n"
- "xtn v0.4h, v1.4s \n"
- "xtn v4.4h, v4.4s \n"
+ "trn1 v1.8h, v0.8h, v0.8h \n"
+ "trn2 v4.8h, v0.8h, v0.8h \n"
+ "xtn v0.4h, v1.4s \n"
+ "xtn v4.4h, v4.4s \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// 0+1+2, 3+4+5
- "add v20.8h, v20.8h, v0.8h \n"
- "add v21.8h, v21.8h, v4.8h \n"
+ "add v20.8h, v20.8h, v0.8h \n"
+ "add v21.8h, v21.8h, v4.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
- "sqrdmulh v0.8h, v20.8h, v31.8h \n"
- "sqrdmulh v1.8h, v21.8h, v31.8h \n"
+ "sqrdmulh v0.8h, v20.8h, v31.8h \n"
+ "sqrdmulh v1.8h, v21.8h, v31.8h \n"
+ "prfm pldl1keep, [%3, 448] \n"
// Align for table lookup, vtbl requires registers to be adjacent
- "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
+ "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
- "st1 {v3.8b}, [%1], #8 \n"
- "st1 {v3.s}[2], [%1], #4 \n"
- "b.gt 1b \n"
+ "st1 {v3.8b}, [%1], #8 \n"
+ "st1 {v3.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(tmp_src_stride), // %2
@@ -422,53 +440,53 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
// TODO(fbarchard): use src_stride directly for clang 3.5+.
ptrdiff_t tmp_src_stride = src_stride;
asm volatile(
- "ld1 {v30.8h}, [%4] \n"
- "ld1 {v31.16b}, [%5] \n"
- "add %2, %2, %0 \n"
- "1: \n"
+ "ld1 {v30.8h}, [%4] \n"
+ "ld1 {v31.16b}, [%5] \n"
+ "add %2, %2, %0 \n"
+ "1: \n"
// 00 40 01 41 02 42 03 43
// 10 50 11 51 12 52 13 53
// 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
- "subs %w3, %w3, #12 \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+ "subs %w3, %w3, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// 00 10 01 11 02 12 03 13
// 40 50 41 51 42 52 43 53
- "trn1 v16.8b, v0.8b, v1.8b \n"
- "trn2 v17.8b, v0.8b, v1.8b \n"
- "trn1 v18.8b, v4.8b, v5.8b \n"
- "trn2 v19.8b, v4.8b, v5.8b \n"
+ "trn1 v16.8b, v0.8b, v1.8b \n"
+ "trn2 v17.8b, v0.8b, v1.8b \n"
+ "trn1 v18.8b, v4.8b, v5.8b \n"
+ "trn2 v19.8b, v4.8b, v5.8b \n"
// 20 30 21 31 22 32 23 33
// 60 70 61 71 62 72 63 73
- "trn1 v0.8b, v2.8b, v3.8b \n"
- "trn2 v1.8b, v2.8b, v3.8b \n"
- "trn1 v4.8b, v6.8b, v7.8b \n"
- "trn2 v5.8b, v6.8b, v7.8b \n"
+ "trn1 v0.8b, v2.8b, v3.8b \n"
+ "trn2 v1.8b, v2.8b, v3.8b \n"
+ "trn1 v4.8b, v6.8b, v7.8b \n"
+ "trn2 v5.8b, v6.8b, v7.8b \n"
// 00+10 01+11 02+12 03+13
// 40+50 41+51 42+52 43+53
- "uaddlp v16.4h, v16.8b \n"
- "uaddlp v17.4h, v17.8b \n"
- "uaddlp v18.4h, v18.8b \n"
- "uaddlp v19.4h, v19.8b \n"
+ "uaddlp v16.4h, v16.8b \n"
+ "uaddlp v17.4h, v17.8b \n"
+ "uaddlp v18.4h, v18.8b \n"
+ "uaddlp v19.4h, v19.8b \n"
// 60+70 61+71 62+72 63+73
- "uaddlp v1.4h, v1.8b \n"
- "uaddlp v5.4h, v5.8b \n"
+ "uaddlp v1.4h, v1.8b \n"
+ "uaddlp v5.4h, v5.8b \n"
// combine source lines
- "add v16.4h, v16.4h, v18.4h \n"
- "add v17.4h, v17.4h, v19.4h \n"
- "add v2.4h, v1.4h, v5.4h \n"
+ "add v16.4h, v16.4h, v18.4h \n"
+ "add v17.4h, v17.4h, v19.4h \n"
+ "add v2.4h, v1.4h, v5.4h \n"
// dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
- "uqrshrn v2.8b, v2.8h, #2 \n"
+ "uqrshrn v2.8b, v2.8h, #2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
@@ -478,33 +496,35 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
// xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
// combine source lines
- "uaddl v0.8h, v0.8b, v4.8b \n"
+ "uaddl v0.8h, v0.8b, v4.8b \n"
// xx 20 xx 21 xx 22 xx 23
// xx 30 xx 31 xx 32 xx 33
- "trn1 v1.8h, v0.8h, v0.8h \n"
- "trn2 v4.8h, v0.8h, v0.8h \n"
- "xtn v0.4h, v1.4s \n"
- "xtn v4.4h, v4.4s \n"
+ "trn1 v1.8h, v0.8h, v0.8h \n"
+ "trn2 v4.8h, v0.8h, v0.8h \n"
+ "xtn v0.4h, v1.4s \n"
+ "xtn v4.4h, v4.4s \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// 0+1+2, 3+4+5
- "add v16.8h, v16.8h, v0.8h \n"
- "add v17.8h, v17.8h, v4.8h \n"
+ "add v16.8h, v16.8h, v0.8h \n"
+ "add v17.8h, v17.8h, v4.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
- "sqrdmulh v0.8h, v16.8h, v30.8h \n"
- "sqrdmulh v1.8h, v17.8h, v30.8h \n"
+ "sqrdmulh v0.8h, v16.8h, v30.8h \n"
+ "sqrdmulh v1.8h, v17.8h, v30.8h \n"
// Align for table lookup, vtbl requires registers to
// be adjacent
- "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
+ "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
- "st1 {v3.8b}, [%1], #8 \n"
- "st1 {v3.s}[2], [%1], #4 \n"
- "b.gt 1b \n"
+ "st1 {v3.8b}, [%1], #8 \n"
+ "st1 {v3.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(tmp_src_stride), // %2
@@ -515,38 +535,27 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
"v19", "v30", "v31", "memory", "cc");
}
-void ScaleAddRows_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint16_t* dst_ptr,
- int src_width,
- int src_height) {
- const uint8_t* src_tmp;
+// Add a row of bytes to a row of shorts. Used for box filter.
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+void ScaleAddRow_NEON(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width) {
asm volatile(
"1: \n"
- "mov %0, %1 \n"
- "mov w12, %w5 \n"
- "eor v2.16b, v2.16b, v2.16b \n"
- "eor v3.16b, v3.16b, v3.16b \n"
- "2: \n"
- // load 16 pixels into q0
- "ld1 {v0.16b}, [%0], %3 \n"
- "uaddw2 v3.8h, v3.8h, v0.16b \n"
- "uaddw v2.8h, v2.8h, v0.8b \n"
- "subs w12, w12, #1 \n"
- "b.gt 2b \n"
- "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels
- "add %1, %1, #16 \n"
- "subs %w4, %w4, #16 \n" // 16 processed per loop
- "b.gt 1b \n"
- : "=&r"(src_tmp), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_ptr), // %2
- "+r"(src_stride), // %3
- "+r"(src_width), // %4
- "+r"(src_height) // %5
+ "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
+ "uaddw2 v2.8h, v2.8h, v0.16b \n" // add
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uaddw v1.8h, v1.8h, v0.8b \n"
+ "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_width) // %2
:
- : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List
- );
+ : "memory", "cc", "v0", "v1", "v2" // Clobber List
+ );
}
// TODO(Yang Zhang): Investigate less load instructions for
@@ -572,17 +581,17 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
int64_t x64 = (int64_t)x; // NOLINT
int64_t dx64 = (int64_t)dx; // NOLINT
asm volatile (
- "dup v0.4s, %w3 \n" // x
- "dup v1.4s, %w4 \n" // dx
- "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
- "shl v3.4s, v1.4s, #2 \n" // 4 * dx
- "mul v1.4s, v1.4s, v2.4s \n"
+ "dup v0.4s, %w3 \n" // x
+ "dup v1.4s, %w4 \n" // dx
+ "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
+ "shl v3.4s, v1.4s, #2 \n" // 4 * dx
+ "mul v1.4s, v1.4s, v2.4s \n"
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
- "add v1.4s, v1.4s, v0.4s \n"
+ "add v1.4s, v1.4s, v0.4s \n"
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
- "add v2.4s, v1.4s, v3.4s \n"
- "shl v0.4s, v3.4s, #1 \n" // 8 * dx
- "1: \n"
+ "add v2.4s, v1.4s, v3.4s \n"
+ "shl v0.4s, v3.4s, #1 \n" // 8 * dx
+ "1: \n"
LOAD2_DATA8_LANE(0)
LOAD2_DATA8_LANE(1)
LOAD2_DATA8_LANE(2)
@@ -591,27 +600,27 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
LOAD2_DATA8_LANE(5)
LOAD2_DATA8_LANE(6)
LOAD2_DATA8_LANE(7)
- "mov v6.16b, v1.16b \n"
- "mov v7.16b, v2.16b \n"
- "uzp1 v6.8h, v6.8h, v7.8h \n"
- "ushll v4.8h, v4.8b, #0 \n"
- "ushll v5.8h, v5.8b, #0 \n"
- "ssubl v16.4s, v5.4h, v4.4h \n"
- "ssubl2 v17.4s, v5.8h, v4.8h \n"
- "ushll v7.4s, v6.4h, #0 \n"
- "ushll2 v6.4s, v6.8h, #0 \n"
- "mul v16.4s, v16.4s, v7.4s \n"
- "mul v17.4s, v17.4s, v6.4s \n"
- "rshrn v6.4h, v16.4s, #16 \n"
- "rshrn2 v6.8h, v17.4s, #16 \n"
- "add v4.8h, v4.8h, v6.8h \n"
- "xtn v4.8b, v4.8h \n"
-
- "st1 {v4.8b}, [%0], #8 \n" // store pixels
- "add v1.4s, v1.4s, v0.4s \n"
- "add v2.4s, v2.4s, v0.4s \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "b.gt 1b \n"
+ "mov v6.16b, v1.16b \n"
+ "mov v7.16b, v2.16b \n"
+ "uzp1 v6.8h, v6.8h, v7.8h \n"
+ "ushll v4.8h, v4.8b, #0 \n"
+ "ushll v5.8h, v5.8b, #0 \n"
+ "ssubl v16.4s, v5.4h, v4.4h \n"
+ "ssubl2 v17.4s, v5.8h, v4.8h \n"
+ "ushll v7.4s, v6.4h, #0 \n"
+ "ushll2 v6.4s, v6.8h, #0 \n"
+ "mul v16.4s, v16.4s, v7.4s \n"
+ "mul v17.4s, v17.4s, v6.4s \n"
+ "rshrn v6.4h, v16.4s, #16 \n"
+ "rshrn2 v6.8h, v17.4s, #16 \n"
+ "add v4.8h, v4.8h, v6.8h \n"
+ "xtn v4.8b, v4.8h \n"
+
+ "st1 {v4.8b}, [%0], #8 \n" // store pixels
+ "add v1.4s, v1.4s, v0.4s \n"
+ "add v2.4s, v2.4s, v0.4s \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "b.gt 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width), // %2
@@ -635,74 +644,83 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
int source_y_fraction) {
int y_fraction = 256 - source_y_fraction;
asm volatile(
- "cmp %w4, #0 \n"
- "b.eq 100f \n"
- "add %2, %2, %1 \n"
- "cmp %w4, #64 \n"
- "b.eq 75f \n"
- "cmp %w4, #128 \n"
- "b.eq 50f \n"
- "cmp %w4, #192 \n"
- "b.eq 25f \n"
-
- "dup v5.8b, %w4 \n"
- "dup v4.8b, %w5 \n"
+ "cmp %w4, #0 \n"
+ "b.eq 100f \n"
+ "add %2, %2, %1 \n"
+ "cmp %w4, #64 \n"
+ "b.eq 75f \n"
+ "cmp %w4, #128 \n"
+ "b.eq 50f \n"
+ "cmp %w4, #192 \n"
+ "b.eq 25f \n"
+
+ "dup v5.8b, %w4 \n"
+ "dup v4.8b, %w5 \n"
// General purpose row blend.
"1: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "umull v6.8h, v0.8b, v4.8b \n"
- "umull2 v7.8h, v0.16b, v4.16b \n"
- "umlal v6.8h, v1.8b, v5.8b \n"
- "umlal2 v7.8h, v1.16b, v5.16b \n"
- "rshrn v0.8b, v6.8h, #8 \n"
- "rshrn2 v0.16b, v7.8h, #8 \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 1b \n"
- "b 99f \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "umull v6.8h, v0.8b, v4.8b \n"
+ "umull2 v7.8h, v0.16b, v4.16b \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "umlal v6.8h, v1.8b, v5.8b \n"
+ "umlal2 v7.8h, v1.16b, v5.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "rshrn v0.8b, v6.8h, #8 \n"
+ "rshrn2 v0.16b, v7.8h, #8 \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 1b \n"
+ "b 99f \n"
// Blend 25 / 75.
"25: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 25b \n"
- "b 99f \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 25b \n"
+ "b 99f \n"
// Blend 50 / 50.
"50: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 50b \n"
- "b 99f \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 50b \n"
+ "b 99f \n"
// Blend 75 / 25.
"75: \n"
- "ld1 {v1.16b}, [%1], #16 \n"
- "ld1 {v0.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 75b \n"
- "b 99f \n"
+ "ld1 {v1.16b}, [%1], #16 \n"
+ "ld1 {v0.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 75b \n"
+ "b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "subs %w3, %w3, #16 \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 100b \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 100b \n"
"99: \n"
- "st1 {v0.b}[15], [%0] \n"
+ "st1 {v0.b}[15], [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_stride), // %2
@@ -721,17 +739,18 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
asm volatile(
"1: \n"
// load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
- "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "mov v2.16b, v3.16b \n"
- "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
- "b.gt 1b \n"
+ "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "mov v2.16b, v3.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
- );
+ );
}
void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
@@ -742,19 +761,20 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
asm volatile(
"1: \n"
// load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
- "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop
-
- "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
- "urhadd v1.16b, v2.16b, v3.16b \n"
- "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels
- "b.gt 1b \n"
+ "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+
+ "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "urhadd v1.16b, v2.16b, v3.16b \n"
+ "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
- );
+ );
}
void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
@@ -763,25 +783,27 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
- "add %1, %1, %0 \n"
+ "add %1, %1, %0 \n"
"1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
- "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8
- "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
- "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
- "rshrn v0.8b, v0.8h, #2 \n" // round and pack
- "rshrn v1.8b, v1.8h, #2 \n"
- "rshrn v2.8b, v2.8h, #2 \n"
- "rshrn v3.8b, v3.8h, #2 \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
- "b.gt 1b \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
+ "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8
+ "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
+ "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "rshrn v0.8b, v0.8h, #2 \n" // round and pack
+ "rshrn v1.8b, v1.8h, #2 \n"
+ "rshrn v2.8b, v2.8h, #2 \n"
+ "rshrn v3.8b, v3.8h, #2 \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
@@ -800,13 +822,14 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
(void)src_stride;
asm volatile(
"1: \n"
- "ld1 {v0.s}[0], [%0], %3 \n"
- "ld1 {v0.s}[1], [%0], %3 \n"
- "ld1 {v0.s}[2], [%0], %3 \n"
- "ld1 {v0.s}[3], [%0], %3 \n"
- "subs %w2, %w2, #4 \n" // 4 pixels per loop.
- "st1 {v0.16b}, [%1], #16 \n"
- "b.gt 1b \n"
+ "ld1 {v0.s}[0], [%0], %3 \n"
+ "ld1 {v0.s}[1], [%0], %3 \n"
+ "ld1 {v0.s}[2], [%0], %3 \n"
+ "ld1 {v0.s}[3], [%0], %3 \n"
+ "subs %w2, %w2, #4 \n" // 4 pixels per loop.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v0.16b}, [%1], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
@@ -824,33 +847,35 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
uint8_t* dst_argb,
int dst_width) {
asm volatile(
- "add %1, %1, %0 \n"
+ "add %1, %1, %0 \n"
"1: \n"
- "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
- "ld1 {v1.8b}, [%1], %4 \n"
- "ld1 {v2.8b}, [%0], %4 \n"
- "ld1 {v3.8b}, [%1], %4 \n"
- "ld1 {v4.8b}, [%0], %4 \n"
- "ld1 {v5.8b}, [%1], %4 \n"
- "ld1 {v6.8b}, [%0], %4 \n"
- "ld1 {v7.8b}, [%1], %4 \n"
- "uaddl v0.8h, v0.8b, v1.8b \n"
- "uaddl v2.8h, v2.8b, v3.8b \n"
- "uaddl v4.8h, v4.8b, v5.8b \n"
- "uaddl v6.8h, v6.8b, v7.8b \n"
- "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
- "mov v0.d[1], v2.d[0] \n"
- "mov v2.d[0], v16.d[1] \n"
- "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
- "mov v4.d[1], v6.d[0] \n"
- "mov v6.d[0], v16.d[1] \n"
- "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
- "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
- "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
- "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
- "subs %w3, %w3, #4 \n" // 4 pixels per loop.
- "st1 {v0.16b}, [%2], #16 \n"
- "b.gt 1b \n"
+ "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
+ "ld1 {v1.8b}, [%1], %4 \n"
+ "ld1 {v2.8b}, [%0], %4 \n"
+ "ld1 {v3.8b}, [%1], %4 \n"
+ "ld1 {v4.8b}, [%0], %4 \n"
+ "ld1 {v5.8b}, [%1], %4 \n"
+ "ld1 {v6.8b}, [%0], %4 \n"
+ "ld1 {v7.8b}, [%1], %4 \n"
+ "uaddl v0.8h, v0.8b, v1.8b \n"
+ "uaddl v2.8h, v2.8b, v3.8b \n"
+ "uaddl v4.8h, v4.8b, v5.8b \n"
+ "uaddl v6.8h, v6.8b, v7.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
+ "mov v0.d[1], v2.d[0] \n"
+ "mov v2.d[0], v16.d[1] \n"
+ "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
+ "mov v4.d[1], v6.d[0] \n"
+ "mov v6.d[0], v16.d[1] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
+ "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
+ "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
+ "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
+ "subs %w3, %w3, #4 \n" // 4 pixels per loop.
+ "st1 {v0.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride), // %1
"+r"(dst_argb), // %2
@@ -887,10 +912,11 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
LOAD1_DATA32_LANE(v1, 1)
LOAD1_DATA32_LANE(v1, 2)
LOAD1_DATA32_LANE(v1, 3)
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
// clang-format on
- "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "b.gt 1b \n"
+ "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
@@ -923,16 +949,16 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
int64_t x64 = (int64_t)x; // NOLINT
int64_t dx64 = (int64_t)dx; // NOLINT
asm volatile (
- "dup v0.4s, %w3 \n" // x
- "dup v1.4s, %w4 \n" // dx
- "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
- "shl v6.4s, v1.4s, #2 \n" // 4 * dx
- "mul v1.4s, v1.4s, v2.4s \n"
- "movi v3.16b, #0x7f \n" // 0x7F
- "movi v4.8h, #0x7f \n" // 0x7F
+ "dup v0.4s, %w3 \n" // x
+ "dup v1.4s, %w4 \n" // dx
+ "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
+ "shl v6.4s, v1.4s, #2 \n" // 4 * dx
+ "mul v1.4s, v1.4s, v2.4s \n"
+ "movi v3.16b, #0x7f \n" // 0x7F
+ "movi v4.8h, #0x7f \n" // 0x7F
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
- "add v5.4s, v1.4s, v0.4s \n"
- "1: \n"
+ "add v5.4s, v1.4s, v0.4s \n"
+ "1: \n"
// d0, d1: a
// d2, d3: b
LOAD2_DATA32_LANE(v0, v1, 0)
@@ -953,15 +979,15 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
"umull2 v17.8h, v0.16b, v7.16b \n"
"umull v18.8h, v1.8b, v2.8b \n"
"umull2 v19.8h, v1.16b, v2.16b \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"add v16.8h, v16.8h, v18.8h \n"
"add v17.8h, v17.8h, v19.8h \n"
"shrn v0.8b, v16.8h, #7 \n"
"shrn2 v0.16b, v17.8h, #7 \n"
-
"st1 {v0.4s}, [%0], #16 \n" // store pixels
"add v5.4s, v5.4s, v6.4s \n"
"subs %w2, %w2, #4 \n" // 4 processed per loop
- "b.gt 1b \n"
+ "b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
@@ -984,26 +1010,28 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
- "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
+ "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
"1: \n"
- "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc
- "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc
- "subs %w3, %w3, #8 \n" // 8 processed per loop
- "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent
- "uaddlp v1.4s, v1.8h \n"
- "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent
- "uadalp v1.4s, v3.8h \n"
- "rshrn v0.4h, v0.4s, #2 \n" // round and pack
- "rshrn2 v0.8h, v1.4s, #2 \n"
- "st1 {v0.8h}, [%2], #16 \n"
- "b.gt 1b \n"
+ "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc
+ "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc
+ "subs %w3, %w3, #8 \n" // 8 processed per loop
+ "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent
+ "uaddlp v1.4s, v1.8h \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent
+ "uadalp v1.4s, v3.8h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "rshrn v0.4h, v0.4s, #2 \n" // round and pack
+ "rshrn2 v0.8h, v1.4s, #2 \n"
+ "st1 {v0.8h}, [%2], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
"+r"(dst_width) // %3
:
: "v0", "v1", "v2", "v3" // Clobber List
- );
+ );
}
// Read 8x2 upsample with filtering and write 16x1.
@@ -1013,38 +1041,40 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
uint16_t* dst,
int dst_width) {
asm volatile(
- "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
- "movi v0.8h, #9 \n" // constants
- "movi v1.4s, #3 \n"
+ "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
+ "movi v0.8h, #9 \n" // constants
+ "movi v1.4s, #3 \n"
"1: \n"
- "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8
- "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1
- "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row
- "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1
- "subs %w3, %w3, #16 \n" // 16 dst pixels per loop
- "umull v16.4s, v3.4h, v0.4h \n"
- "umull2 v7.4s, v3.8h, v0.8h \n"
- "umull v18.4s, v4.4h, v0.4h \n"
- "umull2 v17.4s, v4.8h, v0.8h \n"
- "uaddw v16.4s, v16.4s, v6.4h \n"
- "uaddl2 v19.4s, v6.8h, v3.8h \n"
- "uaddl v3.4s, v6.4h, v3.4h \n"
- "uaddw2 v6.4s, v7.4s, v6.8h \n"
- "uaddl2 v7.4s, v5.8h, v4.8h \n"
- "uaddl v4.4s, v5.4h, v4.4h \n"
- "uaddw v18.4s, v18.4s, v5.4h \n"
- "mla v16.4s, v4.4s, v1.4s \n"
- "mla v18.4s, v3.4s, v1.4s \n"
- "mla v6.4s, v7.4s, v1.4s \n"
- "uaddw2 v4.4s, v17.4s, v5.8h \n"
- "uqrshrn v16.4h, v16.4s, #4 \n"
- "mla v4.4s, v19.4s, v1.4s \n"
- "uqrshrn2 v16.8h, v6.4s, #4 \n"
- "uqrshrn v17.4h, v18.4s, #4 \n"
- "uqrshrn2 v17.8h, v4.4s, #4 \n"
- "st2 {v16.8h-v17.8h}, [%2], #32 \n"
- "b.gt 1b \n"
+ "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8
+ "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1
+ "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row
+ "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1
+ "subs %w3, %w3, #16 \n" // 16 dst pixels per loop
+ "umull v16.4s, v3.4h, v0.4h \n"
+ "umull2 v7.4s, v3.8h, v0.8h \n"
+ "umull v18.4s, v4.4h, v0.4h \n"
+ "umull2 v17.4s, v4.8h, v0.8h \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uaddw v16.4s, v16.4s, v6.4h \n"
+ "uaddl2 v19.4s, v6.8h, v3.8h \n"
+ "uaddl v3.4s, v6.4h, v3.4h \n"
+ "uaddw2 v6.4s, v7.4s, v6.8h \n"
+ "uaddl2 v7.4s, v5.8h, v4.8h \n"
+ "uaddl v4.4s, v5.4h, v4.4h \n"
+ "uaddw v18.4s, v18.4s, v5.4h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "mla v16.4s, v4.4s, v1.4s \n"
+ "mla v18.4s, v3.4s, v1.4s \n"
+ "mla v6.4s, v7.4s, v1.4s \n"
+ "uaddw2 v4.4s, v17.4s, v5.8h \n"
+ "uqrshrn v16.4h, v16.4s, #4 \n"
+ "mla v4.4s, v19.4s, v1.4s \n"
+ "uqrshrn2 v16.8h, v6.4s, #4 \n"
+ "uqrshrn v17.4h, v18.4s, #4 \n"
+ "uqrshrn2 v17.8h, v4.4s, #4 \n"
+ "st2 {v16.8h-v17.8h}, [%2], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
@@ -1053,7 +1083,65 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
"r"(14LL) // %5
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
"v19" // Clobber List
- );
+ );
+}
+
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // V 16 bytes -> 8 shorts.
+ "ld2 {v16.16b,v17.16b}, [%1], #32 \n" // load 16
+ "uadalp v0.8h, v16.16b \n" // U 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v17.16b \n" // V 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "rshrn v0.8b, v0.8h, #2 \n" // round and pack
+ "prfm pldl1keep, [%1, 448] \n"
+ "rshrn v1.8b, v1.8h, #2 \n"
+ "st2 {v0.8b,v1.8b}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "v0", "v1", "v16", "v17");
+}
+
+// Reads 4 pixels at a time.
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx, // pixel step
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
+ const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
+ const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.h}[0], [%0], %6 \n"
+ "ld1 {v1.h}[0], [%1], %6 \n"
+ "ld1 {v2.h}[0], [%2], %6 \n"
+ "ld1 {v3.h}[0], [%3], %6 \n"
+ "subs %w5, %w5, #4 \n" // 4 pixels per loop.
+ "st4 {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src1_ptr), // %1
+ "+r"(src2_ptr), // %2
+ "+r"(src3_ptr), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_width) // %5
+ : "r"((int64_t)(src_stepx * 8)) // %6
+ : "memory", "cc", "v0", "v1", "v2", "v3");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
diff --git a/chromium/third_party/libyuv/source/scale_uv.cc b/chromium/third_party/libyuv/source/scale_uv.cc
new file mode 100644
index 00000000000..b0469f09b87
--- /dev/null
+++ b/chromium/third_party/libyuv/source/scale_uv.cc
@@ -0,0 +1,891 @@
+/*
+ * Copyright 2020 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h" // For CopyUV
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Macros to enable specialized scalers
+
+#ifndef HAS_SCALEUVDOWN2
+#define HAS_SCALEUVDOWN2 1
+#endif
+#ifndef HAS_SCALEUVDOWN4BOX
+#define HAS_SCALEUVDOWN4BOX 1
+#endif
+#ifndef HAS_SCALEUVDOWNEVEN
+#define HAS_SCALEUVDOWNEVEN 1
+#endif
+#ifndef HAS_SCALEUVBILINEARDOWN
+#define HAS_SCALEUVBILINEARDOWN 1
+#endif
+#ifndef HAS_SCALEUVBILINEARUP
+#define HAS_SCALEUVBILINEARUP 1
+#endif
+#ifndef HAS_UVCOPY
+#define HAS_UVCOPY 1
+#endif
+#ifndef HAS_SCALEPLANEVERTICAL
+#define HAS_SCALEPLANEVERTICAL 1
+#endif
+
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// ScaleUV, 1/2
+// This is an optimized version for scaling down a UV to 1/2 of
+// its original size.
+#if HAS_SCALEUVDOWN2
+static void ScaleUVDown2(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ int row_stride = src_stride * (dy >> 16);
+ void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
+ uint8_t* dst_uv, int dst_width) =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_C
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_C
+ : ScaleUVRowDown2Box_C);
+ (void)src_width;
+ (void)src_height;
+ (void)dx;
+ assert(dx == 65536 * 2); // Test scale factor of 2.
+ assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2.
+ // Advance to odd row, even column.
+ if (filtering == kFilterBilinear) {
+ src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
+ } else {
+ src_uv += (y >> 16) * src_stride + ((x >> 16) - 1) * 2;
+ }
+
+#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && filtering) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && filtering) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && filtering) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON;
+ }
+ }
+#endif
+
+// This code is not enabled. Only box filter is available at this time.
+#if defined(HAS_SCALEUVROWDOWN2_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_Any_SSSE3
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_SSSE3
+ : ScaleUVRowDown2Box_Any_SSSE3);
+ if (IS_ALIGNED(dst_width, 2)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_SSSE3
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_SSSE3
+ : ScaleUVRowDown2Box_SSSE3);
+ }
+ }
+#endif
+// This code is not enabled. Only box filter is available at this time.
+#if defined(HAS_SCALEUVROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_Any_NEON
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON
+ : ScaleUVRowDown2Box_Any_NEON);
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_NEON
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON
+ : ScaleUVRowDown2Box_NEON);
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_Any_MMI
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MMI
+ : ScaleUVRowDown2Box_Any_MMI);
+ if (IS_ALIGNED(dst_width, 2)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_MMI
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MMI
+ : ScaleUVRowDown2Box_MMI);
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_Any_MSA
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MSA
+ : ScaleUVRowDown2Box_Any_MSA);
+ if (IS_ALIGNED(dst_width, 2)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_MSA
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MSA
+ : ScaleUVRowDown2Box_MSA);
+ }
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ ScaleUVRowDown2(src_uv, src_stride, dst_uv, dst_width);
+ src_uv += row_stride;
+ dst_uv += dst_stride;
+ }
+}
+#endif // HAS_SCALEUVDOWN2
+
+// ScaleUV, 1/4
+// This is an optimized version for scaling down a UV to 1/4 of
+// its original size.
+#if HAS_SCALEUVDOWN4BOX
+static void ScaleUVDown4Box(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy) {
+ int j;
+ // Allocate 2 rows of UV.
+ const int kRowSize = (dst_width * 2 * 2 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+ int row_stride = src_stride * (dy >> 16);
+ void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
+ uint8_t* dst_uv, int dst_width) =
+ ScaleUVRowDown2Box_C;
+ // Advance to odd row, even column.
+ src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
+ (void)src_width;
+ (void)src_height;
+ (void)dx;
+ assert(dx == 65536 * 4); // Test scale factor of 4.
+ assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
+
+#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON;
+ }
+ }
+#endif
+
+ for (j = 0; j < dst_height; ++j) {
+ ScaleUVRowDown2(src_uv, src_stride, row, dst_width * 2);
+ ScaleUVRowDown2(src_uv + src_stride * 2, src_stride, row + kRowSize,
+ dst_width * 2);
+ ScaleUVRowDown2(row, kRowSize, dst_uv, dst_width);
+ src_uv += row_stride;
+ dst_uv += dst_stride;
+ }
+ free_aligned_buffer_64(row);
+}
+#endif // HAS_SCALEUVDOWN4BOX
+
+// ScaleUV Even
+// This is an optimized version for scaling down a UV to even
+// multiple of its original size.
+#if HAS_SCALEUVDOWNEVEN
+static void ScaleUVDownEven(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ int col_step = dx >> 16;
+ int row_stride = (dy >> 16) * src_stride;
+ void (*ScaleUVRowDownEven)(const uint8_t* src_uv, ptrdiff_t src_stride,
+ int src_step, uint8_t* dst_uv, int dst_width) =
+ filtering ? ScaleUVRowDownEvenBox_C : ScaleUVRowDownEven_C;
+ (void)src_width;
+ (void)src_height;
+ assert(IS_ALIGNED(src_width, 2));
+ assert(IS_ALIGNED(src_height, 2));
+ src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
+#if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3
+ : ScaleUVRowDownEven_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_SSE2 : ScaleUVRowDownEven_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && !filtering) {
+ ScaleUVRowDownEven = ScaleUVRowDownEven_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDownEven = ScaleUVRowDownEven_NEON;
+ }
+ }
+#endif// TODO(fbarchard): Enable Box filter
+#if defined(HAS_SCALEUVROWDOWNEVENBOX_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_NEON
+ : ScaleUVRowDownEven_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_NEON : ScaleUVRowDownEven_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_Any_MMI : ScaleUVRowDownEven_Any_MMI;
+ if (IS_ALIGNED(dst_width, 2)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_MMI : ScaleUVRowDownEven_MMI;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_Any_MSA : ScaleUVRowDownEven_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_MSA : ScaleUVRowDownEven_MSA;
+ }
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ ScaleUVRowDownEven(src_uv, src_stride, col_step, dst_uv, dst_width);
+ src_uv += row_stride;
+ dst_uv += dst_stride;
+ }
+}
+#endif
+
+// Scale UV down with bilinear interpolation.
+#if HAS_SCALEUVBILINEARDOWN
+static void ScaleUVBilinearDown(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv,
+ int dst_width, int x, int dx) =
+ (src_width >= 32768) ? ScaleUVFilterCols64_C : ScaleUVFilterCols_C;
+ int64_t xlast = x + (int64_t)(dst_width - 1) * dx;
+ int64_t xl = (dx >= 0) ? x : xlast;
+ int64_t xr = (dx >= 0) ? xlast : x;
+ int clip_src_width;
+ xl = (xl >> 16) & ~3; // Left edge aligned.
+ xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels.
+ xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel.
+ if (xr > src_width) {
+ xr = src_width;
+ }
+ clip_src_width = (int)(xr - xl) * 2; // Width aligned to 2.
+ src_uv += xl * 2;
+ x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleUVFilterCols = ScaleUVFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_MSA;
+ }
+ }
+#endif
+ // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+ // Allocate a row of UV.
+ {
+ align_buffer_64(row, clip_src_width * 2);
+
+ const int max_y = (src_height - 1) << 16;
+ if (y > max_y) {
+ y = max_y;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ int yi = y >> 16;
+ const uint8_t* src = src_uv + yi * src_stride;
+ if (filtering == kFilterLinear) {
+ ScaleUVFilterCols(dst_uv, src, dst_width, x, dx);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(row, src, src_stride, clip_src_width, yf);
+ ScaleUVFilterCols(dst_uv, row, dst_width, x, dx);
+ }
+ dst_uv += dst_stride;
+ y += dy;
+ if (y > max_y) {
+ y = max_y;
+ }
+ }
+ free_aligned_buffer_64(row);
+ }
+}
+#endif
+
+// Scale UV up with bilinear interpolation.
+#if HAS_SCALEUVBILINEARUP
+static void ScaleUVBilinearUp(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv,
+ int dst_width, int x, int dx) =
+ filtering ? ScaleUVFilterCols_C : ScaleUVCols_C;
+ const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(dst_width, 2)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+ if (src_width >= 32768) {
+ ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C;
+ }
+#if defined(HAS_SCALEUVFILTERCOLS_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleUVFilterCols = ScaleUVFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_NEON)
+ if (filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_MSA)
+ if (filtering && TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_SSSE3)
+ if (!filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleUVFilterCols = ScaleUVCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_NEON)
+ if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVFilterCols = ScaleUVCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVFilterCols = ScaleUVCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_MMI)
+ if (!filtering && TestCpuFlag(kCpuHasMMI)) {
+ ScaleUVFilterCols = ScaleUVCols_Any_MMI;
+ if (IS_ALIGNED(dst_width, 1)) {
+ ScaleUVFilterCols = ScaleUVCols_MMI;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_MSA)
+ if (!filtering && TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVFilterCols = ScaleUVCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVFilterCols = ScaleUVCols_MSA;
+ }
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleUVFilterCols = ScaleUVColsUp2_C;
+#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) {
+ ScaleUVFilterCols = ScaleUVColsUp2_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVCOLSUP2_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+ ScaleUVFilterCols = ScaleUVColsUp2_MMI;
+ }
+#endif
+ }
+
+ if (y > max_y) {
+ y = max_y;
+ }
+
+ {
+ int yi = y >> 16;
+ const uint8_t* src = src_uv + yi * src_stride;
+
+ // Allocate 2 rows of UV.
+ const int kRowSize = (dst_width * 2 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+
+ uint8_t* rowptr = row;
+ int rowstride = kRowSize;
+ int lasty = yi;
+
+ ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
+ if (src_height > 1) {
+ src += src_stride;
+ }
+ ScaleUVFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+ src += src_stride;
+
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ src = src_uv + yi * src_stride;
+ }
+ if (yi != lasty) {
+ ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ src += src_stride;
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_uv, rowptr, 0, dst_width * 2, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_uv, rowptr, rowstride, dst_width * 2, yf);
+ }
+ dst_uv += dst_stride;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ }
+}
+#endif // HAS_SCALEUVBILINEARUP
+
+// Scale UV to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScaleUVSimple(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy) {
+ int j;
+ void (*ScaleUVCols)(uint8_t * dst_uv, const uint8_t* src_uv, int dst_width,
+ int x, int dx) =
+ (src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C;
+ (void)src_height;
+#if defined(HAS_SCALEUVCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleUVCols = ScaleUVCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVCols = ScaleUVCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVCols = ScaleUVCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleUVCols = ScaleUVCols_Any_MMI;
+ if (IS_ALIGNED(dst_width, 1)) {
+ ScaleUVCols = ScaleUVCols_MMI;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVCols = ScaleUVCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVCols = ScaleUVCols_MSA;
+ }
+ }
+#endif
+ if (src_width * 2 == dst_width && x < 0x8000) {
+ ScaleUVCols = ScaleUVColsUp2_C;
+#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) {
+ ScaleUVCols = ScaleUVColsUp2_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVCOLSUP2_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+ ScaleUVCols = ScaleUVColsUp2_MMI;
+ }
+#endif
+ }
+
+ for (j = 0; j < dst_height; ++j) {
+ ScaleUVCols(dst_uv, src_uv + (y >> 16) * src_stride, dst_width, x, dx);
+ dst_uv += dst_stride;
+ y += dy;
+ }
+}
+
+// Copy UV with optional flipping
+#if HAS_UVCOPY
+static int UVCopy(const uint8_t* src_UV,
+ int src_stride_UV,
+ uint8_t* dst_UV,
+ int dst_stride_UV,
+ int width,
+ int height) {
+ if (!src_UV || !dst_UV || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_UV = src_UV + (height - 1) * src_stride_UV;
+ src_stride_UV = -src_stride_UV;
+ }
+
+ CopyPlane(src_UV, src_stride_UV, dst_UV, dst_stride_UV, width * 2, height);
+ return 0;
+}
+#endif // HAS_UVCOPY
+
+// Scale a UV plane (from NV12)
+// This function in turn calls a scaling function
+// suitable for handling the desired resolutions.
+static void ScaleUV(const uint8_t* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint8_t* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ int clip_x,
+ int clip_y,
+ int clip_width,
+ int clip_height,
+ enum FilterMode filtering) {
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ // UV does not support box filter yet, but allow the user to pass it.
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+ filtering);
+
+ // Negative src_height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src = src + (src_height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+ &dx, &dy);
+ src_width = Abs(src_width);
+ if (clip_x) {
+ int64_t clipf = (int64_t)(clip_x)*dx;
+ x += (clipf & 0xffff);
+ src += (clipf >> 16) * 2;
+ dst += clip_x * 2;
+ }
+ if (clip_y) {
+ int64_t clipf = (int64_t)(clip_y)*dy;
+ y += (clipf & 0xffff);
+ src += (clipf >> 16) * src_stride;
+ dst += clip_y * dst_stride;
+ }
+
+ // Special case for integer step values.
+ if (((dx | dy) & 0xffff) == 0) {
+ if (!dx || !dy) { // 1 pixel wide and/or tall.
+ filtering = kFilterNone;
+ } else {
+ // Optimized even scale down. ie 2, 4, 6, 8, 10x.
+ if (!(dx & 0x10000) && !(dy & 0x10000)) {
+#if HAS_SCALEUVDOWN2
+ if (dx == 0x20000) {
+ // Optimized 1/2 downsample.
+ ScaleUVDown2(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+ }
+#endif
+#if HAS_SCALEUVDOWN4BOX
+ if (dx == 0x40000 && filtering == kFilterBox) {
+ // Optimized 1/4 box downsample.
+ ScaleUVDown4Box(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy);
+ return;
+ }
+#endif
+#if HAS_SCALEUVDOWNEVEN
+ ScaleUVDownEven(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+#endif
+ }
+ // Optimized odd scale down. ie 3, 5, 7, 9x.
+ if ((dx & 0x10000) && (dy & 0x10000)) {
+ filtering = kFilterNone;
+#ifdef HAS_UVCOPY
+ if (dx == 0x10000 && dy == 0x10000) {
+ // Straight copy.
+ UVCopy(src + (y >> 16) * src_stride + (x >> 16) * 2, src_stride, dst,
+ dst_stride, clip_width, clip_height);
+ return;
+ }
+#endif
+ }
+ }
+ }
+ // HAS_SCALEPLANEVERTICAL
+ if (dx == 0x10000 && (x & 0xffff) == 0) {
+ // Arbitrary scale vertically, but unscaled horizontally.
+ ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
+ dst_stride, src, dst, x, y, dy, 4, filtering);
+ return;
+ }
+
+#if HAS_SCALEUVBILINEARUP
+ if (filtering && dy < 65536) {
+ ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+ }
+#endif
+#if HAS_SCALEUVBILINEARDOWN
+ if (filtering) {
+ ScaleUVBilinearDown(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+ }
+#endif
+ ScaleUVSimple(src_width, src_height, clip_width, clip_height, src_stride,
+ dst_stride, src, dst, x, dx, y, dy);
+}
+
+// Scale an UV image.
+LIBYUV_API
+int UVScale(const uint8_t* src_uv,
+ int src_stride_uv,
+ int src_width,
+ int src_height,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ if (!src_uv || src_width == 0 || src_height == 0 || src_width > 32768 ||
+ src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+ ScaleUV(src_uv, src_stride_uv, src_width, src_height, dst_uv, dst_stride_uv,
+ dst_width, dst_height, 0, 0, dst_width, dst_height, filtering);
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/tools_libyuv/OWNERS b/chromium/third_party/libyuv/tools_libyuv/OWNERS
index 2cb971d2b72..aae4fb6e021 100644
--- a/chromium/third_party/libyuv/tools_libyuv/OWNERS
+++ b/chromium/third_party/libyuv/tools_libyuv/OWNERS
@@ -1 +1,4 @@
-phoglund@chromium.org
+mbonadei@chromium.org
+fbarchard@chromium.org
+pbos@chromium.org
+
diff --git a/chromium/third_party/libyuv/tools_libyuv/autoroller/roll_deps.py b/chromium/third_party/libyuv/tools_libyuv/autoroller/roll_deps.py
index 37727ab1a69..9b9660de0bb 100755
--- a/chromium/third_party/libyuv/tools_libyuv/autoroller/roll_deps.py
+++ b/chromium/third_party/libyuv/tools_libyuv/autoroller/roll_deps.py
@@ -8,7 +8,7 @@
# be found in the AUTHORS file in the root of the source tree.
# This is a modified copy of the script in
-# https://chromium.googlesource.com/external/webrtc/+/master/tools-webrtc/autoroller/roll_deps.py
+# https://webrtc.googlesource.com/src/+/master/tools_webrtc/autoroller/roll_deps.py
# customized for libyuv.
@@ -22,7 +22,7 @@ import os
import re
import subprocess
import sys
-import urllib
+import urllib2
# Skip these dependencies (list without solution name prefix).
@@ -37,7 +37,7 @@ CHROMIUM_LOG_TEMPLATE = CHROMIUM_SRC_URL + '/+log/%s'
CHROMIUM_FILE_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s/%s'
COMMIT_POSITION_RE = re.compile('^Cr-Commit-Position: .*#([0-9]+).*$')
-CLANG_REVISION_RE = re.compile(r'^CLANG_REVISION = \'(\d+)\'$')
+CLANG_REVISION_RE = re.compile(r'^CLANG_REVISION = \'([0-9a-z-]+)\'$')
ROLL_BRANCH_NAME = 'roll_chromium_revision'
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -69,6 +69,7 @@ def ParseDepsDict(deps_content):
local_scope = {}
global_scope = {
'Var': VarLookup(local_scope),
+ 'Str': lambda s: s,
'deps_os': {},
}
exec(deps_content, global_scope, local_scope)
@@ -90,7 +91,7 @@ def ParseCommitPosition(commit_message):
for line in reversed(commit_message.splitlines()):
m = COMMIT_POSITION_RE.match(line.strip())
if m:
- return m.group(1)
+ return int(m.group(1))
logging.error('Failed to parse commit position id from:\n%s\n',
commit_message)
sys.exit(-1)
@@ -109,7 +110,7 @@ def _RunCommand(command, working_dir=None, ignore_exit_code=False,
logging.debug('CMD: %s CWD: %s', ' '.join(command), working_dir)
env = os.environ.copy()
if extra_env:
- assert all(type(value) == str for value in extra_env.values())
+ assert all(isinstance(value, str) for value in extra_env.values())
logging.debug('extra env: %s', extra_env)
env.update(extra_env)
p = subprocess.Popen(command, stdout=subprocess.PIPE,
@@ -169,7 +170,7 @@ def ReadRemoteCrCommit(revision):
def ReadUrlContent(url):
"""Connect to a remote host and read the contents. Returns a list of lines."""
- conn = urllib.urlopen(url)
+ conn = urllib2.urlopen(url)
try:
return conn.readlines()
except IOError as e:
@@ -274,7 +275,7 @@ def CalculateChangedClang(new_cr_rev):
match = CLANG_REVISION_RE.match(line)
if match:
return match.group(1)
- raise RollError('Could not parse Clang revision!')
+ raise RollError('Could not parse Clang revision from:\n' + '\n'.join(' ' + l for l in lines))
with open(CLANG_UPDATE_SCRIPT_LOCAL_PATH, 'rb') as f:
current_lines = f.readlines()
@@ -298,9 +299,6 @@ def GenerateCommitMessage(current_cr_rev, new_cr_rev, current_commit_pos,
commit_msg.append('Change log: %s' % (CHROMIUM_LOG_TEMPLATE % rev_interval))
commit_msg.append('Full diff: %s\n' % (CHROMIUM_COMMIT_TEMPLATE %
rev_interval))
- # TBR field will be empty unless in some custom cases, where some engineers
- # are added.
- tbr_authors = ''
if changed_deps_list:
commit_msg.append('Changed dependencies:')
@@ -322,7 +320,11 @@ def GenerateCommitMessage(current_cr_rev, new_cr_rev, current_commit_pos,
else:
commit_msg.append('No update to Clang.\n')
- commit_msg.append('TBR=%s' % tbr_authors)
+ # TBR needs to be non-empty for Gerrit to process it.
+ git_author = _RunCommand(['git', 'config', 'user.email'],
+ working_dir=CHECKOUT_SRC_DIR)[0].strip()
+ commit_msg.append('TBR=%s' % git_author)
+
commit_msg.append('BUG=None')
return '\n'.join(commit_msg)
@@ -397,20 +399,36 @@ def _LocalCommit(commit_msg, dry_run):
_RunCommand(['git', 'commit', '-m', commit_msg])
-def _UploadCL(dry_run, rietveld_email=None):
- logging.info('Uploading CL...')
- if not dry_run:
- cmd = ['git', 'cl', 'upload', '-f']
- if rietveld_email:
- cmd.append('--email=%s' % rietveld_email)
- _RunCommand(cmd, extra_env={'EDITOR': 'true'})
+def ChooseCQMode(skip_cq, cq_over, current_commit_pos, new_commit_pos):
+ if skip_cq:
+ return 0
+ if (new_commit_pos - current_commit_pos) < cq_over:
+ return 1
+ return 2
-def _SendToCQ(dry_run, skip_cq):
- logging.info('Sending the CL to the CQ...')
- if not dry_run and not skip_cq:
- _RunCommand(['git', 'cl', 'set_commit'])
- logging.info('Sent the CL to the CQ.')
+def _UploadCL(commit_queue_mode):
+ """Upload the committed changes as a changelist to Gerrit.
+
+ commit_queue_mode:
+ - 2: Submit to commit queue.
+ - 1: Run trybots but do not submit to CQ.
+ - 0: Skip CQ, upload only.
+ """
+ cmd = ['git', 'cl', 'upload', '--force', '--bypass-hooks', '--send-mail']
+ if commit_queue_mode >= 2:
+ logging.info('Sending the CL to the CQ...')
+ cmd.extend(['--use-commit-queue'])
+ elif commit_queue_mode >= 1:
+ logging.info('Starting CQ dry run...')
+ cmd.extend(['--cq-dry-run'])
+ extra_env = {
+ 'EDITOR': 'true',
+ 'SKIP_GCE_AUTH_FOR_GIT': '1',
+ }
+ stdout, stderr = _RunCommand(cmd, extra_env=extra_env)
+ logging.debug('Output from "git cl upload":\nstdout:\n%s\n\nstderr:\n%s',
+ stdout, stderr)
def main():
@@ -420,10 +438,6 @@ def main():
p.add_argument('-r', '--revision',
help=('Chromium Git revision to roll to. Defaults to the '
'Chromium HEAD revision if omitted.'))
- p.add_argument('-u', '--rietveld-email',
- help=('E-mail address to use for creating the CL at Rietveld'
- 'If omitted a previously cached one will be used or an '
- 'error will be thrown during upload.'))
p.add_argument('--dry-run', action='store_true', default=False,
help=('Calculate changes and modify DEPS, but don\'t create '
'any local branch, commit, upload CL or send any '
@@ -432,8 +446,12 @@ def main():
default=False,
help=('Ignore if the current branch is not master or if there '
'are uncommitted changes (default: %(default)s).'))
- p.add_argument('--skip-cq', action='store_true', default=False,
- help='Skip sending the CL to the CQ (default: %(default)s)')
+ grp = p.add_mutually_exclusive_group()
+ grp.add_argument('--skip-cq', action='store_true', default=False,
+ help='Skip sending the CL to the CQ (default: %(default)s)')
+ grp.add_argument('--cq-over', type=int, default=1,
+ help=('Commit queue dry run if the revision difference '
+ 'is below this number (default: %(default)s)'))
p.add_argument('-v', '--verbose', action='store_true', default=False,
help='Be extra verbose in printing of log messages.')
opts = p.parse_args()
@@ -478,8 +496,11 @@ def main():
_CreateRollBranch(opts.dry_run)
UpdateDepsFile(deps_filename, current_cr_rev, new_cr_rev, changed_deps)
_LocalCommit(commit_msg, opts.dry_run)
- _UploadCL(opts.dry_run, opts.rietveld_email)
- _SendToCQ(opts.dry_run, opts.skip_cq)
+ commit_queue_mode = ChooseCQMode(opts.skip_cq, opts.cq_over,
+ current_commit_pos, new_commit_pos)
+ logging.info('Uploading CL...')
+ if not opts.dry_run:
+ _UploadCL(commit_queue_mode)
return 0
diff --git a/chromium/third_party/libyuv/tools_libyuv/msan/OWNERS b/chromium/third_party/libyuv/tools_libyuv/msan/OWNERS
index 0a919805c2c..9b67a8f6789 100644
--- a/chromium/third_party/libyuv/tools_libyuv/msan/OWNERS
+++ b/chromium/third_party/libyuv/tools_libyuv/msan/OWNERS
@@ -1,3 +1,3 @@
+mbonadei@chromium.org
+fbarchard@chromium.org
pbos@chromium.org
-phoglund@chromium.org
-
diff --git a/chromium/third_party/libyuv/tools_libyuv/ubsan/OWNERS b/chromium/third_party/libyuv/tools_libyuv/ubsan/OWNERS
index da77b4ef23f..9b67a8f6789 100644
--- a/chromium/third_party/libyuv/tools_libyuv/ubsan/OWNERS
+++ b/chromium/third_party/libyuv/tools_libyuv/ubsan/OWNERS
@@ -1,4 +1,3 @@
-pbos@webrtc.org
-phoglund@webrtc.org
+mbonadei@chromium.org
fbarchard@chromium.org
-
+pbos@chromium.org
diff --git a/chromium/third_party/libyuv/unit_test/color_test.cc b/chromium/third_party/libyuv/unit_test/color_test.cc
index 4bb448d56fe..842fd994441 100644
--- a/chromium/third_party/libyuv/unit_test/color_test.cc
+++ b/chromium/third_party/libyuv/unit_test/color_test.cc
@@ -20,21 +20,19 @@
namespace libyuv {
-// TODO(fbarchard): Port high accuracy YUV to RGB to Neon.
-#if !defined(LIBYUV_DISABLE_NEON) && \
- (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define ERROR_R 1
-#define ERROR_G 1
-#define ERROR_B 3
-#define ERROR_FULL 6
-#define ERROR_J420 5
+// TODO(fbarchard): clang x86 has a higher accuracy YUV to RGB.
+// Port to Visual C and other CPUs
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#define ERROR_FULL 5
+#define ERROR_J420 4
#else
+#define ERROR_FULL 6
+#define ERROR_J420 6
+#endif
#define ERROR_R 1
#define ERROR_G 1
#define ERROR_B 3
-#define ERROR_FULL 5
-#define ERROR_J420 3
-#endif
#define TESTCS(TESTNAME, YUVTOARGB, ARGBTOYUV, HS1, HS, HN, DIFF) \
TEST_F(LibYUVColorTest, TESTNAME) { \
@@ -187,6 +185,52 @@ static void YUVJToRGB(int y, int u, int v, int* r, int* g, int* b) {
*r = orig_pixels[2];
}
+static void YUVHToRGB(int y, int u, int v, int* r, int* g, int* b) {
+ const int kWidth = 16;
+ const int kHeight = 1;
+ const int kPixels = kWidth * kHeight;
+ const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
+
+ SIMD_ALIGNED(uint8_t orig_y[16]);
+ SIMD_ALIGNED(uint8_t orig_u[8]);
+ SIMD_ALIGNED(uint8_t orig_v[8]);
+ SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]);
+ memset(orig_y, y, kPixels);
+ memset(orig_u, u, kHalfPixels);
+ memset(orig_v, v, kHalfPixels);
+
+ /* YUV converted to ARGB. */
+ H422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2,
+ orig_pixels, kWidth * 4, kWidth, kHeight);
+
+ *b = orig_pixels[0];
+ *g = orig_pixels[1];
+ *r = orig_pixels[2];
+}
+
+static void YUVRec2020ToRGB(int y, int u, int v, int* r, int* g, int* b) {
+ const int kWidth = 16;
+ const int kHeight = 1;
+ const int kPixels = kWidth * kHeight;
+ const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
+
+ SIMD_ALIGNED(uint8_t orig_y[16]);
+ SIMD_ALIGNED(uint8_t orig_u[8]);
+ SIMD_ALIGNED(uint8_t orig_v[8]);
+ SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]);
+ memset(orig_y, y, kPixels);
+ memset(orig_u, u, kHalfPixels);
+ memset(orig_v, v, kHalfPixels);
+
+ /* YUV converted to ARGB. */
+ U422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2,
+ orig_pixels, kWidth * 4, kWidth, kHeight);
+
+ *b = orig_pixels[0];
+ *g = orig_pixels[1];
+ *r = orig_pixels[2];
+}
+
static void YToRGB(int y, int* r, int* g, int* b) {
const int kWidth = 16;
const int kHeight = 1;
@@ -335,18 +379,41 @@ TEST_F(LibYUVColorTest, TestRoundToByte) {
EXPECT_LE(allb, 255);
}
+// BT.601 YUV to RGB reference
static void YUVToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
*r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.596);
*g = RoundToByte((y - 16) * 1.164 - (u - 128) * 0.391 - (v - 128) * 0.813);
*b = RoundToByte((y - 16) * 1.164 - (u - 128) * -2.018);
}
+// JPEG YUV to RGB reference
static void YUVJToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
*r = RoundToByte(y - (v - 128) * -1.40200);
*g = RoundToByte(y - (u - 128) * 0.34414 - (v - 128) * 0.71414);
*b = RoundToByte(y - (u - 128) * -1.77200);
}
+// BT.709 YUV to RGB reference
+// See also http://www.equasys.de/colorconversion.html
+static void YUVHToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
+ *r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.793);
+ *g = RoundToByte((y - 16) * 1.164 - (u - 128) * 0.213 - (v - 128) * 0.533);
+ *b = RoundToByte((y - 16) * 1.164 - (u - 128) * -2.112);
+}
+
+// BT.2020 YUV to RGB reference
+static void YUVRec2020ToRGBReference(int y,
+ int u,
+ int v,
+ int* r,
+ int* g,
+ int* b) {
+ *r = RoundToByte((y - 16) * 1.164384 - (v - 128) * -1.67867);
+ *g = RoundToByte((y - 16) * 1.164384 - (u - 128) * 0.187326 -
+ (v - 128) * 0.65042);
+ *b = RoundToByte((y - 16) * 1.164384 - (u - 128) * -2.14177);
+}
+
TEST_F(LibYUVColorTest, TestYUV) {
int r0, g0, b0, r1, g1, b1;
@@ -473,7 +540,11 @@ static void PrintHistogram(int rh[256], int gh[256], int bh[256]) {
// Step by 5 on inner loop goes from 0 to 255 inclusive.
// Set to 1 for better converage. 3, 5 or 17 for faster testing.
+#ifdef ENABLE_SLOW_TESTS
+#define FASTSTEP 1
+#else
#define FASTSTEP 5
+#endif
TEST_F(LibYUVColorTest, TestFullYUV) {
int rh[256] = {
0,
@@ -531,6 +602,66 @@ TEST_F(LibYUVColorTest, TestFullYUVJ) {
}
PrintHistogram(rh, gh, bh);
}
+
+TEST_F(LibYUVColorTest, TestFullYUVH) {
+ int rh[256] = {
+ 0,
+ };
+ int gh[256] = {
+ 0,
+ };
+ int bh[256] = {
+ 0,
+ };
+ for (int u = 0; u < 256; ++u) {
+ for (int v = 0; v < 256; ++v) {
+ for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
+ int r0, g0, b0, r1, g1, b1;
+ int y = RANDOM256(y2);
+ YUVHToRGBReference(y, u, v, &r0, &g0, &b0);
+ YUVHToRGB(y, u, v, &r1, &g1, &b1);
+ EXPECT_NEAR(r0, r1, ERROR_R);
+ EXPECT_NEAR(g0, g1, ERROR_G);
+ // TODO(crbug.com/libyuv/862): Reduce the errors in the B channel.
+ EXPECT_NEAR(b0, b1, 15);
+ ++rh[r1 - r0 + 128];
+ ++gh[g1 - g0 + 128];
+ ++bh[b1 - b0 + 128];
+ }
+ }
+ }
+ PrintHistogram(rh, gh, bh);
+}
+
+TEST_F(LibYUVColorTest, TestFullYUVRec2020) {
+ int rh[256] = {
+ 0,
+ };
+ int gh[256] = {
+ 0,
+ };
+ int bh[256] = {
+ 0,
+ };
+ for (int u = 0; u < 256; ++u) {
+ for (int v = 0; v < 256; ++v) {
+ for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
+ int r0, g0, b0, r1, g1, b1;
+ int y = RANDOM256(y2);
+ YUVRec2020ToRGBReference(y, u, v, &r0, &g0, &b0);
+ YUVRec2020ToRGB(y, u, v, &r1, &g1, &b1);
+ EXPECT_NEAR(r0, r1, ERROR_R);
+ EXPECT_NEAR(g0, g1, ERROR_G);
+ // TODO(crbug.com/libyuv/863): Reduce the errors in the B channel.
+ EXPECT_NEAR(b0, b1, 18);
+ ++rh[r1 - r0 + 128];
+ ++gh[g1 - g0 + 128];
+ ++bh[b1 - b0 + 128];
+ }
+ }
+ }
+ PrintHistogram(rh, gh, bh);
+}
#undef FASTSTEP
TEST_F(LibYUVColorTest, TestGreyYUVJ) {
diff --git a/chromium/third_party/libyuv/unit_test/compare_test.cc b/chromium/third_party/libyuv/unit_test/compare_test.cc
index 136254e169b..bd99cdd3ac3 100644
--- a/chromium/third_party/libyuv/unit_test/compare_test.cc
+++ b/chromium/third_party/libyuv/unit_test/compare_test.cc
@@ -15,10 +15,13 @@
#include "../unit_test/unit_test.h"
#include "libyuv/basic_types.h"
#include "libyuv/compare.h"
-#include "libyuv/compare_row.h" /* For HammingDistance_C */
#include "libyuv/cpu_id.h"
#include "libyuv/video_common.h"
+#ifdef ENABLE_ROW_TESTS
+#include "libyuv/compare_row.h" /* For HammingDistance_C */
+#endif
+
namespace libyuv {
// hash seed of 5381 recommended.
@@ -206,6 +209,7 @@ TEST_F(LibYUVCompareTest, BenchmarkARGBDetect_Unaligned) {
free_aligned_buffer_page_end(src_a);
}
+#ifdef ENABLE_ROW_TESTS
TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) {
const int kMaxWidth = 4096 * 3;
align_buffer_page_end(src_a, kMaxWidth);
@@ -403,6 +407,7 @@ TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) {
free_aligned_buffer_page_end(src_a);
free_aligned_buffer_page_end(src_b);
}
+#endif // ENABLE_ROW_TESTS
TEST_F(LibYUVCompareTest, TestHammingDistance) {
align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_);
diff --git a/chromium/third_party/libyuv/unit_test/convert_test.cc b/chromium/third_party/libyuv/unit_test/convert_test.cc
index e11b101fca2..59a9480d679 100644
--- a/chromium/third_party/libyuv/unit_test/convert_test.cc
+++ b/chromium/third_party/libyuv/unit_test/convert_test.cc
@@ -12,8 +12,6 @@
#include <stdlib.h>
#include <time.h>
-#include "libyuv/row.h" /* For ARGBToAR30Row_AVX2 */
-
#include "libyuv/basic_types.h"
#include "libyuv/compare.h"
#include "libyuv/convert.h"
@@ -29,12 +27,14 @@
#include "libyuv/rotate.h"
#include "libyuv/video_common.h"
-#if defined(__arm__) || defined(__aarch64__)
-// arm version subsamples by summing 4 pixels then multiplying by matrix with
-// 4x smaller coefficients which are rounded to nearest integer.
-#define ARM_YUV_ERROR 4
-#else
-#define ARM_YUV_ERROR 0
+#ifdef ENABLE_ROW_TESTS
+#include "libyuv/row.h" /* For ARGBToAR30Row_AVX2 */
+#endif
+
+// Some functions fail on big endian. Enable these tests on all cpus except
+// PowerPC, but they are not optimized so disabled by default.
+#if !defined(__powerpc__) && defined(ENABLE_SLOW_TESTS)
+#define LITTLE_ENDIAN_ONLY_TEST 1
#endif
namespace libyuv {
@@ -216,41 +216,23 @@ TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2)
dst_y_opt, kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
} \
- int max_diff = 0; \
for (int i = 0; i < kHeight; ++i) { \
for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
+ EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \
} \
} \
- EXPECT_EQ(0, max_diff); \
for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = abs( \
- static_cast<int>(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>( \
- dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
+ EXPECT_EQ(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j], \
+ dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]); \
} \
} \
- EXPECT_LE(max_diff, 3); \
for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = abs( \
- static_cast<int>(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>( \
- dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
+ EXPECT_EQ(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j], \
+ dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]); \
} \
} \
- EXPECT_LE(max_diff, 3); \
free_aligned_buffer_page_end(dst_y_c); \
free_aligned_buffer_page_end(dst_u_c); \
free_aligned_buffer_page_end(dst_v_c); \
@@ -281,6 +263,23 @@ TESTAPLANARTOP(Android420, I420, 1, 0, 0, 2, 2, I420, 2, 2)
TESTAPLANARTOP(Android420, NV12, 2, 0, 1, 2, 2, I420, 2, 2)
TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2)
+// wrapper to keep API the same
+int I400ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* /* src_u */,
+ int /* src_stride_u */,
+ const uint8_t* /* src_v */,
+ int /* src_stride_v */,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ return I400ToNV21(src_y, src_stride_y, dst_y, dst_stride_y, dst_vu,
+ dst_stride_vu, width, height);
+}
+
#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
@@ -294,10 +293,10 @@ TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2)
SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
OFF); \
align_buffer_page_end(dst_y_c, kWidth* kHeight); \
- align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
+ align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
- align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
+ align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
for (int i = 0; i < kHeight; ++i) \
for (int j = 0; j < kWidth; ++j) \
@@ -312,46 +311,33 @@ TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2)
} \
memset(dst_y_c, 1, kWidth* kHeight); \
memset(dst_uv_c, 2, \
- SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
memset(dst_y_opt, 101, kWidth* kHeight); \
memset(dst_uv_opt, 102, \
- SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
MaskCpuFlags(disable_cpu_flags_); \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth, \
- dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \
+ dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \
MaskCpuFlags(benchmark_cpu_info_); \
for (int i = 0; i < benchmark_iterations_; ++i) { \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth, \
- dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \
+ dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \
} \
- int max_diff = 0; \
for (int i = 0; i < kHeight; ++i) { \
for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
+ EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \
} \
} \
- EXPECT_LE(max_diff, 1); \
for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) { \
- int abs_diff = \
- abs(static_cast<int>( \
- dst_uv_c[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) - \
- static_cast<int>( \
- dst_uv_opt[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) { \
+ EXPECT_EQ(dst_uv_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j], \
+ dst_uv_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]); \
} \
} \
- EXPECT_LE(max_diff, 1); \
free_aligned_buffer_page_end(dst_y_c); \
free_aligned_buffer_page_end(dst_uv_c); \
free_aligned_buffer_page_end(dst_y_opt); \
@@ -374,6 +360,92 @@ TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2)
TESTPLANARTOBP(I420, 2, 2, NV12, 2, 2)
TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2)
+TESTPLANARTOBP(I422, 2, 1, NV21, 2, 2)
+TESTPLANARTOBP(I444, 1, 1, NV12, 2, 2)
+TESTPLANARTOBP(I444, 1, 1, NV21, 2, 2)
+TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2)
+
+#define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, \
+ OFF, DOY) \
+ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
+ OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_c, 2 * SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_opt, 2 * SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
+ src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
+ (fastrand() & 0xff); \
+ } \
+ } \
+ memset(dst_y_c, 1, kWidth* kHeight); \
+ memset(dst_uv_c, 2, \
+ 2 * SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_opt, 101, kWidth* kHeight); \
+ memset(dst_uv_opt, 102, \
+ 2 * SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y + OFF, kWidth, src_uv + OFF, \
+ 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_c : NULL, kWidth, \
+ dst_uv_c, 2 * SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y + OFF, kWidth, src_uv + OFF, \
+ 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_opt : NULL, \
+ kWidth, dst_uv_opt, 2 * SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, \
+ NEG kHeight); \
+ } \
+ if (DOY) { \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \
+ } \
+ } \
+ } \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ EXPECT_EQ(dst_uv_c[i * 2 * SUBSAMPLE(kWidth, SUBSAMP_X) + j], \
+ dst_uv_opt[i * 2 * SUBSAMPLE(kWidth, SUBSAMP_X) + j]); \
+ } \
+ } \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_uv_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_uv_opt); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_uv); \
+ }
+
+#define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, 1) \
+ TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1, \
+ 1) \
+ TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1) \
+ TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1) \
+ TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0)
+
+TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2)
+TESTBIPLANARTOBP(NV12, 2, 2, NV12Mirror, 2, 2)
#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \
@@ -428,43 +500,25 @@ TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2)
kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_opt, \
SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
} \
- int max_diff = 0; \
if (DOY) { \
for (int i = 0; i < kHeight; ++i) { \
for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
+ EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \
} \
} \
- EXPECT_LE(max_diff, 1); \
} \
for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = abs( \
- static_cast<int>(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>( \
- dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
+ EXPECT_EQ(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j], \
+ dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]); \
} \
} \
- EXPECT_LE(max_diff, 1); \
for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = abs( \
- static_cast<int>(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>( \
- dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
+ EXPECT_EQ(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j], \
+ dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]); \
} \
} \
- EXPECT_LE(max_diff, 1); \
free_aligned_buffer_page_end(dst_y_c); \
free_aligned_buffer_page_end(dst_u_c); \
free_aligned_buffer_page_end(dst_v_c); \
@@ -554,43 +608,60 @@ TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
YALIGN, benchmark_width_, _Opt, +, 0)
TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1)
TESTPLANARTOB(J420, 2, 2, ARGB, 4, 4, 1)
TESTPLANARTOB(J420, 2, 2, ABGR, 4, 4, 1)
TESTPLANARTOB(H420, 2, 2, ARGB, 4, 4, 1)
TESTPLANARTOB(H420, 2, 2, ABGR, 4, 4, 1)
+TESTPLANARTOB(U420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(U420, 2, 2, ABGR, 4, 4, 1)
TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1)
-TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1)
TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1)
TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1)
TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 1)
+TESTPLANARTOB(J420, 2, 2, RAW, 3, 3, 1)
+TESTPLANARTOB(J420, 2, 2, RGB24, 3, 3, 1)
TESTPLANARTOB(H420, 2, 2, RAW, 3, 3, 1)
TESTPLANARTOB(H420, 2, 2, RGB24, 3, 3, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 1)
+TESTPLANARTOB(J420, 2, 2, RGB565, 2, 2, 1)
+TESTPLANARTOB(H420, 2, 2, RGB565, 2, 2, 1)
TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 1)
TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 1)
-TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1)
TESTPLANARTOB(I422, 2, 1, RGB565, 2, 2, 1)
+#endif
+TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1)
TESTPLANARTOB(J422, 2, 1, ARGB, 4, 4, 1)
TESTPLANARTOB(J422, 2, 1, ABGR, 4, 4, 1)
TESTPLANARTOB(H422, 2, 1, ARGB, 4, 4, 1)
TESTPLANARTOB(H422, 2, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(U422, 2, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(U422, 2, 1, ABGR, 4, 4, 1)
TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1)
-TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1)
TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1)
TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1)
-TESTPLANARTOB(J444, 1, 1, ARGB, 4, 4, 1)
TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(J444, 1, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(J444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(H444, 1, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(H444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(U444, 1, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(U444, 1, 1, ABGR, 4, 4, 1)
TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1)
TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1)
TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1)
TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1)
TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1)
TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
TESTPLANARTOB(I420, 2, 2, AR30, 4, 4, 1)
TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
+#endif
#define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, W1280, DIFF, N, NEG, OFF, ATTEN) \
+ YALIGN, W1280, N, NEG, OFF, ATTEN) \
TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
@@ -625,15 +696,9 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
dst_argb_opt + OFF, kStrideB, kWidth, NEG kHeight, \
ATTEN); \
} \
- int max_diff = 0; \
for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \
- int abs_diff = abs(static_cast<int>(dst_argb_c[i + OFF]) - \
- static_cast<int>(dst_argb_opt[i + OFF])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
+ EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]); \
} \
- EXPECT_LE(max_diff, DIFF); \
free_aligned_buffer_page_end(src_y); \
free_aligned_buffer_page_end(src_u); \
free_aligned_buffer_page_end(src_v); \
@@ -643,23 +708,48 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
}
#define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, DIFF) \
+ YALIGN) \
TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0) \
+ YALIGN, benchmark_width_ - 4, _Any, +, 0, 0) \
TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 0) \
+ YALIGN, benchmark_width_, _Unaligned, +, 1, 0) \
TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0) \
+ YALIGN, benchmark_width_, _Invert, -, 0, 0) \
TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0) \
+ YALIGN, benchmark_width_, _Opt, +, 0, 0) \
TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Premult, +, 0, 1)
-
-TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2)
-TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)
-
-#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- W1280, DIFF, N, NEG, OFF) \
+ YALIGN, benchmark_width_, _Premult, +, 0, 1)
+
+#define J420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+ l, m)
+#define J420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+ l, m)
+#define H420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+ l, m)
+#define H420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+ l, m)
+#define U420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+ l, m)
+#define U420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+ l, m)
+
+TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(J420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(J420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(H420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(H420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(U420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(U420Alpha, 2, 2, ABGR, 4, 4, 1)
+
+#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, \
+ BPP_B, W1280, N, NEG, OFF) \
TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kHeight = benchmark_height_; \
@@ -694,22 +784,16 @@ TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)
align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight); \
memset(dst_argb32_c, 2, kWidth * 4 * kHeight); \
memset(dst_argb32_opt, 102, kWidth * 4 * kHeight); \
- FMT_B##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth, \
+ FMT_C##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth, \
kHeight); \
- FMT_B##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth, \
+ FMT_C##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth, \
kHeight); \
- int max_diff = 0; \
for (int i = 0; i < kHeight; ++i) { \
for (int j = 0; j < kWidth * 4; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_argb32_c[i * kWidth * 4 + j]) - \
- static_cast<int>(dst_argb32_opt[i * kWidth * 4 + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
+ EXPECT_EQ(dst_argb32_c[i * kWidth * 4 + j], \
+ dst_argb32_opt[i * kWidth * 4 + j]); \
} \
} \
- EXPECT_LE(max_diff, DIFF); \
free_aligned_buffer_page_end(src_y); \
free_aligned_buffer_page_end(src_uv); \
free_aligned_buffer_page_end(dst_argb_c); \
@@ -718,89 +802,62 @@ TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)
free_aligned_buffer_page_end(dst_argb32_opt); \
}
-#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, DIFF) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- benchmark_width_ - 4, DIFF, _Any, +, 0) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- benchmark_width_, DIFF, _Unaligned, +, 1) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- benchmark_width_, DIFF, _Invert, -, 0) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- benchmark_width_, DIFF, _Opt, +, 0)
-
-TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4, 2)
-TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4, 2)
-TESTBIPLANARTOB(NV12, 2, 2, ABGR, 4, 2)
-TESTBIPLANARTOB(NV21, 2, 2, ABGR, 4, 2)
-TESTBIPLANARTOB(NV12, 2, 2, RGB24, 3, 2)
-TESTBIPLANARTOB(NV21, 2, 2, RGB24, 3, 2)
-TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9)
-
-#ifdef DO_THREE_PLANES
-// Do 3 allocations for yuv. conventional but slower.
-#define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- W1280, DIFF, N, NEG, OFF) \
- TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
- const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
- const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8; \
- align_buffer_page_end(src_argb, kStride* kHeight + OFF); \
- align_buffer_page_end(dst_y_c, kWidth* kHeight); \
- align_buffer_page_end(dst_u_c, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_v_c, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
- align_buffer_page_end(dst_u_opt, \
- kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_v_opt, \
- kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_y_c, 1, kWidth* kHeight); \
- memset(dst_u_c, 2, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_v_c, 3, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_y_opt, 101, kWidth* kHeight); \
- memset(dst_u_opt, 102, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_v_opt, 103, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kStride; ++j) \
- src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_u_c, \
- kStrideUV, dst_v_c, kStrideUV, kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth, \
- dst_u_opt, kStrideUV, dst_v_opt, kStrideUV, \
- kWidth, NEG kHeight); \
- } \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- EXPECT_NEAR(static_cast<int>(dst_y_c[i * kWidth + j]), \
- static_cast<int>(dst_y_opt[i * kWidth + j]), DIFF); \
- } \
- } \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < kStrideUV; ++j) { \
- EXPECT_NEAR(static_cast<int>(dst_u_c[i * kStrideUV + j]), \
- static_cast<int>(dst_u_opt[i * kStrideUV + j]), DIFF); \
- } \
- } \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < kStrideUV; ++j) { \
- EXPECT_NEAR(static_cast<int>(dst_v_c[i * kStrideUV + j]), \
- static_cast<int>(dst_v_opt[i * kStrideUV + j]), DIFF); \
- } \
- } \
- free_aligned_buffer_page_end(dst_y_c); \
- free_aligned_buffer_page_end(dst_u_c); \
- free_aligned_buffer_page_end(dst_v_c); \
- free_aligned_buffer_page_end(dst_y_opt); \
- free_aligned_buffer_page_end(dst_u_opt); \
- free_aligned_buffer_page_end(dst_v_opt); \
- free_aligned_buffer_page_end(src_argb); \
- }
-#else
+#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
+ benchmark_width_ - 4, _Any, +, 0) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
+ benchmark_width_, _Unaligned, +, 1) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
+ benchmark_width_, _Invert, -, 0) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
+ benchmark_width_, _Opt, +, 0)
+
+#define JNV12ToARGB(a, b, c, d, e, f, g, h) \
+ NV12ToARGBMatrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
+#define JNV21ToARGB(a, b, c, d, e, f, g, h) \
+ NV21ToARGBMatrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
+#define JNV12ToABGR(a, b, c, d, e, f, g, h) \
+ NV21ToARGBMatrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h)
+#define JNV21ToABGR(a, b, c, d, e, f, g, h) \
+ NV12ToARGBMatrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h)
+#define JNV12ToRGB24(a, b, c, d, e, f, g, h) \
+ NV12ToRGB24Matrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
+#define JNV21ToRGB24(a, b, c, d, e, f, g, h) \
+ NV21ToRGB24Matrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
+#define JNV12ToRAW(a, b, c, d, e, f, g, h) \
+ NV21ToRGB24Matrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h)
+#define JNV21ToRAW(a, b, c, d, e, f, g, h) \
+ NV12ToRGB24Matrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h)
+#define JNV12ToRGB565(a, b, c, d, e, f, g, h) \
+ NV12ToRGB565Matrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
+
+TESTBIPLANARTOB(JNV12, 2, 2, ARGB, ARGB, 4)
+TESTBIPLANARTOB(JNV21, 2, 2, ARGB, ARGB, 4)
+TESTBIPLANARTOB(JNV12, 2, 2, ABGR, ABGR, 4)
+TESTBIPLANARTOB(JNV21, 2, 2, ABGR, ABGR, 4)
+TESTBIPLANARTOB(JNV12, 2, 2, RGB24, RGB24, 3)
+TESTBIPLANARTOB(JNV21, 2, 2, RGB24, RGB24, 3)
+TESTBIPLANARTOB(JNV12, 2, 2, RAW, RAW, 3)
+TESTBIPLANARTOB(JNV21, 2, 2, RAW, RAW, 3)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTBIPLANARTOB(JNV12, 2, 2, RGB565, RGB565, 2)
+#endif
+
+TESTBIPLANARTOB(NV12, 2, 2, ARGB, ARGB, 4)
+TESTBIPLANARTOB(NV21, 2, 2, ARGB, ARGB, 4)
+TESTBIPLANARTOB(NV12, 2, 2, ABGR, ABGR, 4)
+TESTBIPLANARTOB(NV21, 2, 2, ABGR, ABGR, 4)
+TESTBIPLANARTOB(NV12, 2, 2, RGB24, RGB24, 3)
+TESTBIPLANARTOB(NV21, 2, 2, RGB24, RGB24, 3)
+TESTBIPLANARTOB(NV12, 2, 2, RAW, RAW, 3)
+TESTBIPLANARTOB(NV21, 2, 2, RAW, RAW, 3)
+TESTBIPLANARTOB(NV21, 2, 2, YUV24, RAW, 3)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTBIPLANARTOB(NV12, 2, 2, RGB565, RGB565, 2)
+#endif
+
#define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- W1280, DIFF, N, NEG, OFF) \
+ W1280, N, NEG, OFF) \
TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
@@ -832,14 +889,12 @@ TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9)
} \
for (int i = 0; i < kHeight; ++i) { \
for (int j = 0; j < kWidth; ++j) { \
- EXPECT_NEAR(static_cast<int>(dst_y_c[i * kWidth + j]), \
- static_cast<int>(dst_y_opt[i * kWidth + j]), DIFF); \
+ EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \
} \
} \
for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; ++i) { \
for (int j = 0; j < kStrideUV; ++j) { \
- EXPECT_NEAR(static_cast<int>(dst_uv_c[i * kStrideUV + j]), \
- static_cast<int>(dst_uv_opt[i * kStrideUV + j]), DIFF); \
+ EXPECT_EQ(dst_uv_c[i * kStrideUV + j], dst_uv_opt[i * kStrideUV + j]); \
} \
} \
free_aligned_buffer_page_end(dst_y_c); \
@@ -848,39 +903,39 @@ TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9)
free_aligned_buffer_page_end(dst_uv_opt); \
free_aligned_buffer_page_end(src_argb); \
}
-#endif
-#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- DIFF) \
+#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_ - 4, DIFF, _Any, +, 0) \
+ benchmark_width_ - 4, _Any, +, 0) \
TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, DIFF, _Unaligned, +, 1) \
+ benchmark_width_, _Unaligned, +, 1) \
TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, DIFF, _Invert, -, 0) \
+ benchmark_width_, _Invert, -, 0) \
TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, DIFF, _Opt, +, 0)
-
-TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, ARM_YUV_ERROR)
-TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1, ARM_YUV_ERROR)
-TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4)
-TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4)
-TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5)
-// TODO(fbarchard): Make 1555 neon work same as C code, reduce to diff 9.
-TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2, 15)
-TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2, 17)
-TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 2)
-TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 2)
-TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2, 2)
-TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2, 2)
-TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2)
-TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1, 2)
-TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2)
-TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2)
+ benchmark_width_, _Opt, +, 0)
+
+TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2)
+TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2)
+TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1)
+TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1)
+TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2)
+TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2)
+TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2)
+TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2)
+#endif
+TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2)
+TESTATOPLANAR(I400, 1, 1, I420, 2, 2)
+TESTATOPLANAR(J400, 1, 1, J420, 2, 2)
+TESTATOPLANAR(RAW, 3, 1, I420, 2, 2)
+TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2)
+TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2)
+TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2)
+TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2)
+TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1)
+TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2)
+TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1)
#define TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, \
SUBSAMP_Y, W1280, N, NEG, OFF) \
@@ -911,28 +966,17 @@ TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2)
FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth, \
dst_uv_opt, kStrideUV * 2, kWidth, NEG kHeight); \
} \
- int max_diff = 0; \
for (int i = 0; i < kHeight; ++i) { \
for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
+ EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \
} \
} \
- EXPECT_LE(max_diff, 4); \
for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
for (int j = 0; j < kStrideUV * 2; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_uv_c[i * kStrideUV * 2 + j]) - \
- static_cast<int>(dst_uv_opt[i * kStrideUV * 2 + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
+ EXPECT_EQ(dst_uv_c[i * kStrideUV * 2 + j], \
+ dst_uv_opt[i * kStrideUV * 2 + j]); \
} \
} \
- EXPECT_LE(max_diff, 4); \
free_aligned_buffer_page_end(dst_y_c); \
free_aligned_buffer_page_end(dst_uv_c); \
free_aligned_buffer_page_end(dst_y_opt); \
@@ -952,11 +996,15 @@ TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2)
TESTATOBIPLANAR(ARGB, 1, 4, NV12, 2, 2)
TESTATOBIPLANAR(ARGB, 1, 4, NV21, 2, 2)
+TESTATOBIPLANAR(ABGR, 1, 4, NV12, 2, 2)
+TESTATOBIPLANAR(ABGR, 1, 4, NV21, 2, 2)
TESTATOBIPLANAR(YUY2, 2, 4, NV12, 2, 2)
TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
+TESTATOBIPLANAR(AYUV, 1, 4, NV12, 2, 2)
+TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2)
#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, W1280, DIFF, N, NEG, OFF) \
+ HEIGHT_B, W1280, N, NEG, OFF) \
TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kHeight = benchmark_height_; \
@@ -982,22 +1030,16 @@ TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
FMT_A##To##FMT_B(src_argb + OFF, kStrideA, dst_argb_opt, kStrideB, \
kWidth, NEG kHeight); \
} \
- int max_diff = 0; \
for (int i = 0; i < kStrideB * kHeightB; ++i) { \
- int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \
- static_cast<int>(dst_argb_opt[i])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
+ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \
} \
- EXPECT_LE(max_diff, DIFF); \
free_aligned_buffer_page_end(src_argb); \
free_aligned_buffer_page_end(dst_argb_c); \
free_aligned_buffer_page_end(dst_argb_opt); \
}
#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, \
- STRIDE_B, HEIGHT_B, DIFF) \
+ STRIDE_B, HEIGHT_B) \
TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) { \
for (int times = 0; times < benchmark_iterations_; ++times) { \
const int kWidth = (fastrand() & 63) + 1; \
@@ -1023,7 +1065,7 @@ TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
FMT_A##To##FMT_B(src_argb, kStrideA, dst_argb_opt, kStrideB, kWidth, \
kHeight); \
for (int i = 0; i < kStrideB * kHeightB; ++i) { \
- EXPECT_NEAR(dst_argb_c[i], dst_argb_opt[i], DIFF); \
+ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \
} \
free_aligned_buffer_page_end(src_argb); \
free_aligned_buffer_page_end(dst_argb_c); \
@@ -1032,61 +1074,79 @@ TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
}
#define TESTATOB(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, DIFF) \
+ HEIGHT_B) \
TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, benchmark_width_ - 4, DIFF, _Any, +, 0) \
+ HEIGHT_B, benchmark_width_ - 4, _Any, +, 0) \
TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, benchmark_width_, DIFF, _Unaligned, +, 1) \
+ HEIGHT_B, benchmark_width_, _Unaligned, +, 1) \
TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, benchmark_width_, DIFF, _Invert, -, 0) \
+ HEIGHT_B, benchmark_width_, _Invert, -, 0) \
TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, benchmark_width_, DIFF, _Opt, +, 0) \
+ HEIGHT_B, benchmark_width_, _Opt, +, 0) \
TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, DIFF)
-
-// TODO(fbarchard): make ARM version of C code that matches NEON.
-TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0)
-TESTATOB(ABGR, 4, 4, 1, AR30, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, AR30, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 4)
-TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1, 4)
-TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 2)
-TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 2)
-TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(AR30, 4, 4, 1, AR30, 4, 4, 1, 0)
-TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RAW, 3, 3, 1, RGB24, 3, 3, 1, 0)
-TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(AR30, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(AR30, 4, 4, 1, ABGR, 4, 4, 1, 0)
-TESTATOB(AB30, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(AB30, 4, 4, 1, ABGR, 4, 4, 1, 0)
-TESTATOB(AR30, 4, 4, 1, AB30, 4, 4, 1, 0)
-TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1, ARM_YUV_ERROR)
-TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1, ARM_YUV_ERROR)
-TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1, 0)
-TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(J400, 1, 1, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1, 0)
-TESTATOB(J400, 1, 1, 1, J400, 1, 1, 1, 0)
-TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
+ HEIGHT_B)
+
+TESTATOB(AB30, 4, 4, 1, ABGR, 4, 4, 1)
+TESTATOB(AB30, 4, 4, 1, ARGB, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(ABGR, 4, 4, 1, AR30, 4, 4, 1)
+#endif
+TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(AR30, 4, 4, 1, AB30, 4, 4, 1)
+#endif
+TESTATOB(AR30, 4, 4, 1, ABGR, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(AR30, 4, 4, 1, AR30, 4, 4, 1)
+TESTATOB(AR30, 4, 4, 1, ARGB, 4, 4, 1)
+#endif
+TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(ARGB, 4, 4, 1, AR30, 4, 4, 1)
+#endif
+TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1)
+TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1)
+TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1)
+TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1)
+TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1)
+TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1)
+TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1)
+TESTATOB(RGBA, 4, 4, 1, J400, 1, 1, 1)
+TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1)
+TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1)
+TESTATOB(ABGR, 4, 4, 1, RAW, 3, 3, 1)
+TESTATOB(ABGR, 4, 4, 1, RGB24, 3, 3, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1)
+#endif
+TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1)
+TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1)
+TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1) // 4
+TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1)
+TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1)
+TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1)
+TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1)
+TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1)
+TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1)
+TESTATOB(J400, 1, 1, 1, ARGB, 4, 4, 1)
+TESTATOB(J400, 1, 1, 1, J400, 1, 1, 1)
+TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1)
+TESTATOB(RAW, 3, 3, 1, RGBA, 4, 4, 1)
+TESTATOB(RAW, 3, 3, 1, RGB24, 3, 3, 1)
+TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1)
+TESTATOB(RGB24, 3, 3, 1, J400, 1, 1, 1)
+TESTATOB(RGB24, 3, 3, 1, RGB24Mirror, 3, 3, 1)
+TESTATOB(RAW, 3, 3, 1, J400, 1, 1, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1)
+#endif
+TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1)
+TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1)
+TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1)
+TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1)
#define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, W1280, DIFF, N, NEG, OFF) \
+ HEIGHT_B, W1280, N, NEG, OFF) \
TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kHeight = benchmark_height_; \
@@ -1112,22 +1172,16 @@ TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_opt, \
kStrideB, NULL, kWidth, NEG kHeight); \
} \
- int max_diff = 0; \
for (int i = 0; i < kStrideB * kHeightB; ++i) { \
- int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \
- static_cast<int>(dst_argb_opt[i])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
+ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \
} \
- EXPECT_LE(max_diff, DIFF); \
free_aligned_buffer_page_end(src_argb); \
free_aligned_buffer_page_end(dst_argb_c); \
free_aligned_buffer_page_end(dst_argb_opt); \
}
#define TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, \
- STRIDE_B, HEIGHT_B, DIFF) \
+ STRIDE_B, HEIGHT_B) \
TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither_Random) { \
for (int times = 0; times < benchmark_iterations_; ++times) { \
const int kWidth = (fastrand() & 63) + 1; \
@@ -1152,15 +1206,9 @@ TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
MaskCpuFlags(benchmark_cpu_info_); \
FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_opt, kStrideB, \
NULL, kWidth, kHeight); \
- int max_diff = 0; \
for (int i = 0; i < kStrideB * kHeightB; ++i) { \
- int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \
- static_cast<int>(dst_argb_opt[i])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
+ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \
} \
- EXPECT_LE(max_diff, DIFF); \
free_aligned_buffer_page_end(src_argb); \
free_aligned_buffer_page_end(dst_argb_c); \
free_aligned_buffer_page_end(dst_argb_opt); \
@@ -1168,19 +1216,21 @@ TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
}
#define TESTATOBD(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, DIFF) \
+ HEIGHT_B) \
TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, benchmark_width_ - 4, DIFF, _Any, +, 0) \
+ HEIGHT_B, benchmark_width_ - 4, _Any, +, 0) \
TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, benchmark_width_, DIFF, _Unaligned, +, 1) \
+ HEIGHT_B, benchmark_width_, _Unaligned, +, 1) \
TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, benchmark_width_, DIFF, _Invert, -, 0) \
+ HEIGHT_B, benchmark_width_, _Invert, -, 0) \
TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, benchmark_width_, DIFF, _Opt, +, 0) \
+ HEIGHT_B, benchmark_width_, _Opt, +, 0) \
TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, DIFF)
+ HEIGHT_B)
-TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1)
+#endif
#define TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, W1280, N, NEG, OFF) \
TEST_F(LibYUVConvertTest, FMT_ATOB##_Symetric##N) { \
@@ -1267,6 +1317,7 @@ TEST_F(LibYUVConvertTest, ValidateJpeg) {
// EOI, SOI. Expect pass.
orig_pixels[0] = 0xff;
orig_pixels[1] = 0xd8; // SOI.
+ orig_pixels[2] = 0xff;
orig_pixels[kSize - kOff + 0] = 0xff;
orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
for (int times = 0; times < benchmark_iterations_; ++times) {
@@ -1293,6 +1344,7 @@ TEST_F(LibYUVConvertTest, ValidateJpegLarge) {
// EOI, SOI. Expect pass.
orig_pixels[0] = 0xff;
orig_pixels[1] = 0xd8; // SOI.
+ orig_pixels[2] = 0xff;
orig_pixels[kSize - kOff + 0] = 0xff;
orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
for (int times = 0; times < benchmark_iterations_; ++times) {
@@ -1326,6 +1378,7 @@ TEST_F(LibYUVConvertTest, InvalidateJpeg) {
// SOI but no EOI. Expect fail.
orig_pixels[0] = 0xff;
orig_pixels[1] = 0xd8; // SOI.
+ orig_pixels[2] = 0xff;
for (int times = 0; times < benchmark_iterations_; ++times) {
EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
}
@@ -1343,85 +1396,823 @@ TEST_F(LibYUVConvertTest, InvalidateJpeg) {
TEST_F(LibYUVConvertTest, FuzzJpeg) {
// SOI but no EOI. Expect fail.
for (int times = 0; times < benchmark_iterations_; ++times) {
- const int kSize = fastrand() % 5000 + 2;
+ const int kSize = fastrand() % 5000 + 3;
align_buffer_page_end(orig_pixels, kSize);
MemRandomize(orig_pixels, kSize);
// Add SOI so frame will be scanned.
orig_pixels[0] = 0xff;
orig_pixels[1] = 0xd8; // SOI.
+ orig_pixels[2] = 0xff;
orig_pixels[kSize - 1] = 0xff;
- ValidateJpeg(orig_pixels, kSize); // Failure normally expected.
+ ValidateJpeg(orig_pixels,
+ kSize); // Failure normally expected.
free_aligned_buffer_page_end(orig_pixels);
}
}
-TEST_F(LibYUVConvertTest, MJPGToI420) {
- const int kOff = 10;
- const int kMinJpeg = 64;
- const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
- ? benchmark_width_ * benchmark_height_
- : kMinJpeg;
- const int kSize = kImageSize + kOff;
- align_buffer_page_end(orig_pixels, kSize);
- align_buffer_page_end(dst_y_opt, benchmark_width_ * benchmark_height_);
- align_buffer_page_end(dst_u_opt, SUBSAMPLE(benchmark_width_, 2) *
- SUBSAMPLE(benchmark_height_, 2));
- align_buffer_page_end(dst_v_opt, SUBSAMPLE(benchmark_width_, 2) *
- SUBSAMPLE(benchmark_height_, 2));
+// Test data created in GIMP. In export jpeg, disable
+// thumbnails etc, choose a subsampling, and use low quality
+// (50) to keep size small. Generated with xxd -i test.jpg
+// test 0 is J400
+static const uint8_t kTest0Jpg[] = {
+ 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+ 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+ 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+ 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+ 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+ 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+ 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+ 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xc2, 0x00, 0x0b, 0x08, 0x00, 0x10,
+ 0x00, 0x20, 0x01, 0x01, 0x11, 0x00, 0xff, 0xc4, 0x00, 0x17, 0x00, 0x01,
+ 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xda, 0x00, 0x08, 0x01,
+ 0x01, 0x00, 0x00, 0x00, 0x01, 0x43, 0x7e, 0xa7, 0x97, 0x57, 0xff, 0xc4,
+ 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
+ 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
+ 0x02, 0x3b, 0xc0, 0x6f, 0x66, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
+ 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03,
+ 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff,
+ 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28,
+ 0x32, 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4,
+ 0x00, 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51,
+ 0x31, 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+ 0x3f, 0x21, 0x65, 0x6e, 0x31, 0x86, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb,
+ 0xa9, 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9,
+ 0xc6, 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x08,
+ 0x01, 0x01, 0x00, 0x00, 0x00, 0x10, 0x35, 0xff, 0xc4, 0x00, 0x1f, 0x10,
+ 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91,
+ 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+ 0x3f, 0x10, 0x0b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x88, 0xab, 0x8b,
+ 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec,
+ 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c,
+ 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff,
+ 0xd9};
+static const size_t kTest0JpgLen = 421;
+
+// test 1 is J444
+static const uint8_t kTest1Jpg[] = {
+ 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+ 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+ 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+ 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+ 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+ 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+ 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+ 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+ 0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+ 0x01, 0x11, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+ 0x17, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xc4,
+ 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x01, 0x03, 0xff, 0xda,
+ 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00, 0x01,
+ 0x40, 0x8f, 0x26, 0xe8, 0xf4, 0xcc, 0xf9, 0x69, 0x2b, 0x1b, 0x2a, 0xcb,
+ 0xff, 0xc4, 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11,
+ 0x00, 0x03, 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00,
+ 0x01, 0x05, 0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99,
+ 0x0d, 0x26, 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x01, 0x00,
+ 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x01, 0x00, 0x10, 0x11, 0x02, 0x12, 0xff, 0xda, 0x00, 0x08,
+ 0x01, 0x03, 0x01, 0x01, 0x3f, 0x01, 0xf1, 0x00, 0x27, 0x45, 0xbb, 0x31,
+ 0xaf, 0xff, 0xc4, 0x00, 0x1a, 0x11, 0x00, 0x02, 0x03, 0x01, 0x01, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+ 0x02, 0x10, 0x11, 0x41, 0x12, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01,
+ 0x01, 0x3f, 0x01, 0xf6, 0x4b, 0x5f, 0x48, 0xb3, 0x69, 0x63, 0x35, 0x72,
+ 0xbf, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11,
+ 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda, 0x00,
+ 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32, 0xd2,
+ 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00, 0x1c,
+ 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31, 0x61,
+ 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x21,
+ 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9, 0x01,
+ 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6, 0x48,
+ 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01,
+ 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x26, 0x61, 0xd4, 0xff,
+ 0xc4, 0x00, 0x1a, 0x11, 0x00, 0x03, 0x01, 0x00, 0x03, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x21,
+ 0x31, 0x41, 0x51, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f,
+ 0x10, 0x54, 0xa8, 0xbf, 0x50, 0x87, 0xb0, 0x9d, 0x8b, 0xc4, 0x6a, 0x26,
+ 0x6b, 0x2a, 0x9c, 0x1f, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x01, 0x00, 0x11, 0x21, 0x51, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02,
+ 0x01, 0x01, 0x3f, 0x10, 0x70, 0xe1, 0x3e, 0xd1, 0x8e, 0x0d, 0xe1, 0xb5,
+ 0xd5, 0x91, 0x76, 0x43, 0x82, 0x45, 0x4c, 0x7b, 0x7f, 0xff, 0xc4, 0x00,
+ 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61,
+ 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01,
+ 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x8a,
+ 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96,
+ 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad,
+ 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7,
+ 0xd4, 0xff, 0xd9};
+static const size_t kTest1JpgLen = 735;
+
+// test 2 is J420
+static const uint8_t kTest2Jpg[] = {
+ 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+ 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+ 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+ 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+ 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+ 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+ 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+ 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+ 0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+ 0x01, 0x22, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+ 0x18, 0x00, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x05, 0x01, 0x02, 0x04, 0xff,
+ 0xc4, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x01, 0x02, 0xff,
+ 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
+ 0x01, 0x20, 0xe7, 0x28, 0xa3, 0x0b, 0x2e, 0x2d, 0xcf, 0xff, 0xc4, 0x00,
+ 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03, 0x10,
+ 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05, 0x02,
+ 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26, 0x62,
+ 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x00, 0x03, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x01, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f,
+ 0x01, 0xc8, 0x53, 0xff, 0xc4, 0x00, 0x16, 0x11, 0x01, 0x01, 0x01, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x11, 0x32, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f,
+ 0x01, 0xd2, 0xc7, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03,
+ 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff,
+ 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28,
+ 0x32, 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4,
+ 0x00, 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51,
+ 0x31, 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+ 0x3f, 0x21, 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb,
+ 0xa9, 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9,
+ 0xc6, 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c,
+ 0x03, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x13, 0x5f,
+ 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11,
+ 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x0e,
+ 0xa1, 0x3a, 0x76, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x01, 0x00, 0x21, 0x11, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01,
+ 0x3f, 0x10, 0x57, 0x0b, 0x08, 0x70, 0xdb, 0xff, 0xc4, 0x00, 0x1f, 0x10,
+ 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91,
+ 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+ 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x8a, 0xeb, 0x8b,
+ 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec,
+ 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c,
+ 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff,
+ 0xd9};
+static const size_t kTest2JpgLen = 685;
+
+// test 3 is J422
+static const uint8_t kTest3Jpg[] = {
+ 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+ 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+ 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+ 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+ 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+ 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+ 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+ 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+ 0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+ 0x01, 0x21, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+ 0x17, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xc4,
+ 0x00, 0x17, 0x01, 0x00, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x00, 0xff,
+ 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
+ 0x01, 0x43, 0x8d, 0x1f, 0xa2, 0xb3, 0xca, 0x1b, 0x57, 0x0f, 0xff, 0xc4,
+ 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
+ 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
+ 0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
+ 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x00, 0x02, 0x03, 0x01,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x01, 0x02, 0x10, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03,
+ 0x01, 0x01, 0x3f, 0x01, 0x51, 0xce, 0x8c, 0x75, 0xff, 0xc4, 0x00, 0x18,
+ 0x11, 0x00, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x61, 0x21, 0xff, 0xda,
+ 0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f, 0x01, 0xa6, 0xd9, 0x2f, 0x84,
+ 0xe8, 0xf0, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda,
+ 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32,
+ 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00,
+ 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31,
+ 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f,
+ 0x21, 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9,
+ 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6,
+ 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03,
+ 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x2e, 0x45, 0xff,
+ 0xc4, 0x00, 0x18, 0x11, 0x00, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x21,
+ 0x31, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x53,
+ 0x50, 0xba, 0x54, 0xc1, 0x67, 0x4f, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x00,
+ 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x01, 0x11, 0x21, 0x00, 0x10, 0xff, 0xda, 0x00, 0x08,
+ 0x01, 0x02, 0x01, 0x01, 0x3f, 0x10, 0x18, 0x81, 0x5c, 0x04, 0x1a, 0xca,
+ 0x91, 0xbf, 0xff, 0xc4, 0x00, 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04,
+ 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+ 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff,
+ 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9,
+ 0x58, 0xbe, 0x1a, 0xfd, 0x8a, 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5,
+ 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c,
+ 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00,
+ 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff, 0xd9};
+static const size_t kTest3JpgLen = 704;
+
+// test 4 is J422 vertical - not supported
+static const uint8_t kTest4Jpg[] = {
+ 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+ 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+ 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+ 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+ 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+ 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+ 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+ 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+ 0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+ 0x01, 0x12, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+ 0x18, 0x00, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x05, 0x01, 0x02, 0x03, 0xff,
+ 0xc4, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x03, 0xff,
+ 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
+ 0x01, 0xd2, 0x98, 0xe9, 0x03, 0x0c, 0x00, 0x46, 0x21, 0xd9, 0xff, 0xc4,
+ 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
+ 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
+ 0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
+ 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x11, 0x01, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01,
+ 0x3f, 0x01, 0x98, 0xb1, 0xbd, 0x47, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x00,
+ 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x12, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08,
+ 0x01, 0x02, 0x01, 0x01, 0x3f, 0x01, 0xb6, 0x35, 0xa2, 0xe1, 0x47, 0xff,
+ 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x21, 0x02,
+ 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda, 0x00, 0x08, 0x01,
+ 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32, 0xd2, 0xed, 0xf9,
+ 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00, 0x1c, 0x10, 0x01,
+ 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31, 0x61, 0x81, 0xf0,
+ 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x21, 0x75, 0x6e,
+ 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9, 0x01, 0xf3, 0xde,
+ 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6, 0x48, 0x5d, 0x7a,
+ 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02,
+ 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x24, 0xaf, 0xff, 0xc4, 0x00, 0x19,
+ 0x11, 0x00, 0x03, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x51, 0x21, 0x31, 0xff,
+ 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x59, 0x11, 0xca,
+ 0x42, 0x60, 0x9f, 0x69, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x00, 0x02, 0x03,
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x01, 0x11, 0x21, 0x31, 0x61, 0xff, 0xda, 0x00, 0x08, 0x01,
+ 0x02, 0x01, 0x01, 0x3f, 0x10, 0xb0, 0xd7, 0x27, 0x51, 0xb6, 0x41, 0xff,
+ 0xc4, 0x00, 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31,
+ 0x41, 0x61, 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08,
+ 0x01, 0x01, 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a,
+ 0xfd, 0x8a, 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd,
+ 0x46, 0x96, 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30,
+ 0x49, 0xad, 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03,
+ 0x0b, 0xb7, 0xd4, 0xff, 0xd9};
+static const size_t kTest4JpgLen = 701;
+
+TEST_F(LibYUVConvertTest, TestMJPGSize) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ printf("test jpeg size %d x %d\n", width, height);
+}
- // EOI, SOI to make MJPG appear valid.
- memset(orig_pixels, 0, kSize);
- orig_pixels[0] = 0xff;
- orig_pixels[1] = 0xd8; // SOI.
- orig_pixels[kSize - kOff + 0] = 0xff;
- orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
+TEST_F(LibYUVConvertTest, TestMJPGToI420) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_u, half_width * half_height);
+ align_buffer_page_end(dst_v, half_width * half_height);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_u, half_width,
+ dst_v, half_width, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value.
+ uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+ uint32_t dst_u_hash = HashDjb2(dst_u, half_width * half_height, 5381);
+ uint32_t dst_v_hash = HashDjb2(dst_v, half_width * half_height, 5381);
+ EXPECT_EQ(dst_y_hash, 2682851208u);
+ EXPECT_EQ(dst_u_hash, 2501859930u);
+ EXPECT_EQ(dst_v_hash, 2126459123u);
- for (int times = 0; times < benchmark_iterations_; ++times) {
- int ret =
- MJPGToI420(orig_pixels, kSize, dst_y_opt, benchmark_width_, dst_u_opt,
- SUBSAMPLE(benchmark_width_, 2), dst_v_opt,
- SUBSAMPLE(benchmark_width_, 2), benchmark_width_,
- benchmark_height_, benchmark_width_, benchmark_height_);
- // Expect failure because image is not really valid.
- EXPECT_EQ(1, ret);
- }
-
- free_aligned_buffer_page_end(dst_y_opt);
- free_aligned_buffer_page_end(dst_u_opt);
- free_aligned_buffer_page_end(dst_v_opt);
- free_aligned_buffer_page_end(orig_pixels);
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_u);
+ free_aligned_buffer_page_end(dst_v);
}
-TEST_F(LibYUVConvertTest, MJPGToARGB) {
- const int kOff = 10;
- const int kMinJpeg = 64;
- const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
- ? benchmark_width_ * benchmark_height_
- : kMinJpeg;
- const int kSize = kImageSize + kOff;
- align_buffer_page_end(orig_pixels, kSize);
- align_buffer_page_end(dst_argb_opt, benchmark_width_ * benchmark_height_ * 4);
+TEST_F(LibYUVConvertTest, TestMJPGToI420_NV21) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
- // EOI, SOI to make MJPG appear valid.
- memset(orig_pixels, 0, kSize);
- orig_pixels[0] = 0xff;
- orig_pixels[1] = 0xd8; // SOI.
- orig_pixels[kSize - kOff + 0] = 0xff;
- orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
- for (int times = 0; times < benchmark_iterations_; ++times) {
- int ret = MJPGToARGB(orig_pixels, kSize, dst_argb_opt, benchmark_width_ * 4,
- benchmark_width_, benchmark_height_, benchmark_width_,
- benchmark_height_);
- // Expect failure because image is not really valid.
- EXPECT_EQ(1, ret);
+ // Convert to NV21
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_vu, half_width * half_height * 2);
+
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV21(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_vu,
+ half_width * 2, width, height, width, height);
}
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
- free_aligned_buffer_page_end(dst_argb_opt);
- free_aligned_buffer_page_end(orig_pixels);
+ // Convert to I420
+ align_buffer_page_end(dst2_y, width * height);
+ align_buffer_page_end(dst2_u, half_width * half_height);
+ align_buffer_page_end(dst2_v, half_width * half_height);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst2_y, width, dst2_u, half_width,
+ dst2_v, half_width, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Convert I420 to NV21
+ align_buffer_page_end(dst3_y, width * height);
+ align_buffer_page_end(dst3_vu, half_width * half_height * 2);
+
+ I420ToNV21(dst2_y, width, dst2_u, half_width, dst2_v, half_width, dst3_y,
+ width, dst3_vu, half_width * 2, width, height);
+
+ for (int i = 0; i < width * height; ++i) {
+ EXPECT_EQ(dst_y[i], dst3_y[i]);
+ }
+ for (int i = 0; i < half_width * half_height * 2; ++i) {
+ EXPECT_EQ(dst_vu[i], dst3_vu[i]);
+ EXPECT_EQ(dst_vu[i], dst3_vu[i]);
+ }
+
+ free_aligned_buffer_page_end(dst3_y);
+ free_aligned_buffer_page_end(dst3_vu);
+
+ free_aligned_buffer_page_end(dst2_y);
+ free_aligned_buffer_page_end(dst2_u);
+ free_aligned_buffer_page_end(dst2_v);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_vu);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToI420_NV12) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+
+ // Convert to NV12
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_uv, half_width * half_height * 2);
+
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV12(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Convert to I420
+ align_buffer_page_end(dst2_y, width * height);
+ align_buffer_page_end(dst2_u, half_width * half_height);
+ align_buffer_page_end(dst2_v, half_width * half_height);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst2_y, width, dst2_u, half_width,
+ dst2_v, half_width, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Convert I420 to NV12
+ align_buffer_page_end(dst3_y, width * height);
+ align_buffer_page_end(dst3_uv, half_width * half_height * 2);
+
+ I420ToNV12(dst2_y, width, dst2_u, half_width, dst2_v, half_width, dst3_y,
+ width, dst3_uv, half_width * 2, width, height);
+
+ for (int i = 0; i < width * height; ++i) {
+ EXPECT_EQ(dst_y[i], dst3_y[i]);
+ }
+ for (int i = 0; i < half_width * half_height * 2; ++i) {
+ EXPECT_EQ(dst_uv[i], dst3_uv[i]);
+ EXPECT_EQ(dst_uv[i], dst3_uv[i]);
+ }
+
+ free_aligned_buffer_page_end(dst3_y);
+ free_aligned_buffer_page_end(dst3_uv);
+
+ free_aligned_buffer_page_end(dst2_y);
+ free_aligned_buffer_page_end(dst2_u);
+ free_aligned_buffer_page_end(dst2_v);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV21_420) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_uv, half_width * half_height * 2);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV21(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value.
+ uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+ uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+ EXPECT_EQ(dst_y_hash, 2682851208u);
+ EXPECT_EQ(dst_uv_hash, 1069662856u);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV12_420) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_uv, half_width * half_height * 2);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV12(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value. Hashes are for VU so flip the plane.
+ uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+ align_buffer_page_end(dst_vu, half_width * half_height * 2);
+ SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
+ half_height);
+ uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
+ EXPECT_EQ(dst_y_hash, 2682851208u);
+ EXPECT_EQ(dst_vu_hash, 1069662856u);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+ free_aligned_buffer_page_end(dst_vu);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV21_422) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_uv, half_width * half_height * 2);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV21(kTest3Jpg, kTest3JpgLen, dst_y, width, dst_uv,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value.
+ uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+ uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+ EXPECT_EQ(dst_y_hash, 2682851208u);
+ EXPECT_EQ(dst_uv_hash, 3543430771u);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV12_422) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_uv, half_width * half_height * 2);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV12(kTest3Jpg, kTest3JpgLen, dst_y, width, dst_uv,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value. Hashes are for VU so flip the plane.
+ uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+ align_buffer_page_end(dst_vu, half_width * half_height * 2);
+ SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
+ half_height);
+ uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
+ EXPECT_EQ(dst_y_hash, 2682851208u);
+ EXPECT_EQ(dst_vu_hash, 3543430771u);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+ free_aligned_buffer_page_end(dst_vu);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV21_400) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest0Jpg, kTest0JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_uv, half_width * half_height * 2);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV21(kTest0Jpg, kTest0JpgLen, dst_y, width, dst_uv,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value.
+ uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+ uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+ EXPECT_EQ(dst_y_hash, 330644005u);
+ EXPECT_EQ(dst_uv_hash, 135214341u);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV12_400) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest0Jpg, kTest0JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_uv, half_width * half_height * 2);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV12(kTest0Jpg, kTest0JpgLen, dst_y, width, dst_uv,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value. Hashes are for VU so flip the plane.
+ uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+ align_buffer_page_end(dst_vu, half_width * half_height * 2);
+ SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
+ half_height);
+ uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
+ EXPECT_EQ(dst_y_hash, 330644005u);
+ EXPECT_EQ(dst_vu_hash, 135214341u);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+ free_aligned_buffer_page_end(dst_vu);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV21_444) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest1Jpg, kTest1JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_uv, half_width * half_height * 2);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV21(kTest1Jpg, kTest1JpgLen, dst_y, width, dst_uv,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value.
+ uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+ uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+ EXPECT_EQ(dst_y_hash, 2682851208u);
+ EXPECT_EQ(dst_uv_hash, 506143297u);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV12_444) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest1Jpg, kTest1JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_uv, half_width * half_height * 2);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV12(kTest1Jpg, kTest1JpgLen, dst_y, width, dst_uv,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value. Hashes are for VU so flip the plane.
+ uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+ align_buffer_page_end(dst_vu, half_width * half_height * 2);
+ SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
+ half_height);
+ uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
+ EXPECT_EQ(dst_y_hash, 2682851208u);
+ EXPECT_EQ(dst_vu_hash, 506143297u);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+ free_aligned_buffer_page_end(dst_vu);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToARGB) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+
+ align_buffer_page_end(dst_argb, width * height * 4);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToARGB(kTest3Jpg, kTest3JpgLen, dst_argb, width * 4, width,
+ height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value.
+ uint32_t dst_argb_hash = HashDjb2(dst_argb, width * height, 5381);
+ EXPECT_EQ(dst_argb_hash, 2355976473u);
+
+ free_aligned_buffer_page_end(dst_argb);
}
+static int ShowJPegInfo(const uint8_t* sample, size_t sample_size) {
+ MJpegDecoder mjpeg_decoder;
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+
+ int width = mjpeg_decoder.GetWidth();
+ int height = mjpeg_decoder.GetHeight();
+
+ // YUV420
+ if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ printf("JPeg is J420, %dx%d %d bytes\n", width, height,
+ static_cast<int>(sample_size));
+ // YUV422
+ } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ printf("JPeg is J422, %dx%d %d bytes\n", width, height,
+ static_cast<int>(sample_size));
+ // YUV444
+ } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ printf("JPeg is J444, %dx%d %d bytes\n", width, height,
+ static_cast<int>(sample_size));
+ // YUV400
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceGrayscale &&
+ mjpeg_decoder.GetNumComponents() == 1 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+ printf("JPeg is J400, %dx%d %d bytes\n", width, height,
+ static_cast<int>(sample_size));
+ } else {
+ // Unknown colorspace.
+ printf("JPeg is Unknown colorspace.\n");
+ }
+ mjpeg_decoder.UnloadFrame();
+ return ret;
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGInfo) {
+ EXPECT_EQ(1, ShowJPegInfo(kTest0Jpg, kTest0JpgLen));
+ EXPECT_EQ(1, ShowJPegInfo(kTest1Jpg, kTest1JpgLen));
+ EXPECT_EQ(1, ShowJPegInfo(kTest2Jpg, kTest2JpgLen));
+ EXPECT_EQ(1, ShowJPegInfo(kTest3Jpg, kTest3JpgLen));
+ EXPECT_EQ(1, ShowJPegInfo(kTest4Jpg,
+ kTest4JpgLen)); // Valid but unsupported.
+}
#endif // HAVE_JPEG
TEST_F(LibYUVConvertTest, NV12Crop) {
@@ -1504,6 +2295,78 @@ TEST_F(LibYUVConvertTest, NV12Crop) {
free_aligned_buffer_page_end(src_y);
}
+TEST_F(LibYUVConvertTest, I420CropOddY) {
+ const int SUBSAMP_X = 2;
+ const int SUBSAMP_Y = 2;
+ const int kWidth = benchmark_width_;
+ const int kHeight = benchmark_height_;
+ const int crop_y = 1;
+ const int kDestWidth = benchmark_width_;
+ const int kDestHeight = benchmark_height_ - crop_y * 2;
+ const int kStrideU = SUBSAMPLE(kWidth, SUBSAMP_X);
+ const int kStrideV = SUBSAMPLE(kWidth, SUBSAMP_X);
+ const int sample_size = kWidth * kHeight +
+ kStrideU * SUBSAMPLE(kHeight, SUBSAMP_Y) +
+ kStrideV * SUBSAMPLE(kHeight, SUBSAMP_Y);
+ align_buffer_page_end(src_y, sample_size);
+ uint8_t* src_u = src_y + kWidth * kHeight;
+ uint8_t* src_v = src_u + kStrideU * SUBSAMPLE(kHeight, SUBSAMP_Y);
+
+ align_buffer_page_end(dst_y, kDestWidth * kDestHeight);
+ align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+ SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+ align_buffer_page_end(dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+ SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+ for (int i = 0; i < kHeight * kWidth; ++i) {
+ src_y[i] = (fastrand() & 0xff);
+ }
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideU; ++i) {
+ src_u[i] = (fastrand() & 0xff);
+ }
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideV; ++i) {
+ src_v[i] = (fastrand() & 0xff);
+ }
+ memset(dst_y, 1, kDestWidth * kDestHeight);
+ memset(dst_u, 2,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+ memset(dst_v, 3,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+ MaskCpuFlags(benchmark_cpu_info_);
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ ConvertToI420(src_y, sample_size, dst_y, kDestWidth, dst_u,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X), 0, crop_y, kWidth, kHeight,
+ kDestWidth, kDestHeight, libyuv::kRotate0,
+ libyuv::FOURCC_I420);
+ }
+
+ for (int i = 0; i < kDestHeight; ++i) {
+ for (int j = 0; j < kDestWidth; ++j) {
+ EXPECT_EQ(src_y[crop_y * kWidth + i * kWidth + j],
+ dst_y[i * kDestWidth + j]);
+ }
+ }
+ for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+ for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+ EXPECT_EQ(src_u[(crop_y / 2 + i) * kStrideU + j],
+ dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+ }
+ }
+ for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+ for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+ EXPECT_EQ(src_v[(crop_y / 2 + i) * kStrideV + j],
+ dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+ }
+ }
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_u);
+ free_aligned_buffer_page_end(dst_v);
+ free_aligned_buffer_page_end(src_y);
+}
+
TEST_F(LibYUVConvertTest, TestYToARGB) {
uint8_t y[32];
uint8_t expectedg[32];
@@ -1588,7 +2451,7 @@ TEST_F(LibYUVConvertTest, TestDither) {
}
#define TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C) \
+ YALIGN, W1280, N, NEG, OFF, FMT_C, BPP_C) \
TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##Dither##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
@@ -1619,7 +2482,6 @@ TEST_F(LibYUVConvertTest, TestDither) {
src_y + OFF, kWidth, src_u + OFF, kStrideUV, src_v + OFF, kStrideUV, \
dst_argb_opt + OFF, kStrideB, NULL, kWidth, NEG kHeight); \
} \
- int max_diff = 0; \
/* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight); \
align_buffer_page_end(dst_argb32_opt, kWidth* BPP_C* kHeight); \
@@ -1630,13 +2492,8 @@ TEST_F(LibYUVConvertTest, TestDither) {
FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, dst_argb32_opt, \
kWidth * BPP_C, kWidth, kHeight); \
for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \
- int abs_diff = abs(static_cast<int>(dst_argb32_c[i]) - \
- static_cast<int>(dst_argb32_opt[i])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
+ EXPECT_EQ(dst_argb32_c[i], dst_argb32_opt[i]); \
} \
- EXPECT_LE(max_diff, DIFF); \
free_aligned_buffer_page_end(src_y); \
free_aligned_buffer_page_end(src_u); \
free_aligned_buffer_page_end(src_v); \
@@ -1646,20 +2503,20 @@ TEST_F(LibYUVConvertTest, TestDither) {
free_aligned_buffer_page_end(dst_argb32_opt); \
}
-#define TESTPLANARTOBD(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, DIFF, FMT_C, BPP_C) \
- TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, \
- BPP_C) \
- TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, \
- BPP_C) \
- TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C) \
- TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
+#define TESTPLANARTOBD(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, FMT_C, BPP_C) \
+ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C) \
+ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C) \
+ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Invert, -, 0, FMT_C, BPP_C) \
+ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Opt, +, 0, FMT_C, BPP_C)
-TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, 9, ARGB, 4)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, ARGB, 4)
+#endif
#define TESTPTOB(NAME, UYVYTOI420, UYVYTONV12) \
TEST_F(LibYUVConvertTest, NAME) { \
@@ -1783,12 +2640,14 @@ TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12)
benchmark_width_, _Opt, +, 0, FMT_C, BPP_C)
TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTPLANARTOE(I420, 2, 2, ABGR, 1, 4, ARGB, 4)
TESTPLANARTOE(J420, 2, 2, ARGB, 1, 4, ARGB, 4)
TESTPLANARTOE(J420, 2, 2, ABGR, 1, 4, ARGB, 4)
TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, ARGB, 4)
TESTPLANARTOE(H420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(U420, 2, 2, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(U420, 2, 2, ABGR, 1, 4, ARGB, 4)
TESTPLANARTOE(I420, 2, 2, BGRA, 1, 4, ARGB, 4)
-TESTPLANARTOE(I420, 2, 2, ABGR, 1, 4, ARGB, 4)
TESTPLANARTOE(I420, 2, 2, RGBA, 1, 4, ARGB, 4)
TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, ARGB, 4)
TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, RGB24, 3)
@@ -1800,20 +2659,30 @@ TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, RGB24, 3)
TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, RAW, 3)
TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, RAW, 3)
TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, ARGB, 4)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB565, 2)
TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB1555, 2)
TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB4444, 2)
TESTPLANARTOE(I422, 2, 1, ARGB, 1, 4, RGB565, 2)
+#endif
+TESTPLANARTOE(I422, 2, 1, ARGB, 1, 4, ABGR, 4)
+TESTPLANARTOE(I422, 2, 1, ABGR, 1, 4, ARGB, 4)
TESTPLANARTOE(J422, 2, 1, ARGB, 1, 4, ARGB, 4)
TESTPLANARTOE(J422, 2, 1, ABGR, 1, 4, ARGB, 4)
TESTPLANARTOE(H422, 2, 1, ARGB, 1, 4, ARGB, 4)
TESTPLANARTOE(H422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(U422, 2, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(U422, 2, 1, ABGR, 1, 4, ARGB, 4)
TESTPLANARTOE(I422, 2, 1, BGRA, 1, 4, ARGB, 4)
-TESTPLANARTOE(I422, 2, 1, ABGR, 1, 4, ARGB, 4)
TESTPLANARTOE(I422, 2, 1, RGBA, 1, 4, ARGB, 4)
-TESTPLANARTOE(I444, 1, 1, ARGB, 1, 4, ARGB, 4)
-TESTPLANARTOE(J444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(I444, 1, 1, ARGB, 1, 4, ABGR, 4)
TESTPLANARTOE(I444, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(J444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(J444, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(H444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(H444, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(U444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(U444, 1, 1, ABGR, 1, 4, ARGB, 4)
TESTPLANARTOE(I420, 2, 2, YUY2, 2, 4, ARGB, 4)
TESTPLANARTOE(I420, 2, 2, UYVY, 2, 4, ARGB, 4)
TESTPLANARTOE(I422, 2, 1, YUY2, 2, 4, ARGB, 4)
@@ -1887,6 +2756,12 @@ TESTPLANARTOE(I422, 2, 1, UYVY, 2, 4, ARGB, 4)
TESTQPLANARTOE(I420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(J420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(J420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(H420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(H420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(U420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(U420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
#define TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, W1280, N, NEG, \
OFF, FMT_C, BPP_C) \
@@ -1937,6 +2812,7 @@ TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
_Opt, +, 0, FMT_C, BPP_C)
// Caveat: Destination needs to be 4 bytes
+#ifdef LITTLE_ENDIAN_ONLY_TEST
TESTPLANETOE(ARGB, 1, 4, AR30, 1, 4, ARGB, 4)
TESTPLANETOE(ABGR, 1, 4, AR30, 1, 4, ABGR, 4)
TESTPLANETOE(AR30, 1, 4, ARGB, 1, 4, ABGR, 4)
@@ -1945,6 +2821,7 @@ TESTPLANETOE(ARGB, 1, 4, AB30, 1, 4, ARGB, 4)
TESTPLANETOE(ABGR, 1, 4, AB30, 1, 4, ABGR, 4)
TESTPLANETOE(AB30, 1, 4, ARGB, 1, 4, ABGR, 4)
TESTPLANETOE(AB30, 1, 4, ABGR, 1, 4, ARGB, 4)
+#endif
TEST_F(LibYUVConvertTest, RotateWithARGBSource) {
// 2x2 frames
@@ -2051,7 +2928,7 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
// TODO(fbarchard): Fix clamping issue affected by U channel.
#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- ALIGN, YALIGN, W1280, DIFF, N, NEG, SOFF, DOFF) \
+ ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF) \
TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
@@ -2087,15 +2964,9 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV, \
dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight); \
} \
- int max_diff = 0; \
for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \
- int abs_diff = abs(static_cast<int>(dst_argb_c[i + DOFF]) - \
- static_cast<int>(dst_argb_opt[i + DOFF])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
+ EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]); \
} \
- EXPECT_LE(max_diff, DIFF); \
free_aligned_buffer_page_end(src_y); \
free_aligned_buffer_page_end(src_u); \
free_aligned_buffer_page_end(src_v); \
@@ -2104,24 +2975,42 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
}
#define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, DIFF) \
+ YALIGN) \
TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0) \
+ YALIGN, benchmark_width_ - 4, _Any, +, 0, 0) \
TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 1) \
+ YALIGN, benchmark_width_, _Unaligned, +, 1, 1) \
TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0) \
+ YALIGN, benchmark_width_, _Invert, -, 0, 0) \
TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0)
-
-TESTPLANAR16TOB(I010, 2, 2, ARGB, 4, 4, 1, 2)
-TESTPLANAR16TOB(I010, 2, 2, ABGR, 4, 4, 1, 2)
-TESTPLANAR16TOB(I010, 2, 2, AR30, 4, 4, 1, 2)
-TESTPLANAR16TOB(I010, 2, 2, AB30, 4, 4, 1, 2)
-TESTPLANAR16TOB(H010, 2, 2, ARGB, 4, 4, 1, 2)
-TESTPLANAR16TOB(H010, 2, 2, ABGR, 4, 4, 1, 2)
-TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1, 2)
-TESTPLANAR16TOB(H010, 2, 2, AB30, 4, 4, 1, 2)
+ YALIGN, benchmark_width_, _Opt, +, 0, 0)
+
+TESTPLANAR16TOB(I010, 2, 2, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(I010, 2, 2, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, ABGR, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTPLANAR16TOB(I010, 2, 2, AR30, 4, 4, 1)
+TESTPLANAR16TOB(I010, 2, 2, AB30, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, AB30, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, AR30, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, AB30, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, AR30, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, AB30, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, AR30, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, AB30, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, AR30, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, AB30, 4, 4, 1)
+#endif
static int Clamp(int y) {
if (y < 0) {
@@ -2266,7 +3155,8 @@ TEST_F(LibYUVConvertTest, TestH010ToARGB) {
}
// Test 10 bit YUV to 10 bit RGB
-// Caveat: Result is near due to float rounding in expected result.
+// Caveat: Result is near due to float rounding in expected
+// result.
TEST_F(LibYUVConvertTest, TestH010ToAR30) {
const int kSize = 1024;
int histogram_b[1024];
@@ -2329,7 +3219,8 @@ TEST_F(LibYUVConvertTest, TestH010ToAR30) {
}
// Test 10 bit YUV to 10 bit RGB
-// Caveat: Result is near due to float rounding in expected result.
+// Caveat: Result is near due to float rounding in expected
+// result.
TEST_F(LibYUVConvertTest, TestH010ToAB30) {
const int kSize = 1024;
int histogram_b[1024];
@@ -2477,4 +3368,66 @@ TEST_F(LibYUVConvertTest, TestARGBToRGB24) {
free_aligned_buffer_page_end(dest_rgb24);
}
+// Test I400 with jpeg matrix is same as J400
+TEST_F(LibYUVConvertTest, TestI400) {
+ const int kSize = 256;
+ align_buffer_page_end(orig_i400, kSize);
+ align_buffer_page_end(argb_pixels_i400, kSize * 4);
+ align_buffer_page_end(argb_pixels_j400, kSize * 4);
+ align_buffer_page_end(argb_pixels_jpeg_i400, kSize * 4);
+ align_buffer_page_end(argb_pixels_h709_i400, kSize * 4);
+ align_buffer_page_end(argb_pixels_2020_i400, kSize * 4);
+
+ // Test grey scale
+ for (int i = 0; i < kSize; ++i) {
+ orig_i400[i] = i;
+ }
+
+ J400ToARGB(orig_i400, 0, argb_pixels_j400, 0, kSize, 1);
+ I400ToARGB(orig_i400, 0, argb_pixels_i400, 0, kSize, 1);
+ I400ToARGBMatrix(orig_i400, 0, argb_pixels_jpeg_i400, 0, &kYuvJPEGConstants,
+ kSize, 1);
+ I400ToARGBMatrix(orig_i400, 0, argb_pixels_h709_i400, 0, &kYuvH709Constants,
+ kSize, 1);
+ I400ToARGBMatrix(orig_i400, 0, argb_pixels_2020_i400, 0, &kYuv2020Constants,
+ kSize, 1);
+
+ EXPECT_EQ(0, argb_pixels_i400[0]);
+ EXPECT_EQ(0, argb_pixels_j400[0]);
+ EXPECT_EQ(0, argb_pixels_jpeg_i400[0]);
+ EXPECT_EQ(0, argb_pixels_h709_i400[0]);
+ EXPECT_EQ(0, argb_pixels_2020_i400[0]);
+ EXPECT_EQ(0, argb_pixels_i400[16 * 4]);
+ EXPECT_EQ(16, argb_pixels_j400[16 * 4]);
+ EXPECT_EQ(16, argb_pixels_jpeg_i400[16 * 4]);
+ EXPECT_EQ(0, argb_pixels_h709_i400[16 * 4]);
+ EXPECT_EQ(0, argb_pixels_2020_i400[16 * 4]);
+ EXPECT_EQ(130, argb_pixels_i400[128 * 4]);
+ EXPECT_EQ(128, argb_pixels_j400[128 * 4]);
+ EXPECT_EQ(128, argb_pixels_jpeg_i400[128 * 4]);
+ EXPECT_EQ(130, argb_pixels_h709_i400[128 * 4]);
+ EXPECT_EQ(130, argb_pixels_2020_i400[128 * 4]);
+ EXPECT_EQ(255, argb_pixels_i400[255 * 4]);
+ EXPECT_EQ(255, argb_pixels_j400[255 * 4]);
+ EXPECT_EQ(255, argb_pixels_jpeg_i400[255 * 4]);
+ EXPECT_EQ(255, argb_pixels_h709_i400[255 * 4]);
+ EXPECT_EQ(255, argb_pixels_2020_i400[255 * 4]);
+
+ for (int i = 0; i < kSize * 4; ++i) {
+ if ((i & 3) == 3) {
+ EXPECT_EQ(255, argb_pixels_j400[i]);
+ } else {
+ EXPECT_EQ(i / 4, argb_pixels_j400[i]);
+ }
+ EXPECT_EQ(argb_pixels_jpeg_i400[i], argb_pixels_j400[i]);
+ }
+
+ free_aligned_buffer_page_end(orig_i400);
+ free_aligned_buffer_page_end(argb_pixels_i400);
+ free_aligned_buffer_page_end(argb_pixels_j400);
+ free_aligned_buffer_page_end(argb_pixels_jpeg_i400);
+ free_aligned_buffer_page_end(argb_pixels_h709_i400);
+ free_aligned_buffer_page_end(argb_pixels_2020_i400);
+}
+
} // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/cpu_test.cc b/chromium/third_party/libyuv/unit_test/cpu_test.cc
index c4648bb949f..7264de08016 100644
--- a/chromium/third_party/libyuv/unit_test/cpu_test.cc
+++ b/chromium/third_party/libyuv/unit_test/cpu_test.cc
@@ -67,6 +67,8 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
printf("Has MIPS %d\n", has_mips);
int has_msa = TestCpuFlag(kCpuHasMSA);
printf("Has MSA %d\n", has_msa);
+ int has_mmi = TestCpuFlag(kCpuHasMMI);
+ printf("Has MMI %d\n", has_mmi);
#endif
}
@@ -158,7 +160,29 @@ TEST_F(LibYUVBaseTest, TestLinuxNeon) {
#endif
}
+TEST_F(LibYUVBaseTest, TestLinuxMipsMsaMmi) {
+ if (FileExists("../../unit_test/testdata/mips.txt")) {
+ printf("Note: testing to load \"../../unit_test/testdata/mips.txt\"\n");
+
+ EXPECT_EQ(0, MipsCpuCaps("../../unit_test/testdata/mips.txt"));
+ EXPECT_EQ(kCpuHasMMI,
+ MipsCpuCaps("../../unit_test/testdata/mips_loongson3.txt"));
+ EXPECT_EQ(kCpuHasMMI,
+ MipsCpuCaps("../../unit_test/testdata/mips_loongson_mmi.txt"));
+ EXPECT_EQ(kCpuHasMSA, MipsCpuCaps("../../unit_test/testdata/mips_msa.txt"));
+ EXPECT_EQ(kCpuHasMMI | kCpuHasMSA,
+ MipsCpuCaps("../../unit_test/testdata/mips_loongson2k.txt"));
+ } else {
+ printf("WARNING: unable to load \"../../unit_test/testdata/mips.txt\"\n");
+ }
+}
+
+// TODO(fbarchard): Fix clangcl test of cpuflags.
+#ifdef _MSC_VER
+TEST_F(LibYUVBaseTest, DISABLED_TestSetCpuFlags) {
+#else
TEST_F(LibYUVBaseTest, TestSetCpuFlags) {
+#endif
// Reset any masked flags that may have been set so auto init is enabled.
MaskCpuFlags(0);
diff --git a/chromium/third_party/libyuv/unit_test/cpu_thread_test.cc b/chromium/third_party/libyuv/unit_test/cpu_thread_test.cc
index 59061b98e0b..69aab74e7c8 100644
--- a/chromium/third_party/libyuv/unit_test/cpu_thread_test.cc
+++ b/chromium/third_party/libyuv/unit_test/cpu_thread_test.cc
@@ -12,7 +12,7 @@
#include "libyuv/cpu_id.h"
-#if defined(__clang__)
+#if defined(__clang__) && !defined(__wasm__)
#if __has_include(<pthread.h>)
#define LIBYUV_HAVE_PTHREAD 1
#endif
@@ -30,7 +30,7 @@ namespace libyuv {
void* ThreadMain(void* arg) {
int* flags = static_cast<int*>(arg);
- *flags = TestCpuFlag(kCpuHasSSSE3);
+ *flags = TestCpuFlag(kCpuInitialized);
return nullptr;
}
#endif // LIBYUV_HAVE_PTHREAD
diff --git a/chromium/third_party/libyuv/unit_test/math_test.cc b/chromium/third_party/libyuv/unit_test/math_test.cc
index 0abbad51321..a1544c122b5 100644
--- a/chromium/third_party/libyuv/unit_test/math_test.cc
+++ b/chromium/third_party/libyuv/unit_test/math_test.cc
@@ -16,10 +16,14 @@
#include "libyuv/basic_types.h"
#include "libyuv/cpu_id.h"
#include "libyuv/scale.h"
+
+#ifdef ENABLE_ROW_TESTS
#include "libyuv/scale_row.h"
+#endif
namespace libyuv {
+#ifdef ENABLE_ROW_TESTS
TEST_F(LibYUVBaseTest, TestFixedDiv) {
int num[1280];
int div[1280];
@@ -151,5 +155,6 @@ TEST_F(LibYUVBaseTest, TestFixedDiv1_Opt) {
EXPECT_NEAR(result_c[j], result_opt[j], 1);
}
}
+#endif // ENABLE_ROW_TESTS
} // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/planar_test.cc b/chromium/third_party/libyuv/unit_test/planar_test.cc
index 756089558f7..e05ff15640c 100644
--- a/chromium/third_party/libyuv/unit_test/planar_test.cc
+++ b/chromium/third_party/libyuv/unit_test/planar_test.cc
@@ -12,9 +12,6 @@
#include <stdlib.h>
#include <time.h>
-// row.h defines SIMD_ALIGNED, overriding unit_test.h
-#include "libyuv/row.h" /* For ScaleSumSamples_Neon */
-
#include "../unit_test/unit_test.h"
#include "libyuv/compare.h"
#include "libyuv/convert.h"
@@ -24,6 +21,13 @@
#include "libyuv/cpu_id.h"
#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"
+#include "libyuv/scale.h"
+
+#ifdef ENABLE_ROW_TESTS
+// row.h defines SIMD_ALIGNED, overriding unit_test.h
+// TODO(fbarchard): Remove row.h from unittests. Test public functions.
+#include "libyuv/row.h" /* For ScaleSumSamples_Neon */
+#endif
namespace libyuv {
@@ -277,6 +281,7 @@ TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) {
}
}
+// near is for legacy platforms.
TEST_F(LibYUVPlanarTest, TestARGBGray) {
SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
memset(orig_pixels, 0, sizeof(orig_pixels));
@@ -313,17 +318,17 @@ TEST_F(LibYUVPlanarTest, TestARGBGray) {
orig_pixels[5][3] = 224u;
// Do 16 to test asm version.
ARGBGray(&orig_pixels[0][0], 0, 0, 0, 16, 1);
- EXPECT_EQ(30u, orig_pixels[0][0]);
- EXPECT_EQ(30u, orig_pixels[0][1]);
- EXPECT_EQ(30u, orig_pixels[0][2]);
+ EXPECT_NEAR(29u, orig_pixels[0][0], 1);
+ EXPECT_NEAR(29u, orig_pixels[0][1], 1);
+ EXPECT_NEAR(29u, orig_pixels[0][2], 1);
EXPECT_EQ(128u, orig_pixels[0][3]);
EXPECT_EQ(149u, orig_pixels[1][0]);
EXPECT_EQ(149u, orig_pixels[1][1]);
EXPECT_EQ(149u, orig_pixels[1][2]);
EXPECT_EQ(0u, orig_pixels[1][3]);
- EXPECT_EQ(76u, orig_pixels[2][0]);
- EXPECT_EQ(76u, orig_pixels[2][1]);
- EXPECT_EQ(76u, orig_pixels[2][2]);
+ EXPECT_NEAR(77u, orig_pixels[2][0], 1);
+ EXPECT_NEAR(77u, orig_pixels[2][1], 1);
+ EXPECT_NEAR(77u, orig_pixels[2][2], 1);
EXPECT_EQ(255u, orig_pixels[2][3]);
EXPECT_EQ(0u, orig_pixels[3][0]);
EXPECT_EQ(0u, orig_pixels[3][1]);
@@ -333,9 +338,9 @@ TEST_F(LibYUVPlanarTest, TestARGBGray) {
EXPECT_EQ(255u, orig_pixels[4][1]);
EXPECT_EQ(255u, orig_pixels[4][2]);
EXPECT_EQ(255u, orig_pixels[4][3]);
- EXPECT_EQ(96u, orig_pixels[5][0]);
- EXPECT_EQ(96u, orig_pixels[5][1]);
- EXPECT_EQ(96u, orig_pixels[5][2]);
+ EXPECT_NEAR(97u, orig_pixels[5][0], 1);
+ EXPECT_NEAR(97u, orig_pixels[5][1], 1);
+ EXPECT_NEAR(97u, orig_pixels[5][2], 1);
EXPECT_EQ(224u, orig_pixels[5][3]);
for (int i = 0; i < 1280; ++i) {
orig_pixels[i][0] = i;
@@ -385,30 +390,30 @@ TEST_F(LibYUVPlanarTest, TestARGBGrayTo) {
orig_pixels[5][3] = 224u;
// Do 16 to test asm version.
ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 16, 1);
- EXPECT_EQ(30u, gray_pixels[0][0]);
- EXPECT_EQ(30u, gray_pixels[0][1]);
- EXPECT_EQ(30u, gray_pixels[0][2]);
- EXPECT_EQ(128u, gray_pixels[0][3]);
- EXPECT_EQ(149u, gray_pixels[1][0]);
- EXPECT_EQ(149u, gray_pixels[1][1]);
- EXPECT_EQ(149u, gray_pixels[1][2]);
- EXPECT_EQ(0u, gray_pixels[1][3]);
- EXPECT_EQ(76u, gray_pixels[2][0]);
- EXPECT_EQ(76u, gray_pixels[2][1]);
- EXPECT_EQ(76u, gray_pixels[2][2]);
- EXPECT_EQ(255u, gray_pixels[2][3]);
- EXPECT_EQ(0u, gray_pixels[3][0]);
- EXPECT_EQ(0u, gray_pixels[3][1]);
- EXPECT_EQ(0u, gray_pixels[3][2]);
- EXPECT_EQ(255u, gray_pixels[3][3]);
- EXPECT_EQ(255u, gray_pixels[4][0]);
- EXPECT_EQ(255u, gray_pixels[4][1]);
- EXPECT_EQ(255u, gray_pixels[4][2]);
- EXPECT_EQ(255u, gray_pixels[4][3]);
- EXPECT_EQ(96u, gray_pixels[5][0]);
- EXPECT_EQ(96u, gray_pixels[5][1]);
- EXPECT_EQ(96u, gray_pixels[5][2]);
- EXPECT_EQ(224u, gray_pixels[5][3]);
+ EXPECT_NEAR(30u, gray_pixels[0][0], 1);
+ EXPECT_NEAR(30u, gray_pixels[0][1], 1);
+ EXPECT_NEAR(30u, gray_pixels[0][2], 1);
+ EXPECT_NEAR(128u, gray_pixels[0][3], 1);
+ EXPECT_NEAR(149u, gray_pixels[1][0], 1);
+ EXPECT_NEAR(149u, gray_pixels[1][1], 1);
+ EXPECT_NEAR(149u, gray_pixels[1][2], 1);
+ EXPECT_NEAR(0u, gray_pixels[1][3], 1);
+ EXPECT_NEAR(76u, gray_pixels[2][0], 1);
+ EXPECT_NEAR(76u, gray_pixels[2][1], 1);
+ EXPECT_NEAR(76u, gray_pixels[2][2], 1);
+ EXPECT_NEAR(255u, gray_pixels[2][3], 1);
+ EXPECT_NEAR(0u, gray_pixels[3][0], 1);
+ EXPECT_NEAR(0u, gray_pixels[3][1], 1);
+ EXPECT_NEAR(0u, gray_pixels[3][2], 1);
+ EXPECT_NEAR(255u, gray_pixels[3][3], 1);
+ EXPECT_NEAR(255u, gray_pixels[4][0], 1);
+ EXPECT_NEAR(255u, gray_pixels[4][1], 1);
+ EXPECT_NEAR(255u, gray_pixels[4][2], 1);
+ EXPECT_NEAR(255u, gray_pixels[4][3], 1);
+ EXPECT_NEAR(96u, gray_pixels[5][0], 1);
+ EXPECT_NEAR(96u, gray_pixels[5][1], 1);
+ EXPECT_NEAR(96u, gray_pixels[5][2], 1);
+ EXPECT_NEAR(224u, gray_pixels[5][3], 1);
for (int i = 0; i < 1280; ++i) {
orig_pixels[i][0] = i;
orig_pixels[i][1] = i / 2;
@@ -418,6 +423,20 @@ TEST_F(LibYUVPlanarTest, TestARGBGrayTo) {
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 1280, 1);
}
+
+ for (int i = 0; i < 256; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i;
+ orig_pixels[i][2] = i;
+ orig_pixels[i][3] = i;
+ }
+ ARGBGray(&orig_pixels[0][0], 0, 0, 0, 256, 1);
+ for (int i = 0; i < 256; ++i) {
+ EXPECT_EQ(i, orig_pixels[i][0]);
+ EXPECT_EQ(i, orig_pixels[i][1]);
+ EXPECT_EQ(i, orig_pixels[i][2]);
+ EXPECT_EQ(i, orig_pixels[i][3]);
+ }
}
TEST_F(LibYUVPlanarTest, TestARGBSepia) {
@@ -763,27 +782,75 @@ TEST_F(LibYUVPlanarTest, TestARGBQuantize) {
}
}
-TEST_F(LibYUVPlanarTest, TestARGBMirror) {
- SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
- SIMD_ALIGNED(uint8_t dst_pixels[1280][4]);
+TEST_F(LibYUVPlanarTest, ARGBMirror_Opt) {
+ align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 4);
+ align_buffer_page_end(dst_pixels_opt,
+ benchmark_width_ * benchmark_height_ * 4);
+ align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 4);
- for (int i = 0; i < 1280; ++i) {
- orig_pixels[i][0] = i;
- orig_pixels[i][1] = i / 2;
- orig_pixels[i][2] = i / 3;
- orig_pixels[i][3] = i / 4;
+ MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 4);
+ MaskCpuFlags(disable_cpu_flags_);
+ ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_c,
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_opt,
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_);
}
- ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
+ for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+ free_aligned_buffer_page_end(src_pixels);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+ free_aligned_buffer_page_end(dst_pixels_c);
+}
- for (int i = 0; i < 1280; ++i) {
- EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i][0]);
- EXPECT_EQ((i / 2) & 255, dst_pixels[1280 - 1 - i][1]);
- EXPECT_EQ((i / 3) & 255, dst_pixels[1280 - 1 - i][2]);
- EXPECT_EQ((i / 4) & 255, dst_pixels[1280 - 1 - i][3]);
+TEST_F(LibYUVPlanarTest, MirrorPlane_Opt) {
+ align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(dst_pixels_opt, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_);
+
+ MemRandomize(src_pixels, benchmark_width_ * benchmark_height_);
+ MaskCpuFlags(disable_cpu_flags_);
+ MirrorPlane(src_pixels, benchmark_width_, dst_pixels_c, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ MirrorPlane(src_pixels, benchmark_width_, dst_pixels_opt, benchmark_width_,
+ benchmark_width_, benchmark_height_);
}
- for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
- ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+ free_aligned_buffer_page_end(src_pixels);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+ free_aligned_buffer_page_end(dst_pixels_c);
+}
+
+TEST_F(LibYUVPlanarTest, MirrorUVPlane_Opt) {
+ align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 2);
+ align_buffer_page_end(dst_pixels_opt,
+ benchmark_width_ * benchmark_height_ * 2);
+ align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 2);
+
+ MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 2);
+ MaskCpuFlags(disable_cpu_flags_);
+ MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_c,
+ benchmark_width_ * 2, benchmark_width_, benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_opt,
+ benchmark_width_ * 2, benchmark_width_, benchmark_height_);
}
+ for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+ free_aligned_buffer_page_end(src_pixels);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+ free_aligned_buffer_page_end(dst_pixels_c);
}
TEST_F(LibYUVPlanarTest, TestShade) {
@@ -1058,7 +1125,8 @@ static int TestBlend(int width,
int disable_cpu_flags,
int benchmark_cpu_info,
int invert,
- int off) {
+ int off,
+ int attenuate) {
if (width < 1) {
width = 1;
}
@@ -1072,10 +1140,12 @@ static int TestBlend(int width,
src_argb_a[i + off] = (fastrand() & 0xff);
src_argb_b[i + off] = (fastrand() & 0xff);
}
- ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width,
- height);
- ARGBAttenuate(src_argb_b + off, kStride, src_argb_b + off, kStride, width,
- height);
+ MemRandomize(src_argb_a, kStride * height + off);
+ MemRandomize(src_argb_b, kStride * height + off);
+ if (attenuate) {
+ ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width,
+ height);
+ }
memset(dst_argb_c, 255, kStride * height);
memset(dst_argb_opt, 255, kStride * height);
@@ -1105,28 +1175,35 @@ static int TestBlend(int width,
TEST_F(LibYUVPlanarTest, ARGBBlend_Any) {
int max_diff =
TestBlend(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBBlend_Unaligned) {
int max_diff =
TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1, 1);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBBlend_Invert) {
int max_diff =
TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0, 1);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlend_Unattenuated) {
+ int max_diff =
+ TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 0);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBBlend_Opt) {
int max_diff =
TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1);
EXPECT_LE(max_diff, 1);
}
@@ -2321,7 +2398,8 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyAlpha) {
}
TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) {
- const int kPixels = benchmark_width_ * benchmark_height_;
+ // Round count up to multiple of 16
+ const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
align_buffer_page_end(src_pixels, kPixels * 4);
align_buffer_page_end(dst_pixels_opt, kPixels);
align_buffer_page_end(dst_pixels_c, kPixels);
@@ -2349,7 +2427,8 @@ TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) {
}
TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) {
- const int kPixels = benchmark_width_ * benchmark_height_;
+ // Round count up to multiple of 16
+ const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
align_buffer_page_end(orig_pixels, kPixels);
align_buffer_page_end(dst_pixels_opt, kPixels * 4);
align_buffer_page_end(dst_pixels_c, kPixels * 4);
@@ -2482,7 +2561,8 @@ TEST_F(LibYUVPlanarTest, SetPlane_Opt) {
}
TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
- const int kPixels = benchmark_width_ * benchmark_height_;
+ // Round count up to multiple of 16
+ const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
align_buffer_page_end(src_pixels, kPixels * 2);
align_buffer_page_end(tmp_pixels_u, kPixels);
align_buffer_page_end(tmp_pixels_v, kPixels);
@@ -2526,7 +2606,8 @@ TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
}
TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
- const int kPixels = benchmark_width_ * benchmark_height_;
+ // Round count up to multiple of 16
+ const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
align_buffer_page_end(src_pixels, kPixels * 2);
align_buffer_page_end(tmp_pixels_u, kPixels);
align_buffer_page_end(tmp_pixels_v, kPixels);
@@ -2568,8 +2649,39 @@ TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
free_aligned_buffer_page_end(dst_pixels_c);
}
+TEST_F(LibYUVPlanarTest, SwapUVPlane_Opt) {
+ // Round count up to multiple of 16
+ const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
+ align_buffer_page_end(src_pixels, kPixels * 2);
+ align_buffer_page_end(dst_pixels_opt, kPixels * 2);
+ align_buffer_page_end(dst_pixels_c, kPixels * 2);
+
+ MemRandomize(src_pixels, kPixels * 2);
+ MemRandomize(dst_pixels_opt, kPixels * 2);
+ MemRandomize(dst_pixels_c, kPixels * 2);
+
+ MaskCpuFlags(disable_cpu_flags_);
+ SwapUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_c,
+ benchmark_width_ * 2, benchmark_width_, benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ SwapUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_opt,
+ benchmark_width_ * 2, benchmark_width_, benchmark_height_);
+ }
+
+ for (int i = 0; i < kPixels * 2; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+ free_aligned_buffer_page_end(dst_pixels_c);
+}
+
TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) {
- const int kPixels = benchmark_width_ * benchmark_height_;
+ // Round count up to multiple of 16
+ const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
align_buffer_page_end(src_pixels, kPixels * 3);
align_buffer_page_end(tmp_pixels_r, kPixels);
align_buffer_page_end(tmp_pixels_g, kPixels);
@@ -2617,7 +2729,8 @@ TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) {
}
TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
- const int kPixels = benchmark_width_ * benchmark_height_;
+ // Round count up to multiple of 16
+ const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
align_buffer_page_end(src_pixels, kPixels * 3);
align_buffer_page_end(tmp_pixels_r, kPixels);
align_buffer_page_end(tmp_pixels_g, kPixels);
@@ -2666,7 +2779,8 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
// TODO(fbarchard): improve test for platforms and cpu detect
#ifdef HAS_MERGEUVROW_16_AVX2
TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
- const int kPixels = benchmark_width_ * benchmark_height_;
+ // Round count up to multiple of 16
+ const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
align_buffer_page_end(src_pixels_u, kPixels * 2);
align_buffer_page_end(src_pixels_v, kPixels * 2);
align_buffer_page_end(dst_pixels_uv_opt, kPixels * 2 * 2);
@@ -2710,7 +2824,8 @@ TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
// TODO(fbarchard): Improve test for more platforms.
#ifdef HAS_MULTIPLYROW_16_AVX2
TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) {
- const int kPixels = benchmark_width_ * benchmark_height_;
+ // Round count up to multiple of 16
+ const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
align_buffer_page_end(src_pixels_y, kPixels * 2);
align_buffer_page_end(dst_pixels_y_opt, kPixels * 2);
align_buffer_page_end(dst_pixels_y_c, kPixels * 2);
@@ -2746,7 +2861,8 @@ TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) {
#endif // HAS_MULTIPLYROW_16_AVX2
TEST_F(LibYUVPlanarTest, Convert16To8Plane) {
- const int kPixels = benchmark_width_ * benchmark_height_;
+ // Round count up to multiple of 16
+ const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
align_buffer_page_end(src_pixels_y, kPixels * 2);
align_buffer_page_end(dst_pixels_y_opt, kPixels);
align_buffer_page_end(dst_pixels_y_c, kPixels);
@@ -2776,6 +2892,7 @@ TEST_F(LibYUVPlanarTest, Convert16To8Plane) {
free_aligned_buffer_page_end(dst_pixels_y_c);
}
+#ifdef ENABLE_ROW_TESTS
// TODO(fbarchard): Improve test for more platforms.
#ifdef HAS_CONVERT16TO8ROW_AVX2
TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
@@ -2821,9 +2938,11 @@ TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
free_aligned_buffer_page_end(dst_pixels_y_c);
}
#endif // HAS_CONVERT16TO8ROW_AVX2
+#endif // ENABLE_ROW_TESTS
TEST_F(LibYUVPlanarTest, Convert8To16Plane) {
- const int kPixels = benchmark_width_ * benchmark_height_;
+ // Round count up to multiple of 16
+ const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
align_buffer_page_end(src_pixels_y, kPixels);
align_buffer_page_end(dst_pixels_y_opt, kPixels * 2);
align_buffer_page_end(dst_pixels_y_c, kPixels * 2);
@@ -2855,6 +2974,7 @@ TEST_F(LibYUVPlanarTest, Convert8To16Plane) {
free_aligned_buffer_page_end(dst_pixels_y_c);
}
+#ifdef ENABLE_ROW_TESTS
// TODO(fbarchard): Improve test for more platforms.
#ifdef HAS_CONVERT8TO16ROW_AVX2
TEST_F(LibYUVPlanarTest, Convert8To16Row_Opt) {
@@ -3173,32 +3293,33 @@ extern "C" void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width);
extern "C" void GaussRow_C(const uint32_t* src, uint16_t* dst, int width);
TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
- SIMD_ALIGNED(uint32_t orig_pixels[640 + 4]);
- SIMD_ALIGNED(uint16_t dst_pixels_c[640]);
- SIMD_ALIGNED(uint16_t dst_pixels_opt[640]);
+ SIMD_ALIGNED(uint32_t orig_pixels[1280 + 8]);
+ SIMD_ALIGNED(uint16_t dst_pixels_c[1280]);
+ SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]);
memset(orig_pixels, 0, sizeof(orig_pixels));
memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
- for (int i = 0; i < 640 + 4; ++i) {
+ for (int i = 0; i < 1280 + 8; ++i) {
orig_pixels[i] = i * 256;
}
- GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 640);
- for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) {
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+ GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 1280);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
int has_neon = TestCpuFlag(kCpuHasNEON);
if (has_neon) {
- GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 640);
+ GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
} else {
- GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 640);
+ GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
}
#else
- GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 640);
+ GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
#endif
}
- for (int i = 0; i < 640; ++i) {
+ for (int i = 0; i < 1280; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
@@ -3224,47 +3345,285 @@ extern "C" void GaussCol_C(const uint16_t* src0,
int width);
TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
- SIMD_ALIGNED(uint16_t orig_pixels[640 * 5]);
- SIMD_ALIGNED(uint32_t dst_pixels_c[640]);
- SIMD_ALIGNED(uint32_t dst_pixels_opt[640]);
+ SIMD_ALIGNED(uint16_t orig_pixels[1280 * 5]);
+ SIMD_ALIGNED(uint32_t dst_pixels_c[1280]);
+ SIMD_ALIGNED(uint32_t dst_pixels_opt[1280]);
memset(orig_pixels, 0, sizeof(orig_pixels));
memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
- for (int i = 0; i < 640 * 5; ++i) {
- orig_pixels[i] = i;
+ for (int i = 0; i < 1280 * 5; ++i) {
+ orig_pixels[i] = static_cast<float>(i);
}
- GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
- &orig_pixels[640 * 3], &orig_pixels[640 * 4], &dst_pixels_c[0],
- 640);
- for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) {
+ GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+ &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_c[0],
+ 1280);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+ int has_neon = TestCpuFlag(kCpuHasNEON);
+ if (has_neon) {
+ GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+ &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+ &dst_pixels_opt[0], 1280);
+ } else {
+ GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+ &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+ &dst_pixels_opt[0], 1280);
+ }
+#else
+ GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+ &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+ &dst_pixels_opt[0], 1280);
+#endif
+ }
+
+ for (int i = 0; i < 1280; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+}
+
+TEST_F(LibYUVPlanarTest, TestGaussRow_F32_Opt) {
+ SIMD_ALIGNED(float orig_pixels[1280 + 4]);
+ SIMD_ALIGNED(float dst_pixels_c[1280]);
+ SIMD_ALIGNED(float dst_pixels_opt[1280]);
+
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+ memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
+ memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
+
+ for (int i = 0; i < 1280 + 4; ++i) {
+ orig_pixels[i] = static_cast<float>(i);
+ }
+ GaussRow_F32_C(&orig_pixels[0], &dst_pixels_c[0], 1280);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
int has_neon = TestCpuFlag(kCpuHasNEON);
if (has_neon) {
- GaussCol_NEON(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
- &orig_pixels[640 * 3], &orig_pixels[640 * 4],
- &dst_pixels_opt[0], 640);
+ GaussRow_F32_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
} else {
- GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
- &orig_pixels[640 * 3], &orig_pixels[640 * 4],
- &dst_pixels_opt[0], 640);
+ GaussRow_F32_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
}
#else
- GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
- &orig_pixels[640 * 3], &orig_pixels[640 * 4], &dst_pixels_opt[0],
- 640);
+ GaussRow_F32_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
#endif
}
- for (int i = 0; i < 640; ++i) {
+ for (int i = 0; i < 1280; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
+}
- EXPECT_EQ(dst_pixels_c[0],
- static_cast<uint32_t>(0 * 1 + 640 * 4 + 640 * 2 * 6 + 640 * 3 * 4 +
- 640 * 4 * 1));
- EXPECT_EQ(dst_pixels_c[639], static_cast<uint32_t>(30704));
+TEST_F(LibYUVPlanarTest, TestGaussCol_F32_Opt) {
+ SIMD_ALIGNED(float dst_pixels_c[1280]);
+ SIMD_ALIGNED(float dst_pixels_opt[1280]);
+ align_buffer_page_end(orig_pixels_buf, 1280 * 5 * 4); // 5 rows
+ float* orig_pixels = reinterpret_cast<float*>(orig_pixels_buf);
+
+ memset(orig_pixels, 0, 1280 * 5 * 4);
+ memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
+ memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
+
+ for (int i = 0; i < 1280 * 5; ++i) {
+ orig_pixels[i] = static_cast<float>(i);
+ }
+ GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+ &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+ &dst_pixels_c[0], 1280);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+ int has_neon = TestCpuFlag(kCpuHasNEON);
+ if (has_neon) {
+ GaussCol_F32_NEON(&orig_pixels[0], &orig_pixels[1280],
+ &orig_pixels[1280 * 2], &orig_pixels[1280 * 3],
+ &orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280);
+ } else {
+ GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280],
+ &orig_pixels[1280 * 2], &orig_pixels[1280 * 3],
+ &orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280);
+ }
+#else
+ GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+ &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+ &dst_pixels_opt[0], 1280);
+#endif
+ }
+
+ for (int i = 0; i < 1280; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+ free_aligned_buffer_page_end(orig_pixels_buf);
+}
+
+TEST_F(LibYUVPlanarTest, SwapUVRow) {
+ const int kPixels = benchmark_width_ * benchmark_height_;
+ void (*SwapUVRow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) =
+ SwapUVRow_C;
+
+ align_buffer_page_end(src_pixels_vu, kPixels * 2);
+ align_buffer_page_end(dst_pixels_uv, kPixels * 2);
+ MemRandomize(src_pixels_vu, kPixels * 2);
+ memset(dst_pixels_uv, 1, kPixels * 2);
+
+#if defined(HAS_SWAPUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SwapUVRow = SwapUVRow_Any_NEON;
+ if (IS_ALIGNED(kPixels, 16)) {
+ SwapUVRow = SwapUVRow_NEON;
+ }
+ }
+#endif
+
+ for (int j = 0; j < benchmark_iterations_; j++) {
+ SwapUVRow(src_pixels_vu, dst_pixels_uv, kPixels);
+ }
+ for (int i = 0; i < kPixels; ++i) {
+ EXPECT_EQ(dst_pixels_uv[i * 2 + 0], src_pixels_vu[i * 2 + 1]);
+ EXPECT_EQ(dst_pixels_uv[i * 2 + 1], src_pixels_vu[i * 2 + 0]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels_vu);
+ free_aligned_buffer_page_end(dst_pixels_uv);
+}
+#endif // ENABLE_ROW_TESTS
+
+TEST_F(LibYUVPlanarTest, TestGaussPlane_F32) {
+ const int kSize = benchmark_width_ * benchmark_height_ * 4;
+ align_buffer_page_end(orig_pixels, kSize);
+ align_buffer_page_end(dst_pixels_opt, kSize);
+ align_buffer_page_end(dst_pixels_c, kSize);
+
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ ((float*)(orig_pixels))[i] = (i & 1023) * 3.14f;
+ }
+ memset(dst_pixels_opt, 1, kSize);
+ memset(dst_pixels_c, 2, kSize);
+
+ MaskCpuFlags(disable_cpu_flags_);
+ GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
+ (float*)(dst_pixels_c), benchmark_width_, benchmark_width_,
+ benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
+ (float*)(dst_pixels_opt), benchmark_width_, benchmark_width_,
+ benchmark_height_);
+ }
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ EXPECT_NEAR(((float*)(dst_pixels_c))[i], ((float*)(dst_pixels_opt))[i], 1.f)
+ << i;
+ }
+
+ free_aligned_buffer_page_end(dst_pixels_c);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVPlanarTest, HalfMergeUVPlane_Opt) {
+ int dst_width = (benchmark_width_ + 1) / 2;
+ int dst_height = (benchmark_height_ + 1) / 2;
+ align_buffer_page_end(src_pixels_u, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(src_pixels_v, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(tmp_pixels_u, dst_width * dst_height);
+ align_buffer_page_end(tmp_pixels_v, dst_width * dst_height);
+ align_buffer_page_end(dst_pixels_uv_opt, dst_width * 2 * dst_height);
+ align_buffer_page_end(dst_pixels_uv_c, dst_width * 2 * dst_height);
+
+ MemRandomize(src_pixels_u, benchmark_width_ * benchmark_height_);
+ MemRandomize(src_pixels_v, benchmark_width_ * benchmark_height_);
+ MemRandomize(tmp_pixels_u, dst_width * dst_height);
+ MemRandomize(tmp_pixels_v, dst_width * dst_height);
+ MemRandomize(dst_pixels_uv_opt, dst_width * 2 * dst_height);
+ MemRandomize(dst_pixels_uv_c, dst_width * 2 * dst_height);
+
+ MaskCpuFlags(disable_cpu_flags_);
+ HalfMergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v,
+ benchmark_width_, dst_pixels_uv_c, dst_width * 2,
+ benchmark_width_, benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ HalfMergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v,
+ benchmark_width_, dst_pixels_uv_opt, dst_width * 2,
+ benchmark_width_, benchmark_height_);
+ }
+
+ for (int i = 0; i < dst_width * 2 * dst_height; ++i) {
+ EXPECT_EQ(dst_pixels_uv_c[i], dst_pixels_uv_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels_u);
+ free_aligned_buffer_page_end(src_pixels_v);
+ free_aligned_buffer_page_end(tmp_pixels_u);
+ free_aligned_buffer_page_end(tmp_pixels_v);
+ free_aligned_buffer_page_end(dst_pixels_uv_opt);
+ free_aligned_buffer_page_end(dst_pixels_uv_c);
+}
+
+TEST_F(LibYUVPlanarTest, NV12Copy) {
+ const int halfwidth = (benchmark_width_ + 1) >> 1;
+ const int halfheight = (benchmark_height_ + 1) >> 1;
+ align_buffer_page_end(src_y, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(src_uv, halfwidth * 2 * halfheight);
+ align_buffer_page_end(dst_y, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(dst_uv, halfwidth * 2 * halfheight);
+
+ MemRandomize(src_y, benchmark_width_ * benchmark_height_);
+ MemRandomize(src_uv, halfwidth * 2 * halfheight);
+ MemRandomize(dst_y, benchmark_width_ * benchmark_height_);
+ MemRandomize(dst_uv, halfwidth * 2 * halfheight);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ NV12Copy(src_y, benchmark_width_, src_uv, halfwidth * 2, dst_y,
+ benchmark_width_, dst_uv, halfwidth * 2, benchmark_width_,
+ benchmark_height_);
+ }
+
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ EXPECT_EQ(src_y[i], dst_y[i]);
+ }
+ for (int i = 0; i < halfwidth * 2 * halfheight; ++i) {
+ EXPECT_EQ(src_uv[i], dst_uv[i]);
+ }
+
+ free_aligned_buffer_page_end(src_y);
+ free_aligned_buffer_page_end(src_uv);
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVPlanarTest, NV21Copy) {
+ const int halfwidth = (benchmark_width_ + 1) >> 1;
+ const int halfheight = (benchmark_height_ + 1) >> 1;
+ align_buffer_page_end(src_y, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(src_vu, halfwidth * 2 * halfheight);
+ align_buffer_page_end(dst_y, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(dst_vu, halfwidth * 2 * halfheight);
+
+ MemRandomize(src_y, benchmark_width_ * benchmark_height_);
+ MemRandomize(src_vu, halfwidth * 2 * halfheight);
+ MemRandomize(dst_y, benchmark_width_ * benchmark_height_);
+ MemRandomize(dst_vu, halfwidth * 2 * halfheight);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ NV21Copy(src_y, benchmark_width_, src_vu, halfwidth * 2, dst_y,
+ benchmark_width_, dst_vu, halfwidth * 2, benchmark_width_,
+ benchmark_height_);
+ }
+
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ EXPECT_EQ(src_y[i], dst_y[i]);
+ }
+ for (int i = 0; i < halfwidth * 2 * halfheight; ++i) {
+ EXPECT_EQ(src_vu[i], dst_vu[i]);
+ }
+
+ free_aligned_buffer_page_end(src_y);
+ free_aligned_buffer_page_end(src_vu);
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_vu);
}
} // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/rotate_argb_test.cc b/chromium/third_party/libyuv/unit_test/rotate_argb_test.cc
index d2003895961..3208b66a2ad 100644
--- a/chromium/third_party/libyuv/unit_test/rotate_argb_test.cc
+++ b/chromium/third_party/libyuv/unit_test/rotate_argb_test.cc
@@ -183,4 +183,46 @@ TEST_F(LibYUVRotateTest, DISABLED_RotatePlane270_Odd) {
benchmark_cpu_info_);
}
+TEST_F(LibYUVRotateTest, RotatePlane90_TestStride) {
+ int argb_plane_size = benchmark_width_ * 4 * abs(benchmark_height_);
+
+ align_buffer_page_end(src_argb, argb_plane_size);
+ align_buffer_page_end(dst_argb, argb_plane_size);
+
+ EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+ benchmark_width_ * 4, benchmark_width_,
+ benchmark_height_, kRotate0));
+
+ EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+ benchmark_width_ * 4 - 1, benchmark_width_ - 1,
+ benchmark_height_, kRotate0));
+
+ EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+ benchmark_width_ * 4, benchmark_width_,
+ benchmark_height_, kRotate180));
+
+ EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+ benchmark_width_ * 4 - 1, benchmark_width_ - 1,
+ benchmark_height_, kRotate180));
+
+ EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+ abs(benchmark_height_) * 4, benchmark_width_,
+ benchmark_height_, kRotate90));
+
+ EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+ abs(benchmark_height_) * 4, benchmark_width_ - 1,
+ benchmark_height_, kRotate90));
+
+ EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+ abs(benchmark_height_) * 4, benchmark_width_,
+ benchmark_height_, kRotate270));
+
+ EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+ abs(benchmark_height_) * 4, benchmark_width_ - 1,
+ benchmark_height_, kRotate270));
+
+ free_aligned_buffer_page_end(dst_argb);
+ free_aligned_buffer_page_end(src_argb);
+}
+
} // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/rotate_test.cc b/chromium/third_party/libyuv/unit_test/rotate_test.cc
index d04b96e9c68..61941e63e0e 100644
--- a/chromium/third_party/libyuv/unit_test/rotate_test.cc
+++ b/chromium/third_party/libyuv/unit_test/rotate_test.cc
@@ -135,6 +135,123 @@ TEST_F(LibYUVRotateTest, DISABLED_I420Rotate270_Odd) {
benchmark_cpu_info_);
}
+static void I444TestRotate(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ libyuv::RotationMode mode,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (src_width < 1) {
+ src_width = 1;
+ }
+ if (src_height == 0) {
+ src_height = 1;
+ }
+ if (dst_width < 1) {
+ dst_width = 1;
+ }
+ if (dst_height < 1) {
+ dst_height = 1;
+ }
+ int src_i444_y_size = src_width * Abs(src_height);
+ int src_i444_uv_size = src_width * Abs(src_height);
+ int src_i444_size = src_i444_y_size + src_i444_uv_size * 2;
+ align_buffer_page_end(src_i444, src_i444_size);
+ for (int i = 0; i < src_i444_size; ++i) {
+ src_i444[i] = fastrand() & 0xff;
+ }
+
+ int dst_i444_y_size = dst_width * dst_height;
+ int dst_i444_uv_size = dst_width * dst_height;
+ int dst_i444_size = dst_i444_y_size + dst_i444_uv_size * 2;
+ align_buffer_page_end(dst_i444_c, dst_i444_size);
+ align_buffer_page_end(dst_i444_opt, dst_i444_size);
+ memset(dst_i444_c, 2, dst_i444_size);
+ memset(dst_i444_opt, 3, dst_i444_size);
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ I444Rotate(src_i444, src_width, src_i444 + src_i444_y_size, src_width,
+ src_i444 + src_i444_y_size + src_i444_uv_size, src_width,
+ dst_i444_c, dst_width, dst_i444_c + dst_i444_y_size, dst_width,
+ dst_i444_c + dst_i444_y_size + dst_i444_uv_size, dst_width,
+ src_width, src_height, mode);
+
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ I444Rotate(src_i444, src_width, src_i444 + src_i444_y_size, src_width,
+ src_i444 + src_i444_y_size + src_i444_uv_size, src_width,
+ dst_i444_opt, dst_width, dst_i444_opt + dst_i444_y_size,
+ dst_width, dst_i444_opt + dst_i444_y_size + dst_i444_uv_size,
+ dst_width, src_width, src_height, mode);
+ }
+
+ // Rotation should be exact.
+ for (int i = 0; i < dst_i444_size; ++i) {
+ EXPECT_EQ(dst_i444_c[i], dst_i444_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(dst_i444_c);
+ free_aligned_buffer_page_end(dst_i444_opt);
+ free_aligned_buffer_page_end(src_i444);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate0_Opt) {
+ I444TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate0, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate90_Opt) {
+ I444TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate90, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate180_Opt) {
+ I444TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate180, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate270_Opt) {
+ I444TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate270, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+// TODO(fbarchard): Remove odd width tests.
+// Odd width tests work but disabled because they use C code and can be
+// tested by passing an odd width command line or environment variable.
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate0_Odd) {
+ I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate90_Odd) {
+ I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate180_Odd) {
+ I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate270_Odd) {
+ I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
static void NV12TestRotate(int src_width,
int src_height,
int dst_width,
diff --git a/chromium/third_party/libyuv/unit_test/scale_argb_test.cc b/chromium/third_party/libyuv/unit_test/scale_argb_test.cc
index 6a0a58640e4..2fdf5f60341 100644
--- a/chromium/third_party/libyuv/unit_test/scale_argb_test.cc
+++ b/chromium/third_party/libyuv/unit_test/scale_argb_test.cc
@@ -259,7 +259,7 @@ static int ARGBClipTestFilter(int src_width,
TEST_FACTOR(2, 1, 2)
TEST_FACTOR(4, 1, 4)
-TEST_FACTOR(8, 1, 8)
+// TEST_FACTOR(8, 1, 8) Disable for benchmark performance.
TEST_FACTOR(3by4, 3, 4)
TEST_FACTOR(3by8, 3, 8)
TEST_FACTOR(3, 1, 3)
@@ -303,10 +303,12 @@ TEST_FACTOR(3, 1, 3)
TEST_SCALETO(ARGBScale, 1, 1)
TEST_SCALETO(ARGBScale, 320, 240)
-TEST_SCALETO(ARGBScale, 352, 288)
TEST_SCALETO(ARGBScale, 569, 480)
TEST_SCALETO(ARGBScale, 640, 360)
+#ifdef ENABLE_SLOW_TESTS
TEST_SCALETO(ARGBScale, 1280, 720)
+TEST_SCALETO(ARGBScale, 1920, 1080)
+#endif // ENABLE_SLOW_TESTS
#undef TEST_SCALETO1
#undef TEST_SCALETO
@@ -454,4 +456,79 @@ TEST_F(LibYUVScaleTest, YUVToRGBScaleDown) {
EXPECT_LE(diff, 10);
}
+TEST_F(LibYUVScaleTest, ARGBTest3x) {
+ const int kSrcStride = 48 * 4;
+ const int kDstStride = 16 * 4;
+ const int kSize = kSrcStride * 3;
+ align_buffer_page_end(orig_pixels, kSize);
+ for (int i = 0; i < 48 * 3; ++i) {
+ orig_pixels[i * 4 + 0] = i;
+ orig_pixels[i * 4 + 1] = 255 - i;
+ orig_pixels[i * 4 + 2] = i + 1;
+ orig_pixels[i * 4 + 3] = i + 10;
+ }
+ align_buffer_page_end(dest_pixels, kDstStride);
+
+ int iterations16 =
+ benchmark_width_ * benchmark_height_ / (16 * 1) * benchmark_iterations_;
+ for (int i = 0; i < iterations16; ++i) {
+ ARGBScale(orig_pixels, kSrcStride, 48, 3, dest_pixels, kDstStride, 16, 1,
+ kFilterBilinear);
+ }
+
+ EXPECT_EQ(49, dest_pixels[0]);
+ EXPECT_EQ(255 - 49, dest_pixels[1]);
+ EXPECT_EQ(50, dest_pixels[2]);
+ EXPECT_EQ(59, dest_pixels[3]);
+
+ ARGBScale(orig_pixels, kSrcStride, 48, 3, dest_pixels, kDstStride, 16, 1,
+ kFilterNone);
+
+ EXPECT_EQ(49, dest_pixels[0]);
+ EXPECT_EQ(255 - 49, dest_pixels[1]);
+ EXPECT_EQ(50, dest_pixels[2]);
+ EXPECT_EQ(59, dest_pixels[3]);
+
+ free_aligned_buffer_page_end(dest_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, ARGBTest4x) {
+ const int kSrcStride = 64 * 4;
+ const int kDstStride = 16 * 4;
+ const int kSize = kSrcStride * 4;
+ align_buffer_page_end(orig_pixels, kSize);
+ for (int i = 0; i < 64 * 4; ++i) {
+ orig_pixels[i * 4 + 0] = i;
+ orig_pixels[i * 4 + 1] = 255 - i;
+ orig_pixels[i * 4 + 2] = i + 1;
+ orig_pixels[i * 4 + 3] = i + 10;
+ }
+ align_buffer_page_end(dest_pixels, kDstStride);
+
+ int iterations16 =
+ benchmark_width_ * benchmark_height_ / (16 * 1) * benchmark_iterations_;
+ for (int i = 0; i < iterations16; ++i) {
+ ARGBScale(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1,
+ kFilterBilinear);
+ }
+
+ EXPECT_NEAR((65 + 66 + 129 + 130 + 2) / 4, dest_pixels[0], 4);
+ EXPECT_NEAR((255 - 65 + 255 - 66 + 255 - 129 + 255 - 130 + 2) / 4,
+ dest_pixels[1], 4);
+ EXPECT_NEAR((1 * 4 + 65 + 66 + 129 + 130 + 2) / 4, dest_pixels[2], 4);
+ EXPECT_NEAR((10 * 4 + 65 + 66 + 129 + 130 + 2) / 4, dest_pixels[3], 4);
+
+ ARGBScale(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1,
+ kFilterNone);
+
+ EXPECT_EQ(130, dest_pixels[0]);
+ EXPECT_EQ(255 - 130, dest_pixels[1]);
+ EXPECT_EQ(130 + 1, dest_pixels[2]);
+ EXPECT_EQ(130 + 10, dest_pixels[3]);
+
+ free_aligned_buffer_page_end(dest_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
} // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/scale_test.cc b/chromium/third_party/libyuv/unit_test/scale_test.cc
index 08b6cffaa26..d627af02d63 100644
--- a/chromium/third_party/libyuv/unit_test/scale_test.cc
+++ b/chromium/third_party/libyuv/unit_test/scale_test.cc
@@ -14,7 +14,10 @@
#include "../unit_test/unit_test.h"
#include "libyuv/cpu_id.h"
#include "libyuv/scale.h"
+
+#ifdef ENABLE_ROW_TESTS
#include "libyuv/scale_row.h" // For ScaleRowDown2Box_Odd_C
+#endif
#define STRINGIZE(line) #line
#define FILELINESTR(file, line) file ":" STRINGIZE(line)
@@ -22,14 +25,14 @@
namespace libyuv {
// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
-static int TestFilter(int src_width,
- int src_height,
- int dst_width,
- int dst_height,
- FilterMode f,
- int benchmark_iterations,
- int disable_cpu_flags,
- int benchmark_cpu_info) {
+static int I420TestFilter(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
return 0;
}
@@ -141,14 +144,14 @@ static int TestFilter(int src_width,
// Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference.
// 0 = exact.
-static int TestFilter_16(int src_width,
- int src_height,
- int dst_width,
- int dst_height,
- FilterMode f,
- int benchmark_iterations,
- int disable_cpu_flags,
- int benchmark_cpu_info) {
+static int I420TestFilter_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
return 0;
}
@@ -256,41 +259,412 @@ static int TestFilter_16(int src_width,
return max_diff;
}
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
+static int I444TestFilter(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+ return 0;
+ }
+
+ int i, j;
+ int src_width_uv = Abs(src_width);
+ int src_height_uv = Abs(src_height);
+
+ int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+ int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv);
+
+ int src_stride_y = Abs(src_width);
+ int src_stride_uv = src_width_uv;
+
+ align_buffer_page_end(src_y, src_y_plane_size);
+ align_buffer_page_end(src_u, src_uv_plane_size);
+ align_buffer_page_end(src_v, src_uv_plane_size);
+ if (!src_y || !src_u || !src_v) {
+ printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+ return 0;
+ }
+ MemRandomize(src_y, src_y_plane_size);
+ MemRandomize(src_u, src_uv_plane_size);
+ MemRandomize(src_v, src_uv_plane_size);
+
+ int dst_width_uv = dst_width;
+ int dst_height_uv = dst_height;
+
+ int64_t dst_y_plane_size = (dst_width) * (dst_height);
+ int64_t dst_uv_plane_size = (dst_width_uv) * (dst_height_uv);
+
+ int dst_stride_y = dst_width;
+ int dst_stride_uv = dst_width_uv;
+
+ align_buffer_page_end(dst_y_c, dst_y_plane_size);
+ align_buffer_page_end(dst_u_c, dst_uv_plane_size);
+ align_buffer_page_end(dst_v_c, dst_uv_plane_size);
+ align_buffer_page_end(dst_y_opt, dst_y_plane_size);
+ align_buffer_page_end(dst_u_opt, dst_uv_plane_size);
+ align_buffer_page_end(dst_v_opt, dst_uv_plane_size);
+ if (!dst_y_c || !dst_u_c || !dst_v_c || !dst_y_opt || !dst_u_opt ||
+ !dst_v_opt) {
+ printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+ return 0;
+ }
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ double c_time = get_time();
+ I444Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv,
+ src_width, src_height, dst_y_c, dst_stride_y, dst_u_c,
+ dst_stride_uv, dst_v_c, dst_stride_uv, dst_width, dst_height, f);
+ c_time = (get_time() - c_time);
+
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ double opt_time = get_time();
+ for (i = 0; i < benchmark_iterations; ++i) {
+ I444Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv,
+ src_width, src_height, dst_y_opt, dst_stride_y, dst_u_opt,
+ dst_stride_uv, dst_v_opt, dst_stride_uv, dst_width, dst_height,
+ f);
+ }
+ opt_time = (get_time() - opt_time) / benchmark_iterations;
+ // Report performance of C vs OPT.
+ printf("filter %d - %8d us C - %8d us OPT\n", f,
+ static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+
+ // C version may be a little off from the optimized. Order of
+ // operations may introduce rounding somewhere. So do a difference
+ // of the buffers and look to see that the max difference is not
+ // over 3.
+ int max_diff = 0;
+ for (i = 0; i < (dst_height); ++i) {
+ for (j = 0; j < (dst_width); ++j) {
+ int abs_diff = Abs(dst_y_c[(i * dst_stride_y) + j] -
+ dst_y_opt[(i * dst_stride_y) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ }
+
+ for (i = 0; i < (dst_height_uv); ++i) {
+ for (j = 0; j < (dst_width_uv); ++j) {
+ int abs_diff = Abs(dst_u_c[(i * dst_stride_uv) + j] -
+ dst_u_opt[(i * dst_stride_uv) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ abs_diff = Abs(dst_v_c[(i * dst_stride_uv) + j] -
+ dst_v_opt[(i * dst_stride_uv) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ }
+
+ free_aligned_buffer_page_end(dst_y_c);
+ free_aligned_buffer_page_end(dst_u_c);
+ free_aligned_buffer_page_end(dst_v_c);
+ free_aligned_buffer_page_end(dst_y_opt);
+ free_aligned_buffer_page_end(dst_u_opt);
+ free_aligned_buffer_page_end(dst_v_opt);
+ free_aligned_buffer_page_end(src_y);
+ free_aligned_buffer_page_end(src_u);
+ free_aligned_buffer_page_end(src_v);
+
+ return max_diff;
+}
+
+// Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference.
+// 0 = exact.
+static int I444TestFilter_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+ return 0;
+ }
+
+ int i;
+ int src_width_uv = Abs(src_width);
+ int src_height_uv = Abs(src_height);
+
+ int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+ int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv);
+
+ int src_stride_y = Abs(src_width);
+ int src_stride_uv = src_width_uv;
+
+ align_buffer_page_end(src_y, src_y_plane_size);
+ align_buffer_page_end(src_u, src_uv_plane_size);
+ align_buffer_page_end(src_v, src_uv_plane_size);
+ align_buffer_page_end(src_y_16, src_y_plane_size * 2);
+ align_buffer_page_end(src_u_16, src_uv_plane_size * 2);
+ align_buffer_page_end(src_v_16, src_uv_plane_size * 2);
+ if (!src_y || !src_u || !src_v || !src_y_16 || !src_u_16 || !src_v_16) {
+ printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+ return 0;
+ }
+ uint16_t* p_src_y_16 = reinterpret_cast<uint16_t*>(src_y_16);
+ uint16_t* p_src_u_16 = reinterpret_cast<uint16_t*>(src_u_16);
+ uint16_t* p_src_v_16 = reinterpret_cast<uint16_t*>(src_v_16);
+
+ MemRandomize(src_y, src_y_plane_size);
+ MemRandomize(src_u, src_uv_plane_size);
+ MemRandomize(src_v, src_uv_plane_size);
+
+ for (i = 0; i < src_y_plane_size; ++i) {
+ p_src_y_16[i] = src_y[i];
+ }
+ for (i = 0; i < src_uv_plane_size; ++i) {
+ p_src_u_16[i] = src_u[i];
+ p_src_v_16[i] = src_v[i];
+ }
+
+ int dst_width_uv = dst_width;
+ int dst_height_uv = dst_height;
+
+ int dst_y_plane_size = (dst_width) * (dst_height);
+ int dst_uv_plane_size = (dst_width_uv) * (dst_height_uv);
+
+ int dst_stride_y = dst_width;
+ int dst_stride_uv = dst_width_uv;
+
+ align_buffer_page_end(dst_y_8, dst_y_plane_size);
+ align_buffer_page_end(dst_u_8, dst_uv_plane_size);
+ align_buffer_page_end(dst_v_8, dst_uv_plane_size);
+ align_buffer_page_end(dst_y_16, dst_y_plane_size * 2);
+ align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2);
+ align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2);
+
+ uint16_t* p_dst_y_16 = reinterpret_cast<uint16_t*>(dst_y_16);
+ uint16_t* p_dst_u_16 = reinterpret_cast<uint16_t*>(dst_u_16);
+ uint16_t* p_dst_v_16 = reinterpret_cast<uint16_t*>(dst_v_16);
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ I444Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv,
+ src_width, src_height, dst_y_8, dst_stride_y, dst_u_8,
+ dst_stride_uv, dst_v_8, dst_stride_uv, dst_width, dst_height, f);
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ for (i = 0; i < benchmark_iterations; ++i) {
+ I444Scale_16(p_src_y_16, src_stride_y, p_src_u_16, src_stride_uv,
+ p_src_v_16, src_stride_uv, src_width, src_height, p_dst_y_16,
+ dst_stride_y, p_dst_u_16, dst_stride_uv, p_dst_v_16,
+ dst_stride_uv, dst_width, dst_height, f);
+ }
+
+ // Expect an exact match.
+ int max_diff = 0;
+ for (i = 0; i < dst_y_plane_size; ++i) {
+ int abs_diff = Abs(dst_y_8[i] - p_dst_y_16[i]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ for (i = 0; i < dst_uv_plane_size; ++i) {
+ int abs_diff = Abs(dst_u_8[i] - p_dst_u_16[i]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ abs_diff = Abs(dst_v_8[i] - p_dst_v_16[i]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+
+ free_aligned_buffer_page_end(dst_y_8);
+ free_aligned_buffer_page_end(dst_u_8);
+ free_aligned_buffer_page_end(dst_v_8);
+ free_aligned_buffer_page_end(dst_y_16);
+ free_aligned_buffer_page_end(dst_u_16);
+ free_aligned_buffer_page_end(dst_v_16);
+ free_aligned_buffer_page_end(src_y);
+ free_aligned_buffer_page_end(src_u);
+ free_aligned_buffer_page_end(src_v);
+ free_aligned_buffer_page_end(src_y_16);
+ free_aligned_buffer_page_end(src_u_16);
+ free_aligned_buffer_page_end(src_v_16);
+
+ return max_diff;
+}
+
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
+static int NV12TestFilter(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+ return 0;
+ }
+
+ int i, j;
+ int src_width_uv = (Abs(src_width) + 1) >> 1;
+ int src_height_uv = (Abs(src_height) + 1) >> 1;
+
+ int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+ int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv)*2;
+
+ int src_stride_y = Abs(src_width);
+ int src_stride_uv = src_width_uv * 2;
+
+ align_buffer_page_end(src_y, src_y_plane_size);
+ align_buffer_page_end(src_uv, src_uv_plane_size);
+ if (!src_y || !src_uv) {
+ printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+ return 0;
+ }
+ MemRandomize(src_y, src_y_plane_size);
+ MemRandomize(src_uv, src_uv_plane_size);
+
+ int dst_width_uv = (dst_width + 1) >> 1;
+ int dst_height_uv = (dst_height + 1) >> 1;
+
+ int64_t dst_y_plane_size = (dst_width) * (dst_height);
+ int64_t dst_uv_plane_size = (dst_width_uv) * (dst_height_uv)*2;
+
+ int dst_stride_y = dst_width;
+ int dst_stride_uv = dst_width_uv * 2;
+
+ align_buffer_page_end(dst_y_c, dst_y_plane_size);
+ align_buffer_page_end(dst_uv_c, dst_uv_plane_size);
+ align_buffer_page_end(dst_y_opt, dst_y_plane_size);
+ align_buffer_page_end(dst_uv_opt, dst_uv_plane_size);
+ if (!dst_y_c || !dst_uv_c || !dst_y_opt || !dst_uv_opt) {
+ printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+ return 0;
+ }
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ double c_time = get_time();
+ NV12Scale(src_y, src_stride_y, src_uv, src_stride_uv, src_width, src_height,
+ dst_y_c, dst_stride_y, dst_uv_c, dst_stride_uv, dst_width,
+ dst_height, f);
+ c_time = (get_time() - c_time);
+
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ double opt_time = get_time();
+ for (i = 0; i < benchmark_iterations; ++i) {
+ NV12Scale(src_y, src_stride_y, src_uv, src_stride_uv, src_width, src_height,
+ dst_y_opt, dst_stride_y, dst_uv_opt, dst_stride_uv, dst_width,
+ dst_height, f);
+ }
+ opt_time = (get_time() - opt_time) / benchmark_iterations;
+ // Report performance of C vs OPT.
+ printf("filter %d - %8d us C - %8d us OPT\n", f,
+ static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+
+ // C version may be a little off from the optimized. Order of
+ // operations may introduce rounding somewhere. So do a difference
+ // of the buffers and look to see that the max difference is not
+ // over 3.
+ int max_diff = 0;
+ for (i = 0; i < (dst_height); ++i) {
+ for (j = 0; j < (dst_width); ++j) {
+ int abs_diff = Abs(dst_y_c[(i * dst_stride_y) + j] -
+ dst_y_opt[(i * dst_stride_y) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ }
+
+ for (i = 0; i < (dst_height_uv); ++i) {
+ for (j = 0; j < (dst_width_uv * 2); ++j) {
+ int abs_diff = Abs(dst_uv_c[(i * dst_stride_uv) + j] -
+ dst_uv_opt[(i * dst_stride_uv) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ }
+
+ free_aligned_buffer_page_end(dst_y_c);
+ free_aligned_buffer_page_end(dst_uv_c);
+ free_aligned_buffer_page_end(dst_y_opt);
+ free_aligned_buffer_page_end(dst_uv_opt);
+ free_aligned_buffer_page_end(src_y);
+ free_aligned_buffer_page_end(src_uv);
+
+ return max_diff;
+}
+
// The following adjustments in dimensions ensure the scale factor will be
// exactly achieved.
// 2 is chroma subsample.
#define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2)
#define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2)
-#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
- TEST_F(LibYUVScaleTest, ScaleDownBy##name##_##filter) { \
- int diff = TestFilter( \
- SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
- DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
- kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
- benchmark_cpu_info_); \
- EXPECT_LE(diff, max_diff); \
- } \
- TEST_F(LibYUVScaleTest, ScaleDownBy##name##_##filter##_16) { \
- int diff = TestFilter_16( \
- SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
- DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
- kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
- benchmark_cpu_info_); \
- EXPECT_LE(diff, max_diff); \
+#define TEST_FACTOR1(DISABLED_, name, filter, nom, denom, max_diff) \
+ TEST_F(LibYUVScaleTest, I420ScaleDownBy##name##_##filter) { \
+ int diff = I420TestFilter( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, I444ScaleDownBy##name##_##filter) { \
+ int diff = I444TestFilter( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, DISABLED_##I420ScaleDownBy##name##_##filter##_16) { \
+ int diff = I420TestFilter_16( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, DISABLED_##I444ScaleDownBy##name##_##filter##_16) { \
+ int diff = I444TestFilter_16( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, NV12ScaleDownBy##name##_##filter) { \
+ int diff = NV12TestFilter( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
}
// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but
// filtering is different fixed point implementations for SSSE3, Neon and C.
-#define TEST_FACTOR(name, nom, denom, boxdiff) \
- TEST_FACTOR1(name, None, nom, denom, 0) \
- TEST_FACTOR1(name, Linear, nom, denom, 3) \
- TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
- TEST_FACTOR1(name, Box, nom, denom, boxdiff)
+#ifdef ENABLE_SLOW_TESTS
+#define TEST_FACTOR(name, nom, denom, boxdiff) \
+ TEST_FACTOR1(, name, None, nom, denom, 0) \
+ TEST_FACTOR1(, name, Linear, nom, denom, 3) \
+ TEST_FACTOR1(, name, Bilinear, nom, denom, 3) \
+ TEST_FACTOR1(, name, Box, nom, denom, boxdiff)
+#else
+#define TEST_FACTOR(name, nom, denom, boxdiff) \
+ TEST_FACTOR1(DISABLED_, name, None, nom, denom, 0) \
+ TEST_FACTOR1(DISABLED_, name, Linear, nom, denom, 3) \
+ TEST_FACTOR1(DISABLED_, name, Bilinear, nom, denom, 3) \
+ TEST_FACTOR1(DISABLED_, name, Box, nom, denom, boxdiff)
+#endif
TEST_FACTOR(2, 1, 2, 0)
TEST_FACTOR(4, 1, 4, 0)
-TEST_FACTOR(8, 1, 8, 0)
+// TEST_FACTOR(8, 1, 8, 0) Disable for benchmark performance. Takes 90 seconds.
TEST_FACTOR(3by4, 3, 4, 1)
TEST_FACTOR(3by8, 3, 8, 1)
TEST_FACTOR(3, 1, 3, 0)
@@ -299,50 +673,105 @@ TEST_FACTOR(3, 1, 3, 0)
#undef SX
#undef DX
-#define TEST_SCALETO1(name, width, height, filter, max_diff) \
- TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \
- int diff = TestFilter(benchmark_width_, benchmark_height_, width, height, \
- kFilter##filter, benchmark_iterations_, \
- disable_cpu_flags_, benchmark_cpu_info_); \
+#define TEST_SCALETO1(DISABLED_, name, width, height, filter, max_diff) \
+ TEST_F(LibYUVScaleTest, I420##name##To##width##x##height##_##filter) { \
+ int diff = I420TestFilter(benchmark_width_, benchmark_height_, width, \
+ height, kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, I444##name##To##width##x##height##_##filter) { \
+ int diff = I444TestFilter(benchmark_width_, benchmark_height_, width, \
+ height, kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, \
+ DISABLED_##I420##name##To##width##x##height##_##filter##_16) { \
+ int diff = I420TestFilter_16( \
+ benchmark_width_, benchmark_height_, width, height, kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, \
+ DISABLED_##I444##name##To##width##x##height##_##filter##_16) { \
+ int diff = I444TestFilter_16( \
+ benchmark_width_, benchmark_height_, width, height, kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
- TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \
- int diff = TestFilter(width, height, Abs(benchmark_width_), \
- Abs(benchmark_height_), kFilter##filter, \
- benchmark_iterations_, disable_cpu_flags_, \
- benchmark_cpu_info_); \
+ TEST_F(LibYUVScaleTest, NV12##name##To##width##x##height##_##filter) { \
+ int diff = NV12TestFilter(benchmark_width_, benchmark_height_, width, \
+ height, kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
- TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter##_16) { \
- int diff = TestFilter_16(benchmark_width_, benchmark_height_, width, \
- height, kFilter##filter, benchmark_iterations_, \
- disable_cpu_flags_, benchmark_cpu_info_); \
+ TEST_F(LibYUVScaleTest, I420##name##From##width##x##height##_##filter) { \
+ int diff = I420TestFilter(width, height, Abs(benchmark_width_), \
+ Abs(benchmark_height_), kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
- TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter##_16) { \
- int diff = TestFilter_16(width, height, Abs(benchmark_width_), \
- Abs(benchmark_height_), kFilter##filter, \
- benchmark_iterations_, disable_cpu_flags_, \
- benchmark_cpu_info_); \
+ TEST_F(LibYUVScaleTest, I444##name##From##width##x##height##_##filter) { \
+ int diff = I444TestFilter(width, height, Abs(benchmark_width_), \
+ Abs(benchmark_height_), kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, \
+ DISABLED_##I420##name##From##width##x##height##_##filter##_16) { \
+ int diff = I420TestFilter_16(width, height, Abs(benchmark_width_), \
+ Abs(benchmark_height_), kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, \
+ DISABLED_##I444##name##From##width##x##height##_##filter##_16) { \
+ int diff = I444TestFilter_16(width, height, Abs(benchmark_width_), \
+ Abs(benchmark_height_), kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, NV12##name##From##width##x##height##_##filter) { \
+ int diff = NV12TestFilter(width, height, Abs(benchmark_width_), \
+ Abs(benchmark_height_), kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
}
+#ifdef ENABLE_SLOW_TESTS
// Test scale to a specified size with all 4 filters.
-#define TEST_SCALETO(name, width, height) \
- TEST_SCALETO1(name, width, height, None, 0) \
- TEST_SCALETO1(name, width, height, Linear, 3) \
- TEST_SCALETO1(name, width, height, Bilinear, 3) \
- TEST_SCALETO1(name, width, height, Box, 3)
+#define TEST_SCALETO(name, width, height) \
+ TEST_SCALETO1(, name, width, height, None, 0) \
+ TEST_SCALETO1(, name, width, height, Linear, 3) \
+ TEST_SCALETO1(, name, width, height, Bilinear, 3) \
+ TEST_SCALETO1(, name, width, height, Box, 3)
+#else
+// Test scale to a specified size with all 4 filters.
+#define TEST_SCALETO(name, width, height) \
+ TEST_SCALETO1(DISABLED_, name, width, height, None, 0) \
+ TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3) \
+ TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3) \
+ TEST_SCALETO1(DISABLED_, name, width, height, Box, 3)
+#endif
TEST_SCALETO(Scale, 1, 1)
TEST_SCALETO(Scale, 320, 240)
-TEST_SCALETO(Scale, 352, 288)
TEST_SCALETO(Scale, 569, 480)
TEST_SCALETO(Scale, 640, 360)
TEST_SCALETO(Scale, 1280, 720)
+#ifdef ENABLE_SLOW_TESTS
+TEST_SCALETO(Scale, 1920, 1080)
+#endif // ENABLE_SLOW_TESTS
#undef TEST_SCALETO1
#undef TEST_SCALETO
+#ifdef ENABLE_ROW_TESTS
#ifdef HAS_SCALEROWDOWN2_SSSE3
TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) {
SIMD_ALIGNED(uint8_t orig_pixels[128 * 2]);
@@ -437,6 +866,10 @@ extern "C" void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst,
int dst_width);
+extern "C" void ScaleRowUp2_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
extern "C" void ScaleRowUp2_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst,
@@ -463,6 +896,13 @@ TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) {
} else {
ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
}
+#elif !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+ int has_mmi = TestCpuFlag(kCpuHasMMI);
+ if (has_mmi) {
+ ScaleRowUp2_16_MMI(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
+ } else {
+ ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
+ }
#else
ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
#endif
@@ -513,6 +953,7 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) {
EXPECT_EQ(dst_pixels_c[0], (0 + 1 + 2560 + 2561 + 2) / 4);
EXPECT_EQ(dst_pixels_c[1279], 3839);
}
+#endif // ENABLE_ROW_TESTS
// Test scaling plane with 8 bit C vs 16 bit C and return maximum pixel
// difference.
@@ -583,14 +1024,14 @@ static int TestPlaneFilter_16(int src_width,
#define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2)
#define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2)
-#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
- TEST_F(LibYUVScaleTest, ScalePlaneDownBy##name##_##filter##_16) { \
- int diff = TestPlaneFilter_16( \
- SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
- DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
- kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
- benchmark_cpu_info_); \
- EXPECT_LE(diff, max_diff); \
+#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
+ TEST_F(LibYUVScaleTest, DISABLED_##ScalePlaneDownBy##name##_##filter##_16) { \
+ int diff = TestPlaneFilter_16( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
}
// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but
@@ -603,7 +1044,7 @@ static int TestPlaneFilter_16(int src_width,
TEST_FACTOR(2, 1, 2, 0)
TEST_FACTOR(4, 1, 4, 0)
-TEST_FACTOR(8, 1, 8, 0)
+// TEST_FACTOR(8, 1, 8, 0) Disable for benchmark performance. Takes 90 seconds.
TEST_FACTOR(3by4, 3, 4, 1)
TEST_FACTOR(3by8, 3, 8, 1)
TEST_FACTOR(3, 1, 3, 0)
@@ -611,4 +1052,171 @@ TEST_FACTOR(3, 1, 3, 0)
#undef TEST_FACTOR
#undef SX
#undef DX
+
+TEST_F(LibYUVScaleTest, PlaneTest3x) {
+ const int kSrcStride = 48;
+ const int kDstStride = 16;
+ const int kSize = kSrcStride * 3;
+ align_buffer_page_end(orig_pixels, kSize);
+ for (int i = 0; i < 48 * 3; ++i) {
+ orig_pixels[i] = i;
+ }
+ align_buffer_page_end(dest_pixels, kDstStride);
+
+ int iterations16 =
+ benchmark_width_ * benchmark_height_ / (16 * 1) * benchmark_iterations_;
+ for (int i = 0; i < iterations16; ++i) {
+ ScalePlane(orig_pixels, kSrcStride, 48, 3, dest_pixels, kDstStride, 16, 1,
+ kFilterBilinear);
+ }
+
+ EXPECT_EQ(49, dest_pixels[0]);
+
+ ScalePlane(orig_pixels, kSrcStride, 48, 3, dest_pixels, kDstStride, 16, 1,
+ kFilterNone);
+
+ EXPECT_EQ(49, dest_pixels[0]);
+
+ free_aligned_buffer_page_end(dest_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, PlaneTest4x) {
+ const int kSrcStride = 64;
+ const int kDstStride = 16;
+ const int kSize = kSrcStride * 4;
+ align_buffer_page_end(orig_pixels, kSize);
+ for (int i = 0; i < 64 * 4; ++i) {
+ orig_pixels[i] = i;
+ }
+ align_buffer_page_end(dest_pixels, kDstStride);
+
+ int iterations16 =
+ benchmark_width_ * benchmark_height_ / (16 * 1) * benchmark_iterations_;
+ for (int i = 0; i < iterations16; ++i) {
+ ScalePlane(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1,
+ kFilterBilinear);
+ }
+
+ EXPECT_EQ((65 + 66 + 129 + 130 + 2) / 4, dest_pixels[0]);
+
+ ScalePlane(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1,
+ kFilterNone);
+
+ EXPECT_EQ(130, dest_pixels[0]); // expect the 3rd pixel of the 3rd row
+
+ free_aligned_buffer_page_end(dest_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+// Intent is to test 200x50 to 50x200 but width and height can be parameters.
+TEST_F(LibYUVScaleTest, PlaneTestRotate_None) {
+ const int kSize = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(orig_pixels, kSize);
+ for (int i = 0; i < kSize; ++i) {
+ orig_pixels[i] = i;
+ }
+ align_buffer_page_end(dest_opt_pixels, kSize);
+ align_buffer_page_end(dest_c_pixels, kSize);
+
+
+ MaskCpuFlags(disable_cpu_flags_); // Disable all CPU optimization.
+ ScalePlane(orig_pixels, benchmark_width_,
+ benchmark_width_, benchmark_height_,
+ dest_c_pixels, benchmark_height_,
+ benchmark_height_, benchmark_width_,
+ kFilterNone);
+ MaskCpuFlags(benchmark_cpu_info_); // Enable all CPU optimization.
+
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ ScalePlane(orig_pixels, benchmark_width_,
+ benchmark_width_, benchmark_height_,
+ dest_opt_pixels, benchmark_height_,
+ benchmark_height_, benchmark_width_,
+ kFilterNone);
+ }
+
+ for (int i = 0; i < kSize; ++i) {
+ EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+ }
+
+ free_aligned_buffer_page_end(dest_c_pixels);
+ free_aligned_buffer_page_end(dest_opt_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, PlaneTestRotate_Bilinear) {
+ const int kSize = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(orig_pixels, kSize);
+ for (int i = 0; i < kSize; ++i) {
+ orig_pixels[i] = i;
+ }
+ align_buffer_page_end(dest_opt_pixels, kSize);
+ align_buffer_page_end(dest_c_pixels, kSize);
+
+
+ MaskCpuFlags(disable_cpu_flags_); // Disable all CPU optimization.
+ ScalePlane(orig_pixels, benchmark_width_,
+ benchmark_width_, benchmark_height_,
+ dest_c_pixels, benchmark_height_,
+ benchmark_height_, benchmark_width_,
+ kFilterBilinear);
+ MaskCpuFlags(benchmark_cpu_info_); // Enable all CPU optimization.
+
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ ScalePlane(orig_pixels, benchmark_width_,
+ benchmark_width_, benchmark_height_,
+ dest_opt_pixels, benchmark_height_,
+ benchmark_height_, benchmark_width_,
+ kFilterBilinear);
+ }
+
+ for (int i = 0; i < kSize; ++i) {
+ EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+ }
+
+ free_aligned_buffer_page_end(dest_c_pixels);
+ free_aligned_buffer_page_end(dest_opt_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+// Intent is to test 200x50 to 50x200 but width and height can be parameters.
+TEST_F(LibYUVScaleTest, PlaneTestRotate_Box) {
+ const int kSize = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(orig_pixels, kSize);
+ for (int i = 0; i < kSize; ++i) {
+ orig_pixels[i] = i;
+ }
+ align_buffer_page_end(dest_opt_pixels, kSize);
+ align_buffer_page_end(dest_c_pixels, kSize);
+
+
+ MaskCpuFlags(disable_cpu_flags_); // Disable all CPU optimization.
+ ScalePlane(orig_pixels, benchmark_width_,
+ benchmark_width_, benchmark_height_,
+ dest_c_pixels, benchmark_height_,
+ benchmark_height_, benchmark_width_,
+ kFilterBox);
+ MaskCpuFlags(benchmark_cpu_info_); // Enable all CPU optimization.
+
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ ScalePlane(orig_pixels, benchmark_width_,
+ benchmark_width_, benchmark_height_,
+ dest_opt_pixels, benchmark_height_,
+ benchmark_height_, benchmark_width_,
+ kFilterBox);
+ }
+
+ for (int i = 0; i < kSize; ++i) {
+ EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+ }
+
+ free_aligned_buffer_page_end(dest_c_pixels);
+ free_aligned_buffer_page_end(dest_opt_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
} // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/unit_test.cc b/chromium/third_party/libyuv/unit_test/unit_test.cc
index 7d662706aaa..2aa9cdaad6e 100644
--- a/chromium/third_party/libyuv/unit_test/unit_test.cc
+++ b/chromium/third_party/libyuv/unit_test/unit_test.cc
@@ -17,6 +17,9 @@
#ifdef LIBYUV_USE_GFLAGS
#include "gflags/gflags.h"
#endif
+#ifdef LIBYUV_USE_BASE_FLAGS
+#include "base/commandlineflags.h"
+#endif
#include "libyuv/cpu_id.h"
unsigned int fastrand_seed = 0xfb;
@@ -66,6 +69,9 @@ int TestCpuEnv(int cpu_info) {
if (TestEnv("LIBYUV_DISABLE_MSA")) {
cpu_info &= ~libyuv::kCpuHasMSA;
}
+ if (TestEnv("LIBYUV_DISABLE_MMI")) {
+ cpu_info &= ~libyuv::kCpuHasMMI;
+ }
#endif
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
(defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
diff --git a/chromium/third_party/libyuv/unit_test/video_common_test.cc b/chromium/third_party/libyuv/unit_test/video_common_test.cc
index a84206a2adb..eb183aaa796 100644
--- a/chromium/third_party/libyuv/unit_test/video_common_test.cc
+++ b/chromium/third_party/libyuv/unit_test/video_common_test.cc
@@ -65,7 +65,7 @@ TEST_F(LibYUVBaseTest, TestFourCC) {
EXPECT_TRUE(TestValidFourCC(FOURCC_NV12, FOURCC_BPP_NV12));
EXPECT_TRUE(TestValidFourCC(FOURCC_YUY2, FOURCC_BPP_YUY2));
EXPECT_TRUE(TestValidFourCC(FOURCC_UYVY, FOURCC_BPP_UYVY));
- EXPECT_TRUE(TestValidFourCC(FOURCC_M420, FOURCC_BPP_M420));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_M420, FOURCC_BPP_M420)); // deprecated.
EXPECT_TRUE(TestValidFourCC(FOURCC_Q420, FOURCC_BPP_Q420)); // deprecated.
EXPECT_TRUE(TestValidFourCC(FOURCC_ARGB, FOURCC_BPP_ARGB));
EXPECT_TRUE(TestValidFourCC(FOURCC_BGRA, FOURCC_BPP_BGRA));
diff --git a/chromium/third_party/libyuv/util/cpuid.c b/chromium/third_party/libyuv/util/cpuid.c
index 59c65d60e0f..46f9c1bfff4 100644
--- a/chromium/third_party/libyuv/util/cpuid.c
+++ b/chromium/third_party/libyuv/util/cpuid.c
@@ -12,10 +12,11 @@
#include <stdlib.h>
#include <string.h>
-#define INCLUDE_LIBYUV_COMPARE_H_
-#include "libyuv.h"
-#include "./psnr.h"
-#include "./ssim.h"
+#include "libyuv/cpu_id.h"
+
+#ifdef __cplusplus
+using namespace libyuv;
+#endif
int main(int argc, const char* argv[]) {
int cpu_flags = TestCpuFlag(-1);
@@ -71,6 +72,8 @@ int main(int argc, const char* argv[]) {
if (has_mips) {
int has_msa = TestCpuFlag(kCpuHasMSA);
printf("Has MSA %x\n", has_msa);
+ int has_mmi = TestCpuFlag(kCpuHasMMI);
+ printf("Has MMI %x\n", has_mmi);
}
if (has_x86) {
int has_sse2 = TestCpuFlag(kCpuHasSSE2);
@@ -81,7 +84,7 @@ int main(int argc, const char* argv[]) {
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
int has_erms = TestCpuFlag(kCpuHasERMS);
int has_fma3 = TestCpuFlag(kCpuHasFMA3);
- int has_f16c = TestCpuFlag(kCpuHasF16C);
+ int has_f16c = TestCpuFlag(kCpuHasF16C);
int has_gfni = TestCpuFlag(kCpuHasGFNI);
int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW);
int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL);
diff --git a/chromium/third_party/libyuv/util/i444tonv12_eg.cc b/chromium/third_party/libyuv/util/i444tonv12_eg.cc
new file mode 100644
index 00000000000..0fcb4095a80
--- /dev/null
+++ b/chromium/third_party/libyuv/util/i444tonv12_eg.cc
@@ -0,0 +1,28 @@
+
+#include "libyuv/convert.h"
+
+#include <stdio.h> // for printf
+#include <string.h> // for memset
+
+int main(int, char**) {
+ unsigned char src_i444[640 * 400 * 3];
+ unsigned char dst_nv12[640 * 400 * 3 / 2];
+
+ for (size_t i = 0; i < sizeof(src_i444); ++i) {
+ src_i444[i] = i & 255;
+ }
+ memset(dst_nv12, 0, sizeof(dst_nv12));
+ libyuv::I444ToNV12(&src_i444[0], 640, // source Y
+ &src_i444[640 * 400], 640, // source U
+ &src_i444[640 * 400 * 2], 640, // source V
+ &dst_nv12[0], 640, // dest Y
+ &dst_nv12[640 * 400], 640, // dest UV
+ 640, 400); // width and height
+
+ int checksum = 0;
+ for (size_t i = 0; i < sizeof(dst_nv12); ++i) {
+ checksum += dst_nv12[i];
+ }
+ printf("checksum %x %s\n", checksum, checksum == 0x2ec0c00 ? "PASS" : "FAIL");
+ return 0;
+} \ No newline at end of file
diff --git a/chromium/third_party/libyuv/util/psnr.cc b/chromium/third_party/libyuv/util/psnr.cc
index f54015bab82..c7bee7f97d2 100644
--- a/chromium/third_party/libyuv/util/psnr.cc
+++ b/chromium/third_party/libyuv/util/psnr.cc
@@ -189,7 +189,7 @@ static uint32_t SumSquareError_SSE2(const uint8_t* src_a,
,
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
- ); // NOLINT
+ ); // NOLINT
return sse;
}
#endif // LIBYUV_DISABLE_X86 etc
diff --git a/chromium/third_party/libyuv/winarm.mk b/chromium/third_party/libyuv/winarm.mk
index c4307a431f9..b0a344ae06d 100644
--- a/chromium/third_party/libyuv/winarm.mk
+++ b/chromium/third_party/libyuv/winarm.mk
@@ -31,6 +31,7 @@ LOCAL_OBJ_FILES = \
source/scale_any.o\
source/scale_argb.o\
source/scale_common.o\
+ source/scale_uv.o\
source/video_common.o
.cc.o: