[Backport] Security bug 1062941

Manual backport (library update) of patch originally reviewed on https://chromium-review.googlesource.com/c/chromium/src/+/2692542: Roll src/third_party/libyuv/ 6866adbec..1d3f901aa (17 commits) https://chromium.googlesource.com/libyuv/libyuv.git/+log/6866adbec5af..1d3f901aa016 $ git log 6866adbec..1d3f901aa --date=short --no-merges --format='%ad %ae %s' 2020-12-25 fbarchard Scale bug fix with msan when scaling up in height and down in width with box filter. 2020-12-22 fbarchard Test Box filter scale plane with 1 dimension growing and the other reducing 2020-12-03 eshr NV12 Copy, include scale_uv.h 2020-11-18 thakis Stop setting mac_xcode_version in DEPS 2020-11-06 libyuv-ci-autoroll-builder Roll chromium_revision 5aaa70b53c..64c8c30faa (822628:824854) 2020-11-03 fbarchard Scale by even factor low level row function 2020-10-30 libyuv-ci-autoroll-builder Roll chromium_revision df9aecfc0b..5aaa70b53c (820568:822628) 2020-10-28 fbarchard PlaneScale, UVScale and ARGBScale test 3x and 4x down sample. 2020-10-27 fbarchard MJPGToNV12 added and build files sorted 2020-10-24 libyuv-ci-autoroll-builder Roll chromium_revision e812106b13..df9aecfc0b (817907:820568) 2020-10-16 libyuv-ci-autoroll-builder Roll chromium_revision 4892423355..e812106b13 (815587:817907) 2020-10-13 fbarchard UVScale down use AVX2 and Neon for aarch32 2020-10-13 fbarchard UVScale down by 4 use SSSE3/NEON 2020-10-12 fbarchard 2x down sample for UV planes ported to SSSE3 / NEON 2020-10-09 libyuv-ci-autoroll-builder Roll chromium_revision ccec2ad009..4892423355 (811963:815587) 2020-10-02 fbarchard I420ToARGB prototype added to convert_from.h 2020-10-01 fbarchard scale neon adjust PRFM instruction to co-issue with math Created with: roll-dep src/third_party/libyuv (cherry picked from commit 1a60856f34aa15def686168c3b392dc37a120c51) Bug: chromium:1158178, chromium:1062941, libyuv:875, b/176195584 Change-Id: Iecf360198a90acabcbd71e57791634f5e3e861c3 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Eugene Zemtsov <eugene@chromium.org> Cr-Original-Commit-Position: refs/heads/master@{#839493} Commit-Queue: Jana Grill <janagrill@chromium.org> Reviewed-by: Victor-Gabriel Savu <vsavu@google.com> Cr-Commit-Position: refs/branch-heads/4240@{#1545} Cr-Branched-From: f297677702651916bbf65e59c0d4bbd4ce57d1ee-refs/heads/master@{#800218} Reviewed-by: Jüri Valdmann <juri.valdmann@qt.io>
author: Jana Grill <janagrill@google.com> 2021-02-17 12:35:20 +0000
committer: Michael Brüning <michael.bruning@qt.io> 2021-04-09 10:50:31 +0000
commit: fcd5c56fe795bb48ff3b31e0fff038875c5ad689 (patch)
tree: eb133a6c1e56daabcf4418a517d856fb8a727c4a
parent: 35caa7c78115cc22c405c1b6de386b73d36f8609 (diff)
download: qtwebengine-chromium-fcd5c56fe795bb48ff3b31e0fff038875c5ad689.tar.gz
92 files changed, 35235 insertions, 10785 deletions
diff --git a/chromium/third_party/libyuv/Android.bp b/chromium/third_party/libyuv/Android.bp
index fc6a81fc66a..d0b23432628 100644
--- a/chromium/third_party/libyuv/Android.bp
+++ b/chromium/third_party/libyuv/Android.bp
@@ -9,28 +9,34 @@ cc_library {
         "source/compare.cc",
         "source/compare_common.cc",
         "source/compare_gcc.cc",
+        "source/compare_mmi.cc",
+        "source/compare_msa.cc",
         "source/compare_neon.cc",
         "source/compare_neon64.cc",
-        "source/compare_msa.cc",
         "source/convert.cc",
         "source/convert_argb.cc",
         "source/convert_from.cc",
         "source/convert_from_argb.cc",
+        "source/convert_jpeg.cc",
         "source/convert_to_argb.cc",
         "source/convert_to_i420.cc",
         "source/cpu_id.cc",
+        "source/mjpeg_decoder.cc",
+        "source/mjpeg_validate.cc",
         "source/planar_functions.cc",
         "source/rotate.cc",
         "source/rotate_any.cc",
         "source/rotate_argb.cc",
         "source/rotate_common.cc",
         "source/rotate_gcc.cc",
+        "source/rotate_mmi.cc",
         "source/rotate_msa.cc",
         "source/rotate_neon.cc",
         "source/rotate_neon64.cc",
         "source/row_any.cc",
         "source/row_common.cc",
         "source/row_gcc.cc",
+        "source/row_mmi.cc",
         "source/row_msa.cc",
         "source/row_neon.cc",
         "source/row_neon64.cc",
@@ -39,13 +45,12 @@ cc_library {
         "source/scale_argb.cc",
         "source/scale_common.cc",
         "source/scale_gcc.cc",
+        "source/scale_mmi.cc",
         "source/scale_msa.cc",
         "source/scale_neon.cc",
         "source/scale_neon64.cc",
+        "source/scale_uv.cc",
         "source/video_common.cc",
-        "source/convert_jpeg.cc",
-        "source/mjpeg_decoder.cc",
-        "source/mjpeg_validate.cc",
     ],
 
     cflags: [
@@ -65,6 +70,7 @@ cc_library {
 // with libyuv (b/37646797)
 cc_library_static {
     name: "libyuv_static",
+    vendor_available: true,
     whole_static_libs: ["libyuv"],
 }
 
@@ -74,7 +80,6 @@ cc_test {
     shared_libs: ["libjpeg"],
     cflags: ["-Wall", "-Werror"],
     srcs: [
-        "unit_test/unit_test.cc",
         "unit_test/basictypes_test.cc",
         "unit_test/color_test.cc",
         "unit_test/compare_test.cc",
@@ -87,6 +92,8 @@ cc_test {
         "unit_test/rotate_test.cc",
         "unit_test/scale_argb_test.cc",
         "unit_test/scale_test.cc",
+        "unit_test/scale_uv_test.cc",
+        "unit_test/unit_test.cc",
         "unit_test/video_common_test.cc",
     ],
 }
@@ -101,6 +108,15 @@ cc_test {
 }
 
 cc_test {
+    name: "i444tonv12_eg",
+    gtest: false,
+    srcs: [
+        "util/i444tonv12_eg.cc",
+    ],
+    static_libs: ["libyuv"],
+}
+
+cc_test {
     name: "cpuid",
     gtest: false,
     srcs: [
diff --git a/chromium/third_party/libyuv/Android.mk b/chromium/third_party/libyuv/Android.mk
index dbc6cad37ab..2ceb49281be 100644
--- a/chromium/third_party/libyuv/Android.mk
+++ b/chromium/third_party/libyuv/Android.mk
@@ -9,9 +9,11 @@ LOCAL_SRC_FILES := \
     source/compare.cc           \
     source/compare_common.cc    \
     source/compare_gcc.cc       \
+    source/compare_mmi.cc       \
     source/compare_msa.cc       \
     source/compare_neon.cc      \
     source/compare_neon64.cc    \
+    source/compare_win.cc       \
     source/convert.cc           \
     source/convert_argb.cc      \
     source/convert_from.cc      \
@@ -25,23 +27,30 @@ LOCAL_SRC_FILES := \
     source/rotate_argb.cc       \
     source/rotate_common.cc     \
     source/rotate_gcc.cc        \
+    source/rotate_mmi.cc        \
     source/rotate_msa.cc        \
     source/rotate_neon.cc       \
     source/rotate_neon64.cc     \
+    source/rotate_win.cc        \
     source/row_any.cc           \
     source/row_common.cc        \
     source/row_gcc.cc           \
+    source/row_mmi.cc           \
     source/row_msa.cc           \
     source/row_neon.cc          \
     source/row_neon64.cc        \
+    source/row_win.cc           \
     source/scale.cc             \
     source/scale_any.cc         \
     source/scale_argb.cc        \
     source/scale_common.cc      \
     source/scale_gcc.cc         \
+    source/scale_mmi.cc         \
     source/scale_msa.cc         \
     source/scale_neon.cc        \
     source/scale_neon64.cc      \
+    source/scale_uv.cc          \
+    source/scale_win.cc         \
     source/video_common.cc
 
 common_CFLAGS := -Wall -fexceptions
@@ -81,7 +90,6 @@ LOCAL_MODULE_TAGS := tests
 LOCAL_CPP_EXTENSION := .cc
 LOCAL_C_INCLUDES += $(LOCAL_PATH)/include
 LOCAL_SRC_FILES := \
-    unit_test/unit_test.cc        \
     unit_test/basictypes_test.cc  \
     unit_test/color_test.cc       \
     unit_test/compare_test.cc     \
@@ -94,6 +102,8 @@ LOCAL_SRC_FILES := \
     unit_test/rotate_test.cc      \
     unit_test/scale_argb_test.cc  \
     unit_test/scale_test.cc       \
+    unit_test/scale_uv_test.cc    \
+    unit_test/unit_test.cc        \
     unit_test/video_common_test.cc
 
 LOCAL_MODULE := libyuv_unittest
diff --git a/chromium/third_party/libyuv/BUILD.gn b/chromium/third_party/libyuv/BUILD.gn
index 9518a8db3e1..3d5298d7041 100644
--- a/chromium/third_party/libyuv/BUILD.gn
+++ b/chromium/third_party/libyuv/BUILD.gn
@@ -6,9 +6,9 @@
 # in the file PATENTS. All contributing project authors may
 # be found in the AUTHORS file in the root of the source tree.
 
-import("libyuv.gni")
 import("//build/config/features.gni")
 import("//testing/test.gni")
+import("libyuv.gni")
 
 declare_args() {
   # Set to false to disable building with gflags.
@@ -33,13 +33,12 @@ config("libyuv_config") {
 # This target is built when no specific target is specified on the command line.
 group("default") {
   testonly = true
-  deps = [
-    ":libyuv",
-  ]
+  deps = [ ":libyuv" ]
   if (libyuv_include_tests) {
     deps += [
       ":compare",
       ":cpuid",
+      ":i444tonv12_eg",
       ":libyuv_unittest",
       ":psnr",
       ":yuvconvert",
@@ -53,13 +52,9 @@ group("libyuv") {
 
   if (is_win && target_cpu == "x64" && !use_qt) {
     # Compile with clang in order to get inline assembly
-    public_deps = [
-      ":libyuv_internal(//build/toolchain/win:win_clang_x64)",
-    ]
+    public_deps = [ ":libyuv_internal(//build/toolchain/win:win_clang_x64)" ]
   } else {
-    public_deps = [
-      ":libyuv_internal",
-    ]
+    public_deps = [ ":libyuv_internal" ]
   }
 
   if (libyuv_use_neon) {
@@ -70,7 +65,11 @@ group("libyuv") {
     deps += [ ":libyuv_msa" ]
   }
 
-  if (!is_ios) {
+  if (libyuv_use_mmi) {
+    deps += [ ":libyuv_mmi" ]
+  }
+
+  if (!is_ios && !libyuv_disable_jpeg) {
     # Make sure that clients of libyuv link with libjpeg. This can't go in
     # libyuv_internal because in Windows x64 builds that will generate a clang
     # build of libjpeg, and we don't want two copies.
@@ -100,6 +99,7 @@ static_library("libyuv_internal") {
     "include/libyuv/scale.h",
     "include/libyuv/scale_argb.h",
     "include/libyuv/scale_row.h",
+    "include/libyuv/scale_uv.h",
     "include/libyuv/version.h",
     "include/libyuv/video_common.h",
 
@@ -134,6 +134,7 @@ static_library("libyuv_internal") {
     "source/scale_argb.cc",
     "source/scale_common.cc",
     "source/scale_gcc.cc",
+    "source/scale_uv.cc",
     "source/scale_win.cc",
     "source/video_common.cc",
   ]
@@ -147,7 +148,7 @@ static_library("libyuv_internal") {
     configs += [ "//build/config/gcc:symbol_visibility_default" ]
   }
 
-  if (!is_ios) {
+  if (!is_ios && !libyuv_disable_jpeg) {
     defines += [ "HAVE_JPEG" ]
 
     # Needed to pull in libjpeg headers. Can't add //third_party:jpeg to deps
@@ -173,6 +174,9 @@ static_library("libyuv_internal") {
       "-ffp-contract=fast",  # Enable fma vectorization for NEON.
     ]
   }
+  if (!libyuv_use_mmi) {
+    defines += [ "LIBYUV_DISABLE_MMI" ]
+  }
 }
 
 if (libyuv_use_neon) {
@@ -189,9 +193,7 @@ if (libyuv_use_neon) {
       "source/scale_neon64.cc",
     ]
 
-    deps = [
-      ":libyuv_internal",
-    ]
+    deps = [ ":libyuv_internal" ]
 
     public_configs = [ ":libyuv_config" ]
 
@@ -222,10 +224,24 @@ if (libyuv_use_msa) {
       "source/scale_msa.cc",
     ]
 
-    deps = [
-      ":libyuv_internal",
+    deps = [ ":libyuv_internal" ]
+
+    public_configs = [ ":libyuv_config" ]
+  }
+}
+
+if (libyuv_use_mmi) {
+  static_library("libyuv_mmi") {
+    sources = [
+      # MMI Source Files
+      "source/compare_mmi.cc",
+      "source/rotate_mmi.cc",
+      "source/row_mmi.cc",
+      "source/scale_mmi.cc",
     ]
 
+    deps = [ ":libyuv_internal" ]
+
     public_configs = [ ":libyuv_config" ]
   }
 }
@@ -254,8 +270,6 @@ if (libyuv_include_tests) {
     testonly = true
 
     sources = [
-      # sources
-      # headers
       "unit_test/basictypes_test.cc",
       "unit_test/color_test.cc",
       "unit_test/compare_test.cc",
@@ -268,6 +282,7 @@ if (libyuv_include_tests) {
       "unit_test/rotate_test.cc",
       "unit_test/scale_argb_test.cc",
       "unit_test/scale_test.cc",
+      "unit_test/scale_uv_test.cc",
       "unit_test/unit_test.cc",
       "unit_test/unit_test.h",
       "unit_test/video_common_test.cc",
@@ -286,12 +301,10 @@ if (libyuv_include_tests) {
 
     configs += [ ":libyuv_unittest_warnings_config" ]
 
-    public_deps = [
-      "//testing/gtest",
-    ]
+    public_deps = [ "//testing/gtest" ]
     public_configs = [ ":libyuv_unittest_config" ]
 
-    if (is_linux) {
+    if (is_linux || is_chromeos) {
       cflags = [ "-fexceptions" ]
     }
     if (is_ios) {
@@ -328,10 +341,8 @@ if (libyuv_include_tests) {
       # sources
       "util/compare.cc",
     ]
-    deps = [
-      ":libyuv",
-    ]
-    if (is_linux) {
+    deps = [ ":libyuv" ]
+    if (is_linux || is_chromeos) {
       cflags = [ "-fexceptions" ]
     }
   }
@@ -341,10 +352,8 @@ if (libyuv_include_tests) {
       # sources
       "util/yuvconvert.cc",
     ]
-    deps = [
-      ":libyuv",
-    ]
-    if (is_linux) {
+    deps = [ ":libyuv" ]
+    if (is_linux || is_chromeos) {
       cflags = [ "-fexceptions" ]
     }
   }
@@ -356,22 +365,28 @@ if (libyuv_include_tests) {
       "util/psnr_main.cc",
       "util/ssim.cc",
     ]
-    deps = [
-      ":libyuv",
-    ]
+    deps = [ ":libyuv" ]
 
     if (!is_ios && !libyuv_disable_jpeg) {
       defines = [ "HAVE_JPEG" ]
     }
   }
 
-  executable("cpuid") {
+  executable("i444tonv12_eg") {
     sources = [
       # sources
-      "util/cpuid.c",
+      "util/i444tonv12_eg.cc",
     ]
     deps = [
       ":libyuv",
     ]
   }
+
+  executable("cpuid") {
+    sources = [
+      # sources
+      "util/cpuid.c",
+    ]
+    deps = [ ":libyuv" ]
+  }
 }
diff --git a/chromium/third_party/libyuv/DEPS b/chromium/third_party/libyuv/DEPS
index 60e437ef6bd..de185434500 100644
--- a/chromium/third_party/libyuv/DEPS
+++ b/chromium/third_party/libyuv/DEPS
@@ -1,45 +1,81 @@
 vars = {
   'chromium_git': 'https://chromium.googlesource.com',
-  'swarming_revision': '88229872dd17e71658fe96763feaa77915d8cbd6',
-  # Three lines of non-changing comments so that
-  # the commit queue can handle CLs rolling lss
-  # and whatever else without interference from each other.
-  'lss_revision': 'e6527b0cd469e3ff5764785dadcb39bf7d787154',
-  # Three lines of non-changing comments so that
-  # the commit queue can handle CLs rolling catapult
-  # and whatever else without interference from each other.
-  'catapult_revision': 'f3ce003c2baaf3b2aba669681f832139efe5d773',
+  'chromium_revision': '64c8c30faaf969c15c028131dfcd0819208039c1',
+  'gn_version': 'git_revision:6f13aaac55a977e1948910942675c69f2b4f7a94',
 }
 
 deps = {
   'src/build':
-    Var('chromium_git') + '/chromium/src/build' + '@' + '8cb53523220fec0dee401d2ee5f046cbf43b0656',
+    Var('chromium_git') + '/chromium/src/build' + '@' + '2d2f9f2b85592bb9af5753ef300c055e6feb709f',
   'src/buildtools':
-    Var('chromium_git') + '/chromium/buildtools.git' + '@' + '5941c1b3df96c1db756a2834343533335c394c4a',
+    Var('chromium_git') + '/chromium/src/buildtools' + '@' + '6302c1175607a436e18947a5abe9df2209e845fc',
   'src/testing':
-    Var('chromium_git') + '/chromium/src/testing' + '@' + '60b2c69b17816251c0d20557eb818d26ac7e0fe4',
+    Var('chromium_git') + '/chromium/src/testing' + '@' + '40b44171056045ed1f85ca0b57485e46c03d7867',
   'src/third_party':
-    Var('chromium_git') + '/chromium/src/third_party' + '@' + 'e755204b7ae59ba1c63e5720a0420d8661672642',
+    Var('chromium_git') + '/chromium/src/third_party' + '@' + '24ccdf9b7553446791983bf357261c5e0a4314a0',
+
+  'src/buildtools/linux64': {
+    'packages': [
+      {
+        'package': 'gn/gn/linux-amd64',
+        'version': Var('gn_version'),
+      }
+    ],
+    'dep_type': 'cipd',
+    'condition': 'checkout_linux',
+  },
+  'src/buildtools/mac': {
+    'packages': [
+      {
+        'package': 'gn/gn/mac-amd64',
+        'version': Var('gn_version'),
+      }
+    ],
+    'dep_type': 'cipd',
+    'condition': 'checkout_mac',
+  },
+  'src/buildtools/win': {
+    'packages': [
+      {
+        'package': 'gn/gn/windows-amd64',
+        'version': Var('gn_version'),
+      }
+    ],
+    'dep_type': 'cipd',
+    'condition': 'checkout_win',
+  },
+
+  'src/buildtools/clang_format/script':
+    Var('chromium_git') + '/chromium/llvm-project/cfe/tools/clang-format.git' + '@' + '96636aa0e9f047f17447f2d45a094d0b59ed7917',
+  'src/buildtools/third_party/libc++/trunk':
+    Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxx.git' + '@' + 'd9040c75cfea5928c804ab7c235fed06a63f743a',
+  'src/buildtools/third_party/libc++abi/trunk':
+    Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxxabi.git' + '@' + '196ba1aaa8ac285d94f4ea8d9836390a45360533',
+  'src/buildtools/third_party/libunwind/trunk':
+    Var('chromium_git') + '/external/github.com/llvm/llvm-project/libunwind.git' + '@' + 'd999d54f4bca789543a2eb6c995af2d9b5a1f3ed',
+
   'src/third_party/catapult':
-    Var('chromium_git') + '/catapult.git' + '@' + Var('catapult_revision'),
+    Var('chromium_git') + '/catapult.git' + '@' + 'ccc9dd2835f5a7c5c82ae3c1a2fbc2fe2fd9dfd1',
   'src/third_party/colorama/src':
     Var('chromium_git') + '/external/colorama.git' + '@' + '799604a1041e9b3bc5d2789ecbd7e8db2e18e6b8',
+  'src/third_party/depot_tools':
+    Var('chromium_git') + '/chromium/tools/depot_tools.git' + '@' + '91bb7506bd20ed22b8787e7a8b9975cc07e97175',
   'src/third_party/freetype/src':
-    Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + 'a44e20879cefea41663bb36ff4af908cc4146fb8',
+    Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + '26e2a89598d69c7aba76c83f6a1fcf1db17574ab',
   'src/third_party/googletest/src':
-    Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + 'ba96d0b1161f540656efdaed035b3c062b60e006',
+    Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + '4fe018038f87675c083d0cfb6a6b57c274fb1753',
   'src/third_party/harfbuzz-ng/src':
-    Var('chromium_git') + '/external/github.com/harfbuzz/harfbuzz.git' + '@' + '957e7756634a4fdf1654041e20e883cf964ecac9',
+    Var('chromium_git') + '/external/github.com/harfbuzz/harfbuzz.git' + '@' + 'c39ab82c90479341dcf28eaa8174af6f08c0d7ae',
   'src/third_party/libjpeg_turbo':
-    Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + 'a1750dbc79a8792dde3d3f7d7d8ac28ba01ac9dd',
+    Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + 'd5148db386ceb4a608058320071cbed890bd6ad2',
+  'src/third_party/nasm':
+    Var('chromium_git') + '/chromium/deps/nasm.git' + '@' + '19f3fad68da99277b2882939d3b2fa4c4b8d51d9',
   'src/third_party/yasm/source/patched-yasm':
-    Var('chromium_git') + '/chromium/deps/yasm/patched-yasm.git' + '@' + 'b98114e18d8b9b84586b10d24353ab8616d4c5fc',
+    Var('chromium_git') + '/chromium/deps/yasm/patched-yasm.git' + '@' + '720b70524a4424b15fc57e82263568c8ba0496ad',
   'src/tools':
-    Var('chromium_git') + '/chromium/src/tools' + '@' + '55c65d8fecf04f55f5ba9e14b1fdba170f0202d0',
-  'src/tools/gyp':
-    Var('chromium_git') + '/external/gyp.git' + '@' + 'd61a9397e668fa9843c4aa7da9e79460fe590bfb',
-   'src/tools/swarming_client':
-     Var('chromium_git') + '/infra/luci/client-py.git' + '@' +  Var('swarming_revision'),
+    Var('chromium_git') + '/chromium/src/tools' + '@' + '1bb7c085e67a0fc8c63511af83299d1632f5a3f3',
+  'src/tools/swarming_client':
+    Var('chromium_git') + '/infra/luci/client-py.git' + '@' + 'd46ea7635f2911208268170512cb611412488fd8',
 
   # libyuv-only dependencies (not present in Chromium).
   'src/third_party/gflags':
@@ -50,7 +86,7 @@ deps = {
     Var('chromium_git') + '/external/webrtc/deps/third_party/gtest-parallel' + '@' + '1dad0e9f6d82ff994130b529d7d814b40eb32b0e',
 
   'src/third_party/lss': {
-    'url': Var('chromium_git') + '/linux-syscall-support.git' + '@' + Var('lss_revision'),
+    'url': Var('chromium_git') + '/linux-syscall-support.git' + '@' + '29f7c7e018f4ce706a709f0b0afbf8bacf869480',
     'condition': 'checkout_android or checkout_linux',
   },
 
@@ -59,25 +95,27 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/accessibility-test-framework',
-              'version': 'version:2.1-cr0',
+              'version': 'b5ec1e56e58e56bc1a0c77d43111c37f9b512c8a',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
   'src/third_party/auto/src': {
-    'url': Var('chromium_git') + '/external/github.com/google/auto.git' + '@' + '8a81a858ae7b78a1aef71ac3905fade0bbd64e82',
+    'url': Var('chromium_git') + '/external/github.com/google/auto.git' + '@' + 'f40317ae215863102cf87fe0679ad66f4b19454e',
     'condition': 'checkout_android',
   },
+  'src/third_party/boringssl/src':
+    'https://boringssl.googlesource.com/boringssl.git' + '@' + '1607f54fed72c6589d560254626909a64124f091',
   'src/base': {
-    'url': Var('chromium_git') + '/chromium/src/base' + '@' + '733a32608c5cd39c03a578cf6001afc2e6c636a2',
+    'url': Var('chromium_git') + '/chromium/src/base' + '@' + 'e096814b0448fba1095c6e7be7c7a0b5d7264251',
     'condition': 'checkout_android',
   },
   'src/third_party/bazel': {
       'packages': [
           {
               'package': 'chromium/third_party/bazel',
-              'version': 'version:0.10.0',
+              'version': 'VjMsf48QUWw8n7XtJP2AuSjIGmbQeYdWdwyxVvIRLmAC',
           },
       ],
       'condition': 'checkout_android',
@@ -87,42 +125,96 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/bouncycastle',
-              'version': 'version:1.46-cr0',
+              'version': 'c078e87552ba26e776566fdaf0f22cd8712743d0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
   'src/third_party/android_ndk': {
-    'url': Var('chromium_git') + '/android_ndk.git' + '@' + '5cd86312e794bdf542a3685c6f10cbb96072990b',
+    'url': Var('chromium_git') + '/android_ndk.git' + '@' + '27c0a8d090c666a50e40fceb4ee5b40b1a2d3f87',
     'condition': 'checkout_android',
   },
   'src/third_party/android_support_test_runner': {
       'packages': [
           {
               'package': 'chromium/third_party/android_support_test_runner',
-              'version': 'version:0.5-cr0',
+              'version': '96d4bf848cd210fdcbca6bcc8c1b4b39cbd93141',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_tools': {
-    'url': Var('chromium_git') + '/android_tools.git' + '@' + 'c22a664c39af72dd8f89200220713dcad811300a',
-    'condition': 'checkout_android',
+  'src/third_party/android_sdk/public': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_sdk/public/build-tools/30.0.1',
+              'version': '8LZujEmLjSh0g3JciDA3cslSptxKs9HOa_iUPXkOeYQC',
+          },
+          {
+              'package': 'chromium/third_party/android_sdk/public/cmdline-tools',
+              'version': 'ijpIFSitwBfaEdO9VXBGPqDHUVzPimXy_whw3aHTN9oC',
+          },
+          {
+              'package': 'chromium/third_party/android_sdk/public/emulator',
+              'version': 'A4EvXZUIuQho0QRDJopMUpgyp6NA3aiDQjGKPUKbowMC',
+          },
+          {
+              'package': 'chromium/third_party/android_sdk/public/extras',
+              'version': 'ppQ4TnqDvBHQ3lXx5KPq97egzF5X2FFyOrVHkGmiTMQC',
+          },
+          {
+              'package': 'chromium/third_party/android_sdk/public/patcher',
+              'version': 'I6FNMhrXlpB-E1lOhMlvld7xt9lBVNOO83KIluXDyA0C',
+          },
+          {
+              'package': 'chromium/third_party/android_sdk/public/platform-tools',
+              'version': '8tF0AOj7Dwlv4j7_nfkhxWB0jzrvWWYjEIpirt8FIWYC',
+          },
+          {
+              'package': 'chromium/third_party/android_sdk/public/platforms/android-30',
+              'version': 'YMUu9EHNZ__2Xcxl-KsaSf-dI5TMt_P62IseUVsxktMC',
+          },
+          {
+              'package': 'chromium/third_party/android_sdk/public/sources/android-29',
+              'version': '4gxhM8E62bvZpQs7Q3d0DinQaW0RLCIefhXrQBFkNy8C',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/tools/clang/dsymutil': {
+    'packages': [
+      {
+        'package': 'chromium/llvm-build-tools/dsymutil',
+        'version': 'OWlhXkmj18li3yhJk59Kmjbc5KdgLh56TwCd1qBdzlIC',
+      }
+    ],
+    'condition': 'checkout_mac',
+    'dep_type': 'cipd',
+  },
+  'src/third_party/android_build_tools/aapt2': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_build_tools/aapt2',
+              'version': 'version:3.6.0-alpha03-5516695-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
   },
   'src/third_party/byte_buddy': {
       'packages': [
           {
               'package': 'chromium/third_party/byte_buddy',
-              'version': 'version:1.4.17-cr0',
+              'version': 'c9b53316603fc2d997c899c7ca1707f809b918cd',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
   'src/third_party/ced/src': {
-    'url': Var('chromium_git') + '/external/github.com/google/compact_enc_det.git' + '@' + '94c367a1fe3a13207f4b22604fcfd1d9f9ddf6d9',
+    'url': Var('chromium_git') + '/external/github.com/google/compact_enc_det.git' + '@' + 'ba412eaaacd3186085babcd901679a48863c7dd5',
     'condition': 'checkout_android',
   },
   'src/third_party/errorprone/lib': {
@@ -137,7 +229,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/gson',
-              'version': 'version:2.8.0-cr0',
+              'version': '681931c9778045903a0ed59856ce2dd8dd7bf7ca',
           },
       ],
       'condition': 'checkout_android',
@@ -147,7 +239,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/guava',
-              'version': 'version:23.0-cr0',
+              'version': 'a6fba501f3a0de88b9be1daa2052632de5b96a46',
           },
       ],
       'condition': 'checkout_android',
@@ -157,20 +249,21 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/hamcrest',
-              'version': 'version:1.3-cr0',
+              'version': '37eccfc658fe79695d6abb6dd497463c4372032f',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/icu': {
-    'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'd888fd2a1be890f4d35e43f68d6d79f42519a357',
+    'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'c2a4cae149aae7fd30c4cbe3cf1b30df03b386f1',
   },
   'src/third_party/icu4j': {
       'packages': [
           {
               'package': 'chromium/third_party/icu4j',
-              'version': 'version:53.1-cr0',
+              'version': 'e87e5bed2b4935913ee26a3ebd0b723ee2344354',
           },
       ],
       'condition': 'checkout_android',
@@ -180,7 +273,21 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/intellij',
-              'version': 'version:12.0-cr0',
+              'version': '77c2721b024b36ee073402c08e6d8428c0295336',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/jdk': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/jdk',
+              'version': 'PfRSnxe8Od6WU4zBXomq-zsgcJgWmm3z4gMQNB-r2QcC',
+          },
+          {
+              'package': 'chromium/third_party/jdk/extras',
+              'version': 'fkhuOQ3r-zKtWEdKplpo6k0vKkjl-LY_rJTmtzFCQN4C',
           },
       ],
       'condition': 'checkout_android',
@@ -194,15 +301,19 @@ deps = {
     'url': Var('chromium_git') + '/external/junit.git' + '@' + '64155f8a9babcfcf4263cf4d08253a1556e75481',
     'condition': 'checkout_android',
   },
+  'src/third_party/libunwindstack': {
+      'url': Var('chromium_git') + '/chromium/src/third_party/libunwindstack.git' + '@' + '11659d420a71e7323b379ea8781f07c6f384bc7e',
+      'condition': 'checkout_android',
+  },
   'src/third_party/mockito/src': {
-    'url': Var('chromium_git') + '/external/mockito/mockito.git' + '@' + 'de83ad4598ad4cf5ea53c69a8a8053780b04b850',
+    'url': Var('chromium_git') + '/external/mockito/mockito.git' + '@' + '04a2a289a4222f80ad20717c25144981210d2eac',
     'condition': 'checkout_android',
   },
   'src/third_party/objenesis': {
       'packages': [
           {
               'package': 'chromium/third_party/objenesis',
-              'version': 'version:2.4-cr0',
+              'version': '9e367f55e5a65781ee77bfcbaa88fb82b30e75c0',
           },
       ],
       'condition': 'checkout_android',
@@ -212,7 +323,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/ow2_asm',
-              'version': 'version:5.0.1-cr0',
+              'version': 'NNAhdJzMdnutUVqfSJm5v0tVazA9l3Dd6CRwH6N4Q5kC',
           },
       ],
       'condition': 'checkout_android',
@@ -222,40 +333,64 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/r8',
-              'version': 'version:1.0.30',
+              'version': 'N9LppKV-9lFkp7JQtmcLHhm7xHqFv0SPa6aDPtgNCdwC',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/proguard': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/proguard',
+              'version': '3bd778c422ea5496de2ef25c007a517dbb5ce5ca',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
   'src/third_party/requests/src': {
-    'url': Var('chromium_git') + '/external/github.com/kennethreitz/requests.git' + '@' + 'f172b30356d821d180fa4ecfa3e71c7274a32de4',
+    'url': Var('chromium_git') + '/external/github.com/kennethreitz/requests.git' + '@' + 'refs/tags/v2.23.0',
     'condition': 'checkout_android',
   },
   'src/third_party/robolectric': {
       'packages': [
           {
               'package': 'chromium/third_party/robolectric',
-              'version': 'version:3.5.1',
+              'version': '1KXoOiNP1a_uZNdM2ybWKwAQNow1dHTXTig-ZK4Xgq8C',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
   'src/third_party/robolectric/robolectric': {
-    'url': Var('chromium_git') + '/external/robolectric.git' + '@' + '7e067f1112e1502caa742f7be72d37b5678d3403',
+    'url': Var('chromium_git') + '/external/robolectric.git' + '@' + '2f3e0a3ac450a17dbf2e7d4eaab3a1f14dda50e6',
     'condition': 'checkout_android',
   },
   'src/third_party/sqlite4java': {
       'packages': [
           {
               'package': 'chromium/third_party/sqlite4java',
-              'version': 'version:0.282-cr0',
+              'version': '889660698187baa7c8b0d79f7bf58563125fbd66',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/turbine': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/turbine',
+              'version': 'O_jNDJ4VdwYKBSDbd2BJ3mknaTFoVkvE7Po8XIiKy8sC',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+  'src/third_party/turbine/src': {
+      'url': Var('chromium_git') + '/external/github.com/google/turbine.git' + '@' + '0f2a5024fe4a9bb745bcd5ac7c655cebe11649bc',
+      'condition': 'checkout_android',
+  },
   'src/third_party/ub-uiautomator/lib': {
     'url': Var('chromium_git') + '/chromium/third_party/ub-uiautomator.git' + '@' + '00270549ce3161ae72ceb24712618ea28b4f9434',
     'condition': 'checkout_android',
@@ -264,7 +399,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/xstream',
-              'version': 'version:1.4.8-cr0',
+              'version': '4278b1b78b86ab7a1a29e64d5aec9a47a9aab0fe',
           },
       ],
       'condition': 'checkout_android',
@@ -273,7 +408,7 @@ deps = {
 
   # iOS deps:
   'src/ios': {
-    'url': Var('chromium_git') + '/chromium/src/ios' + '@' + '299ef76e844a74a1f2f4ce7f06d101861fb49aba',
+    'url': Var('chromium_git') + '/chromium/src/ios' + '@' + '60ef55beac67e3c0eda1c35ab7944c786b377313',
     'condition': 'checkout_ios'
   },
 
@@ -285,11 +420,176 @@ deps = {
   },
 
   # === ANDROID_DEPS Generated Code Start ===
-  # Generated by //tools/android/roll/android_deps/fetch_all.sh
-  'src/third_party/android_deps/repository/android_arch_core_common': {
+  # Generated by //third_party/android_deps/fetch_all.py
+  'src/third_party/android_deps/libs/android_arch_core_common': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/android_arch_core_common',
+              'version': 'version:1.1.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/android_arch_core_runtime': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/android_arch_core_runtime',
+              'version': 'version:1.1.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/android_arch_lifecycle_common': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/android_arch_core_common',
+              'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common',
+              'version': 'version:1.1.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/android_arch_lifecycle_common_java8': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common_java8',
+              'version': 'version:1.1.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/android_arch_lifecycle_livedata': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_livedata',
+              'version': 'version:1.1.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/android_arch_lifecycle_livedata_core': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_livedata_core',
+              'version': 'version:1.1.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/android_arch_lifecycle_runtime': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_runtime',
+              'version': 'version:1.1.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/android_arch_lifecycle_viewmodel': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_viewmodel',
+              'version': 'version:1.1.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_activity_activity': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_activity_activity',
+              'version': 'version:1.1.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_annotation_annotation': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_annotation_annotation',
+              'version': 'version:1.1.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_annotation_annotation_experimental': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_annotation_annotation_experimental',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_appcompat_appcompat': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_appcompat_appcompat',
+              'version': 'version:1.2.0-beta01-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_appcompat_appcompat_resources': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_appcompat_appcompat_resources',
+              'version': 'version:1.2.0-beta01-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_arch_core_core_common': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_arch_core_core_common',
+              'version': 'version:2.1.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_arch_core_core_runtime': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_arch_core_core_runtime',
+              'version': 'version:2.1.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_asynclayoutinflater_asynclayoutinflater': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_asynclayoutinflater_asynclayoutinflater',
               'version': 'version:1.0.0-cr0',
           },
       ],
@@ -297,10 +597,10 @@ deps = {
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/android_arch_lifecycle_common': {
+  'src/third_party/android_deps/libs/androidx_cardview_cardview': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/android_arch_lifecycle_common',
+              'package': 'chromium/third_party/android_deps/libs/androidx_cardview_cardview',
               'version': 'version:1.0.0-cr0',
           },
       ],
@@ -308,10 +608,21 @@ deps = {
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/android_arch_lifecycle_runtime': {
+  'src/third_party/android_deps/libs/androidx_collection_collection': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/android_arch_lifecycle_runtime',
+              'package': 'chromium/third_party/android_deps/libs/androidx_collection_collection',
+              'version': 'version:1.1.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_concurrent_concurrent_futures': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_concurrent_concurrent_futures',
               'version': 'version:1.0.0-cr0',
           },
       ],
@@ -319,87 +630,98 @@ deps = {
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/com_android_support_animated_vector_drawable': {
+  'src/third_party/android_deps/libs/androidx_coordinatorlayout_coordinatorlayout': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/com_android_support_animated_vector_drawable',
-              'version': 'version:27.0.0-cr0',
+              'package': 'chromium/third_party/android_deps/libs/androidx_coordinatorlayout_coordinatorlayout',
+              'version': 'version:1.1.0-cr0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/com_android_support_appcompat_v7': {
+  'src/third_party/android_deps/libs/androidx_core_core': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/com_android_support_appcompat_v7',
-              'version': 'version:27.0.0-cr0',
+              'package': 'chromium/third_party/android_deps/libs/androidx_core_core',
+              'version': 'version:1.3.0-beta01-cr0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/com_android_support_cardview_v7': {
+  'src/third_party/android_deps/libs/androidx_cursoradapter_cursoradapter': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/com_android_support_cardview_v7',
-              'version': 'version:27.0.0-cr0',
+              'package': 'chromium/third_party/android_deps/libs/androidx_cursoradapter_cursoradapter',
+              'version': 'version:1.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/com_android_support_design': {
+  'src/third_party/android_deps/libs/androidx_customview_customview': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/com_android_support_design',
-              'version': 'version:27.0.0-cr0',
+              'package': 'chromium/third_party/android_deps/libs/androidx_customview_customview',
+              'version': 'version:1.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/com_android_support_gridlayout_v7': {
+  'src/third_party/android_deps/libs/androidx_documentfile_documentfile': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/com_android_support_gridlayout_v7',
-              'version': 'version:27.0.0-cr0',
+              'package': 'chromium/third_party/android_deps/libs/androidx_documentfile_documentfile',
+              'version': 'version:1.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/com_android_support_leanback_v17': {
+  'src/third_party/android_deps/libs/androidx_drawerlayout_drawerlayout': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/com_android_support_leanback_v17',
-              'version': 'version:27.0.0-cr0',
+              'package': 'chromium/third_party/android_deps/libs/androidx_drawerlayout_drawerlayout',
+              'version': 'version:1.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/com_android_support_mediarouter_v7': {
+  'src/third_party/android_deps/libs/androidx_exifinterface_exifinterface': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/com_android_support_mediarouter_v7',
-              'version': 'version:27.0.0-cr0',
+              'package': 'chromium/third_party/android_deps/libs/androidx_exifinterface_exifinterface',
+              'version': 'version:1.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/com_android_support_multidex': {
+  'src/third_party/android_deps/libs/androidx_fragment_fragment': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/com_android_support_multidex',
+              'package': 'chromium/third_party/android_deps/libs/androidx_fragment_fragment',
+              'version': 'version:1.2.5-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_gridlayout_gridlayout': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_gridlayout_gridlayout',
               'version': 'version:1.0.0-cr0',
           },
       ],
@@ -407,165 +729,2211 @@ deps = {
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/com_android_support_palette_v7': {
+  'src/third_party/android_deps/libs/androidx_interpolator_interpolator': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_interpolator_interpolator',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_leanback_leanback': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_leanback_leanback',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_leanback_leanback_preference': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_leanback_leanback_preference',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_legacy_legacy_preference_v14': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_legacy_legacy_preference_v14',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_legacy_legacy_support_core_ui': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_legacy_legacy_support_core_ui',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_legacy_legacy_support_core_utils': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_legacy_legacy_support_core_utils',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_legacy_legacy_support_v13': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_legacy_legacy_support_v13',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_legacy_legacy_support_v4': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_legacy_legacy_support_v4',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_lifecycle_lifecycle_common': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_lifecycle_lifecycle_common',
+              'version': 'version:2.2.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_lifecycle_lifecycle_common_java8': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_lifecycle_lifecycle_common_java8',
+              'version': 'version:2.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_lifecycle_lifecycle_livedata': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_lifecycle_lifecycle_livedata',
+              'version': 'version:2.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_lifecycle_lifecycle_livedata_core': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_lifecycle_lifecycle_livedata_core',
+              'version': 'version:2.2.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_lifecycle_lifecycle_runtime': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_lifecycle_lifecycle_runtime',
+              'version': 'version:2.2.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_lifecycle_lifecycle_viewmodel': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_lifecycle_lifecycle_viewmodel',
+              'version': 'version:2.2.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_lifecycle_lifecycle_viewmodel_savedstate': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_lifecycle_lifecycle_viewmodel_savedstate',
+              'version': 'version:2.2.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_loader_loader': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_loader_loader',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_localbroadcastmanager_localbroadcastmanager': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_localbroadcastmanager_localbroadcastmanager',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_media_media': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_media_media',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_mediarouter_mediarouter': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_mediarouter_mediarouter',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_multidex_multidex': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_multidex_multidex',
+              'version': 'version:2.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_palette_palette': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_palette_palette',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_preference_preference': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_preference_preference',
+              'version': 'version:1.1.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_print_print': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_print_print',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_recyclerview_recyclerview': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_recyclerview_recyclerview',
+              'version': 'version:1.1.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_savedstate_savedstate': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_savedstate_savedstate',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_slice_slice_builders': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_slice_slice_builders',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_slice_slice_core': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_slice_slice_core',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_slidingpanelayout_slidingpanelayout': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_slidingpanelayout_slidingpanelayout',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_swiperefreshlayout_swiperefreshlayout': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_swiperefreshlayout_swiperefreshlayout',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_test_core': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_test_core',
+              'version': 'version:1.2.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_test_espresso_espresso_contrib': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_test_espresso_espresso_contrib',
+              'version': 'version:3.2.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_test_espresso_espresso_core': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_test_espresso_espresso_core',
+              'version': 'version:3.2.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_test_espresso_espresso_idling_resource': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_test_espresso_espresso_idling_resource',
+              'version': 'version:3.2.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_test_espresso_espresso_intents': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_test_espresso_espresso_intents',
+              'version': 'version:3.2.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_test_espresso_espresso_web': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_test_espresso_espresso_web',
+              'version': 'version:3.2.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_test_ext_junit': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_test_ext_junit',
+              'version': 'version:1.1.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_test_monitor': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_test_monitor',
+              'version': 'version:1.2.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_test_rules': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_test_rules',
+              'version': 'version:1.2.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_test_runner': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_test_runner',
+              'version': 'version:1.2.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_test_uiautomator_uiautomator': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_test_uiautomator_uiautomator',
+              'version': 'version:2.2.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_transition_transition': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_transition_transition',
+              'version': 'version:1.2.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_tvprovider_tvprovider': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_tvprovider_tvprovider',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_vectordrawable_vectordrawable': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_vectordrawable_vectordrawable',
+              'version': 'version:1.1.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_vectordrawable_vectordrawable_animated': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_vectordrawable_vectordrawable_animated',
+              'version': 'version:1.1.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_versionedparcelable_versionedparcelable': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_versionedparcelable_versionedparcelable',
+              'version': 'version:1.1.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_viewpager2_viewpager2': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_viewpager2_viewpager2',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_viewpager_viewpager': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_viewpager_viewpager',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_webkit_webkit': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_webkit_webkit',
+              'version': 'version:1.3.0-rc01-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_window_window': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_window_window',
+              'version': 'version:1.0.0-alpha01-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/backport_util_concurrent_backport_util_concurrent': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/backport_util_concurrent_backport_util_concurrent',
+              'version': 'version:3.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/classworlds_classworlds': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/classworlds_classworlds',
+              'version': 'version:1.1-alpha-2-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_animated_vector_drawable': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_animated_vector_drawable',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_appcompat_v7': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_appcompat_v7',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_asynclayoutinflater': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_asynclayoutinflater',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_cardview_v7': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_cardview_v7',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_collections': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_collections',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_coordinatorlayout': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_coordinatorlayout',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_cursoradapter': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_cursoradapter',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_customview': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_customview',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_design': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_design',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_documentfile': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_documentfile',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_drawerlayout': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_drawerlayout',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_gridlayout_v7': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_gridlayout_v7',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_interpolator': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_interpolator',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_leanback_v17': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_leanback_v17',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_loader': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_loader',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_localbroadcastmanager': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_localbroadcastmanager',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_mediarouter_v7': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_mediarouter_v7',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_multidex': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_multidex',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_palette_v7': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_palette_v7',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_preference_leanback_v17': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_preference_leanback_v17',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_preference_v14': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_preference_v14',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_preference_v7': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_preference_v7',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_print': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_print',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_recyclerview_v7': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_recyclerview_v7',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_slidingpanelayout': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_slidingpanelayout',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_support_annotations': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_annotations',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_support_compat': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_compat',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_support_core_ui': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_ui',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_support_core_utils': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_utils',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_support_fragment': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_fragment',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_support_media_compat': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_media_compat',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_support_v13': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_v13',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_support_v4': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_v4',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_support_vector_drawable': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_vector_drawable',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_swiperefreshlayout': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_swiperefreshlayout',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_transition': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_transition',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_versionedparcelable': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_versionedparcelable',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_viewpager': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_viewpager',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_tools_build_jetifier_jetifier_core': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_tools_build_jetifier_jetifier_core',
+              'version': 'version:1.0.0-beta08-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_tools_build_jetifier_jetifier_processor': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_tools_build_jetifier_jetifier_processor',
+              'version': 'version:1.0.0-beta08-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs',
+              'version': 'version:1.0.10-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs_configuration': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs_configuration',
+              'version': 'version:1.0.10-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine',
+              'version': 'version:2.8.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms',
+              'version': 'version:1.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth',
+              'version': 'version:17.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone',
+              'version': 'version:17.1.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base',
+              'version': 'version:17.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_base': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_base',
+              'version': 'version:17.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_basement': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_basement',
+              'version': 'version:17.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_cast': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast',
+              'version': 'version:17.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework',
+              'version': 'version:17.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_clearcut': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_clearcut',
+              'version': 'version:17.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_fido': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_fido',
+              'version': 'version:18.1.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_flags': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_flags',
+              'version': 'version:17.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_gcm': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_gcm',
+              'version': 'version:17.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_iid': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_iid',
+              'version': 'version:17.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps',
+              'version': 'version:17.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_location': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_location',
+              'version': 'version:17.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_phenotype': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_phenotype',
+              'version': 'version:17.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_places_placereport': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_places_placereport',
+              'version': 'version:17.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_stats': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_stats',
+              'version': 'version:17.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_tasks': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_tasks',
+              'version': 'version:17.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_vision': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision',
+              'version': 'version:18.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common',
+              'version': 'version:18.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_material_material': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_material_material',
+              'version': 'version:1.2.0-alpha06-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_auto_auto_common': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_auto_auto_common',
+              'version': 'version:0.10-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_auto_service_auto_service': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_auto_service_auto_service',
+              'version': 'version:1.0-rc6-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_auto_service_auto_service_annotations': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_auto_service_auto_service_annotations',
+              'version': 'version:1.0-rc6-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_auto_value_auto_value_annotations': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_auto_value_auto_value_annotations',
+              'version': 'version:1.7-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_code_findbugs_jFormatString': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_code_findbugs_jformatstring',
+              'version': 'version:3.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_code_findbugs_jsr305': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_code_findbugs_jsr305',
+              'version': 'version:3.0.2-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_code_gson_gson': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_code_gson_gson',
+              'version': 'version:2.8.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_dagger_dagger': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger',
+              'version': 'version:2.26-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_dagger_dagger_compiler': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_compiler',
+              'version': 'version:2.26-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_dagger_dagger_producers': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_producers',
+              'version': 'version:2.26-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_dagger_dagger_spi': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_spi',
+              'version': 'version:2.26-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_errorprone_error_prone_annotation': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_annotation',
+              'version': 'version:2.4.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_errorprone_error_prone_annotations': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_annotations',
+              'version': 'version:2.4.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_errorprone_error_prone_check_api': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_check_api',
+              'version': 'version:2.4.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_errorprone_error_prone_core': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_core',
+              'version': 'version:2.4.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_errorprone_error_prone_type_annotations': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_type_annotations',
+              'version': 'version:2.4.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_errorprone_javac': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_javac',
+              'version': 'version:9+181-r4173-1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_errorprone_javac_shaded': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_javac_shaded',
+              'version': 'version:9-dev-r4023-3-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_googlejavaformat_google_java_format': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_googlejavaformat_google_java_format',
+              'version': 'version:1.5-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_guava_failureaccess': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_guava_failureaccess',
+              'version': 'version:1.0.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_guava_guava': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/com_android_support_palette_v7',
-              'version': 'version:27.0.0-cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_google_guava_guava',
+              'version': 'version:27.1-jre-cr0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/com_android_support_preference_leanback_v17': {
+  'src/third_party/android_deps/libs/com_google_guava_listenablefuture': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/com_android_support_preference_leanback_v17',
-              'version': 'version:27.0.0-cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_google_guava_listenablefuture',
+              'version': 'version:1.0-cr0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/com_android_support_preference_v14': {
+  'src/third_party/android_deps/libs/com_google_j2objc_j2objc_annotations': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/com_android_support_preference_v14',
-              'version': 'version:27.0.0-cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_google_j2objc_j2objc_annotations',
+              'version': 'version:1.1-cr0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/com_android_support_preference_v7': {
+  'src/third_party/android_deps/libs/com_google_protobuf_protobuf_java': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/com_android_support_preference_v7',
-              'version': 'version:27.0.0-cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_google_protobuf_protobuf_java',
+              'version': 'version:3.4.0-cr0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/com_android_support_recyclerview_v7': {
+  'src/third_party/android_deps/libs/com_google_protobuf_protobuf_javalite': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/com_android_support_recyclerview_v7',
-              'version': 'version:27.0.0-cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_google_protobuf_protobuf_javalite',
+              'version': 'version:3.13.0-cr0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/com_android_support_support_annotations': {
+  'src/third_party/android_deps/libs/com_googlecode_java_diff_utils_diffutils': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/com_android_support_support_annotations',
-              'version': 'version:27.0.0-cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_googlecode_java_diff_utils_diffutils',
+              'version': 'version:1.3.0-cr0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/com_android_support_support_compat': {
+  'src/third_party/android_deps/libs/com_squareup_javapoet': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/com_android_support_support_compat',
-              'version': 'version:27.0.0-cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_squareup_javapoet',
+              'version': 'version:1.11.1-cr0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/com_android_support_support_core_ui': {
+  'src/third_party/android_deps/libs/com_squareup_javawriter': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/com_android_support_support_core_ui',
-              'version': 'version:27.0.0-cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_squareup_javawriter',
+              'version': 'version:2.1.1-cr0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/com_android_support_support_core_utils': {
+  'src/third_party/android_deps/libs/commons_cli_commons_cli': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/com_android_support_support_core_utils',
-              'version': 'version:27.0.0-cr0',
+              'package': 'chromium/third_party/android_deps/libs/commons_cli_commons_cli',
+              'version': 'version:1.3.1-cr0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/com_android_support_support_fragment': {
+  'src/third_party/android_deps/libs/javax_annotation_javax_annotation_api': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/com_android_support_support_fragment',
-              'version': 'version:27.0.0-cr0',
+              'package': 'chromium/third_party/android_deps/libs/javax_annotation_javax_annotation_api',
+              'version': 'version:1.3.2-cr0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/com_android_support_support_media_compat': {
+  'src/third_party/android_deps/libs/javax_annotation_jsr250_api': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/com_android_support_support_media_compat',
-              'version': 'version:27.0.0-cr0',
+              'package': 'chromium/third_party/android_deps/libs/javax_annotation_jsr250_api',
+              'version': 'version:1.0-cr0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/com_android_support_support_v13': {
+  'src/third_party/android_deps/libs/javax_inject_javax_inject': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/com_android_support_support_v13',
-              'version': 'version:27.0.0-cr0',
+              'package': 'chromium/third_party/android_deps/libs/javax_inject_javax_inject',
+              'version': 'version:1-cr0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/com_android_support_support_v4': {
+  'src/third_party/android_deps/libs/nekohtml_nekohtml': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/com_android_support_support_v4',
-              'version': 'version:27.0.0-cr0',
+              'package': 'chromium/third_party/android_deps/libs/nekohtml_nekohtml',
+              'version': 'version:1.9.6.2-cr0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/com_android_support_support_vector_drawable': {
+  'src/third_party/android_deps/libs/nekohtml_xercesMinimal': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/com_android_support_support_vector_drawable',
-              'version': 'version:27.0.0-cr0',
+              'package': 'chromium/third_party/android_deps/libs/nekohtml_xercesminimal',
+              'version': 'version:1.9.6.2-cr0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/repository/com_android_support_transition': {
+  'src/third_party/android_deps/libs/net_ltgt_gradle_incap_incap': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/repository/com_android_support_transition',
-              'version': 'version:27.0.0-cr0',
+              'package': 'chromium/third_party/android_deps/libs/net_ltgt_gradle_incap_incap',
+              'version': 'version:0.2-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/net_sf_kxml_kxml2': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/net_sf_kxml_kxml2',
+              'version': 'version:2.3.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_apache_ant_ant': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_apache_ant_ant',
+              'version': 'version:1.8.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_apache_ant_ant_launcher': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_apache_ant_ant_launcher',
+              'version': 'version:1.8.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_apache_maven_maven_ant_tasks': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_ant_tasks',
+              'version': 'version:2.1.3-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_apache_maven_maven_artifact': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_artifact',
+              'version': 'version:2.2.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_apache_maven_maven_artifact_manager': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_artifact_manager',
+              'version': 'version:2.2.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_apache_maven_maven_error_diagnostics': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_error_diagnostics',
+              'version': 'version:2.2.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_apache_maven_maven_model': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_model',
+              'version': 'version:2.2.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_apache_maven_maven_plugin_registry': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_plugin_registry',
+              'version': 'version:2.2.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_apache_maven_maven_profile': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_profile',
+              'version': 'version:2.2.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_apache_maven_maven_project': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_project',
+              'version': 'version:2.2.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_apache_maven_maven_repository_metadata': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_repository_metadata',
+              'version': 'version:2.2.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_apache_maven_maven_settings': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_settings',
+              'version': 'version:2.2.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_file': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_file',
+              'version': 'version:1.0-beta-6-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_lightweight': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_lightweight',
+              'version': 'version:1.0-beta-6-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_shared': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_shared',
+              'version': 'version:1.0-beta-6-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_provider_api': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_provider_api',
+              'version': 'version:1.0-beta-6-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup',
+              'version': 'version:1.2.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_checkerframework_checker_compat_qual': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_compat_qual',
+              'version': 'version:2.5.3-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_checkerframework_checker_qual': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_qual',
+              'version': 'version:2.10.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_checkerframework_dataflow_shaded': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_checkerframework_dataflow_shaded',
+              'version': 'version:3.1.2-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations',
+              'version': 'version:1.17-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_codehaus_plexus_plexus_container_default': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_codehaus_plexus_plexus_container_default',
+              'version': 'version:1.0-alpha-9-stable-1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_codehaus_plexus_plexus_interpolation': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_codehaus_plexus_plexus_interpolation',
+              'version': 'version:1.11-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_codehaus_plexus_plexus_utils': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_codehaus_plexus_plexus_utils',
+              'version': 'version:1.5.15-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_jdom_jdom2': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_jdom_jdom2',
+              'version': 'version:2.0.6-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_jetbrains_annotations': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_annotations',
+              'version': 'version:13.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib',
+              'version': 'version:1.3.50-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_common': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_common',
+              'version': 'version:1.3.50-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm',
+              'version': 'version:0.1.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_ow2_asm_asm': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm',
+              'version': 'version:7.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_ow2_asm_asm_analysis': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_analysis',
+              'version': 'version:7.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_ow2_asm_asm_commons': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_commons',
+              'version': 'version:7.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_ow2_asm_asm_tree': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_tree',
+              'version': 'version:7.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_ow2_asm_asm_util': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_util',
+              'version': 'version:7.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_pcollections_pcollections': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_pcollections_pcollections',
+              'version': 'version:2.1.2-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_annotations': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_annotations',
+              'version': 'version:4.3.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_junit': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_junit',
+              'version': 'version:4.3.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_pluginapi': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_pluginapi',
+              'version': 'version:4.3.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_plugins_maven_dependency_resolver': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_plugins_maven_dependency_resolver',
+              'version': 'version:4.3.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_resources': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_resources',
+              'version': 'version:4.3.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_robolectric': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_robolectric',
+              'version': 'version:4.3.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_sandbox': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_sandbox',
+              'version': 'version:4.3.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_shadowapi': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadowapi',
+              'version': 'version:4.3.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_shadows_framework': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadows_framework',
+              'version': 'version:4.3.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_shadows_multidex': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadows_multidex',
+              'version': 'version:4.3.1-cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_shadows_playservices': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadows_playservices',
+              'version': 'version:4.3.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_utils': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_utils',
+              'version': 'version:4.3.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_utils_reflector': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_utils_reflector',
+              'version': 'version:4.3.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_threeten_threeten_extra': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_threeten_threeten_extra',
+              'version': 'version:1.5.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -651,24 +3019,39 @@ hooks = [
     'name': 'mac_toolchain',
     'pattern': '.',
     'action': ['python', 'src/build/mac_toolchain.py'],
+    'condition': 'checkout_mac',
   },
-  # Pull binutils for linux, enabled debug fission for faster linking /
-  # debugging when used with clang on Ubuntu Precise.
-  # https://code.google.com/p/chromium/issues/detail?id=352046
+  # Pull the msan libraries on linux.
   {
-    'name': 'binutils',
-    'pattern': 'src/third_party/binutils',
-    'action': [
-        'python',
-        'src/third_party/binutils/download.py',
-    ],
+    'name': 'msan_chained_origins',
+    'pattern': '.',
+    'condition': 'checkout_linux',
+    'action': [ 'python',
+                'src/third_party/depot_tools/download_from_google_storage.py',
+                '--no_resume',
+                '--no_auth',
+                '--bucket', 'chromium-instrumented-libraries',
+                '-s', 'src/third_party/instrumented_libraries/binaries/msan-chained-origins-trusty.tgz.sha1',
+              ],
+  },
+  {
+    'name': 'msan_no_origins',
+    'pattern': '.',
+    'condition': 'checkout_linux',
+    'action': [ 'python',
+                'src/third_party/depot_tools/download_from_google_storage.py',
+                '--no_resume',
+                '--no_auth',
+                '--bucket', 'chromium-instrumented-libraries',
+                '-s', 'src/third_party/instrumented_libraries/binaries/msan-no-origins-trusty.tgz.sha1',
+              ],
   },
   {
     # Pull clang if needed or requested via GYP_DEFINES.
     # Note: On Win, this should run after win_toolchain, as it may use it.
     'name': 'clang',
     'pattern': '.',
-    'action': ['python', 'src/tools/clang/scripts/update.py', '--if-needed'],
+    'action': ['python', 'src/tools/clang/scripts/update.py'],
   },
   {
     # Update LASTCHANGE.
@@ -677,40 +3060,6 @@ hooks = [
     'action': ['python', 'src/build/util/lastchange.py',
                '-o', 'src/build/util/LASTCHANGE'],
   },
-  # Pull GN binaries.
-  {
-    'name': 'gn_win',
-    'pattern': '.',
-    'action': [ 'download_from_google_storage',
-                '--no_resume',
-                '--platform=win32',
-                '--no_auth',
-                '--bucket', 'chromium-gn',
-                '-s', 'src/buildtools/win/gn.exe.sha1',
-    ],
-  },
-  {
-    'name': 'gn_mac',
-    'pattern': '.',
-    'action': [ 'download_from_google_storage',
-                '--no_resume',
-                '--platform=darwin',
-                '--no_auth',
-                '--bucket', 'chromium-gn',
-                '-s', 'src/buildtools/mac/gn.sha1',
-    ],
-  },
-  {
-    'name': 'gn_linux64',
-    'pattern': '.',
-    'action': [ 'download_from_google_storage',
-                '--no_resume',
-                '--platform=linux*',
-                '--no_auth',
-                '--bucket', 'chromium-gn',
-                '-s', 'src/buildtools/linux64/gn.sha1',
-    ],
-  },
   # Pull clang-format binaries using checked-in hashes.
   {
     'name': 'clang_format_win',
@@ -737,6 +3086,7 @@ hooks = [
   {
     'name': 'clang_format_linux',
     'pattern': '.',
+    'condition': 'host_os == "linux"',
     'action': [ 'download_from_google_storage',
                 '--no_resume',
                 '--platform=linux*',
@@ -791,26 +3141,6 @@ hooks = [
                '--root', 'src',
     ],
   },
-  # Android dependencies. Many are downloaded using Google Storage these days.
-  # They're copied from https://cs.chromium.org/chromium/src/DEPS for all
-  # such dependencies we share with Chromium.
-  {
-    # This downloads SDK extras and puts them in the
-    # third_party/android_tools/sdk/extras directory.
-    'name': 'sdkextras',
-    'pattern': '.',
-    # When adding a new sdk extras package to download, add the package
-    # directory and zip file to .gitignore in third_party/android_tools.
-    'action': ['python',
-               'src/build/android/play_services/update.py',
-               'download'
-    ],
-  },
 ]
 
-recursedeps = [
-  # buildtools provides clang_format, libc++, and libc++abi.
-  'src/buildtools',
-  # android_tools manages the NDK.
-  'src/third_party/android_tools',
-]
+recursedeps = []
diff --git a/chromium/third_party/libyuv/OWNERS b/chromium/third_party/libyuv/OWNERS
index 7b21adfe6c7..755c220be4d 100644
--- a/chromium/third_party/libyuv/OWNERS
+++ b/chromium/third_party/libyuv/OWNERS
@@ -1,8 +1,12 @@
+mbonadei@chromium.org
 fbarchard@chromium.org
 magjed@chromium.org
+pbos@chromium.org
 
-per-file *.gn=phoglund@chromium.org
+per-file *.gn=mbonadei@chromium.org
 per-file .gitignore=*
 per-file AUTHORS=*
 per-file DEPS=*
-per-file PRESUBMIT.py=phoglund@chromium.org
+per-file PRESUBMIT.py=mbonadei@chromium.org
+
+# COMPONENT: Internals>Images>Codecs
diff --git a/chromium/third_party/libyuv/README.chromium b/chromium/third_party/libyuv/README.chromium
index 4ecdcb2840b..4a7e30b087c 100644
--- a/chromium/third_party/libyuv/README.chromium
+++ b/chromium/third_party/libyuv/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1714
+Version: 1768
 License: BSD
 License File: LICENSE
 
diff --git a/chromium/third_party/libyuv/README.md b/chromium/third_party/libyuv/README.md
index 7b6619220b8..db70b7f08d3 100644
--- a/chromium/third_party/libyuv/README.md
+++ b/chromium/third_party/libyuv/README.md
@@ -10,9 +10,9 @@
 
 ### Development
 
-See [Getting started] [1] for instructions on how to get started developing.
+See [Getting started][1] for instructions on how to get started developing.
 
-You can also browse the [docs directory] [2] for more documentation.
+You can also browse the [docs directory][2] for more documentation.
 
-[1]: https://chromium.googlesource.com/libyuv/libyuv/+/master/docs/getting_started.md
-[2]: https://chromium.googlesource.com/libyuv/libyuv/+/master/docs/
+[1]: ./docs/getting_started.md
+[2]: ./docs/
diff --git a/chromium/third_party/libyuv/build_overrides/build.gni b/chromium/third_party/libyuv/build_overrides/build.gni
index 6d8319b965e..a83860a8eb8 100644
--- a/chromium/third_party/libyuv/build_overrides/build.gni
+++ b/chromium/third_party/libyuv/build_overrides/build.gni
@@ -44,3 +44,13 @@ if (host_os == "mac") {
              "hermetic toolchain if the minimum OS version is not met.")
   use_system_xcode = _result == 0
 }
+
+declare_args() {
+  # Tracing support requires //third_party/perfetto.
+  enable_base_tracing = false
+  use_perfetto_client_library = false
+
+  # Allows googletest to pretty-print various absl types.
+  # Defined here rather than in gtest.gni to match chromium.
+  gtest_enable_absl_printers = true
+}
diff --git a/chromium/third_party/libyuv/docs/environment_variables.md b/chromium/third_party/libyuv/docs/environment_variables.md
index c28d83e7dc1..cd8159ad5a8 100644
--- a/chromium/third_party/libyuv/docs/environment_variables.md
+++ b/chromium/third_party/libyuv/docs/environment_variables.md
@@ -6,7 +6,10 @@ For test purposes, environment variables can be set to control libyuv behavior.
 
 By default the cpu is detected and the most advanced form of SIMD is used.  But you can disable instruction sets selectively, or completely, falling back on C code.  Set the variable to 1 to disable the specified instruction set.
 
+## All CPUs
     LIBYUV_DISABLE_ASM
+
+## Intel CPUs
     LIBYUV_DISABLE_X86
     LIBYUV_DISABLE_SSE2
     LIBYUV_DISABLE_SSSE3
@@ -14,12 +17,25 @@ By default the cpu is detected and the most advanced form of SIMD is used.  But
     LIBYUV_DISABLE_SSE42
     LIBYUV_DISABLE_AVX
     LIBYUV_DISABLE_AVX2
-    LIBYUV_DISABLE_AVX512BW
     LIBYUV_DISABLE_ERMS
     LIBYUV_DISABLE_FMA3
-    LIBYUV_DISABLE_MSA
+    LIBYUV_DISABLE_F16C
+    LIBYUV_DISABLE_AVX512BW
+    LIBYUV_DISABLE_AVX512VL
+    LIBYUV_DISABLE_AVX512VBMI
+    LIBYUV_DISABLE_AVX512VBMI2
+    LIBYUV_DISABLE_AVX512VBITALG
+    LIBYUV_DISABLE_AVX512VPOPCNTDQ
+    LIBYUV_DISABLE_GFNI
+
+## ARM CPUs
+
     LIBYUV_DISABLE_NEON
 
+## MIPS CPUs
+    LIBYUV_DISABLE_MSA
+    LIBYUV_DISABLE_MMI
+
 # Test Width/Height/Repeat
 
 The unittests default to a small image (128x72) to run fast.  This can be set by environment variable to test a specific resolutions.
diff --git a/chromium/third_party/libyuv/docs/formats.md b/chromium/third_party/libyuv/docs/formats.md
index 97e8ce05f48..a29ed5c3043 100644
--- a/chromium/third_party/libyuv/docs/formats.md
+++ b/chromium/third_party/libyuv/docs/formats.md
@@ -36,7 +36,7 @@ This is how OSX formats map to libyuv
 
 The following is extracted from video_common.h as a complete list of formats supported by libyuv.
     enum FourCC {
-      // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+      // 10 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
       FOURCC_I420 = FOURCC('I', '4', '2', '0'),
       FOURCC_I422 = FOURCC('I', '4', '2', '2'),
       FOURCC_I444 = FOURCC('I', '4', '4', '4'),
@@ -46,9 +46,11 @@ The following is extracted from video_common.h as a complete list of formats sup
       FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
       FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
       FOURCC_H010 = FOURCC('H', '0', '1', '0'),  // unofficial fourcc. 10 bit lsb
+      FOURCC_U010 = FOURCC('U', '0', '1', '0'),  // bt.2020, unofficial fourcc.
+                                                 // 10 bit lsb
 
       // 1 Secondary YUV format: row biplanar.
-      FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+      FOURCC_M420 = FOURCC('M', '4', '2', '0'),  // deprecated.
 
       // 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc
       FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
@@ -66,7 +68,7 @@ The following is extracted from video_common.h as a complete list of formats sup
       // 1 Primary Compressed YUV format.
       FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
 
-      // 8 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+      // 11 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
       FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
       FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
       FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
@@ -75,6 +77,9 @@ The following is extracted from video_common.h as a complete list of formats sup
       FOURCC_J400 = FOURCC('J', '4', '0', '0'),  // unofficial fourcc
       FOURCC_H420 = FOURCC('H', '4', '2', '0'),  // unofficial fourcc
       FOURCC_H422 = FOURCC('H', '4', '2', '2'),  // unofficial fourcc
+      FOURCC_U420 = FOURCC('U', '4', '2', '0'),  // bt.2020, unofficial fourcc
+      FOURCC_U422 = FOURCC('U', '4', '2', '2'),  // bt.2020, unofficial fourcc
+      FOURCC_U444 = FOURCC('U', '4', '4', '4'),  // bt.2020, unofficial fourcc
 
       // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
       FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
@@ -161,3 +166,4 @@ The 12 in NV12 refers to 12 bits per pixel.  NV12 has a half width and half
 height chroma channel, and therefore is a 420 subsampling.
 NV16 is 16 bits per pixel, with half width and full height.  aka 422.
 NV24 is 24 bits per pixel with full sized chroma channel. aka 444.
+Most NV12 functions allow the destination Y pointer to be NULL.
diff --git a/chromium/third_party/libyuv/docs/getting_started.md b/chromium/third_party/libyuv/docs/getting_started.md
index f547c419d67..3e339712e19 100644
--- a/chromium/third_party/libyuv/docs/getting_started.md
+++ b/chromium/third_party/libyuv/docs/getting_started.md
@@ -27,7 +27,7 @@ Then you'll get a .gclient file like:
       },
     ];
 
-For iOS add `;target_os=['ios'];` to your OSX .gclient and run `GYP_DEFINES="OS=ios" gclient sync.`
+For iOS add `;target_os=['ios'];` to your OSX .gclient and run `gclient sync.`
 
 Browse the Git reprository: https://chromium.googlesource.com/libyuv/libyuv/+/master
 
@@ -48,11 +48,8 @@ For Android add `;target_os=['android'];` to your Linux .gclient
 
 Then run:
 
-    export GYP_DEFINES="OS=android"
     gclient sync
 
-The sync will generate native build files for your environment using gyp (Windows: Visual Studio, OSX: XCode, Linux: make). This generation can also be forced manually: `gclient runhooks`
-
 To get just the source (not buildable):
 
     git clone https://chromium.googlesource.com/libyuv/libyuv
@@ -135,8 +132,8 @@ ia32
 
 mips
 
-    gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=true"
-    gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=true"
+    gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true"
+    gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true"
     ninja -v -C out/Debug libyuv_unittest
     ninja -v -C out/Release libyuv_unittest
 
@@ -152,15 +149,15 @@ arm disassembly:
 
 Running tests:
 
-    build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=*
+    out/Release/bin/run_libyuv_unittest -vv --gtest_filter=*
 
 Running test as benchmark:
 
-    build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1  --libyuv_cpu_info=-1"
+    out/Release/bin/run_libyuv_unittest -vv --gtest_filter=* --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1  --libyuv_cpu_info=-1
 
 Running test with C code:
 
-    build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=1 --libyuv_cpu_info=1"
+    out/Release/bin/run_libyuv_unittest -vv --gtest_filter=* --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=1 --libyuv_cpu_info=1
 
 ### Build targets
 
@@ -178,13 +175,22 @@ Running test with C code:
     ninja -v -C out/Debug libyuv_unittest
     ninja -v -C out/Release libyuv_unittest
 
+### MIPS Linux
+
+mips
+
+   gn gen out/Release "--args=is_debug=false target_os=\"linux\" target_cpu=\"mips64el\" mips_arch_variant=\"loongson3\" mips_use_mmi=true is_component_build=false use_sysroot=false use_gold=false"
+   gn gen out/Debug "--args=is_debug=true target_os=\"linux\" target_cpu=\"mips64el\" mips_arch_variant=\"loongson3\" mips_use_mmi=true is_component_build=false use_sysroot=false use_gold=false"
+   ninja -v -C out/Debug libyuv_unittest
+   ninja -v -C out/Release libyuv_unittest
+
 ## Building the Library with make
 
 ### Linux
 
     make V=1 -f linux.mk
     make V=1 -f linux.mk clean
-    make V=1 -f linux.mk CXX=clang++
+    make V=1 -f linux.mk CXX=clang++ CC=clang
 
 ## Building the library with cmake
 
diff --git a/chromium/third_party/libyuv/docs/rotation.md b/chromium/third_party/libyuv/docs/rotation.md
index fb84fce5a9c..a08430fded0 100644
--- a/chromium/third_party/libyuv/docs/rotation.md
+++ b/chromium/third_party/libyuv/docs/rotation.md
@@ -100,4 +100,8 @@ Inverting can be achieved with almost any libyuv function by passing a negative
 
 I420Mirror and ARGBMirror can also be used to rotate by 180 degrees by passing a negative height.
 
+# Cropping - Vertical Flip
 
+When cropping from a subsampled format like NV21, the method of setting the start pointers wont work for odd crop start y on the UV plane.
+If the height after cropping will be odd, invert the source - point to the last row, negate the strides, and pass negative height, which
+will re-invert the image as the conversion outputs.
diff --git a/chromium/third_party/libyuv/include/libyuv.h b/chromium/third_party/libyuv/include/libyuv.h
index aeffd5ef7a4..a06e1233abb 100644
--- a/chromium/third_party/libyuv/include/libyuv.h
+++ b/chromium/third_party/libyuv/include/libyuv.h
@@ -26,6 +26,7 @@
 #include "libyuv/scale.h"
 #include "libyuv/scale_argb.h"
 #include "libyuv/scale_row.h"
+#include "libyuv/scale_uv.h"
 #include "libyuv/version.h"
 #include "libyuv/video_common.h"
 
diff --git a/chromium/third_party/libyuv/include/libyuv/compare_row.h b/chromium/third_party/libyuv/include/libyuv/compare_row.h
index e81f7455eee..e95b9d93eb2 100644
--- a/chromium/third_party/libyuv/include/libyuv/compare_row.h
+++ b/chromium/third_party/libyuv/include/libyuv/compare_row.h
@@ -84,6 +84,11 @@ extern "C" {
 #define HAS_SUMSQUAREERROR_MSA
 #endif
 
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+#define HAS_HAMMINGDISTANCE_MMI
+#define HAS_SUMSQUAREERROR_MMI
+#endif
+
 uint32_t HammingDistance_C(const uint8_t* src_a,
                            const uint8_t* src_b,
                            int count);
@@ -102,7 +107,9 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
 uint32_t HammingDistance_MSA(const uint8_t* src_a,
                              const uint8_t* src_b,
                              int count);
-
+uint32_t HammingDistance_MMI(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count);
 uint32_t SumSquareError_C(const uint8_t* src_a,
                           const uint8_t* src_b,
                           int count);
@@ -118,6 +125,9 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
 uint32_t SumSquareError_MSA(const uint8_t* src_a,
                             const uint8_t* src_b,
                             int count);
+uint32_t SumSquareError_MMI(const uint8_t* src_a,
+                            const uint8_t* src_b,
+                            int count);
 
 uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed);
 uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed);
diff --git a/chromium/third_party/libyuv/include/libyuv/convert.h b/chromium/third_party/libyuv/include/libyuv/convert.h
index d12ef24f799..026b153cefe 100644
--- a/chromium/third_party/libyuv/include/libyuv/convert.h
+++ b/chromium/third_party/libyuv/include/libyuv/convert.h
@@ -42,6 +42,36 @@ int I444ToI420(const uint8_t* src_y,
                int width,
                int height);
 
+// Convert I444 to NV12.
+LIBYUV_API
+int I444ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert I444 to NV21.
+LIBYUV_API
+int I444ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
+
 // Convert I422 to I420.
 LIBYUV_API
 int I422ToI420(const uint8_t* src_y,
@@ -59,6 +89,21 @@ int I422ToI420(const uint8_t* src_y,
                int width,
                int height);
 
+// Convert I422 to NV21.
+LIBYUV_API
+int I422ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
+
 // Copy I420 to I420.
 #define I420ToI420 I420Copy
 LIBYUV_API
@@ -127,6 +172,17 @@ int I400ToI420(const uint8_t* src_y,
                int width,
                int height);
 
+// Convert I400 (grey) to NV21.
+LIBYUV_API
+int I400ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
+
 #define J400ToJ420 I400ToI420
 
 // Convert NV12 to I420.
@@ -185,16 +241,25 @@ int UYVYToI420(const uint8_t* src_uyvy,
                int width,
                int height);
 
-// Convert M420 to I420.
+// Convert AYUV to NV12.
 LIBYUV_API
-int M420ToI420(const uint8_t* src_m420,
-               int src_stride_m420,
+int AYUVToNV12(const uint8_t* src_ayuv,
+               int src_stride_ayuv,
                uint8_t* dst_y,
                int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert AYUV to NV21.
+LIBYUV_API
+int AYUVToNV21(const uint8_t* src_ayuv,
+               int src_stride_ayuv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
                int width,
                int height);
 
@@ -281,6 +346,19 @@ int RGB24ToI420(const uint8_t* src_rgb24,
                 int width,
                 int height);
 
+// RGB little endian (bgr in memory) to J420.
+LIBYUV_API
+int RGB24ToJ420(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                uint8_t* dst_u,
+                int dst_stride_u,
+                uint8_t* dst_v,
+                int dst_stride_v,
+                int width,
+                int height);
+
 // RGB big endian (rgb in memory) to I420.
 LIBYUV_API
 int RAWToI420(const uint8_t* src_raw,
@@ -333,7 +411,24 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
                    int width,
                    int height);
 
-#ifdef HAVE_JPEG
+// RGB little endian (bgr in memory) to J400.
+LIBYUV_API
+int RGB24ToJ400(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_yj,
+                int dst_stride_yj,
+                int width,
+                int height);
+
+// RGB big endian (rgb in memory) to J400.
+LIBYUV_API
+int RAWToJ400(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_yj,
+              int dst_stride_yj,
+              int width,
+              int height);
+
 // src_width/height provided by capture.
 // dst_width/height for clipping determine final size.
 LIBYUV_API
@@ -350,13 +445,38 @@ int MJPGToI420(const uint8_t* sample,
                int dst_width,
                int dst_height);
 
+// JPEG to NV21
+LIBYUV_API
+int MJPGToNV21(const uint8_t* sample,
+               size_t sample_size,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height);
+
+// JPEG to NV12
+LIBYUV_API
+int MJPGToNV12(const uint8_t* sample,
+               size_t sample_size,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height);
+
 // Query size of MJPG in pixels.
 LIBYUV_API
 int MJPGSize(const uint8_t* sample,
              size_t sample_size,
              int* width,
              int* height);
-#endif
 
 // Convert camera sample to I420 with cropping, rotation and vertical flip.
 // "src_size" is needed to parse MJPG.
diff --git a/chromium/third_party/libyuv/include/libyuv/convert_argb.h b/chromium/third_party/libyuv/include/libyuv/convert_argb.h
index ab772b6c323..715a3dad97d 100644
--- a/chromium/third_party/libyuv/include/libyuv/convert_argb.h
+++ b/chromium/third_party/libyuv/include/libyuv/convert_argb.h
@@ -15,16 +15,41 @@
 
 #include "libyuv/rotate.h"  // For enum RotationMode.
 
-// TODO(fbarchard): This set of functions should exactly match convert.h
-// TODO(fbarchard): Add tests. Create random content of right size and convert
-// with C vs Opt and or to I420 and compare.
-// TODO(fbarchard): Some of these functions lack parameter setting.
-
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
+// Conversion matrix for YUV to RGB
+LIBYUV_API extern const struct YuvConstants kYuvI601Constants;  // BT.601
+LIBYUV_API extern const struct YuvConstants kYuvJPEGConstants;  // JPeg
+LIBYUV_API extern const struct YuvConstants kYuvH709Constants;  // BT.709
+LIBYUV_API extern const struct YuvConstants kYuv2020Constants;  // BT.2020
+
+// Conversion matrix for YVU to BGR
+LIBYUV_API extern const struct YuvConstants kYvuI601Constants;  // BT.601
+LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants;  // JPeg
+LIBYUV_API extern const struct YuvConstants kYvuH709Constants;  // BT.709
+LIBYUV_API extern const struct YuvConstants kYvu2020Constants;  // BT.2020
+
+// Macros for end swapped destination Matrix conversions.
+// Swap UV and pass mirrored kYvuJPEGConstants matrix.
+// TODO(fbarchard): Add macro for each Matrix function.
+#define kYuvI601ConstantsVU kYvuI601Constants
+#define kYuvJPEGConstantsVU kYvuJPEGConstants
+#define kYuvH709ConstantsVU kYvuH709Constants
+#define kYuv2020ConstantsVU kYvu2020Constants
+#define NV12ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
+  NV21ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV21ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
+  NV12ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV12ToRAWMatrix(a, b, c, d, e, f, g, h, i) \
+  NV21ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV21ToRAWMatrix(a, b, c, d, e, f, g, h, i) \
+  NV12ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
+#define I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+  I420AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+
 // Alias.
 #define ARGBToARGB ARGBCopy
 
@@ -50,7 +75,7 @@ int I420ToARGB(const uint8_t* src_y,
                int width,
                int height);
 
-// Duplicate prototype for function in convert_from.h for remoting.
+// Convert I420 to ABGR.
 LIBYUV_API
 int I420ToABGR(const uint8_t* src_y,
                int src_stride_y,
@@ -63,19 +88,292 @@ int I420ToABGR(const uint8_t* src_y,
                int width,
                int height);
 
-// Convert I010 to ARGB.
+// Convert J420 to ARGB.
 LIBYUV_API
-int I010ToARGB(const uint16_t* src_y,
+int J420ToARGB(const uint8_t* src_y,
                int src_stride_y,
-               const uint16_t* src_u,
+               const uint8_t* src_u,
                int src_stride_u,
-               const uint16_t* src_v,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert J420 to ABGR.
+LIBYUV_API
+int J420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert H420 to ARGB.
+LIBYUV_API
+int H420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert H420 to ABGR.
+LIBYUV_API
+int H420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert U420 to ARGB.
+LIBYUV_API
+int U420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
                int src_stride_v,
                uint8_t* dst_argb,
                int dst_stride_argb,
                int width,
                int height);
 
+// Convert U420 to ABGR.
+LIBYUV_API
+int U420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert J422 to ABGR.
+LIBYUV_API
+int J422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert H422 to ARGB.
+LIBYUV_API
+int H422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert H422 to ABGR.
+LIBYUV_API
+int H422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert U422 to ARGB.
+LIBYUV_API
+int U422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert U422 to ABGR.
+LIBYUV_API
+int U422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert J444 to ABGR.
+LIBYUV_API
+int J444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert H444 to ARGB.
+LIBYUV_API
+int H444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert H444 to ABGR.
+LIBYUV_API
+int H444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert U444 to ARGB.
+LIBYUV_API
+int U444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert U444 to ABGR.
+LIBYUV_API
+int U444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
 // Convert I010 to ARGB.
 LIBYUV_API
 int I010ToARGB(const uint16_t* src_y,
@@ -128,52 +426,104 @@ int H010ToABGR(const uint16_t* src_y,
                int width,
                int height);
 
-// Convert I422 to ARGB.
+// Convert U010 to ARGB.
 LIBYUV_API
-int I422ToARGB(const uint8_t* src_y,
+int U010ToARGB(const uint16_t* src_y,
                int src_stride_y,
-               const uint8_t* src_u,
+               const uint16_t* src_u,
                int src_stride_u,
-               const uint8_t* src_v,
+               const uint16_t* src_v,
                int src_stride_v,
                uint8_t* dst_argb,
                int dst_stride_argb,
                int width,
                int height);
 
-// Convert I444 to ARGB.
+// Convert U010 to ABGR.
 LIBYUV_API
-int I444ToARGB(const uint8_t* src_y,
+int U010ToABGR(const uint16_t* src_y,
                int src_stride_y,
-               const uint8_t* src_u,
+               const uint16_t* src_u,
                int src_stride_u,
-               const uint8_t* src_v,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert I210 to ARGB.
+LIBYUV_API
+int I210ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
                int src_stride_v,
                uint8_t* dst_argb,
                int dst_stride_argb,
                int width,
                int height);
 
-// Convert J444 to ARGB.
+// Convert I210 to ABGR.
 LIBYUV_API
-int J444ToARGB(const uint8_t* src_y,
+int I210ToABGR(const uint16_t* src_y,
                int src_stride_y,
-               const uint8_t* src_u,
+               const uint16_t* src_u,
                int src_stride_u,
-               const uint8_t* src_v,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert H210 to ARGB.
+LIBYUV_API
+int H210ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
                int src_stride_v,
                uint8_t* dst_argb,
                int dst_stride_argb,
                int width,
                int height);
 
-// Convert I444 to ABGR.
+// Convert H210 to ABGR.
 LIBYUV_API
-int I444ToABGR(const uint8_t* src_y,
+int H210ToABGR(const uint16_t* src_y,
                int src_stride_y,
-               const uint8_t* src_u,
+               const uint16_t* src_u,
                int src_stride_u,
-               const uint8_t* src_v,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert U210 to ARGB.
+LIBYUV_API
+int U210ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert U210 to ABGR.
+LIBYUV_API
+int U210ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
                int src_stride_v,
                uint8_t* dst_abgr,
                int dst_stride_abgr,
@@ -256,6 +606,7 @@ int NV21ToARGB(const uint8_t* src_y,
                int height);
 
 // Convert NV12 to ABGR.
+LIBYUV_API
 int NV12ToABGR(const uint8_t* src_y,
                int src_stride_y,
                const uint8_t* src_uv,
@@ -298,14 +649,38 @@ int NV21ToRGB24(const uint8_t* src_y,
                 int width,
                 int height);
 
-// Convert M420 to ARGB.
+// Convert NV21 to YUV24.
 LIBYUV_API
-int M420ToARGB(const uint8_t* src_m420,
-               int src_stride_m420,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
+int NV21ToYUV24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_vu,
+                int src_stride_vu,
+                uint8_t* dst_yuv24,
+                int dst_stride_yuv24,
+                int width,
+                int height);
+
+// Convert NV12 to RAW.
+LIBYUV_API
+int NV12ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_uv,
+              int src_stride_uv,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+
+// Convert NV21 to RAW.
+LIBYUV_API
+int NV21ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_vu,
+              int src_stride_vu,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
 
 // Convert YUY2 to ARGB.
 LIBYUV_API
@@ -325,126 +700,113 @@ int UYVYToARGB(const uint8_t* src_uyvy,
                int width,
                int height);
 
-// Convert J420 to ARGB.
-LIBYUV_API
-int J420ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert J422 to ARGB.
+// Convert I010 to AR30.
 LIBYUV_API
-int J422ToARGB(const uint8_t* src_y,
+int I010ToAR30(const uint16_t* src_y,
                int src_stride_y,
-               const uint8_t* src_u,
+               const uint16_t* src_u,
                int src_stride_u,
-               const uint8_t* src_v,
+               const uint16_t* src_v,
                int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
                int width,
                int height);
 
-// Convert J420 to ABGR.
+// Convert I010 to AB30.
 LIBYUV_API
-int J420ToABGR(const uint8_t* src_y,
+int I010ToAB30(const uint16_t* src_y,
                int src_stride_y,
-               const uint8_t* src_u,
+               const uint16_t* src_u,
                int src_stride_u,
-               const uint8_t* src_v,
+               const uint16_t* src_v,
                int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
                int width,
                int height);
 
-// Convert J422 to ABGR.
+// Convert H010 to AR30.
 LIBYUV_API
-int J422ToABGR(const uint8_t* src_y,
+int H010ToAR30(const uint16_t* src_y,
                int src_stride_y,
-               const uint8_t* src_u,
+               const uint16_t* src_u,
                int src_stride_u,
-               const uint8_t* src_v,
+               const uint16_t* src_v,
                int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
                int width,
                int height);
 
-// Convert H420 to ARGB.
+// Convert H010 to AB30.
 LIBYUV_API
-int H420ToARGB(const uint8_t* src_y,
+int H010ToAB30(const uint16_t* src_y,
                int src_stride_y,
-               const uint8_t* src_u,
+               const uint16_t* src_u,
                int src_stride_u,
-               const uint8_t* src_v,
+               const uint16_t* src_v,
                int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
                int width,
                int height);
 
-// Convert H422 to ARGB.
+// Convert U010 to AR30.
 LIBYUV_API
-int H422ToARGB(const uint8_t* src_y,
+int U010ToAR30(const uint16_t* src_y,
                int src_stride_y,
-               const uint8_t* src_u,
+               const uint16_t* src_u,
                int src_stride_u,
-               const uint8_t* src_v,
+               const uint16_t* src_v,
                int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
                int width,
                int height);
 
-// Convert H420 to ABGR.
+// Convert U010 to AB30.
 LIBYUV_API
-int H420ToABGR(const uint8_t* src_y,
+int U010ToAB30(const uint16_t* src_y,
                int src_stride_y,
-               const uint8_t* src_u,
+               const uint16_t* src_u,
                int src_stride_u,
-               const uint8_t* src_v,
+               const uint16_t* src_v,
                int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
                int width,
                int height);
 
-// Convert H422 to ABGR.
+// Convert I210 to AR30.
 LIBYUV_API
-int H422ToABGR(const uint8_t* src_y,
+int I210ToAR30(const uint16_t* src_y,
                int src_stride_y,
-               const uint8_t* src_u,
+               const uint16_t* src_u,
                int src_stride_u,
-               const uint8_t* src_v,
+               const uint16_t* src_v,
                int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
                int width,
                int height);
 
-// Convert H010 to ARGB.
+// Convert I210 to AB30.
 LIBYUV_API
-int H010ToARGB(const uint16_t* src_y,
+int I210ToAB30(const uint16_t* src_y,
                int src_stride_y,
                const uint16_t* src_u,
                int src_stride_u,
                const uint16_t* src_v,
                int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
                int width,
                int height);
 
-// Convert I010 to AR30.
+// Convert H210 to AR30.
 LIBYUV_API
-int I010ToAR30(const uint16_t* src_y,
+int H210ToAR30(const uint16_t* src_y,
                int src_stride_y,
                const uint16_t* src_u,
                int src_stride_u,
@@ -455,35 +817,35 @@ int I010ToAR30(const uint16_t* src_y,
                int width,
                int height);
 
-// Convert H010 to AR30.
+// Convert H210 to AB30.
 LIBYUV_API
-int H010ToAR30(const uint16_t* src_y,
+int H210ToAB30(const uint16_t* src_y,
                int src_stride_y,
                const uint16_t* src_u,
                int src_stride_u,
                const uint16_t* src_v,
                int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
                int width,
                int height);
 
-// Convert I010 to AB30.
+// Convert U210 to AR30.
 LIBYUV_API
-int I010ToAB30(const uint16_t* src_y,
+int U210ToAR30(const uint16_t* src_y,
                int src_stride_y,
                const uint16_t* src_u,
                int src_stride_u,
                const uint16_t* src_v,
                int src_stride_v,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
                int width,
                int height);
 
-// Convert H010 to AB30.
+// Convert U210 to AB30.
 LIBYUV_API
-int H010ToAB30(const uint16_t* src_y,
+int U210ToAB30(const uint16_t* src_y,
                int src_stride_y,
                const uint16_t* src_u,
                int src_stride_u,
@@ -542,6 +904,15 @@ int RAWToARGB(const uint8_t* src_raw,
               int width,
               int height);
 
+// RGB big endian (rgb in memory) to RGBA.
+LIBYUV_API
+int RAWToRGBA(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_rgba,
+              int dst_stride_rgba,
+              int width,
+              int height);
+
 // RGB16 (RGBP fourcc) little endian to ARGB.
 LIBYUV_API
 int RGB565ToARGB(const uint8_t* src_rgb565,
@@ -601,7 +972,6 @@ int AR30ToAB30(const uint8_t* src_ar30,
                int width,
                int height);
 
-#ifdef HAVE_JPEG
 // src_width/height provided by capture
 // dst_width/height for clipping determine final size.
 LIBYUV_API
@@ -613,7 +983,6 @@ int MJPGToARGB(const uint8_t* sample,
                int src_height,
                int dst_width,
                int dst_height);
-#endif
 
 // Convert Android420 to ARGB.
 LIBYUV_API
@@ -643,6 +1012,561 @@ int Android420ToABGR(const uint8_t* src_y,
                      int width,
                      int height);
 
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_uv,
+                 int src_stride_uv,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height);
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height);
+
+LIBYUV_API
+int I420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+LIBYUV_API
+int I420ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height);
+
+LIBYUV_API
+int I420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+LIBYUV_API
+int I420ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height);
+
+LIBYUV_API
+int I420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+
+LIBYUV_API
+int I420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+
+LIBYUV_API
+int H420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+
+LIBYUV_API
+int H420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+
+LIBYUV_API
+int J420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+
+LIBYUV_API
+int J420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+
+LIBYUV_API
+int I420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
+
+LIBYUV_API
+int J420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
+
+LIBYUV_API
+int H420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
+
+LIBYUV_API
+int I422ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
+
+// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+
+LIBYUV_API
+int I420ToRGB565Dither(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const uint8_t* dither4x4,
+                       int width,
+                       int height);
+
+LIBYUV_API
+int I420ToARGB1555(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb1555,
+                   int dst_stride_argb1555,
+                   int width,
+                   int height);
+
+LIBYUV_API
+int I420ToARGB4444(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb4444,
+                   int dst_stride_argb4444,
+                   int width,
+                   int height);
+
+// Convert I420 to AR30.
+LIBYUV_API
+int I420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert H420 to AR30.
+LIBYUV_API
+int H420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert I420 to ARGB with matrix.
+LIBYUV_API
+int I420ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert I422 to ARGB with matrix.
+LIBYUV_API
+int I422ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert I444 to ARGB with matrix.
+LIBYUV_API
+int I444ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// multiply 10 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I010ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// multiply 10 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I210ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert 10 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I010ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert 10 bit 422 YUV to ARGB with matrix.
+LIBYUV_API
+int I210ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert I420 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I420AlphaToARGBMatrix(const uint8_t* src_y,
+                          int src_stride_y,
+                          const uint8_t* src_u,
+                          int src_stride_u,
+                          const uint8_t* src_v,
+                          int src_stride_v,
+                          const uint8_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate);
+
+// Convert NV12 to ARGB with matrix.
+LIBYUV_API
+int NV12ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert NV21 to ARGB with matrix.
+LIBYUV_API
+int NV21ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_vu,
+                     int src_stride_vu,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert NV12 to RGB565 with matrix.
+LIBYUV_API
+int NV12ToRGB565Matrix(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_uv,
+                       int src_stride_uv,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width,
+                       int height);
+
+// Convert NV12 to RGB24 with matrix.
+LIBYUV_API
+int NV12ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_uv,
+                      int src_stride_uv,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height);
+
+// Convert NV21 to RGB24 with matrix.
+LIBYUV_API
+int NV21ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_vu,
+                      int src_stride_vu,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height);
+
+// Convert Android420 to ARGB with matrix.
+LIBYUV_API
+int Android420ToARGBMatrix(const uint8_t* src_y,
+                           int src_stride_y,
+                           const uint8_t* src_u,
+                           int src_stride_u,
+                           const uint8_t* src_v,
+                           int src_stride_v,
+                           int src_pixel_stride_uv,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height);
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I422ToRGBAMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_rgba,
+                     int dst_stride_rgba,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I420ToRGBAMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_rgba,
+                     int dst_stride_rgba,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert I420 to RGB24 with matrix.
+LIBYUV_API
+int I420ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height);
+
+// Convert I420 to RGB565 with specified color matrix.
+LIBYUV_API
+int I420ToRGB565Matrix(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width,
+                       int height);
+
+// Convert I420 to AR30 with matrix.
+LIBYUV_API
+int I420ToAR30Matrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.
+LIBYUV_API
+int I400ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
 // Convert camera sample to ARGB with cropping, rotation and vertical flip.
 // "sample_size" is needed to parse MJPG.
 // "dst_stride_argb" number of bytes in a row of the dst_argb plane.
diff --git a/chromium/third_party/libyuv/include/libyuv/convert_from.h b/chromium/third_party/libyuv/include/libyuv/convert_from.h
index 5cd8a4bfc04..5140ed4f3e9 100644
--- a/chromium/third_party/libyuv/include/libyuv/convert_from.h
+++ b/chromium/third_party/libyuv/include/libyuv/convert_from.h
@@ -23,6 +23,7 @@ extern "C" {
 
 // Convert 8 bit YUV to 10 bit.
 #define H420ToH010 I420ToI010
+LIBYUV_API
 int I420ToI010(const uint8_t* src_y,
                int src_stride_y,
                const uint8_t* src_u,
@@ -131,6 +132,10 @@ int I420ToUYVY(const uint8_t* src_y,
                int width,
                int height);
 
+// The following are from convert_argb.h
+// DEPRECATED: The prototypes will be removed in future.  Use convert_argb.h
+
+// Convert I420 to ARGB.
 LIBYUV_API
 int I420ToARGB(const uint8_t* src_y,
                int src_stride_y,
@@ -143,18 +148,7 @@ int I420ToARGB(const uint8_t* src_y,
                int width,
                int height);
 
-LIBYUV_API
-int I420ToBGRA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_bgra,
-               int dst_stride_bgra,
-               int width,
-               int height);
-
+// Convert I420 to ABGR.
 LIBYUV_API
 int I420ToABGR(const uint8_t* src_y,
                int src_stride_y,
@@ -167,157 +161,6 @@ int I420ToABGR(const uint8_t* src_y,
                int width,
                int height);
 
-LIBYUV_API
-int I420ToRGBA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_rgba,
-               int dst_stride_rgba,
-               int width,
-               int height);
-
-LIBYUV_API
-int I420ToRGB24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_u,
-                int src_stride_u,
-                const uint8_t* src_v,
-                int src_stride_v,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height);
-
-LIBYUV_API
-int I420ToRAW(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_u,
-              int src_stride_u,
-              const uint8_t* src_v,
-              int src_stride_v,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height);
-
-LIBYUV_API
-int H420ToRGB24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_u,
-                int src_stride_u,
-                const uint8_t* src_v,
-                int src_stride_v,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height);
-
-LIBYUV_API
-int H420ToRAW(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_u,
-              int src_stride_u,
-              const uint8_t* src_v,
-              int src_stride_v,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height);
-
-LIBYUV_API
-int I420ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_u,
-                 int src_stride_u,
-                 const uint8_t* src_v,
-                 int src_stride_v,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height);
-
-LIBYUV_API
-int I422ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_u,
-                 int src_stride_u,
-                 const uint8_t* src_v,
-                 int src_stride_v,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height);
-
-// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
-// Values in dither matrix from 0 to 7 recommended.
-// The order of the dither matrix is first byte is upper left.
-
-LIBYUV_API
-int I420ToRGB565Dither(const uint8_t* src_y,
-                       int src_stride_y,
-                       const uint8_t* src_u,
-                       int src_stride_u,
-                       const uint8_t* src_v,
-                       int src_stride_v,
-                       uint8_t* dst_rgb565,
-                       int dst_stride_rgb565,
-                       const uint8_t* dither4x4,
-                       int width,
-                       int height);
-
-LIBYUV_API
-int I420ToARGB1555(const uint8_t* src_y,
-                   int src_stride_y,
-                   const uint8_t* src_u,
-                   int src_stride_u,
-                   const uint8_t* src_v,
-                   int src_stride_v,
-                   uint8_t* dst_argb1555,
-                   int dst_stride_argb1555,
-                   int width,
-                   int height);
-
-LIBYUV_API
-int I420ToARGB4444(const uint8_t* src_y,
-                   int src_stride_y,
-                   const uint8_t* src_u,
-                   int src_stride_u,
-                   const uint8_t* src_v,
-                   int src_stride_v,
-                   uint8_t* dst_argb4444,
-                   int dst_stride_argb4444,
-                   int width,
-                   int height);
-
-// Convert I420 to AR30.
-LIBYUV_API
-int I420ToAR30(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height);
-
-// Convert H420 to AR30.
-LIBYUV_API
-int H420ToAR30(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height);
-
 // Convert I420 to specified format.
 // "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
 //    buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
diff --git a/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h b/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h
index 05c815a093e..d992363cebb 100644
--- a/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h
+++ b/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h
@@ -77,6 +77,10 @@ int ARGBToAR30(const uint8_t* src_argb,
                int width,
                int height);
 
+// Aliases
+#define ABGRToRGB24 ARGBToRAW
+#define ABGRToRAW ARGBToRGB24
+
 // Convert ARGB To RGB24.
 LIBYUV_API
 int ARGBToRGB24(const uint8_t* src_argb,
@@ -210,6 +214,15 @@ int ARGBToJ400(const uint8_t* src_argb,
                int width,
                int height);
 
+// Convert RGBA to J400. (JPeg full range).
+LIBYUV_API
+int RGBAToJ400(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               int width,
+               int height);
+
 // Convert ARGB to I400.
 LIBYUV_API
 int ARGBToI400(const uint8_t* src_argb,
@@ -250,10 +263,21 @@ int ARGBToNV21(const uint8_t* src_argb,
                int width,
                int height);
 
-// Convert ARGB To NV21.
+// Convert ABGR To NV12.
 LIBYUV_API
-int ARGBToNV21(const uint8_t* src_argb,
-               int src_stride_argb,
+int ABGRToNV12(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert ABGR To NV21.
+LIBYUV_API
+int ABGRToNV21(const uint8_t* src_abgr,
+               int src_stride_abgr,
                uint8_t* dst_y,
                int dst_stride_y,
                uint8_t* dst_vu,
diff --git a/chromium/third_party/libyuv/include/libyuv/cpu_id.h b/chromium/third_party/libyuv/include/libyuv/cpu_id.h
index 0229cb5e736..3e27cc107dc 100644
--- a/chromium/third_party/libyuv/include/libyuv/cpu_id.h
+++ b/chromium/third_party/libyuv/include/libyuv/cpu_id.h
@@ -48,6 +48,7 @@ static const int kCpuHasAVX512VPOPCNTDQ = 0x100000;
 // These flags are only valid on MIPS processors.
 static const int kCpuHasMIPS = 0x200000;
 static const int kCpuHasMSA = 0x400000;
+static const int kCpuHasMMI = 0x800000;
 
 // Optional init function. TestCpuFlag does an auto-init.
 // Returns cpu_info flags.
@@ -70,6 +71,8 @@ static __inline int TestCpuFlag(int test_flag) {
 // Internal function for parsing /proc/cpuinfo.
 LIBYUV_API
 int ArmCpuCaps(const char* cpuinfo_name);
+LIBYUV_API
+int MipsCpuCaps(const char* cpuinfo_name);
 
 // For testing, allow CPU flags to be disabled.
 // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
diff --git a/chromium/third_party/libyuv/include/libyuv/macros_msa.h b/chromium/third_party/libyuv/include/libyuv/macros_msa.h
index 29997ce11fd..4e232b66bfe 100644
--- a/chromium/third_party/libyuv/include/libyuv/macros_msa.h
+++ b/chromium/third_party/libyuv/include/libyuv/macros_msa.h
@@ -140,6 +140,9 @@
 #define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
 #define LD_UB(...) LD_B(const v16u8, __VA_ARGS__)
 
+#define LD_H(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
+#define LD_UH(...) LD_H(const v8u16, __VA_ARGS__)
+
 #define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
 
diff --git a/chromium/third_party/libyuv/include/libyuv/planar_functions.h b/chromium/third_party/libyuv/include/libyuv/planar_functions.h
index 91137baba25..8d868b95425 100644
--- a/chromium/third_party/libyuv/include/libyuv/planar_functions.h
+++ b/chromium/third_party/libyuv/include/libyuv/planar_functions.h
@@ -105,6 +105,28 @@ void MergeUVPlane(const uint8_t* src_u,
                   int width,
                   int height);
 
+// Scale U and V to half width and height and merge into interleaved UV plane.
+// width and height are source size, allowing odd sizes.
+// Use for converting I444 or I422 to NV12.
+LIBYUV_API
+void HalfMergeUVPlane(const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_uv,
+                      int dst_stride_uv,
+                      int width,
+                      int height);
+
+// Swap U and V channels in interleaved UV plane.
+LIBYUV_API
+void SwapUVPlane(const uint8_t* src_uv,
+                 int src_stride_uv,
+                 uint8_t* dst_vu,
+                 int dst_stride_vu,
+                 int width,
+                 int height);
+
 // Split interleaved RGB plane into separate R, G and B planes.
 LIBYUV_API
 void SplitRGBPlane(const uint8_t* src_rgb,
@@ -178,6 +200,16 @@ int I444Copy(const uint8_t* src_y,
              int width,
              int height);
 
+// Copy NV12. Supports inverting.
+int NV12Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv,
+             int src_stride_uv, uint8_t* dst_y, int dst_stride_y,
+             uint8_t* dst_uv, int dst_stride_uv, int width, int height);
+
+// Copy NV21. Supports inverting.
+int NV21Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu,
+             int src_stride_vu, uint8_t* dst_y, int dst_stride_y,
+             uint8_t* dst_vu, int dst_stride_vu, int width, int height);
+
 // Convert YUY2 to I422.
 LIBYUV_API
 int YUY2ToI422(const uint8_t* src_yuy2,
@@ -224,6 +256,19 @@ int UYVYToNV12(const uint8_t* src_uyvy,
                int width,
                int height);
 
+// Convert NV21 to NV12.
+LIBYUV_API
+int NV21ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
 LIBYUV_API
 int YUY2ToY(const uint8_t* src_yuy2,
             int src_stride_yuy2,
@@ -280,6 +325,22 @@ int I400Mirror(const uint8_t* src_y,
                int height);
 
 // Alias
+#define NV12ToNV12Mirror NV12Mirror
+
+// NV12 mirror.
+LIBYUV_API
+int NV12Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Alias
 #define ARGBToARGBMirror ARGBMirror
 
 // ARGB mirror.
@@ -291,56 +352,35 @@ int ARGBMirror(const uint8_t* src_argb,
                int width,
                int height);
 
-// Convert NV12 to RGB565.
-LIBYUV_API
-int NV12ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_uv,
-                 int src_stride_uv,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height);
+// Alias
+#define RGB24ToRGB24Mirror RGB24Mirror
 
-// I422ToARGB is in convert_argb.h
-// Convert I422 to BGRA.
+// RGB24 mirror.
 LIBYUV_API
-int I422ToBGRA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_bgra,
-               int dst_stride_bgra,
-               int width,
-               int height);
+int RGB24Mirror(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
 
-// Convert I422 to ABGR.
+// Mirror a plane of data.
 LIBYUV_API
-int I422ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
+void MirrorPlane(const uint8_t* src_y,
+                 int src_stride_y,
+                 uint8_t* dst_y,
+                 int dst_stride_y,
+                 int width,
+                 int height);
 
-// Convert I422 to RGBA.
+// Mirror a plane of UV data.
 LIBYUV_API
-int I422ToRGBA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_rgba,
-               int dst_stride_rgba,
-               int width,
-               int height);
+void MirrorUVPlane(const uint8_t* src_uv,
+                   int src_stride_uv,
+                   uint8_t* dst_uv,
+                   int dst_stride_uv,
+                   int width,
+                   int height);
 
 // Alias
 #define RGB24ToRAW RAWToRGB24
@@ -721,6 +761,19 @@ int ARGBBlur(const uint8_t* src_argb,
              int height,
              int radius);
 
+// Gaussian 5x5 blur a float plane.
+// Coefficients of 1, 4, 6, 4, 1.
+// Each destination pixel is a blur of the 5x5
+// pixels from the source.
+// Source edges are clamped.
+LIBYUV_API
+int GaussPlane_F32(const float* src,
+                   int src_stride,
+                   float* dst,
+                   int dst_stride,
+                   int width,
+                   int height);
+
 // Multiply ARGB image by ARGB value.
 LIBYUV_API
 int ARGBShade(const uint8_t* src_argb,
diff --git a/chromium/third_party/libyuv/include/libyuv/rotate.h b/chromium/third_party/libyuv/include/libyuv/rotate.h
index 76b692be8b0..308882242cb 100644
--- a/chromium/third_party/libyuv/include/libyuv/rotate.h
+++ b/chromium/third_party/libyuv/include/libyuv/rotate.h
@@ -49,6 +49,24 @@ int I420Rotate(const uint8_t* src_y,
                int height,
                enum RotationMode mode);
 
+// Rotate I444 frame.
+LIBYUV_API
+int I444Rotate(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
+
 // Rotate NV12 input and store in I420.
 LIBYUV_API
 int NV12ToI420Rotate(const uint8_t* src_y,
@@ -100,6 +118,10 @@ void RotatePlane270(const uint8_t* src,
                     int width,
                     int height);
 
+// Rotations for when U and V are interleaved.
+// These functions take one input pointer and
+// split the data into two buffers while
+// rotating them. Deprecated.
 LIBYUV_API
 void RotateUV90(const uint8_t* src,
                 int src_stride,
@@ -110,10 +132,6 @@ void RotateUV90(const uint8_t* src,
                 int width,
                 int height);
 
-// Rotations for when U and V are interleaved.
-// These functions take one input pointer and
-// split the data into two buffers while
-// rotating them. Deprecated.
 LIBYUV_API
 void RotateUV180(const uint8_t* src,
                  int src_stride,
diff --git a/chromium/third_party/libyuv/include/libyuv/rotate_row.h b/chromium/third_party/libyuv/include/libyuv/rotate_row.h
index 5edc0fcf13a..022293eef2c 100644
--- a/chromium/third_party/libyuv/include/libyuv/rotate_row.h
+++ b/chromium/third_party/libyuv/include/libyuv/rotate_row.h
@@ -60,6 +60,11 @@ extern "C" {
 #define HAS_TRANSPOSEUVWX16_MSA
 #endif
 
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+#define HAS_TRANSPOSEWX8_MMI
+#define HAS_TRANSPOSEUVWX8_MMI
+#endif
+
 void TransposeWxH_C(const uint8_t* src,
                     int src_stride,
                     uint8_t* dst,
@@ -87,6 +92,11 @@ void TransposeWx8_SSSE3(const uint8_t* src,
                         uint8_t* dst,
                         int dst_stride,
                         int width);
+void TransposeWx8_MMI(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst,
+                      int dst_stride,
+                      int width);
 void TransposeWx8_Fast_SSSE3(const uint8_t* src,
                              int src_stride,
                              uint8_t* dst,
@@ -108,6 +118,11 @@ void TransposeWx8_Any_SSSE3(const uint8_t* src,
                             uint8_t* dst,
                             int dst_stride,
                             int width);
+void TransposeWx8_Any_MMI(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width);
 void TransposeWx8_Fast_Any_SSSE3(const uint8_t* src,
                                  int src_stride,
                                  uint8_t* dst,
@@ -156,6 +171,13 @@ void TransposeUVWx8_NEON(const uint8_t* src,
                          uint8_t* dst_b,
                          int dst_stride_b,
                          int width);
+void TransposeUVWx8_MMI(const uint8_t* src,
+                        int src_stride,
+                        uint8_t* dst_a,
+                        int dst_stride_a,
+                        uint8_t* dst_b,
+                        int dst_stride_b,
+                        int width);
 void TransposeUVWx16_MSA(const uint8_t* src,
                          int src_stride,
                          uint8_t* dst_a,
@@ -178,6 +200,13 @@ void TransposeUVWx8_Any_NEON(const uint8_t* src,
                              uint8_t* dst_b,
                              int dst_stride_b,
                              int width);
+void TransposeUVWx8_Any_MMI(const uint8_t* src,
+                            int src_stride,
+                            uint8_t* dst_a,
+                            int dst_stride_a,
+                            uint8_t* dst_b,
+                            int dst_stride_b,
+                            int width);
 void TransposeUVWx16_Any_MSA(const uint8_t* src,
                              int src_stride,
                              uint8_t* dst_a,
diff --git a/chromium/third_party/libyuv/include/libyuv/row.h b/chromium/third_party/libyuv/include/libyuv/row.h
index 1468f4b9925..a27788c1f69 100644
--- a/chromium/third_party/libyuv/include/libyuv/row.h
+++ b/chromium/third_party/libyuv/include/libyuv/row.h
@@ -98,7 +98,6 @@ extern "C" {
 #define HAS_COPYROW_SSE2
 #define HAS_H422TOARGBROW_SSSE3
 #define HAS_HALFFLOATROW_SSE2
-#define HAS_I400TOARGBROW_SSE2
 #define HAS_I422TOARGB1555ROW_SSSE3
 #define HAS_I422TOARGB4444ROW_SSSE3
 #define HAS_I422TOARGBROW_SSSE3
@@ -112,7 +111,7 @@ extern "C" {
 #define HAS_J422TOARGBROW_SSSE3
 #define HAS_MERGEUVROW_SSE2
 #define HAS_MIRRORROW_SSSE3
-#define HAS_MIRRORUVROW_SSSE3
+#define HAS_MIRRORSPLITUVROW_SSSE3
 #define HAS_NV12TOARGBROW_SSSE3
 #define HAS_NV12TORGB24ROW_SSSE3
 #define HAS_NV12TORGB565ROW_SSSE3
@@ -123,6 +122,8 @@ extern "C" {
 #define HAS_RAWTOYROW_SSSE3
 #define HAS_RGB24TOARGBROW_SSSE3
 #define HAS_RGB24TOYROW_SSSE3
+#define HAS_RGB24TOYJROW_SSSE3
+#define HAS_RAWTOYJROW_SSSE3
 #define HAS_RGB565TOARGBROW_SSE2
 #define HAS_RGBATOUVROW_SSSE3
 #define HAS_RGBATOYROW_SSSE3
@@ -194,11 +195,12 @@ extern "C" {
 #define HAS_ARGBTOUVROW_AVX2
 #define HAS_ARGBTOYJROW_AVX2
 #define HAS_ARGBTOYROW_AVX2
+#define HAS_RGB24TOYJROW_AVX2
+#define HAS_RAWTOYJROW_AVX2
 #define HAS_COPYROW_AVX
 #define HAS_H422TOARGBROW_AVX2
 #define HAS_HALFFLOATROW_AVX2
 //  #define HAS_HALFFLOATROW_F16C  // Enable to test halffloat cast
-#define HAS_I400TOARGBROW_AVX2
 #define HAS_I422TOARGB1555ROW_AVX2
 #define HAS_I422TOARGB4444ROW_AVX2
 #define HAS_I422TOARGBROW_AVX2
@@ -269,12 +271,19 @@ extern "C" {
 #define HAS_ARGBTOAR30ROW_SSSE3
 #define HAS_CONVERT16TO8ROW_SSSE3
 #define HAS_CONVERT8TO16ROW_SSE2
-// I210 is for H010.  2 = 422.  I for 601 vs H for 709.
+#define HAS_HALFMERGEUVROW_SSSE3
 #define HAS_I210TOAR30ROW_SSSE3
 #define HAS_I210TOARGBROW_SSSE3
+#define HAS_I400TOARGBROW_SSE2
 #define HAS_I422TOAR30ROW_SSSE3
 #define HAS_MERGERGBROW_SSSE3
+#define HAS_MIRRORUVROW_AVX2
+#define HAS_MIRRORUVROW_SSSE3
+#define HAS_RAWTORGBAROW_SSSE3
+#define HAS_RGB24MIRRORROW_SSSE3
+#define HAS_RGBATOYJROW_SSSE3
 #define HAS_SPLITRGBROW_SSSE3
+#define HAS_SWAPUVROW_SSSE3
 #endif
 
 // The following are available for AVX2 gcc/clang x86 platforms:
@@ -283,18 +292,26 @@ extern "C" {
     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
     (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
 #define HAS_ABGRTOAR30ROW_AVX2
+#define HAS_ABGRTOUVROW_AVX2
+#define HAS_ABGRTOYROW_AVX2
 #define HAS_ARGBTOAR30ROW_AVX2
 #define HAS_ARGBTORAWROW_AVX2
 #define HAS_ARGBTORGB24ROW_AVX2
 #define HAS_CONVERT16TO8ROW_AVX2
 #define HAS_CONVERT8TO16ROW_AVX2
+#define HAS_HALFMERGEUVROW_AVX2
 #define HAS_I210TOAR30ROW_AVX2
 #define HAS_I210TOARGBROW_AVX2
+#define HAS_I400TOARGBROW_AVX2
 #define HAS_I422TOAR30ROW_AVX2
 #define HAS_I422TOUYVYROW_AVX2
 #define HAS_I422TOYUY2ROW_AVX2
 #define HAS_MERGEUVROW_16_AVX2
 #define HAS_MULTIPLYROW_16_AVX2
+#define HAS_RGBATOYJROW_AVX2
+#define HAS_SWAPUVROW_AVX2
+// TODO(fbarchard): Fix AVX2 version of YUV24
+// #define HAS_NV21TOYUV24ROW_AVX2
 #endif
 
 // The following are available for AVX512 clang x86 platforms:
@@ -330,11 +347,15 @@ extern "C" {
 #define HAS_ARGBTOUVROW_NEON
 #define HAS_ARGBTOYJROW_NEON
 #define HAS_ARGBTOYROW_NEON
+#define HAS_AYUVTOUVROW_NEON
+#define HAS_AYUVTOVUROW_NEON
+#define HAS_AYUVTOYROW_NEON
 #define HAS_BGRATOUVROW_NEON
 #define HAS_BGRATOYROW_NEON
 #define HAS_BYTETOFLOATROW_NEON
 #define HAS_COPYROW_NEON
 #define HAS_HALFFLOATROW_NEON
+#define HAS_HALFMERGEUVROW_NEON
 #define HAS_I400TOARGBROW_NEON
 #define HAS_I422ALPHATOARGBROW_NEON
 #define HAS_I422TOARGB1555ROW_NEON
@@ -350,26 +371,33 @@ extern "C" {
 #define HAS_MERGEUVROW_NEON
 #define HAS_MIRRORROW_NEON
 #define HAS_MIRRORUVROW_NEON
+#define HAS_MIRRORSPLITUVROW_NEON
 #define HAS_NV12TOARGBROW_NEON
 #define HAS_NV12TORGB24ROW_NEON
 #define HAS_NV12TORGB565ROW_NEON
 #define HAS_NV21TOARGBROW_NEON
 #define HAS_NV21TORGB24ROW_NEON
+#define HAS_NV21TOYUV24ROW_NEON
 #define HAS_RAWTOARGBROW_NEON
 #define HAS_RAWTORGB24ROW_NEON
+#define HAS_RAWTORGBAROW_NEON
 #define HAS_RAWTOUVROW_NEON
+#define HAS_RAWTOYJROW_NEON
 #define HAS_RAWTOYROW_NEON
 #define HAS_RGB24TOARGBROW_NEON
 #define HAS_RGB24TOUVROW_NEON
+#define HAS_RGB24TOYJROW_NEON
 #define HAS_RGB24TOYROW_NEON
 #define HAS_RGB565TOARGBROW_NEON
 #define HAS_RGB565TOUVROW_NEON
 #define HAS_RGB565TOYROW_NEON
 #define HAS_RGBATOUVROW_NEON
+#define HAS_RGBATOYJROW_NEON
 #define HAS_RGBATOYROW_NEON
 #define HAS_SETROW_NEON
 #define HAS_SPLITRGBROW_NEON
 #define HAS_SPLITUVROW_NEON
+#define HAS_SWAPUVROW_NEON
 #define HAS_UYVYTOARGBROW_NEON
 #define HAS_UYVYTOUV422ROW_NEON
 #define HAS_UYVYTOUVROW_NEON
@@ -386,6 +414,7 @@ extern "C" {
 #define HAS_ARGBCOLORMATRIXROW_NEON
 #define HAS_ARGBGRAYROW_NEON
 #define HAS_ARGBMIRRORROW_NEON
+#define HAS_RGB24MIRRORROW_NEON
 #define HAS_ARGBMULTIPLYROW_NEON
 #define HAS_ARGBQUANTIZEROW_NEON
 #define HAS_ARGBSEPIAROW_NEON
@@ -403,6 +432,9 @@ extern "C" {
 // The following are available on AArch64 platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 #define HAS_SCALESUMSAMPLES_NEON
+#define HAS_GAUSSROW_F32_NEON
+#define HAS_GAUSSCOL_F32_NEON
+
 #endif
 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
 #define HAS_ABGRTOUVROW_MSA
@@ -447,11 +479,14 @@ extern "C" {
 #define HAS_I422TOUYVYROW_MSA
 #define HAS_I422TOYUY2ROW_MSA
 #define HAS_I444TOARGBROW_MSA
+#define HAS_I422TOARGB1555ROW_MSA
+#define HAS_I422TORGB565ROW_MSA
 #define HAS_INTERPOLATEROW_MSA
 #define HAS_J400TOARGBROW_MSA
 #define HAS_MERGEUVROW_MSA
 #define HAS_MIRRORROW_MSA
 #define HAS_MIRRORUVROW_MSA
+#define HAS_MIRRORSPLITUVROW_MSA
 #define HAS_NV12TOARGBROW_MSA
 #define HAS_NV12TORGB565ROW_MSA
 #define HAS_NV21TOARGBROW_MSA
@@ -483,6 +518,98 @@ extern "C" {
 #define HAS_YUY2TOYROW_MSA
 #endif
 
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+#define HAS_ABGRTOUVROW_MMI
+#define HAS_ABGRTOYROW_MMI
+#define HAS_ARGB1555TOARGBROW_MMI
+#define HAS_ARGB1555TOUVROW_MMI
+#define HAS_ARGB1555TOYROW_MMI
+#define HAS_ARGB4444TOARGBROW_MMI
+#define HAS_ARGB4444TOUVROW_MMI
+#define HAS_ARGB4444TOYROW_MMI
+#define HAS_ARGBADDROW_MMI
+#define HAS_ARGBATTENUATEROW_MMI
+#define HAS_ARGBBLENDROW_MMI
+#define HAS_ARGBCOLORMATRIXROW_MMI
+#define HAS_ARGBCOPYALPHAROW_MMI
+#define HAS_ARGBCOPYYTOALPHAROW_MMI
+#define HAS_ARGBEXTRACTALPHAROW_MMI
+#define HAS_ARGBGRAYROW_MMI
+#define HAS_ARGBMIRRORROW_MMI
+#define HAS_ARGBMULTIPLYROW_MMI
+#define HAS_ARGBSEPIAROW_MMI
+#define HAS_ARGBSETROW_MMI
+#define HAS_ARGBSHADEROW_MMI
+#define HAS_ARGBSHUFFLEROW_MMI
+#define HAS_ARGBSUBTRACTROW_MMI
+#define HAS_ARGBTOARGB1555ROW_MMI
+#define HAS_ARGBTOARGB4444ROW_MMI
+#define HAS_ARGBTORAWROW_MMI
+#define HAS_ARGBTORGB24ROW_MMI
+#define HAS_ARGBTORGB565DITHERROW_MMI
+#define HAS_ARGBTORGB565ROW_MMI
+#define HAS_ARGBTOUV444ROW_MMI
+#define HAS_ARGBTOUVJROW_MMI
+#define HAS_ARGBTOUVROW_MMI
+#define HAS_ARGBTOYJROW_MMI
+#define HAS_ARGBTOYROW_MMI
+#define HAS_BGRATOUVROW_MMI
+#define HAS_BGRATOYROW_MMI
+#define HAS_BLENDPLANEROW_MMI
+#define HAS_COMPUTECUMULATIVESUMROW_MMI
+#define HAS_CUMULATIVESUMTOAVERAGEROW_MMI
+#define HAS_HALFFLOATROW_MMI
+#define HAS_I400TOARGBROW_MMI
+#define HAS_I422TOUYVYROW_MMI
+#define HAS_I422TOYUY2ROW_MMI
+#define HAS_I422TOARGBROW_MMI
+#define HAS_I444TOARGBROW_MMI
+#define HAS_INTERPOLATEROW_MMI
+#define HAS_J400TOARGBROW_MMI
+#define HAS_MERGERGBROW_MMI
+#define HAS_MERGEUVROW_MMI
+#define HAS_MIRRORROW_MMI
+#define HAS_MIRRORSPLITUVROW_MMI
+#define HAS_RAWTOARGBROW_MMI
+#define HAS_RAWTORGB24ROW_MMI
+#define HAS_RAWTOUVROW_MMI
+#define HAS_RAWTOYROW_MMI
+#define HAS_RGB24TOARGBROW_MMI
+#define HAS_RGB24TOUVROW_MMI
+#define HAS_RGB24TOYROW_MMI
+#define HAS_RGB565TOARGBROW_MMI
+#define HAS_RGB565TOUVROW_MMI
+#define HAS_RGB565TOYROW_MMI
+#define HAS_RGBATOUVROW_MMI
+#define HAS_RGBATOYROW_MMI
+#define HAS_SOBELROW_MMI
+#define HAS_SOBELTOPLANEROW_MMI
+#define HAS_SOBELXROW_MMI
+#define HAS_SOBELXYROW_MMI
+#define HAS_SOBELYROW_MMI
+#define HAS_SPLITRGBROW_MMI
+#define HAS_SPLITUVROW_MMI
+#define HAS_UYVYTOUVROW_MMI
+#define HAS_UYVYTOYROW_MMI
+#define HAS_YUY2TOUV422ROW_MMI
+#define HAS_YUY2TOUVROW_MMI
+#define HAS_YUY2TOYROW_MMI
+#define HAS_I210TOARGBROW_MMI
+#define HAS_I422TOARGB4444ROW_MMI
+#define HAS_I422TOARGB1555ROW_MMI
+#define HAS_I422TORGB565ROW_MMI
+#define HAS_NV21TORGB24ROW_MMI
+#define HAS_NV12TORGB24ROW_MMI
+#define HAS_I422ALPHATOARGBROW_MMI
+#define HAS_I422TORGB24ROW_MMI
+#define HAS_NV12TOARGBROW_MMI
+#define HAS_NV21TOARGBROW_MMI
+#define HAS_NV12TORGB565ROW_MMI
+#define HAS_YUY2TOARGBROW_MMI
+#define HAS_UYVYTOARGBROW_MMI
+#define HAS_I422TORGBAROW_MMI
+#endif
+
 #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
 #if defined(VISUALC_HAS_AVX2)
 #define SIMD_ALIGNED(var) __declspec(align(32)) var
@@ -491,6 +618,7 @@ extern "C" {
 #endif
 typedef __declspec(align(16)) int16_t vec16[8];
 typedef __declspec(align(16)) int32_t vec32[4];
+typedef __declspec(align(16)) float vecf32[4];
 typedef __declspec(align(16)) int8_t vec8[16];
 typedef __declspec(align(16)) uint16_t uvec16[8];
 typedef __declspec(align(16)) uint32_t uvec32[4];
@@ -510,6 +638,7 @@ typedef __declspec(align(32)) uint8_t ulvec8[32];
 #endif
 typedef int16_t __attribute__((vector_size(16))) vec16;
 typedef int32_t __attribute__((vector_size(16))) vec32;
+typedef float __attribute__((vector_size(16))) vecf32;
 typedef int8_t __attribute__((vector_size(16))) vec8;
 typedef uint16_t __attribute__((vector_size(16))) uvec16;
 typedef uint32_t __attribute__((vector_size(16))) uvec32;
@@ -524,6 +653,7 @@ typedef uint8_t __attribute__((vector_size(32))) ulvec8;
 #define SIMD_ALIGNED(var) var
 typedef int16_t vec16[8];
 typedef int32_t vec32[4];
+typedef float vecf32[4];
 typedef int8_t vec8[16];
 typedef uint16_t uvec16[8];
 typedef uint32_t uvec32[4];
@@ -564,6 +694,7 @@ struct YuvConstants {
   int16_t kUVBiasG[16];
   int16_t kUVBiasR[16];
   int16_t kYToRgb[16];
+  int16_t kYBiasToRgb[16];
 };
 
 // Offsets into YuvConstants structure
@@ -574,17 +705,9 @@ struct YuvConstants {
 #define KUVBIASG 128
 #define KUVBIASR 160
 #define KYTORGB 192
-#endif
-
-// Conversion matrix for YUV to RGB
-extern const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants);  // BT.601
-extern const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants);  // JPeg
-extern const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants);  // BT.709
+#define KYBIASTORGB 224
 
-// Conversion matrix for YVU to BGR
-extern const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants);  // BT.601
-extern const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants);  // JPeg
-extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants);  // BT.709
+#endif
 
 #define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
 
@@ -740,6 +863,10 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width);
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_yuv24,
+                         int width);
 void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
@@ -754,6 +881,12 @@ void I444ToARGBRow_MSA(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void I444ToARGBRow_MMI(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
 
 void I422ToARGBRow_MSA(const uint8_t* src_y,
                        const uint8_t* src_u,
@@ -767,6 +900,12 @@ void I422ToRGBARow_MSA(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void I422ToARGBRow_MMI(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
 void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
@@ -824,19 +963,31 @@ void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
 
 void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void ABGRToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void RGBAToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
 void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width);
 void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
 void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
 void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
 void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_y, int width);
 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
 void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
 void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
 void ARGBToUV444Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -850,7 +1001,16 @@ void ARGBToUV444Row_MSA(const uint8_t* src_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
-void ARGBToUVRow_MSA(const uint8_t* src_argb0,
+void ARGBToUVRow_MSA(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void ARGBToUV444Row_MMI(const uint8_t* src_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+void ARGBToUVRow_MMI(const uint8_t* src_argb,
                      int src_stride_argb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
@@ -900,32 +1060,32 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
-void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
+void ARGBToUVJRow_MSA(const uint8_t* src_rgb,
                       int src_stride_rgb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
-void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
+void BGRAToUVRow_MSA(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
-void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
+void ABGRToUVRow_MSA(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
-void RGBAToUVRow_MSA(const uint8_t* src_rgb0,
+void RGBAToUVRow_MSA(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
-void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
+void RGB24ToUVRow_MSA(const uint8_t* src_rgb,
                       int src_stride_rgb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
-void RAWToUVRow_MSA(const uint8_t* src_rgb0,
+void RAWToUVRow_MSA(const uint8_t* src_rgb,
                     int src_stride_rgb,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
@@ -940,11 +1100,58 @@ void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
+void ARGBToUVJRow_MMI(const uint8_t* src_rgb,
+                      int src_stride_rgb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void BGRAToUVRow_MMI(const uint8_t* src_rgb,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void ABGRToUVRow_MMI(const uint8_t* src_rgb,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void RGBAToUVRow_MMI(const uint8_t* src_rgb,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void RGB24ToUVRow_MMI(const uint8_t* src_rgb,
+                      int src_stride_rgb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void RAWToUVRow_MMI(const uint8_t* src_rgb,
+                    int src_stride_rgb,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
+void RGB565ToUVRow_MMI(const uint8_t* src_rgb565,
+                       int src_stride_rgb565,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555,
+                         int src_stride_argb1555,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444,
+                         int src_stride_argb4444,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
 void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width);
 void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width);
 void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
 void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
 void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width);
 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
 void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
                          uint8_t* dst_y,
@@ -952,37 +1159,59 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
 void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
                          uint8_t* dst_y,
                          int width);
-void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
 void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
-void ARGBToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ARGBToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void BGRAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ABGRToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGBAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGB24ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RAWToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
+void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
+
+void ARGBToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void BGRAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ABGRToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB24ToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB24ToYJRow_C(const uint8_t* src_argb, uint8_t* dst_yj, int width);
+void RAWToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RAWToYJRow_C(const uint8_t* src_argb, uint8_t* dst_yj, int width);
 void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
 void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
 void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
 void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGB24ToYRow_Any_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
-void RAWToYRow_Any_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RGB24ToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_SSSE3(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void RAWToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGB565ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGB1555ToYRow_Any_NEON(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
@@ -1001,38 +1230,57 @@ void RGB565ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGB1555ToYRow_Any_MSA(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
+void BGRAToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB565ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGB1555ToYRow_Any_MMI(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGB4444ToYRow_Any_MMI(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
 
-void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+void ARGBToUVRow_AVX2(const uint8_t* src_argb,
                       int src_stride_argb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
-void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
                        int src_stride_argb,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width);
-void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
                        int src_stride_argb,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width);
-void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
                         int src_stride_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
-void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
+void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
                        int src_stride_bgra,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width);
-void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
+void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
                        int src_stride_abgr,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width);
-void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
+void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
                        int src_stride_rgba,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
@@ -1042,6 +1290,11 @@ void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
+void ABGRToUVRow_Any_AVX2(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
 void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr,
                            int src_stride_ptr,
                            uint8_t* dst_u,
@@ -1090,6 +1343,15 @@ void ARGBToUVRow_Any_MSA(const uint8_t* src_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
+void ARGBToUV444Row_Any_MMI(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void ARGBToUVRow_Any_MMI(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
 void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr,
                            int src_stride_ptr,
                            uint8_t* dst_u,
@@ -1175,47 +1437,92 @@ void ARGB1555ToUVRow_Any_MSA(const uint8_t* src_ptr,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
                              int width);
-void ARGBToUVRow_C(const uint8_t* src_rgb0,
+void ARGBToUVJRow_Any_MMI(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void BGRAToUVRow_Any_MMI(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void ABGRToUVRow_Any_MMI(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void RGBAToUVRow_Any_MMI(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void RGB24ToUVRow_Any_MMI(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void RAWToUVRow_Any_MMI(const uint8_t* src_ptr,
+                        int src_stride_ptr,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+void RGB565ToUVRow_Any_MMI(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void ARGB1555ToUVRow_Any_MMI(const uint8_t* src_ptr,
+                             int src_stride_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void ARGB4444ToUVRow_Any_MMI(const uint8_t* src_ptr,
+                             int src_stride_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void ARGBToUVRow_C(const uint8_t* src_rgb,
                    int src_stride_rgb,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width);
-void ARGBToUVJRow_C(const uint8_t* src_rgb0,
+void ARGBToUVJRow_C(const uint8_t* src_rgb,
                     int src_stride_rgb,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
-void ARGBToUVRow_C(const uint8_t* src_rgb0,
+void ARGBToUVRow_C(const uint8_t* src_rgb,
                    int src_stride_rgb,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width);
-void ARGBToUVJRow_C(const uint8_t* src_rgb0,
+void ARGBToUVJRow_C(const uint8_t* src_rgb,
                     int src_stride_rgb,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
-void BGRAToUVRow_C(const uint8_t* src_rgb0,
+void BGRAToUVRow_C(const uint8_t* src_rgb,
                    int src_stride_rgb,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width);
-void ABGRToUVRow_C(const uint8_t* src_rgb0,
+void ABGRToUVRow_C(const uint8_t* src_rgb,
                    int src_stride_rgb,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width);
-void RGBAToUVRow_C(const uint8_t* src_rgb0,
+void RGBAToUVRow_C(const uint8_t* src_rgb,
                    int src_stride_rgb,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width);
-void RGB24ToUVRow_C(const uint8_t* src_rgb0,
+void RGB24ToUVRow_C(const uint8_t* src_rgb,
                     int src_stride_rgb,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
-void RAWToUVRow_C(const uint8_t* src_rgb0,
+void RAWToUVRow_C(const uint8_t* src_rgb,
                   int src_stride_rgb,
                   uint8_t* dst_u,
                   uint8_t* dst_v,
@@ -1254,34 +1561,50 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 
-void MirrorUVRow_SSSE3(const uint8_t* src,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width);
-void MirrorUVRow_NEON(const uint8_t* src_uv,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width);
-void MirrorUVRow_MSA(const uint8_t* src_uv,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width);
-void MirrorUVRow_C(const uint8_t* src_uv,
-                   uint8_t* dst_u,
-                   uint8_t* dst_v,
-                   int width);
+void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void MirrorSplitUVRow_MSA(const uint8_t* src_uv,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void MirrorSplitUVRow_MMI(const uint8_t* src_uv,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void MirrorSplitUVRow_C(const uint8_t* src_uv,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
 
 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
@@ -1293,6 +1616,17 @@ void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
 void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBMirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+
+void RGB24MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void RGB24MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void RGB24MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
+void RGB24MirrorRow_Any_SSSE3(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void RGB24MirrorRow_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
 
 void SplitUVRow_C(const uint8_t* src_uv,
                   uint8_t* dst_u,
@@ -1314,6 +1648,10 @@ void SplitUVRow_MSA(const uint8_t* src_uv,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
+void SplitUVRow_MMI(const uint8_t* src_uv,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
 void SplitUVRow_Any_SSE2(const uint8_t* src_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -1330,6 +1668,10 @@ void SplitUVRow_Any_MSA(const uint8_t* src_ptr,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
+void SplitUVRow_Any_MMI(const uint8_t* src_ptr,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
 
 void MergeUVRow_C(const uint8_t* src_u,
                   const uint8_t* src_v,
@@ -1351,6 +1693,10 @@ void MergeUVRow_MSA(const uint8_t* src_u,
                     const uint8_t* src_v,
                     uint8_t* dst_uv,
                     int width);
+void MergeUVRow_MMI(const uint8_t* src_u,
+                    const uint8_t* src_v,
+                    uint8_t* dst_uv,
+                    int width);
 void MergeUVRow_Any_SSE2(const uint8_t* y_buf,
                          const uint8_t* uv_buf,
                          uint8_t* dst_ptr,
@@ -1367,6 +1713,38 @@ void MergeUVRow_Any_MSA(const uint8_t* y_buf,
                         const uint8_t* uv_buf,
                         uint8_t* dst_ptr,
                         int width);
+void MergeUVRow_Any_MMI(const uint8_t* y_buf,
+                        const uint8_t* uv_buf,
+                        uint8_t* dst_ptr,
+                        int width);
+
+void HalfMergeUVRow_C(const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_uv,
+                      int width);
+
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+                         int src_stride_u,
+                         const uint8_t* src_v,
+                         int src_stride_v,
+                         uint8_t* dst_uv,
+                         int width);
+
+void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
+                          int src_stride_u,
+                          const uint8_t* src_v,
+                          int src_stride_v,
+                          uint8_t* dst_uv,
+                          int width);
+
+void HalfMergeUVRow_AVX2(const uint8_t* src_u,
+                         int src_stride_u,
+                         const uint8_t* src_v,
+                         int src_stride_v,
+                         uint8_t* dst_uv,
+                         int width);
 
 void SplitRGBRow_C(const uint8_t* src_rgb,
                    uint8_t* dst_r,
@@ -1383,6 +1761,11 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       int width);
+void SplitRGBRow_MMI(const uint8_t* src_rgb,
+                     uint8_t* dst_r,
+                     uint8_t* dst_g,
+                     uint8_t* dst_b,
+                     int width);
 void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr,
                            uint8_t* dst_r,
                            uint8_t* dst_g,
@@ -1393,6 +1776,11 @@ void SplitRGBRow_Any_NEON(const uint8_t* src_ptr,
                           uint8_t* dst_g,
                           uint8_t* dst_b,
                           int width);
+void SplitRGBRow_Any_MMI(const uint8_t* src_ptr,
+                         uint8_t* dst_r,
+                         uint8_t* dst_g,
+                         uint8_t* dst_b,
+                         int width);
 
 void MergeRGBRow_C(const uint8_t* src_r,
                    const uint8_t* src_g,
@@ -1409,6 +1797,11 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
                       const uint8_t* src_b,
                       uint8_t* dst_rgb,
                       int width);
+void MergeRGBRow_MMI(const uint8_t* src_r,
+                     const uint8_t* src_g,
+                     const uint8_t* src_b,
+                     uint8_t* dst_rgb,
+                     int width);
 void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
@@ -1419,6 +1812,11 @@ void MergeRGBRow_Any_NEON(const uint8_t* src_r,
                           const uint8_t* src_b,
                           uint8_t* dst_rgb,
                           int width);
+void MergeRGBRow_Any_MMI(const uint8_t* src_r,
+                         const uint8_t* src_g,
+                         const uint8_t* src_b,
+                         uint8_t* dst_rgb,
+                         int width);
 
 void MergeUVRow_16_C(const uint16_t* src_u,
                      const uint16_t* src_v,
@@ -1497,12 +1895,16 @@ void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count);
 void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);
 void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width);
 void ARGBCopyAlphaRow_Any_SSE2(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int width);
 void ARGBCopyAlphaRow_Any_AVX2(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int width);
+void ARGBCopyAlphaRow_Any_MMI(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
 
 void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width);
 void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
@@ -1517,6 +1919,9 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
 void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
                              uint8_t* dst_a,
                              int width);
+void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb,
+                             uint8_t* dst_a,
+                             int width);
 void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr,
                                   uint8_t* dst_ptr,
                                   int width);
@@ -1529,16 +1934,23 @@ void ARGBExtractAlphaRow_Any_NEON(const uint8_t* src_ptr,
 void ARGBExtractAlphaRow_Any_MSA(const uint8_t* src_ptr,
                                  uint8_t* dst_ptr,
                                  int width);
+void ARGBExtractAlphaRow_Any_MMI(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
+                                 int width);
 
 void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);
 void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width);
 void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr,
                                   uint8_t* dst_ptr,
                                   int width);
 void ARGBCopyYToAlphaRow_Any_AVX2(const uint8_t* src_ptr,
                                   uint8_t* dst_ptr,
                                   int width);
+void ARGBCopyYToAlphaRow_Any_MMI(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
+                                 int width);
 
 void SetRow_C(uint8_t* dst, uint8_t v8, int width);
 void SetRow_MSA(uint8_t* dst, uint8_t v8, int width);
@@ -1554,6 +1966,8 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width);
 void ARGBSetRow_Any_NEON(uint8_t* dst_ptr, uint32_t v32, int width);
 void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width);
 void ARGBSetRow_Any_MSA(uint8_t* dst_ptr, uint32_t v32, int width);
+void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width);
+void ARGBSetRow_Any_MMI(uint8_t* dst_ptr, uint32_t v32, int width);
 
 // ARGBShufflers for BGRAToARGB etc.
 void ARGBShuffleRow_C(const uint8_t* src_argb,
@@ -1576,6 +1990,10 @@ void ARGBShuffleRow_MSA(const uint8_t* src_argb,
                         uint8_t* dst_argb,
                         const uint8_t* shuffler,
                         int width);
+void ARGBShuffleRow_MMI(const uint8_t* src_argb,
+                        uint8_t* dst_argb,
+                        const uint8_t* shuffler,
+                        int width);
 void ARGBShuffleRow_Any_SSSE3(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               const uint8_t* param,
@@ -1592,11 +2010,16 @@ void ARGBShuffleRow_Any_MSA(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             const uint8_t* param,
                             int width);
+void ARGBShuffleRow_Any_MMI(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            const uint8_t* param,
+                            int width);
 
 void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
                           uint8_t* dst_argb,
                           int width);
 void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
 void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
 void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
@@ -1615,30 +2038,44 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
                          uint8_t* dst_argb,
                          int width);
 void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
+void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
 void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
 void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
 void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
                           uint8_t* dst_argb,
                           int width);
 void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565,
                          uint8_t* dst_argb,
                          int width);
+void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565,
+                         uint8_t* dst_argb,
+                         int width);
 void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
                             uint8_t* dst_argb,
                             int width);
 void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555,
                            uint8_t* dst_argb,
                            int width);
+void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555,
+                           uint8_t* dst_argb,
+                           int width);
 void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
                             uint8_t* dst_argb,
                             int width);
 void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444,
                            uint8_t* dst_argb,
                            int width);
+void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444,
+                           uint8_t* dst_argb,
+                           int width);
 void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
 void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
 void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
 void RGB565ToARGBRow_C(const uint8_t* src_rgb565, uint8_t* dst_argb, int width);
 void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
@@ -1658,6 +2095,9 @@ void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
 void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
+void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
 void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int width);
@@ -1687,24 +2127,36 @@ void RGB24ToARGBRow_Any_NEON(const uint8_t* src_ptr,
 void RGB24ToARGBRow_Any_MSA(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
+void RGB24ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
 void RAWToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToRGBARow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RAWToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RAWToRGB24Row_Any_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
 void RAWToRGB24Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToRGB24Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGB565ToARGBRow_Any_NEON(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int width);
 void RGB565ToARGBRow_Any_MSA(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int width);
+void RGB565ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
 void ARGB1555ToARGBRow_Any_NEON(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 int width);
 void ARGB1555ToARGBRow_Any_MSA(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int width);
+void ARGB1555ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
 void ARGB4444ToARGBRow_Any_NEON(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 int width);
@@ -1712,6 +2164,9 @@ void ARGB4444ToARGBRow_Any_NEON(const uint8_t* src_ptr,
 void ARGB4444ToARGBRow_Any_MSA(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int width);
+void ARGB4444ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
 
 void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
 void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
@@ -1780,6 +2235,20 @@ void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
                                const uint32_t dither4,
                                int width);
 
+void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_MMI(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width);
+void ARGBToARGB4444Row_MMI(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width);
+void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb,
+                               uint8_t* dst_rgb,
+                               const uint32_t dither4,
+                               int width);
+
 void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
@@ -1793,6 +2262,7 @@ void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
@@ -1804,6 +2274,7 @@ void J400ToARGBRow_Any_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
 void J400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void J400ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 
 void I444ToARGBRow_C(const uint8_t* src_y,
                      const uint8_t* src_u,
@@ -1867,6 +2338,10 @@ void NV21ToRGB24Row_C(const uint8_t* src_y,
                       uint8_t* rgb_buf,
                       const struct YuvConstants* yuvconstants,
                       int width);
+void NV21ToYUV24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_vu,
+                      uint8_t* dst_yuv24,
+                      int width);
 void YUY2ToARGBRow_C(const uint8_t* src_yuy2,
                      uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
@@ -2033,6 +2508,10 @@ void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width);
+void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_yuv24,
+                         int width);
 void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
                           const uint8_t* src_uv,
                           uint8_t* dst_rgb565,
@@ -2238,6 +2717,10 @@ void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
                              uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
+void NV21ToYUV24Row_Any_AVX2(const uint8_t* src_y,
+                             const uint8_t* src_vu,
+                             uint8_t* dst_yuv24,
+                             int width);
 void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,
                                const uint8_t* uv_buf,
                                uint8_t* dst_ptr,
@@ -2319,21 +2802,50 @@ void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
                              const struct YuvConstants* yuvconstants,
                              int width);
 
-void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width);
-void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width);
-void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width);
-void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
-void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void I400ToARGBRow_C(const uint8_t* src_y,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I400ToARGBRow_SSE2(const uint8_t* y_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I400ToARGBRow_AVX2(const uint8_t* y_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I400ToARGBRow_NEON(const uint8_t* src_y,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I400ToARGBRow_MSA(const uint8_t* src_y,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I400ToARGBRow_MMI(const uint8_t* src_y,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
 void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
                             int width);
 void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
                             int width);
 void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
                             int width);
-void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I400ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
 
 // ARGB preattenuated alpha blend.
 void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
@@ -2348,6 +2860,10 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0,
                       const uint8_t* src_argb1,
                       uint8_t* dst_argb,
                       int width);
+void ARGBBlendRow_MMI(const uint8_t* src_argb0,
+                      const uint8_t* src_argb1,
+                      uint8_t* dst_argb,
+                      int width);
 void ARGBBlendRow_C(const uint8_t* src_argb0,
                     const uint8_t* src_argb1,
                     uint8_t* dst_argb,
@@ -2374,6 +2890,16 @@ void BlendPlaneRow_Any_AVX2(const uint8_t* y_buf,
                             const uint8_t* v_buf,
                             uint8_t* dst_ptr,
                             int width);
+void BlendPlaneRow_MMI(const uint8_t* src0,
+                       const uint8_t* src1,
+                       const uint8_t* alpha,
+                       uint8_t* dst,
+                       int width);
+void BlendPlaneRow_Any_MMI(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
 void BlendPlaneRow_C(const uint8_t* src0,
                      const uint8_t* src1,
                      const uint8_t* alpha,
@@ -2418,6 +2944,14 @@ void ARGBMultiplyRow_Any_MSA(const uint8_t* y_buf,
                              const uint8_t* uv_buf,
                              uint8_t* dst_ptr,
                              int width);
+void ARGBMultiplyRow_MMI(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width);
+void ARGBMultiplyRow_Any_MMI(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
 
 // ARGB add images.
 void ARGBAddRow_C(const uint8_t* src_argb0,
@@ -2456,6 +2990,14 @@ void ARGBAddRow_Any_MSA(const uint8_t* y_buf,
                         const uint8_t* uv_buf,
                         uint8_t* dst_ptr,
                         int width);
+void ARGBAddRow_MMI(const uint8_t* src_argb0,
+                    const uint8_t* src_argb1,
+                    uint8_t* dst_argb,
+                    int width);
+void ARGBAddRow_Any_MMI(const uint8_t* y_buf,
+                        const uint8_t* uv_buf,
+                        uint8_t* dst_ptr,
+                        int width);
 
 // ARGB subtract images. Same API as Blend, but these require
 // pointer and width alignment for SSE2.
@@ -2495,6 +3037,14 @@ void ARGBSubtractRow_Any_MSA(const uint8_t* y_buf,
                              const uint8_t* uv_buf,
                              uint8_t* dst_ptr,
                              int width);
+void ARGBSubtractRow_MMI(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width);
+void ARGBSubtractRow_Any_MMI(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
 
 void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
@@ -2584,6 +3134,24 @@ void ARGBToRGB565DitherRow_Any_MSA(const uint8_t* src_ptr,
                                    const uint32_t param,
                                    int width);
 
+void ARGBToRGB24Row_Any_MMI(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGBToRAWRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB565Row_Any_MMI(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGBToARGB1555Row_Any_MMI(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
+void ARGBToARGB4444Row_Any_MMI(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
+void ARGBToRGB565DitherRow_Any_MMI(const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr,
+                                   const uint32_t param,
+                                   int width);
+
 void I444ToARGBRow_Any_NEON(const uint8_t* y_buf,
                             const uint8_t* u_buf,
                             const uint8_t* v_buf,
@@ -2653,6 +3221,10 @@ void NV21ToRGB24Row_Any_NEON(const uint8_t* y_buf,
                              uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
+void NV21ToYUV24Row_Any_NEON(const uint8_t* src_y,
+                             const uint8_t* src_vu,
+                             uint8_t* dst_yuv24,
+                             int width);
 void NV12ToRGB565Row_Any_NEON(const uint8_t* y_buf,
                               const uint8_t* uv_buf,
                               uint8_t* dst_ptr,
@@ -2672,12 +3244,24 @@ void I444ToARGBRow_Any_MSA(const uint8_t* y_buf,
                            uint8_t* dst_ptr,
                            const struct YuvConstants* yuvconstants,
                            int width);
+void I444ToARGBRow_Any_MMI(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
 void I422ToARGBRow_Any_MSA(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
                            uint8_t* dst_ptr,
                            const struct YuvConstants* yuvconstants,
                            int width);
+void I422ToARGBRow_Any_MMI(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
 void I422ToRGBARow_Any_MSA(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
@@ -2770,15 +3354,25 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
                          uint8_t* dst_v,
                          int width);
 void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
 void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,
                      int src_stride_yuy2,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
+void YUY2ToUVRow_MMI(const uint8_t* src_yuy2,
+                     int src_stride_yuy2,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
 void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
+void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
 void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
 void YUY2ToUVRow_C(const uint8_t* src_yuy2,
                    int src_stride_yuy2,
@@ -2820,15 +3414,25 @@ void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr,
                              uint8_t* dst_v,
                              int width);
 void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr,
                          int src_stride_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
+void YUY2ToUVRow_Any_MMI(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
 void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width);
+void YUY2ToUV422Row_Any_MMI(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
 void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
 void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
                       int stride_uyvy,
@@ -2870,15 +3474,25 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
                          uint8_t* dst_v,
                          int width);
 void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
 void UYVYToUVRow_MSA(const uint8_t* src_uyvy,
                      int src_stride_uyvy,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
+void UYVYToUVRow_MMI(const uint8_t* src_uyvy,
+                     int src_stride_uyvy,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
 void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
+void UYVYToUV422Row_MMI(const uint8_t* src_uyvy,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
 
 void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
 void UYVYToUVRow_C(const uint8_t* src_uyvy,
@@ -2921,15 +3535,59 @@ void UYVYToUV422Row_Any_NEON(const uint8_t* src_ptr,
                              uint8_t* dst_v,
                              int width);
 void UYVYToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void UYVYToUVRow_Any_MSA(const uint8_t* src_ptr,
                          int src_stride_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
+void UYVYToUVRow_Any_MMI(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
 void UYVYToUV422Row_Any_MSA(const uint8_t* src_ptr,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width);
+void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
+void AYUVToUVRow_C(const uint8_t* src_ayuv,
+                   int stride_ayuv,
+                   uint8_t* dst_uv,
+                   int width);
+void AYUVToVURow_C(const uint8_t* src_ayuv,
+                   int stride_ayuv,
+                   uint8_t* dst_vu,
+                   int width);
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+                      int stride_ayuv,
+                      uint8_t* dst_uv,
+                      int width);
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+                      int stride_ayuv,
+                      uint8_t* dst_vu,
+                      int width);
+void AYUVToYRow_Any_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
+void AYUVToUVRow_Any_NEON(const uint8_t* src_ayuv,
+                          int stride_ayuv,
+                          uint8_t* dst_uv,
+                          int width);
+void AYUVToVURow_Any_NEON(const uint8_t* src_ayuv,
+                          int stride_ayuv,
+                          uint8_t* dst_vu,
+                          int width);
 
 void I422ToYUY2Row_C(const uint8_t* src_y,
                      const uint8_t* src_u,
@@ -3006,21 +3664,41 @@ void I422ToYUY2Row_MSA(const uint8_t* src_y,
                        const uint8_t* src_v,
                        uint8_t* dst_yuy2,
                        int width);
+void I422ToYUY2Row_MMI(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_yuy2,
+                       int width);
 void I422ToUYVYRow_MSA(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_uyvy,
                        int width);
+void I422ToUYVYRow_MMI(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_uyvy,
+                       int width);
 void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
                            uint8_t* dst_ptr,
                            int width);
+void I422ToYUY2Row_Any_MMI(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
 void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
                            uint8_t* dst_ptr,
                            int width);
+void I422ToUYVYRow_Any_MMI(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
 
 // Effects related row functions.
 void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
@@ -3036,6 +3714,9 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
 void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
                           uint8_t* dst_argb,
                           int width);
+void ARGBAttenuateRow_MMI(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width);
 void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 int width);
@@ -3048,6 +3729,9 @@ void ARGBAttenuateRow_Any_NEON(const uint8_t* src_ptr,
 void ARGBAttenuateRow_Any_MSA(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int width);
+void ARGBAttenuateRow_Any_MMI(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
 
 // Inverse table for unattenuate, shared by C and SSE2.
 extern const uint32_t fixed_invtbl8[256];
@@ -3071,11 +3755,13 @@ void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 
 void ARGBSepiaRow_C(uint8_t* dst_argb, int width);
 void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width);
 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width);
 void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width);
 
 void ARGBColorMatrixRow_C(const uint8_t* src_argb,
                           uint8_t* dst_argb,
@@ -3093,6 +3779,10 @@ void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             const int8_t* matrix_argb,
                             int width);
+void ARGBColorMatrixRow_MMI(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const int8_t* matrix_argb,
+                            int width);
 
 void ARGBColorTableRow_C(uint8_t* dst_argb,
                          const uint8_t* table_argb,
@@ -3145,6 +3835,10 @@ void ARGBShadeRow_MSA(const uint8_t* src_argb,
                       uint8_t* dst_argb,
                       int width,
                       uint32_t value);
+void ARGBShadeRow_MMI(const uint8_t* src_argb,
+                      uint8_t* dst_argb,
+                      int width,
+                      uint32_t value);
 
 // Used for blur.
 void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
@@ -3158,6 +3852,11 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
                                   const int32_t* previous_cumsum,
                                   int width);
 
+void ComputeCumulativeSumRow_MMI(const uint8_t* row,
+                                 int32_t* cumsum,
+                                 const int32_t* previous_cumsum,
+                                 int width);
+
 void CumulativeSumToAverageRow_C(const int32_t* tl,
                                  const int32_t* bl,
                                  int w,
@@ -3208,6 +3907,11 @@ void InterpolateRow_MSA(uint8_t* dst_ptr,
                         ptrdiff_t src_stride,
                         int width,
                         int source_y_fraction);
+void InterpolateRow_MMI(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int width,
+                        int source_y_fraction);
 void InterpolateRow_Any_NEON(uint8_t* dst_ptr,
                              const uint8_t* src_ptr,
                              ptrdiff_t src_stride_ptr,
@@ -3228,6 +3932,11 @@ void InterpolateRow_Any_MSA(uint8_t* dst_ptr,
                             ptrdiff_t src_stride_ptr,
                             int width,
                             int source_y_fraction);
+void InterpolateRow_Any_MMI(uint8_t* dst_ptr,
+                            const uint8_t* src_ptr,
+                            ptrdiff_t src_stride_ptr,
+                            int width,
+                            int source_y_fraction);
 
 void InterpolateRow_16_C(uint16_t* dst_ptr,
                          const uint16_t* src_ptr,
@@ -3256,6 +3965,11 @@ void SobelXRow_MSA(const uint8_t* src_y0,
                    const uint8_t* src_y2,
                    uint8_t* dst_sobelx,
                    int width);
+void SobelXRow_MMI(const uint8_t* src_y0,
+                   const uint8_t* src_y1,
+                   const uint8_t* src_y2,
+                   uint8_t* dst_sobelx,
+                   int width);
 void SobelYRow_C(const uint8_t* src_y0,
                  const uint8_t* src_y1,
                  uint8_t* dst_sobely,
@@ -3272,6 +3986,10 @@ void SobelYRow_MSA(const uint8_t* src_y0,
                    const uint8_t* src_y1,
                    uint8_t* dst_sobely,
                    int width);
+void SobelYRow_MMI(const uint8_t* src_y0,
+                   const uint8_t* src_y1,
+                   uint8_t* dst_sobely,
+                   int width);
 void SobelRow_C(const uint8_t* src_sobelx,
                 const uint8_t* src_sobely,
                 uint8_t* dst_argb,
@@ -3288,6 +4006,10 @@ void SobelRow_MSA(const uint8_t* src_sobelx,
                   const uint8_t* src_sobely,
                   uint8_t* dst_argb,
                   int width);
+void SobelRow_MMI(const uint8_t* src_sobelx,
+                  const uint8_t* src_sobely,
+                  uint8_t* dst_argb,
+                  int width);
 void SobelToPlaneRow_C(const uint8_t* src_sobelx,
                        const uint8_t* src_sobely,
                        uint8_t* dst_y,
@@ -3304,6 +4026,10 @@ void SobelToPlaneRow_MSA(const uint8_t* src_sobelx,
                          const uint8_t* src_sobely,
                          uint8_t* dst_y,
                          int width);
+void SobelToPlaneRow_MMI(const uint8_t* src_sobelx,
+                         const uint8_t* src_sobely,
+                         uint8_t* dst_y,
+                         int width);
 void SobelXYRow_C(const uint8_t* src_sobelx,
                   const uint8_t* src_sobely,
                   uint8_t* dst_argb,
@@ -3320,6 +4046,10 @@ void SobelXYRow_MSA(const uint8_t* src_sobelx,
                     const uint8_t* src_sobely,
                     uint8_t* dst_argb,
                     int width);
+void SobelXYRow_MMI(const uint8_t* src_sobelx,
+                    const uint8_t* src_sobely,
+                    uint8_t* dst_argb,
+                    int width);
 void SobelRow_Any_SSE2(const uint8_t* y_buf,
                        const uint8_t* uv_buf,
                        uint8_t* dst_ptr,
@@ -3332,6 +4062,10 @@ void SobelRow_Any_MSA(const uint8_t* y_buf,
                       const uint8_t* uv_buf,
                       uint8_t* dst_ptr,
                       int width);
+void SobelRow_Any_MMI(const uint8_t* y_buf,
+                      const uint8_t* uv_buf,
+                      uint8_t* dst_ptr,
+                      int width);
 void SobelToPlaneRow_Any_SSE2(const uint8_t* y_buf,
                               const uint8_t* uv_buf,
                               uint8_t* dst_ptr,
@@ -3344,6 +4078,10 @@ void SobelToPlaneRow_Any_MSA(const uint8_t* y_buf,
                              const uint8_t* uv_buf,
                              uint8_t* dst_ptr,
                              int width);
+void SobelToPlaneRow_Any_MMI(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
 void SobelXYRow_Any_SSE2(const uint8_t* y_buf,
                          const uint8_t* uv_buf,
                          uint8_t* dst_ptr,
@@ -3356,6 +4094,10 @@ void SobelXYRow_Any_MSA(const uint8_t* y_buf,
                         const uint8_t* uv_buf,
                         uint8_t* dst_ptr,
                         int width);
+void SobelXYRow_Any_MMI(const uint8_t* y_buf,
+                        const uint8_t* uv_buf,
+                        uint8_t* dst_ptr,
+                        int width);
 
 void ARGBPolynomialRow_C(const uint8_t* src_argb,
                          uint8_t* dst_argb,
@@ -3462,6 +4204,178 @@ float ScaleSumSamples_NEON(const float* src,
 void ScaleSamples_C(const float* src, float* dst, float scale, int width);
 void ScaleSamples_NEON(const float* src, float* dst, float scale, int width);
 
+void I210ToARGBRow_MMI(const uint16_t* src_y,
+                       const uint16_t* src_u,
+                       const uint16_t* src_v,
+                       uint8_t* rgb_buf,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422ToRGBARow_MMI(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422AlphaToARGBRow_MMI(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGB24Row_MMI(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToRGB565Row_MMI(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422ToARGB4444Row_MMI(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb4444,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422ToARGB1555Row_MMI(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb1555,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void NV12ToARGBRow_MMI(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void NV12ToRGB565Row_MMI(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void NV21ToARGBRow_MMI(const uint8_t* src_y,
+                       const uint8_t* src_vu,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void NV12ToRGB24Row_MMI(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void NV21ToRGB24Row_MMI(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void UYVYToARGBRow_MMI(const uint8_t* src_uyvy,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I210ToARGBRow_Any_MMI(const uint16_t* y_buf,
+                           const uint16_t* u_buf,
+                           const uint16_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422ToRGBARow_Any_MMI(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422AlphaToARGBRow_Any_MMI(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                const uint8_t* a_buf,
+                                uint8_t* dst_ptr,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
+void I422ToRGB24Row_Any_MMI(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGB565Row_Any_MMI(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToARGB4444Row_Any_MMI(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ptr,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
+void I422ToARGB1555Row_Any_MMI(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ptr,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
+void NV12ToARGBRow_Any_MMI(const uint8_t* y_buf,
+                           const uint8_t* uv_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void NV12ToRGB565Row_Any_MMI(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV21ToARGBRow_Any_MMI(const uint8_t* y_buf,
+                           const uint8_t* uv_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void NV12ToRGB24Row_Any_MMI(const uint8_t* y_buf,
+                            const uint8_t* uv_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void NV21ToRGB24Row_Any_MMI(const uint8_t* y_buf,
+                            const uint8_t* uv_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void YUY2ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void UYVYToARGBRow_Any_MMI(const uint8_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+
+void GaussRow_F32_NEON(const float* src, float* dst, int width);
+void GaussRow_F32_C(const float* src, float* dst, int width);
+
+void GaussCol_F32_NEON(const float* src0,
+                       const float* src1,
+                       const float* src2,
+                       const float* src3,
+                       const float* src4,
+                       float* dst,
+                       int width);
+
+void GaussCol_F32_C(const float* src0,
+                    const float* src1,
+                    const float* src2,
+                    const float* src3,
+                    const float* src4,
+                    float* dst,
+                    int width);
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/chromium/third_party/libyuv/include/libyuv/scale.h b/chromium/third_party/libyuv/include/libyuv/scale.h
index b937d348cab..add5a9eb622 100644
--- a/chromium/third_party/libyuv/include/libyuv/scale.h
+++ b/chromium/third_party/libyuv/include/libyuv/scale.h
@@ -97,6 +97,79 @@ int I420Scale_16(const uint16_t* src_y,
                  int dst_height,
                  enum FilterMode filtering);
 
+// Scales a YUV 4:4:4 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+
+LIBYUV_API
+int I444Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering);
+
+LIBYUV_API
+int I444Scale_16(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering);
+
+// Scales an NV12 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// kFilterBox is not supported for the UV channel and will be treated as
+// bilinear.
+// Returns 0 if successful.
+
+LIBYUV_API
+int NV12Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_uv,
+              int src_stride_uv,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_uv,
+              int dst_stride_uv,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering);
+
 #ifdef __cplusplus
 // Legacy API.  Deprecated.
 LIBYUV_API
diff --git a/chromium/third_party/libyuv/include/libyuv/scale_row.h b/chromium/third_party/libyuv/include/libyuv/scale_row.h
index 7194ba09f84..a386d499895 100644
--- a/chromium/third_party/libyuv/include/libyuv/scale_row.h
+++ b/chromium/third_party/libyuv/include/libyuv/scale_row.h
@@ -58,6 +58,7 @@ extern "C" {
     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 #define HAS_FIXEDDIV1_X86
 #define HAS_FIXEDDIV_X86
+#define HAS_SCALEADDROW_SSE2
 #define HAS_SCALEARGBCOLS_SSE2
 #define HAS_SCALEARGBCOLSUP2_SSE2
 #define HAS_SCALEARGBFILTERCOLS_SSSE3
@@ -69,7 +70,22 @@ extern "C" {
 #define HAS_SCALEROWDOWN34_SSSE3
 #define HAS_SCALEROWDOWN38_SSSE3
 #define HAS_SCALEROWDOWN4_SSSE3
-#define HAS_SCALEADDROW_SSE2
+#endif
+
+// The following are available for gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#define HAS_SCALEUVROWDOWN2BOX_SSSE3
+#endif
+
+// The following are available for gcc/clang x86 platforms, but
+// require clang 3.4 or gcc 4.7.
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) &&                                     \
+    (defined(__x86_64__) || defined(__i386__)) && !defined(_MSC_VER) && \
+    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_SCALEUVROWDOWN2BOX_AVX2
 #endif
 
 // The following are available on all x86 platforms, but
@@ -86,7 +102,9 @@ extern "C" {
 // The following are available on Neon platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && \
     (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SCALEADDROW_NEON
 #define HAS_SCALEARGBCOLS_NEON
+#define HAS_SCALEARGBFILTERCOLS_NEON
 #define HAS_SCALEARGBROWDOWN2_NEON
 #define HAS_SCALEARGBROWDOWNEVEN_NEON
 #define HAS_SCALEFILTERCOLS_NEON
@@ -94,7 +112,8 @@ extern "C" {
 #define HAS_SCALEROWDOWN34_NEON
 #define HAS_SCALEROWDOWN38_NEON
 #define HAS_SCALEROWDOWN4_NEON
-#define HAS_SCALEARGBFILTERCOLS_NEON
+#define HAS_SCALEUVROWDOWN2BOX_NEON
+#define HAS_SCALEUVROWDOWNEVEN_NEON
 #endif
 
 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
@@ -110,6 +129,24 @@ extern "C" {
 #define HAS_SCALEROWDOWN4_MSA
 #endif
 
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+#define HAS_FIXEDDIV1_MIPS
+#define HAS_FIXEDDIV_MIPS
+#define HAS_SCALEADDROW_16_MMI
+#define HAS_SCALEADDROW_MMI
+#define HAS_SCALEARGBCOLS_MMI
+#define HAS_SCALEARGBCOLSUP2_MMI
+#define HAS_SCALEARGBROWDOWN2_MMI
+#define HAS_SCALEARGBROWDOWNEVEN_MMI
+#define HAS_SCALECOLS_16_MMI
+#define HAS_SCALECOLS_MMI
+#define HAS_SCALEROWDOWN2_16_MMI
+#define HAS_SCALEROWDOWN2_MMI
+#define HAS_SCALEROWDOWN4_16_MMI
+#define HAS_SCALEROWDOWN4_MMI
+#define HAS_SCALEROWDOWN34_MMI
+#endif
+
 // Scale ARGB vertically with bilinear interpolation.
 void ScalePlaneVertical(int src_height,
                         int dst_width,
@@ -147,12 +184,17 @@ enum FilterMode ScaleFilterReduce(int src_width,
 // Divide num by div and return as 16.16 fixed point result.
 int FixedDiv_C(int num, int div);
 int FixedDiv_X86(int num, int div);
+int FixedDiv_MIPS(int num, int div);
 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
 int FixedDiv1_C(int num, int div);
 int FixedDiv1_X86(int num, int div);
+int FixedDiv1_MIPS(int num, int div);
 #ifdef HAS_FIXEDDIV_X86
 #define FixedDiv FixedDiv_X86
 #define FixedDiv1 FixedDiv1_X86
+#elif defined HAS_FIXEDDIV_MIPS
+#define FixedDiv FixedDiv_MIPS
+#define FixedDiv1 FixedDiv1_MIPS
 #else
 #define FixedDiv FixedDiv_C
 #define FixedDiv1 FixedDiv1_C
@@ -352,6 +394,53 @@ void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
                              int dst_width,
                              int x32,
                              int dx);
+void ScaleUVRowDown2_C(const uint8_t* src_uv,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst_uv,
+                       int dst_width);
+void ScaleUVRowDown2Linear_C(const uint8_t* src_uv,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_uv,
+                             int dst_width);
+void ScaleUVRowDown2Box_C(const uint8_t* src_uv,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_uv,
+                          int dst_width);
+void ScaleUVRowDownEven_C(const uint8_t* src_uv,
+                          ptrdiff_t src_stride,
+                          int src_stepx,
+                          uint8_t* dst_uv,
+                          int dst_width);
+void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv,
+                             ptrdiff_t src_stride,
+                             int src_stepx,
+                             uint8_t* dst_uv,
+                             int dst_width);
+void ScaleUVCols_C(uint8_t* dst_uv,
+                   const uint8_t* src_uv,
+                   int dst_width,
+                   int x,
+                   int dx);
+void ScaleUVCols64_C(uint8_t* dst_uv,
+                     const uint8_t* src_uv,
+                     int dst_width,
+                     int x32,
+                     int dx);
+void ScaleUVColsUp2_C(uint8_t* dst_uv,
+                      const uint8_t* src_uv,
+                      int dst_width,
+                      int,
+                      int);
+void ScaleUVFilterCols_C(uint8_t* dst_uv,
+                         const uint8_t* src_uv,
+                         int dst_width,
+                         int x,
+                         int dx);
+void ScaleUVFilterCols64_C(uint8_t* dst_uv,
+                           const uint8_t* src_uv,
+                           int dst_width,
+                           int x32,
+                           int dx);
 
 // Specialized scalers for x86.
 void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
@@ -569,6 +658,16 @@ void ScaleARGBCols_Any_MSA(uint8_t* dst_ptr,
                            int dst_width,
                            int x,
                            int dx);
+void ScaleARGBCols_MMI(uint8_t* dst_argb,
+                       const uint8_t* src_argb,
+                       int dst_width,
+                       int x,
+                       int dx);
+void ScaleARGBCols_Any_MMI(uint8_t* dst_ptr,
+                           const uint8_t* src_ptr,
+                           int dst_width,
+                           int x,
+                           int dx);
 
 // ARGB Row functions
 void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
@@ -607,6 +706,18 @@ void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,
                               ptrdiff_t src_stride,
                               uint8_t* dst_argb,
                               int dst_width);
+void ScaleARGBRowDown2_MMI(const uint8_t* src_argb,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_argb,
+                           int dst_width);
+void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_argb,
+                                 int dst_width);
+void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_argb,
+                              int dst_width);
 void ScaleARGBRowDown2_Any_SSE2(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8_t* dst_ptr,
@@ -643,7 +754,18 @@ void ScaleARGBRowDown2Box_Any_MSA(const uint8_t* src_ptr,
                                   ptrdiff_t src_stride,
                                   uint8_t* dst_ptr,
                                   int dst_width);
-
+void ScaleARGBRowDown2_Any_MMI(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleARGBRowDown2Linear_Any_MMI(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst_ptr,
+                                     int dst_width);
+void ScaleARGBRowDown2Box_Any_MMI(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
 void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
                                ptrdiff_t src_stride,
                                int src_stepx,
@@ -674,6 +796,16 @@ void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb,
                                  int src_stepx,
                                  uint8_t* dst_argb,
                                  int dst_width);
+void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              int32_t src_stepx,
+                              uint8_t* dst_argb,
+                              int dst_width);
+void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_argb,
+                                 int dst_width);
 void ScaleARGBRowDownEven_Any_SSE2(const uint8_t* src_ptr,
                                    ptrdiff_t src_stride,
                                    int src_stepx,
@@ -704,6 +836,202 @@ void ScaleARGBRowDownEvenBox_Any_MSA(const uint8_t* src_ptr,
                                      int src_stepx,
                                      uint8_t* dst_ptr,
                                      int dst_width);
+void ScaleARGBRowDownEven_Any_MMI(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  int32_t src_stepx,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleARGBRowDownEvenBox_Any_MMI(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     int src_stepx,
+                                     uint8_t* dst_ptr,
+                                     int dst_width);
+
+// UV Row functions
+void ScaleUVRowDown2_SSSE3(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_uv,
+                           int dst_width);
+void ScaleUVRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_uv,
+                                 int dst_width);
+void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_uv,
+                              int dst_width);
+void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_uv,
+                             int dst_width);
+void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width);
+void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_uv,
+                                int dst_width);
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width);
+void ScaleUVRowDown2_MSA(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_uv,
+                         int dst_width);
+void ScaleUVRowDown2Linear_MSA(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_uv,
+                               int dst_width);
+void ScaleUVRowDown2Box_MSA(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_uv,
+                            int dst_width);
+void ScaleUVRowDown2_MMI(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_uv,
+                         int dst_width);
+void ScaleUVRowDown2Linear_MMI(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_uv,
+                               int dst_width);
+void ScaleUVRowDown2Box_MMI(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_uv,
+                            int dst_width);
+void ScaleUVRowDown2_Any_SSSE3(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleUVRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst_ptr,
+                                     int dst_width);
+void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleUVRowDown2Box_Any_AVX2(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleUVRowDown2_Any_NEON(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleUVRowDown2Linear_Any_NEON(const uint8_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleUVRowDown2Box_Any_NEON(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleUVRowDown2_Any_MSA(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleUVRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleUVRowDown2Box_Any_MSA(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleUVRowDown2_Any_MMI(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleUVRowDown2Linear_Any_MMI(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleUVRowDown2Box_Any_MMI(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleUVRowDownEven_SSSE3(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              int src_stepx,
+                              uint8_t* dst_uv,
+                              int dst_width);
+void ScaleUVRowDownEvenBox_SSSE3(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_uv,
+                                 int dst_width);
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             int src_stepx,
+                             uint8_t* dst_uv,
+                             int dst_width);
+void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                int src_stepx,
+                                uint8_t* dst_uv,
+                                int dst_width);
+void ScaleUVRowDownEven_MSA(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            int32_t src_stepx,
+                            uint8_t* dst_uv,
+                            int dst_width);
+void ScaleUVRowDownEvenBox_MSA(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_uv,
+                               int dst_width);
+void ScaleUVRowDownEven_MMI(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            int32_t src_stepx,
+                            uint8_t* dst_uv,
+                            int dst_width);
+void ScaleUVRowDownEvenBox_MMI(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_uv,
+                               int dst_width);
+void ScaleUVRowDownEven_Any_SSSE3(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleUVRowDownEvenBox_Any_SSSE3(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     int src_stepx,
+                                     uint8_t* dst_ptr,
+                                     int dst_width);
+void ScaleUVRowDownEven_Any_NEON(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleUVRowDownEvenBox_Any_NEON(const uint8_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    int src_stepx,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleUVRowDownEven_Any_MSA(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                int32_t src_stepx,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleUVRowDownEvenBox_Any_MSA(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   int src_stepx,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleUVRowDownEven_Any_MMI(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                int32_t src_stepx,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleUVRowDownEvenBox_Any_MMI(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   int src_stepx,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
 
 // ScaleRowDown2Box also used by planar functions
 // NEON downscalers with interpolation.
@@ -874,6 +1202,10 @@ void ScaleRowDown34_MSA(const uint8_t* src_ptr,
                         ptrdiff_t src_stride,
                         uint8_t* dst,
                         int dst_width);
+void ScaleRowDown34_MMI(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
 void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint8_t* d,
@@ -927,6 +1259,10 @@ void ScaleRowDown34_Any_MSA(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
                             uint8_t* dst_ptr,
                             int dst_width);
+void ScaleRowDown34_Any_MMI(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
 void ScaleRowDown34_0_Box_Any_MSA(const uint8_t* src_ptr,
                                   ptrdiff_t src_stride,
                                   uint8_t* dst_ptr,
@@ -936,6 +1272,93 @@ void ScaleRowDown34_1_Box_Any_MSA(const uint8_t* src_ptr,
                                   uint8_t* dst_ptr,
                                   int dst_width);
 
+void ScaleRowDown2_MMI(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width);
+void ScaleRowDown2_16_MMI(const uint16_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint16_t* dst,
+                          int dst_width);
+void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width);
+void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint16_t* dst,
+                                int dst_width);
+void ScaleRowDown2Box_MMI(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width);
+void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint16_t* dst,
+                             int dst_width);
+void ScaleRowDown2Box_Odd_MMI(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst,
+                              int dst_width);
+void ScaleRowDown4_MMI(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width);
+void ScaleRowDown4_16_MMI(const uint16_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint16_t* dst,
+                          int dst_width);
+void ScaleRowDown4Box_MMI(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width);
+void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint16_t* dst,
+                             int dst_width);
+void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_16_MMI(const uint16_t* src_ptr,
+                        uint32_t* dst_ptr,
+                        int src_width);
+void ScaleColsUp2_MMI(uint8_t* dst_ptr,
+                      const uint8_t* src_ptr,
+                      int dst_width,
+                      int x,
+                      int dx);
+void ScaleColsUp2_16_MMI(uint16_t* dst_ptr,
+                         const uint16_t* src_ptr,
+                         int dst_width,
+                         int x,
+                         int dx);
+void ScaleARGBColsUp2_MMI(uint8_t* dst_argb,
+                          const uint8_t* src_argb,
+                          int dst_width,
+                          int x,
+                          int dx);
+
+void ScaleRowDown2_Any_MMI(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
+void ScaleRowDown2Linear_Any_MMI(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleRowDown2Box_Any_MMI(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown4_Any_MMI(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
+void ScaleRowDown4Box_Any_MMI(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleAddRow_Any_MMI(const uint8_t* src_ptr,
+                         uint16_t* dst_ptr,
+                         int src_width);
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/chromium/third_party/libyuv/include/libyuv/scale_uv.h b/chromium/third_party/libyuv/include/libyuv/scale_uv.h
new file mode 100644
index 00000000000..1b6327aaed1
--- /dev/null
+++ b/chromium/third_party/libyuv/include/libyuv/scale_uv.h
@@ -0,0 +1,38 @@
+/*
+ *  Copyright 2020 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_UV_H_
+#define INCLUDE_LIBYUV_SCALE_UV_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h"  // For FilterMode
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+int UVScale(const uint8_t* src_uv,
+            int src_stride_uv,
+            int src_width,
+            int src_height,
+            uint8_t* dst_uv,
+            int dst_stride_uv,
+            int dst_width,
+            int dst_height,
+            enum FilterMode filtering);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_SCALE_UV_H_
diff --git a/chromium/third_party/libyuv/include/libyuv/version.h b/chromium/third_party/libyuv/include/libyuv/version.h
index 249f61f71ac..efaac73e3ab 100644
--- a/chromium/third_party/libyuv/include/libyuv/version.h
+++ b/chromium/third_party/libyuv/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1714
+#define LIBYUV_VERSION 1768
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/chromium/third_party/libyuv/include/libyuv/video_common.h b/chromium/third_party/libyuv/include/libyuv/video_common.h
index ffcbdbf1b0c..b9823d71d09 100644
--- a/chromium/third_party/libyuv/include/libyuv/video_common.h
+++ b/chromium/third_party/libyuv/include/libyuv/video_common.h
@@ -50,7 +50,7 @@ extern "C" {
 // Secondary formats are converted in 2 steps.
 // Auxilliary formats call primary converters.
 enum FourCC {
-  // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+  // 10 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
   FOURCC_I420 = FOURCC('I', '4', '2', '0'),
   FOURCC_I422 = FOURCC('I', '4', '2', '2'),
   FOURCC_I444 = FOURCC('I', '4', '4', '4'),
@@ -59,9 +59,10 @@ enum FourCC {
   FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
   FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
   FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
-  FOURCC_H010 = FOURCC('H', '0', '1', '0'),  // unofficial fourcc. 10 bit lsb
+  FOURCC_I010 = FOURCC('I', '0', '1', '0'),  // bt.601 10 bit 420
+  FOURCC_I210 = FOURCC('I', '0', '1', '0'),  // bt.601 10 bit 422
 
-  // 1 Secondary YUV format: row biplanar.
+  // 1 Secondary YUV format: row biplanar.  deprecated.
   FOURCC_M420 = FOURCC('M', '4', '2', '0'),
 
   // 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc
@@ -80,15 +81,29 @@ enum FourCC {
   // 1 Primary Compressed YUV format.
   FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
 
-  // 8 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+  // 14 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
   FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
   FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
   FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
   FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.
-  FOURCC_J420 = FOURCC('J', '4', '2', '0'),
-  FOURCC_J400 = FOURCC('J', '4', '0', '0'),  // unofficial fourcc
-  FOURCC_H420 = FOURCC('H', '4', '2', '0'),  // unofficial fourcc
-  FOURCC_H422 = FOURCC('H', '4', '2', '2'),  // unofficial fourcc
+  FOURCC_J420 =
+      FOURCC('J', '4', '2', '0'),  // jpeg (bt.601 full), unofficial fourcc
+  FOURCC_J422 =
+      FOURCC('J', '4', '2', '2'),  // jpeg (bt.601 full), unofficial fourcc
+  FOURCC_J444 =
+      FOURCC('J', '4', '4', '4'),  // jpeg (bt.601 full), unofficial fourcc
+  FOURCC_J400 =
+      FOURCC('J', '4', '0', '0'),  // jpeg (bt.601 full), unofficial fourcc
+  FOURCC_H420 = FOURCC('H', '4', '2', '0'),  // bt.709, unofficial fourcc
+  FOURCC_H422 = FOURCC('H', '4', '2', '2'),  // bt.709, unofficial fourcc
+  FOURCC_H444 = FOURCC('H', '4', '4', '4'),  // bt.709, unofficial fourcc
+  FOURCC_U420 = FOURCC('U', '4', '2', '0'),  // bt.2020, unofficial fourcc
+  FOURCC_U422 = FOURCC('U', '4', '2', '2'),  // bt.2020, unofficial fourcc
+  FOURCC_U444 = FOURCC('U', '4', '4', '4'),  // bt.2020, unofficial fourcc
+  FOURCC_H010 = FOURCC('H', '0', '1', '0'),  // bt.709 10 bit 420
+  FOURCC_U010 = FOURCC('U', '0', '1', '0'),  // bt.2020 10 bit 420
+  FOURCC_H210 = FOURCC('H', '0', '1', '0'),  // bt.709 10 bit 422
+  FOURCC_U210 = FOURCC('U', '0', '1', '0'),  // bt.2020 10 bit 422
 
   // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
   FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
@@ -133,7 +148,7 @@ enum FourCCBpp {
   FOURCC_BPP_NV12 = 12,
   FOURCC_BPP_YUY2 = 16,
   FOURCC_BPP_UYVY = 16,
-  FOURCC_BPP_M420 = 12,
+  FOURCC_BPP_M420 = 12,  // deprecated
   FOURCC_BPP_Q420 = 12,
   FOURCC_BPP_ARGB = 32,
   FOURCC_BPP_BGRA = 32,
diff --git a/chromium/third_party/libyuv/libyuv.gni b/chromium/third_party/libyuv/libyuv.gni
index 89e4d382327..8df40ba2d77 100644
--- a/chromium/third_party/libyuv/libyuv.gni
+++ b/chromium/third_party/libyuv/libyuv.gni
@@ -13,8 +13,11 @@ import("//build/config/mips.gni")
 declare_args() {
   libyuv_include_tests = !build_with_chromium
   libyuv_disable_jpeg = false
-  libyuv_use_neon = (current_cpu == "arm64" ||
-      (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon)))
-  libyuv_use_msa = (current_cpu == "mips64el" || current_cpu == "mipsel") &&
-    mips_use_msa
+  libyuv_use_neon =
+      current_cpu == "arm64" ||
+      (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon))
+  libyuv_use_msa =
+      (current_cpu == "mips64el" || current_cpu == "mipsel") && mips_use_msa
+  libyuv_use_mmi =
+      (current_cpu == "mips64el" || current_cpu == "mipsel") && mips_use_mmi
 }
diff --git a/chromium/third_party/libyuv/linux.mk b/chromium/third_party/libyuv/linux.mk
index 3cb6addddd4..3e93b710d49 100644
--- a/chromium/third_party/libyuv/linux.mk
+++ b/chromium/third_party/libyuv/linux.mk
@@ -13,14 +13,15 @@ LOCAL_OBJ_FILES := \
 	source/compare.o           \
 	source/compare_common.o    \
 	source/compare_gcc.o       \
+	source/compare_mmi.o       \
 	source/compare_msa.o       \
-	source/compare_neon64.o    \
 	source/compare_neon.o      \
+	source/compare_neon64.o    \
 	source/compare_win.o       \
-	source/convert_argb.o      \
 	source/convert.o           \
-	source/convert_from_argb.o \
+	source/convert_argb.o      \
 	source/convert_from.o      \
+	source/convert_from_argb.o \
 	source/convert_jpeg.o      \
 	source/convert_to_argb.o   \
 	source/convert_to_i420.o   \
@@ -28,30 +29,34 @@ LOCAL_OBJ_FILES := \
 	source/mjpeg_decoder.o     \
 	source/mjpeg_validate.o    \
 	source/planar_functions.o  \
+	source/rotate.o            \
 	source/rotate_any.o        \
 	source/rotate_argb.o       \
-	source/rotate.o            \
 	source/rotate_common.o     \
 	source/rotate_gcc.o        \
+	source/rotate_mmi.o        \
 	source/rotate_msa.o        \
-	source/rotate_neon64.o     \
 	source/rotate_neon.o       \
+	source/rotate_neon64.o     \
 	source/rotate_win.o        \
 	source/row_any.o           \
 	source/row_common.o        \
 	source/row_gcc.o           \
+	source/row_mmi.o           \
 	source/row_msa.o           \
-	source/row_neon64.o        \
 	source/row_neon.o          \
+	source/row_neon64.o        \
 	source/row_win.o           \
+	source/scale.o             \
 	source/scale_any.o         \
 	source/scale_argb.o        \
-	source/scale.o             \
 	source/scale_common.o      \
 	source/scale_gcc.o         \
+	source/scale_mmi.o         \
 	source/scale_msa.o         \
-	source/scale_neon64.o      \
 	source/scale_neon.o        \
+	source/scale_neon64.o      \
+	source/scale_uv.o          \
 	source/scale_win.o         \
 	source/video_common.o
 
@@ -61,7 +66,7 @@ LOCAL_OBJ_FILES := \
 .c.o:
 	$(CC) -c $(CFLAGS) $*.c -o $*.o
 
-all: libyuv.a yuvconvert cpuid psnr
+all: libyuv.a i444tonv12_eg yuvconvert cpuid psnr
 
 libyuv.a: $(LOCAL_OBJ_FILES)
 	$(AR) $(ARFLAGS) $@ $(LOCAL_OBJ_FILES)
@@ -74,6 +79,10 @@ yuvconvert: util/yuvconvert.cc libyuv.a
 psnr: util/psnr.cc
 	$(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/psnr.cc util/psnr_main.cc util/ssim.cc
 
+# A simple conversion example.
+i444tonv12_eg: util/i444tonv12_eg.cc libyuv.a
+	$(CC) $(CFLAGS) -o $@ util/i444tonv12_eg.cc libyuv.a
+
 # A C test utility that uses libyuv conversion from C.
 # gcc 4.4 and older require -fno-exceptions to avoid link error on __gxx_personality_v0
 # CC=gcc-4.4 CXXFLAGS=-fno-exceptions CXX=g++-4.4 make -f linux.mk
@@ -81,4 +90,4 @@ cpuid: util/cpuid.c libyuv.a
 	$(CC) $(CFLAGS) -o $@ util/cpuid.c libyuv.a
 
 clean:
-	/bin/rm -f source/*.o *.ii *.s libyuv.a yuvconvert cpuid psnr
+	/bin/rm -f source/*.o *.ii *.s libyuv.a i444tonv12_eg yuvconvert cpuid psnr
diff --git a/chromium/third_party/libyuv/source/compare.cc b/chromium/third_party/libyuv/source/compare.cc
index 50e3abd0556..e93aba1b53e 100644
--- a/chromium/third_party/libyuv/source/compare.cc
+++ b/chromium/third_party/libyuv/source/compare.cc
@@ -69,13 +69,13 @@ static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) {
     if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
       return FOURCC_BGRA;
     }
-    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
+    if (argb[3] != 255) {  // Fourth byte is not Alpha of 255, so not BGRA.
       return FOURCC_ARGB;
     }
     if (argb[4] != 255) {  // Second pixel first byte is not Alpha of 255.
       return FOURCC_BGRA;
     }
-    if (argb[7] != 255) {  // Second pixel 4th byte is not Alpha of 255.
+    if (argb[7] != 255) {  // Second pixel fourth byte is not Alpha of 255.
       return FOURCC_ARGB;
     }
     argb += 8;
@@ -149,11 +149,17 @@ uint64_t ComputeHammingDistance(const uint8_t* src_a,
     HammingDistance = HammingDistance_AVX2;
   }
 #endif
+#if defined(HAS_HAMMINGDISTANCE_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    HammingDistance = HammingDistance_MMI;
+  }
+#endif
 #if defined(HAS_HAMMINGDISTANCE_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     HammingDistance = HammingDistance_MSA;
   }
 #endif
+
 #ifdef _OPENMP
 #pragma omp parallel for reduction(+ : diff)
 #endif
@@ -205,6 +211,11 @@ uint64_t ComputeSumSquareError(const uint8_t* src_a,
     SumSquareError = SumSquareError_AVX2;
   }
 #endif
+#if defined(HAS_SUMSQUAREERROR_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    SumSquareError = SumSquareError_MMI;
+  }
+#endif
 #if defined(HAS_SUMSQUAREERROR_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     SumSquareError = SumSquareError_MSA;
diff --git a/chromium/third_party/libyuv/source/compare_gcc.cc b/chromium/third_party/libyuv/source/compare_gcc.cc
index 676527c1b1b..6700f9697e0 100644
--- a/chromium/third_party/libyuv/source/compare_gcc.cc
+++ b/chromium/third_party/libyuv/source/compare_gcc.cc
@@ -29,38 +29,38 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
   uint64_t diff = 0u;
 
   asm volatile(
-      "xor        %3,%3                          \n"
-      "xor        %%r8,%%r8                      \n"
-      "xor        %%r9,%%r9                      \n"
-      "xor        %%r10,%%r10                    \n"
+      "xor         %3,%3                         \n"
+      "xor         %%r8,%%r8                     \n"
+      "xor         %%r9,%%r9                     \n"
+      "xor         %%r10,%%r10                   \n"
 
       // Process 32 bytes per loop.
       LABELALIGN
       "1:                                        \n"
-      "mov        (%0),%%rcx                     \n"
-      "mov        0x8(%0),%%rdx                  \n"
-      "xor        (%1),%%rcx                     \n"
-      "xor        0x8(%1),%%rdx                  \n"
-      "popcnt     %%rcx,%%rcx                    \n"
-      "popcnt     %%rdx,%%rdx                    \n"
-      "mov        0x10(%0),%%rsi                 \n"
-      "mov        0x18(%0),%%rdi                 \n"
-      "xor        0x10(%1),%%rsi                 \n"
-      "xor        0x18(%1),%%rdi                 \n"
-      "popcnt     %%rsi,%%rsi                    \n"
-      "popcnt     %%rdi,%%rdi                    \n"
-      "add        $0x20,%0                       \n"
-      "add        $0x20,%1                       \n"
-      "add        %%rcx,%3                       \n"
-      "add        %%rdx,%%r8                     \n"
-      "add        %%rsi,%%r9                     \n"
-      "add        %%rdi,%%r10                    \n"
-      "sub        $0x20,%2                       \n"
-      "jg         1b                             \n"
+      "mov         (%0),%%rcx                    \n"
+      "mov         0x8(%0),%%rdx                 \n"
+      "xor         (%1),%%rcx                    \n"
+      "xor         0x8(%1),%%rdx                 \n"
+      "popcnt      %%rcx,%%rcx                   \n"
+      "popcnt      %%rdx,%%rdx                   \n"
+      "mov         0x10(%0),%%rsi                \n"
+      "mov         0x18(%0),%%rdi                \n"
+      "xor         0x10(%1),%%rsi                \n"
+      "xor         0x18(%1),%%rdi                \n"
+      "popcnt      %%rsi,%%rsi                   \n"
+      "popcnt      %%rdi,%%rdi                   \n"
+      "add         $0x20,%0                      \n"
+      "add         $0x20,%1                      \n"
+      "add         %%rcx,%3                      \n"
+      "add         %%rdx,%%r8                    \n"
+      "add         %%rsi,%%r9                    \n"
+      "add         %%rdi,%%r10                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
 
-      "add        %%r8, %3                       \n"
-      "add        %%r9, %3                       \n"
-      "add        %%r10, %3                      \n"
+      "add         %%r8, %3                      \n"
+      "add         %%r9, %3                      \n"
+      "add         %%r10, %3                     \n"
       : "+r"(src_a),  // %0
         "+r"(src_b),  // %1
         "+r"(count),  // %2
@@ -80,26 +80,26 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
       // Process 16 bytes per loop.
       LABELALIGN
       "1:                                        \n"
-      "mov        (%0),%%ecx                     \n"
-      "mov        0x4(%0),%%edx                  \n"
-      "xor        (%1),%%ecx                     \n"
-      "xor        0x4(%1),%%edx                  \n"
-      "popcnt     %%ecx,%%ecx                    \n"
-      "add        %%ecx,%3                       \n"
-      "popcnt     %%edx,%%edx                    \n"
-      "add        %%edx,%3                       \n"
-      "mov        0x8(%0),%%ecx                  \n"
-      "mov        0xc(%0),%%edx                  \n"
-      "xor        0x8(%1),%%ecx                  \n"
-      "xor        0xc(%1),%%edx                  \n"
-      "popcnt     %%ecx,%%ecx                    \n"
-      "add        %%ecx,%3                       \n"
-      "popcnt     %%edx,%%edx                    \n"
-      "add        %%edx,%3                       \n"
-      "add        $0x10,%0                       \n"
-      "add        $0x10,%1                       \n"
-      "sub        $0x10,%2                       \n"
-      "jg         1b                             \n"
+      "mov         (%0),%%ecx                    \n"
+      "mov         0x4(%0),%%edx                 \n"
+      "xor         (%1),%%ecx                    \n"
+      "xor         0x4(%1),%%edx                 \n"
+      "popcnt      %%ecx,%%ecx                   \n"
+      "add         %%ecx,%3                      \n"
+      "popcnt      %%edx,%%edx                   \n"
+      "add         %%edx,%3                      \n"
+      "mov         0x8(%0),%%ecx                 \n"
+      "mov         0xc(%0),%%edx                 \n"
+      "xor         0x8(%1),%%ecx                 \n"
+      "xor         0xc(%1),%%edx                 \n"
+      "popcnt      %%ecx,%%ecx                   \n"
+      "add         %%ecx,%3                      \n"
+      "popcnt      %%edx,%%edx                   \n"
+      "add         %%edx,%3                      \n"
+      "add         $0x10,%0                      \n"
+      "add         $0x10,%1                      \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
       : "+r"(src_a),  // %0
         "+r"(src_b),  // %1
         "+r"(count),  // %2
@@ -121,46 +121,46 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
   uint32_t diff = 0u;
 
   asm volatile(
-      "movdqa     %4,%%xmm2                      \n"
-      "movdqa     %5,%%xmm3                      \n"
-      "pxor       %%xmm0,%%xmm0                  \n"
-      "pxor       %%xmm1,%%xmm1                  \n"
-      "sub        %0,%1                          \n"
+      "movdqa      %4,%%xmm2                     \n"
+      "movdqa      %5,%%xmm3                     \n"
+      "pxor        %%xmm0,%%xmm0                 \n"
+      "pxor        %%xmm1,%%xmm1                 \n"
+      "sub         %0,%1                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqa     (%0),%%xmm4                    \n"
-      "movdqa     0x10(%0), %%xmm5               \n"
-      "pxor       (%0,%1), %%xmm4                \n"
-      "movdqa     %%xmm4,%%xmm6                  \n"
-      "pand       %%xmm2,%%xmm6                  \n"
-      "psrlw      $0x4,%%xmm4                    \n"
-      "movdqa     %%xmm3,%%xmm7                  \n"
-      "pshufb     %%xmm6,%%xmm7                  \n"
-      "pand       %%xmm2,%%xmm4                  \n"
-      "movdqa     %%xmm3,%%xmm6                  \n"
-      "pshufb     %%xmm4,%%xmm6                  \n"
-      "paddb      %%xmm7,%%xmm6                  \n"
-      "pxor       0x10(%0,%1),%%xmm5             \n"
-      "add        $0x20,%0                       \n"
-      "movdqa     %%xmm5,%%xmm4                  \n"
-      "pand       %%xmm2,%%xmm5                  \n"
-      "psrlw      $0x4,%%xmm4                    \n"
-      "movdqa     %%xmm3,%%xmm7                  \n"
-      "pshufb     %%xmm5,%%xmm7                  \n"
-      "pand       %%xmm2,%%xmm4                  \n"
-      "movdqa     %%xmm3,%%xmm5                  \n"
-      "pshufb     %%xmm4,%%xmm5                  \n"
-      "paddb      %%xmm7,%%xmm5                  \n"
-      "paddb      %%xmm5,%%xmm6                  \n"
-      "psadbw     %%xmm1,%%xmm6                  \n"
-      "paddd      %%xmm6,%%xmm0                  \n"
-      "sub        $0x20,%2                       \n"
-      "jg         1b                             \n"
+      "movdqa      (%0),%%xmm4                   \n"
+      "movdqa      0x10(%0), %%xmm5              \n"
+      "pxor        (%0,%1), %%xmm4               \n"
+      "movdqa      %%xmm4,%%xmm6                 \n"
+      "pand        %%xmm2,%%xmm6                 \n"
+      "psrlw       $0x4,%%xmm4                   \n"
+      "movdqa      %%xmm3,%%xmm7                 \n"
+      "pshufb      %%xmm6,%%xmm7                 \n"
+      "pand        %%xmm2,%%xmm4                 \n"
+      "movdqa      %%xmm3,%%xmm6                 \n"
+      "pshufb      %%xmm4,%%xmm6                 \n"
+      "paddb       %%xmm7,%%xmm6                 \n"
+      "pxor        0x10(%0,%1),%%xmm5            \n"
+      "add         $0x20,%0                      \n"
+      "movdqa      %%xmm5,%%xmm4                 \n"
+      "pand        %%xmm2,%%xmm5                 \n"
+      "psrlw       $0x4,%%xmm4                   \n"
+      "movdqa      %%xmm3,%%xmm7                 \n"
+      "pshufb      %%xmm5,%%xmm7                 \n"
+      "pand        %%xmm2,%%xmm4                 \n"
+      "movdqa      %%xmm3,%%xmm5                 \n"
+      "pshufb      %%xmm4,%%xmm5                 \n"
+      "paddb       %%xmm7,%%xmm5                 \n"
+      "paddb       %%xmm5,%%xmm6                 \n"
+      "psadbw      %%xmm1,%%xmm6                 \n"
+      "paddd       %%xmm6,%%xmm0                 \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
 
-      "pshufd     $0xaa,%%xmm0,%%xmm1            \n"
-      "paddd      %%xmm1,%%xmm0                  \n"
-      "movd       %%xmm0, %3                     \n"
+      "pshufd      $0xaa,%%xmm0,%%xmm1           \n"
+      "paddd       %%xmm1,%%xmm0                 \n"
+      "movd        %%xmm0, %3                    \n"
       : "+r"(src_a),       // %0
         "+r"(src_b),       // %1
         "+r"(count),       // %2
@@ -182,40 +182,40 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
   asm volatile(
       "vbroadcastf128 %4,%%ymm2                  \n"
       "vbroadcastf128 %5,%%ymm3                  \n"
-      "vpxor      %%ymm0,%%ymm0,%%ymm0           \n"
-      "vpxor      %%ymm1,%%ymm1,%%ymm1           \n"
-      "sub        %0,%1                          \n"
+      "vpxor       %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpxor       %%ymm1,%%ymm1,%%ymm1          \n"
+      "sub         %0,%1                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqa    (%0),%%ymm4                    \n"
-      "vmovdqa    0x20(%0), %%ymm5               \n"
-      "vpxor      (%0,%1), %%ymm4, %%ymm4        \n"
-      "vpand      %%ymm2,%%ymm4,%%ymm6           \n"
-      "vpsrlw     $0x4,%%ymm4,%%ymm4             \n"
-      "vpshufb    %%ymm6,%%ymm3,%%ymm6           \n"
-      "vpand      %%ymm2,%%ymm4,%%ymm4           \n"
-      "vpshufb    %%ymm4,%%ymm3,%%ymm4           \n"
-      "vpaddb     %%ymm4,%%ymm6,%%ymm6           \n"
-      "vpxor      0x20(%0,%1),%%ymm5,%%ymm4      \n"
-      "add        $0x40,%0                       \n"
-      "vpand      %%ymm2,%%ymm4,%%ymm5           \n"
-      "vpsrlw     $0x4,%%ymm4,%%ymm4             \n"
-      "vpshufb    %%ymm5,%%ymm3,%%ymm5           \n"
-      "vpand      %%ymm2,%%ymm4,%%ymm4           \n"
-      "vpshufb    %%ymm4,%%ymm3,%%ymm4           \n"
-      "vpaddb     %%ymm5,%%ymm4,%%ymm4           \n"
-      "vpaddb     %%ymm6,%%ymm4,%%ymm4           \n"
-      "vpsadbw    %%ymm1,%%ymm4,%%ymm4           \n"
-      "vpaddd     %%ymm0,%%ymm4,%%ymm0           \n"
-      "sub        $0x40,%2                       \n"
-      "jg         1b                             \n"
+      "vmovdqa     (%0),%%ymm4                   \n"
+      "vmovdqa     0x20(%0), %%ymm5              \n"
+      "vpxor       (%0,%1), %%ymm4, %%ymm4       \n"
+      "vpand       %%ymm2,%%ymm4,%%ymm6          \n"
+      "vpsrlw      $0x4,%%ymm4,%%ymm4            \n"
+      "vpshufb     %%ymm6,%%ymm3,%%ymm6          \n"
+      "vpand       %%ymm2,%%ymm4,%%ymm4          \n"
+      "vpshufb     %%ymm4,%%ymm3,%%ymm4          \n"
+      "vpaddb      %%ymm4,%%ymm6,%%ymm6          \n"
+      "vpxor       0x20(%0,%1),%%ymm5,%%ymm4     \n"
+      "add         $0x40,%0                      \n"
+      "vpand       %%ymm2,%%ymm4,%%ymm5          \n"
+      "vpsrlw      $0x4,%%ymm4,%%ymm4            \n"
+      "vpshufb     %%ymm5,%%ymm3,%%ymm5          \n"
+      "vpand       %%ymm2,%%ymm4,%%ymm4          \n"
+      "vpshufb     %%ymm4,%%ymm3,%%ymm4          \n"
+      "vpaddb      %%ymm5,%%ymm4,%%ymm4          \n"
+      "vpaddb      %%ymm6,%%ymm4,%%ymm4          \n"
+      "vpsadbw     %%ymm1,%%ymm4,%%ymm4          \n"
+      "vpaddd      %%ymm0,%%ymm4,%%ymm0          \n"
+      "sub         $0x40,%2                      \n"
+      "jg          1b                            \n"
 
-      "vpermq     $0xb1,%%ymm0,%%ymm1            \n"
-      "vpaddd     %%ymm1,%%ymm0,%%ymm0           \n"
-      "vpermq     $0xaa,%%ymm0,%%ymm1            \n"
-      "vpaddd     %%ymm1,%%ymm0,%%ymm0           \n"
-      "vmovd      %%xmm0, %3                     \n"
+      "vpermq      $0xb1,%%ymm0,%%ymm1           \n"
+      "vpaddd      %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xaa,%%ymm0,%%ymm1           \n"
+      "vpaddd      %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovd       %%xmm0, %3                    \n"
       "vzeroupper                                \n"
       : "+r"(src_a),       // %0
         "+r"(src_b),       // %1
@@ -234,34 +234,34 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
                              int count) {
   uint32_t sse;
   asm volatile(
-      "pxor      %%xmm0,%%xmm0                   \n"
-      "pxor      %%xmm5,%%xmm5                   \n"
+      "pxor        %%xmm0,%%xmm0                 \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm1                     \n"
-      "lea       0x10(%0),%0                     \n"
-      "movdqu    (%1),%%xmm2                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "movdqa    %%xmm1,%%xmm3                   \n"
-      "psubusb   %%xmm2,%%xmm1                   \n"
-      "psubusb   %%xmm3,%%xmm2                   \n"
-      "por       %%xmm2,%%xmm1                   \n"
-      "movdqa    %%xmm1,%%xmm2                   \n"
-      "punpcklbw %%xmm5,%%xmm1                   \n"
-      "punpckhbw %%xmm5,%%xmm2                   \n"
-      "pmaddwd   %%xmm1,%%xmm1                   \n"
-      "pmaddwd   %%xmm2,%%xmm2                   \n"
-      "paddd     %%xmm1,%%xmm0                   \n"
-      "paddd     %%xmm2,%%xmm0                   \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm1                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqu      (%1),%%xmm2                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "psubusb     %%xmm2,%%xmm1                 \n"
+      "psubusb     %%xmm3,%%xmm2                 \n"
+      "por         %%xmm2,%%xmm1                 \n"
+      "movdqa      %%xmm1,%%xmm2                 \n"
+      "punpcklbw   %%xmm5,%%xmm1                 \n"
+      "punpckhbw   %%xmm5,%%xmm2                 \n"
+      "pmaddwd     %%xmm1,%%xmm1                 \n"
+      "pmaddwd     %%xmm2,%%xmm2                 \n"
+      "paddd       %%xmm1,%%xmm0                 \n"
+      "paddd       %%xmm2,%%xmm0                 \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
 
-      "pshufd    $0xee,%%xmm0,%%xmm1             \n"
-      "paddd     %%xmm1,%%xmm0                   \n"
-      "pshufd    $0x1,%%xmm0,%%xmm1              \n"
-      "paddd     %%xmm1,%%xmm0                   \n"
-      "movd      %%xmm0,%3                       \n"
+      "pshufd      $0xee,%%xmm0,%%xmm1           \n"
+      "paddd       %%xmm1,%%xmm0                 \n"
+      "pshufd      $0x1,%%xmm0,%%xmm1            \n"
+      "paddd       %%xmm1,%%xmm0                 \n"
+      "movd        %%xmm0,%3                     \n"
 
       : "+r"(src_a),  // %0
         "+r"(src_b),  // %1
@@ -301,44 +301,44 @@ static const uvec32 kHashMul3 = {
 uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
   uint32_t hash;
   asm volatile(
-      "movd      %2,%%xmm0                       \n"
-      "pxor      %%xmm7,%%xmm7                   \n"
-      "movdqa    %4,%%xmm6                       \n"
+      "movd        %2,%%xmm0                     \n"
+      "pxor        %%xmm7,%%xmm7                 \n"
+      "movdqa      %4,%%xmm6                     \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm1                     \n"
-      "lea       0x10(%0),%0                     \n"
-      "pmulld    %%xmm6,%%xmm0                   \n"
-      "movdqa    %5,%%xmm5                       \n"
-      "movdqa    %%xmm1,%%xmm2                   \n"
-      "punpcklbw %%xmm7,%%xmm2                   \n"
-      "movdqa    %%xmm2,%%xmm3                   \n"
-      "punpcklwd %%xmm7,%%xmm3                   \n"
-      "pmulld    %%xmm5,%%xmm3                   \n"
-      "movdqa    %6,%%xmm5                       \n"
-      "movdqa    %%xmm2,%%xmm4                   \n"
-      "punpckhwd %%xmm7,%%xmm4                   \n"
-      "pmulld    %%xmm5,%%xmm4                   \n"
-      "movdqa    %7,%%xmm5                       \n"
-      "punpckhbw %%xmm7,%%xmm1                   \n"
-      "movdqa    %%xmm1,%%xmm2                   \n"
-      "punpcklwd %%xmm7,%%xmm2                   \n"
-      "pmulld    %%xmm5,%%xmm2                   \n"
-      "movdqa    %8,%%xmm5                       \n"
-      "punpckhwd %%xmm7,%%xmm1                   \n"
-      "pmulld    %%xmm5,%%xmm1                   \n"
-      "paddd     %%xmm4,%%xmm3                   \n"
-      "paddd     %%xmm2,%%xmm1                   \n"
-      "paddd     %%xmm3,%%xmm1                   \n"
-      "pshufd    $0xe,%%xmm1,%%xmm2              \n"
-      "paddd     %%xmm2,%%xmm1                   \n"
-      "pshufd    $0x1,%%xmm1,%%xmm2              \n"
-      "paddd     %%xmm2,%%xmm1                   \n"
-      "paddd     %%xmm1,%%xmm0                   \n"
-      "sub       $0x10,%1                        \n"
-      "jg        1b                              \n"
-      "movd      %%xmm0,%3                       \n"
+      "movdqu      (%0),%%xmm1                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "pmulld      %%xmm6,%%xmm0                 \n"
+      "movdqa      %5,%%xmm5                     \n"
+      "movdqa      %%xmm1,%%xmm2                 \n"
+      "punpcklbw   %%xmm7,%%xmm2                 \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "punpcklwd   %%xmm7,%%xmm3                 \n"
+      "pmulld      %%xmm5,%%xmm3                 \n"
+      "movdqa      %6,%%xmm5                     \n"
+      "movdqa      %%xmm2,%%xmm4                 \n"
+      "punpckhwd   %%xmm7,%%xmm4                 \n"
+      "pmulld      %%xmm5,%%xmm4                 \n"
+      "movdqa      %7,%%xmm5                     \n"
+      "punpckhbw   %%xmm7,%%xmm1                 \n"
+      "movdqa      %%xmm1,%%xmm2                 \n"
+      "punpcklwd   %%xmm7,%%xmm2                 \n"
+      "pmulld      %%xmm5,%%xmm2                 \n"
+      "movdqa      %8,%%xmm5                     \n"
+      "punpckhwd   %%xmm7,%%xmm1                 \n"
+      "pmulld      %%xmm5,%%xmm1                 \n"
+      "paddd       %%xmm4,%%xmm3                 \n"
+      "paddd       %%xmm2,%%xmm1                 \n"
+      "paddd       %%xmm3,%%xmm1                 \n"
+      "pshufd      $0xe,%%xmm1,%%xmm2            \n"
+      "paddd       %%xmm2,%%xmm1                 \n"
+      "pshufd      $0x1,%%xmm1,%%xmm2            \n"
+      "paddd       %%xmm2,%%xmm1                 \n"
+      "paddd       %%xmm1,%%xmm0                 \n"
+      "sub         $0x10,%1                      \n"
+      "jg          1b                            \n"
+      "movd        %%xmm0,%3                     \n"
       : "+r"(src),        // %0
         "+r"(count),      // %1
         "+rm"(seed),      // %2
diff --git a/chromium/third_party/libyuv/source/compare_mmi.cc b/chromium/third_party/libyuv/source/compare_mmi.cc
new file mode 100644
index 00000000000..7640d9468cb
--- /dev/null
+++ b/chromium/third_party/libyuv/source/compare_mmi.cc
@@ -0,0 +1,123 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Mips MMI.
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+// Hakmem method for hamming distance.
+uint32_t HammingDistance_MMI(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
+  uint32_t diff = 0u;
+
+  uint64_t temp = 0, temp1 = 0, ta = 0, tb = 0;
+  uint64_t c1 = 0x5555555555555555;
+  uint64_t c2 = 0x3333333333333333;
+  uint64_t c3 = 0x0f0f0f0f0f0f0f0f;
+  uint32_t c4 = 0x01010101;
+  uint64_t s1 = 1, s2 = 2, s3 = 4;
+  __asm__ volatile(
+      "1:	\n\t"
+      "ldc1   %[ta],    0(%[src_a])          \n\t"
+      "ldc1   %[tb],    0(%[src_b])          \n\t"
+      "xor    %[temp],  %[ta],      %[tb]    \n\t"
+      "psrlw  %[temp1], %[temp],    %[s1]    \n\t"  // temp1=x>>1
+      "and    %[temp1], %[temp1],   %[c1]    \n\t"  // temp1&=c1
+      "psubw  %[temp1], %[temp],    %[temp1] \n\t"  // x-temp1
+      "and    %[temp],  %[temp1],   %[c2]    \n\t"  // t = (u&c2)
+      "psrlw  %[temp1], %[temp1],   %[s2]    \n\t"  // u>>2
+      "and    %[temp1], %[temp1],   %[c2]    \n\t"  // u>>2 & c2
+      "paddw  %[temp1], %[temp1],   %[temp]  \n\t"  // t1 = t1+t
+      "psrlw  %[temp],  %[temp1],   %[s3]    \n\t"  // u>>4
+      "paddw  %[temp1], %[temp1],   %[temp]  \n\t"  // u+(u>>4)
+      "and    %[temp1], %[temp1],   %[c3]    \n\t"  //&c3
+      "dmfc1  $t0,      %[temp1]             \n\t"
+      "dsrl32 $t0,      $t0,        0        \n\t "
+      "mul    $t0,      $t0,        %[c4]    \n\t"
+      "dsrl   $t0,      $t0,        24       \n\t"
+      "dadd   %[diff],  %[diff],    $t0      \n\t"
+      "dmfc1  $t0,      %[temp1]             \n\t"
+      "mul    $t0,      $t0,        %[c4]    \n\t"
+      "dsrl   $t0,      $t0,        24       \n\t"
+      "dadd   %[diff],  %[diff],    $t0      \n\t"
+      "daddiu %[src_a], %[src_a],   8        \n\t"
+      "daddiu %[src_b], %[src_b],   8        \n\t"
+      "addiu  %[count], %[count],  -8        \n\t"
+      "bgtz   %[count], 1b \n\t"
+      "nop                            \n\t"
+      : [diff] "+r"(diff), [src_a] "+r"(src_a), [src_b] "+r"(src_b),
+        [count] "+r"(count), [ta] "+f"(ta), [tb] "+f"(tb), [temp] "+f"(temp),
+        [temp1] "+f"(temp1)
+      : [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [c4] "r"(c4), [s1] "f"(s1),
+        [s2] "f"(s2), [s3] "f"(s3)
+      : "memory");
+  return diff;
+}
+
+uint32_t SumSquareError_MMI(const uint8_t* src_a,
+                            const uint8_t* src_b,
+                            int count) {
+  uint32_t sse = 0u;
+  uint32_t sse_hi = 0u, sse_lo = 0u;
+
+  uint64_t src1, src2;
+  uint64_t diff, diff_hi, diff_lo;
+  uint64_t sse_sum, sse_tmp;
+
+  const uint64_t mask = 0x0ULL;
+
+  __asm__ volatile(
+      "xor        %[sse_sum],      %[sse_sum],        %[sse_sum]    \n\t"
+
+      "1:                                                           \n\t"
+      "ldc1       %[src1],         0x00(%[src_a])                   \n\t"
+      "ldc1       %[src2],         0x00(%[src_b])                   \n\t"
+      "pasubub    %[diff],         %[src1],           %[src2]       \n\t"
+      "punpcklbh  %[diff_lo],      %[diff],           %[mask]       \n\t"
+      "punpckhbh  %[diff_hi],      %[diff],           %[mask]       \n\t"
+      "pmaddhw    %[sse_tmp],      %[diff_lo],        %[diff_lo]    \n\t"
+      "paddw      %[sse_sum],      %[sse_sum],        %[sse_tmp]    \n\t"
+      "pmaddhw    %[sse_tmp],      %[diff_hi],        %[diff_hi]    \n\t"
+      "paddw      %[sse_sum],      %[sse_sum],        %[sse_tmp]    \n\t"
+
+      "daddiu     %[src_a],        %[src_a],          0x08          \n\t"
+      "daddiu     %[src_b],        %[src_b],          0x08          \n\t"
+      "daddiu     %[count],        %[count],         -0x08          \n\t"
+      "bnez       %[count],        1b                               \n\t"
+
+      "mfc1       %[sse_lo],       %[sse_sum]                       \n\t"
+      "mfhc1      %[sse_hi],       %[sse_sum]                       \n\t"
+      "daddu      %[sse],          %[sse_hi],         %[sse_lo]     \n\t"
+      : [sse] "+&r"(sse), [diff] "=&f"(diff), [src1] "=&f"(src1),
+        [src2] "=&f"(src2), [diff_lo] "=&f"(diff_lo), [diff_hi] "=&f"(diff_hi),
+        [sse_sum] "=&f"(sse_sum), [sse_tmp] "=&f"(sse_tmp),
+        [sse_hi] "+&r"(sse_hi), [sse_lo] "+&r"(sse_lo)
+      : [src_a] "r"(src_a), [src_b] "r"(src_b), [count] "r"(count),
+        [mask] "f"(mask)
+      : "memory");
+
+  return sse;
+}
+
+#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/compare_neon.cc b/chromium/third_party/libyuv/source/compare_neon.cc
index 2a2181e0cb3..afdd6012164 100644
--- a/chromium/third_party/libyuv/source/compare_neon.cc
+++ b/chromium/third_party/libyuv/source/compare_neon.cc
@@ -29,24 +29,24 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
   uint32_t diff;
 
   asm volatile(
-      "vmov.u16   q4, #0                         \n"  // accumulator
+      "vmov.u16    q4, #0                        \n"  // accumulator
 
       "1:                                        \n"
-      "vld1.8     {q0, q1}, [%0]!                \n"
-      "vld1.8     {q2, q3}, [%1]!                \n"
-      "veor.32    q0, q0, q2                     \n"
-      "veor.32    q1, q1, q3                     \n"
-      "vcnt.i8    q0, q0                         \n"
-      "vcnt.i8    q1, q1                         \n"
-      "subs       %2, %2, #32                    \n"
-      "vadd.u8    q0, q0, q1                     \n"  // 16 byte counts
-      "vpadal.u8  q4, q0                         \n"  // 8 shorts
-      "bgt        1b                             \n"
+      "vld1.8      {q0, q1}, [%0]!               \n"
+      "vld1.8      {q2, q3}, [%1]!               \n"
+      "veor.32     q0, q0, q2                    \n"
+      "veor.32     q1, q1, q3                    \n"
+      "vcnt.i8     q0, q0                        \n"
+      "vcnt.i8     q1, q1                        \n"
+      "subs        %2, %2, #32                   \n"
+      "vadd.u8     q0, q0, q1                    \n"  // 16 byte counts
+      "vpadal.u8   q4, q0                        \n"  // 8 shorts
+      "bgt         1b                            \n"
 
-      "vpaddl.u16 q0, q4                         \n"  // 4 ints
-      "vpadd.u32  d0, d0, d1                     \n"
-      "vpadd.u32  d0, d0, d0                     \n"
-      "vmov.32    %3, d0[0]                      \n"
+      "vpaddl.u16  q0, q4                        \n"  // 4 ints
+      "vpadd.u32   d0, d0, d1                    \n"
+      "vpadd.u32   d0, d0, d0                    \n"
+      "vmov.32     %3, d0[0]                     \n"
 
       : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
       :
@@ -59,29 +59,29 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
                              int count) {
   uint32_t sse;
   asm volatile(
-      "vmov.u8    q8, #0                         \n"
-      "vmov.u8    q10, #0                        \n"
-      "vmov.u8    q9, #0                         \n"
-      "vmov.u8    q11, #0                        \n"
+      "vmov.u8     q8, #0                        \n"
+      "vmov.u8     q10, #0                       \n"
+      "vmov.u8     q9, #0                        \n"
+      "vmov.u8     q11, #0                       \n"
 
       "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"
-      "vld1.8     {q1}, [%1]!                    \n"
-      "subs       %2, %2, #16                    \n"
-      "vsubl.u8   q2, d0, d2                     \n"
-      "vsubl.u8   q3, d1, d3                     \n"
-      "vmlal.s16  q8, d4, d4                     \n"
-      "vmlal.s16  q9, d6, d6                     \n"
-      "vmlal.s16  q10, d5, d5                    \n"
-      "vmlal.s16  q11, d7, d7                    \n"
-      "bgt        1b                             \n"
+      "vld1.8      {q0}, [%0]!                   \n"
+      "vld1.8      {q1}, [%1]!                   \n"
+      "subs        %2, %2, #16                   \n"
+      "vsubl.u8    q2, d0, d2                    \n"
+      "vsubl.u8    q3, d1, d3                    \n"
+      "vmlal.s16   q8, d4, d4                    \n"
+      "vmlal.s16   q9, d6, d6                    \n"
+      "vmlal.s16   q10, d5, d5                   \n"
+      "vmlal.s16   q11, d7, d7                   \n"
+      "bgt         1b                            \n"
 
-      "vadd.u32   q8, q8, q9                     \n"
-      "vadd.u32   q10, q10, q11                  \n"
-      "vadd.u32   q11, q8, q10                   \n"
-      "vpaddl.u32 q1, q11                        \n"
-      "vadd.u64   d0, d2, d3                     \n"
-      "vmov.32    %3, d0[0]                      \n"
+      "vadd.u32    q8, q8, q9                    \n"
+      "vadd.u32    q10, q10, q11                 \n"
+      "vadd.u32    q11, q8, q10                  \n"
+      "vpaddl.u32  q1, q11                       \n"
+      "vadd.u64    d0, d2, d3                    \n"
+      "vmov.32     %3, d0[0]                     \n"
       : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
       :
       : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
diff --git a/chromium/third_party/libyuv/source/compare_neon64.cc b/chromium/third_party/libyuv/source/compare_neon64.cc
index 6e8f672ab73..70fb9b9143f 100644
--- a/chromium/third_party/libyuv/source/compare_neon64.cc
+++ b/chromium/third_party/libyuv/source/compare_neon64.cc
@@ -27,22 +27,24 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
                               int count) {
   uint32_t diff;
   asm volatile(
-      "movi       v4.8h, #0                      \n"
+      "movi        v4.8h, #0                     \n"
 
       "1:                                        \n"
-      "ld1        {v0.16b, v1.16b}, [%0], #32    \n"
-      "ld1        {v2.16b, v3.16b}, [%1], #32    \n"
-      "eor        v0.16b, v0.16b, v2.16b         \n"
-      "eor        v1.16b, v1.16b, v3.16b         \n"
-      "cnt        v0.16b, v0.16b                 \n"
-      "cnt        v1.16b, v1.16b                 \n"
-      "subs       %w2, %w2, #32                  \n"
-      "add        v0.16b, v0.16b, v1.16b         \n"
-      "uadalp     v4.8h, v0.16b                  \n"
-      "b.gt       1b                             \n"
+      "ld1         {v0.16b, v1.16b}, [%0], #32   \n"
+      "ld1         {v2.16b, v3.16b}, [%1], #32   \n"
+      "eor         v0.16b, v0.16b, v2.16b        \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "eor         v1.16b, v1.16b, v3.16b        \n"
+      "cnt         v0.16b, v0.16b                \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "cnt         v1.16b, v1.16b                \n"
+      "subs        %w2, %w2, #32                 \n"
+      "add         v0.16b, v0.16b, v1.16b        \n"
+      "uadalp      v4.8h, v0.16b                 \n"
+      "b.gt        1b                            \n"
 
-      "uaddlv     s4, v4.8h                      \n"
-      "fmov       %w3, s4                        \n"
+      "uaddlv      s4, v4.8h                     \n"
+      "fmov        %w3, s4                       \n"
       : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
       :
       : "cc", "v0", "v1", "v2", "v3", "v4");
@@ -54,28 +56,30 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
                              int count) {
   uint32_t sse;
   asm volatile(
-      "eor        v16.16b, v16.16b, v16.16b      \n"
-      "eor        v18.16b, v18.16b, v18.16b      \n"
-      "eor        v17.16b, v17.16b, v17.16b      \n"
-      "eor        v19.16b, v19.16b, v19.16b      \n"
+      "eor         v16.16b, v16.16b, v16.16b     \n"
+      "eor         v18.16b, v18.16b, v18.16b     \n"
+      "eor         v17.16b, v17.16b, v17.16b     \n"
+      "eor         v19.16b, v19.16b, v19.16b     \n"
 
       "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"
-      "ld1        {v1.16b}, [%1], #16            \n"
-      "subs       %w2, %w2, #16                  \n"
-      "usubl      v2.8h, v0.8b, v1.8b            \n"
-      "usubl2     v3.8h, v0.16b, v1.16b          \n"
-      "smlal      v16.4s, v2.4h, v2.4h           \n"
-      "smlal      v17.4s, v3.4h, v3.4h           \n"
-      "smlal2     v18.4s, v2.8h, v2.8h           \n"
-      "smlal2     v19.4s, v3.8h, v3.8h           \n"
-      "b.gt       1b                             \n"
+      "ld1         {v0.16b}, [%0], #16           \n"
+      "ld1         {v1.16b}, [%1], #16           \n"
+      "subs        %w2, %w2, #16                 \n"
+      "usubl       v2.8h, v0.8b, v1.8b           \n"
+      "usubl2      v3.8h, v0.16b, v1.16b         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "smlal       v16.4s, v2.4h, v2.4h          \n"
+      "smlal       v17.4s, v3.4h, v3.4h          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "smlal2      v18.4s, v2.8h, v2.8h          \n"
+      "smlal2      v19.4s, v3.8h, v3.8h          \n"
+      "b.gt        1b                            \n"
 
-      "add        v16.4s, v16.4s, v17.4s         \n"
-      "add        v18.4s, v18.4s, v19.4s         \n"
-      "add        v19.4s, v16.4s, v18.4s         \n"
-      "addv       s0, v19.4s                     \n"
-      "fmov       %w3, s0                        \n"
+      "add         v16.4s, v16.4s, v17.4s        \n"
+      "add         v18.4s, v18.4s, v19.4s        \n"
+      "add         v19.4s, v16.4s, v18.4s        \n"
+      "addv        s0, v19.4s                    \n"
+      "fmov        %w3, s0                       \n"
       : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
       :
       : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
diff --git a/chromium/third_party/libyuv/source/convert.cc b/chromium/third_party/libyuv/source/convert.cc
index 375cc732c1d..98258b9bc93 100644
--- a/chromium/third_party/libyuv/source/convert.cc
+++ b/chromium/third_party/libyuv/source/convert.cc
@@ -215,6 +215,195 @@ int I422ToI420(const uint8_t* src_y,
                     dst_v, dst_stride_v, width, height, src_uv_width, height);
 }
 
+// TODO(fbarchard): Implement row conversion.
+LIBYUV_API
+int I422ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  // Allocate u and v buffers
+  align_buffer_64(plane_u, halfwidth * halfheight * 2);
+  uint8_t* plane_v = plane_u + halfwidth * halfheight;
+
+  I422ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+             dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
+             height);
+  MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu,
+               halfwidth, halfheight);
+  free_aligned_buffer_64(plane_u);
+  return 0;
+}
+
+#ifdef I422TONV21_ROW_VERSION
+// Unittest fails for this version.
+// 422 chroma is 1/2 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+// Swap src_u and src_v to implement I422ToNV12
+LIBYUV_API
+int I422ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  int y;
+  void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
+                     uint8_t* dst_uv, int width) = MergeUVRow_C;
+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_u || !src_v || !dst_vu || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow = MergeUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    MergeUVRow = MergeUVRow_Any_MMI;
+    if (IS_ALIGNED(halfwidth, 8)) {
+      MergeUVRow = MergeUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow = MergeUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    InterpolateRow = InterpolateRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      InterpolateRow = InterpolateRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
+
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, halfwidth, height);
+  }
+  {
+    // Allocate 2 rows of vu.
+    int awidth = halfwidth * 2;
+    align_buffer_64(row_vu_0, awidth * 2);
+    uint8_t* row_vu_1 = row_vu_0 + awidth;
+
+    for (y = 0; y < height - 1; y += 2) {
+      MergeUVRow(src_v, src_u, row_vu_0, halfwidth);
+      MergeUVRow(src_v + src_stride_v, src_u + src_stride_u, row_vu_1,
+                 halfwidth);
+      InterpolateRow(dst_vu, row_vu_0, awidth, awidth, 128);
+      src_u += src_stride_u * 2;
+      src_v += src_stride_v * 2;
+      dst_vu += dst_stride_vu;
+    }
+    if (height & 1) {
+      MergeUVRow(src_v, src_u, dst_vu, halfwidth);
+    }
+    free_aligned_buffer_64(row_vu_0);
+  }
+  return 0;
+}
+#endif  // I422TONV21_ROW_VERSION
+
 // 444 chroma is 1x width, 1x height
 // 420 chroma is 1/2 width, 1/2 height
 LIBYUV_API
@@ -237,6 +426,59 @@ int I444ToI420(const uint8_t* src_y,
                     dst_v, dst_stride_v, width, height, width, height);
 }
 
+LIBYUV_API
+int I444ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  HalfMergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv,
+                   dst_stride_uv, width, height);
+  return 0;
+}
+
+LIBYUV_API
+int I444ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  return I444ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                    src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
+                    width, height);
+}
+
 // I400 is greyscale typically used in MJPG
 LIBYUV_API
 int I400ToI420(const uint8_t* src_y,
@@ -269,70 +511,50 @@ int I400ToI420(const uint8_t* src_y,
   return 0;
 }
 
-static void CopyPlane2(const uint8_t* src,
-                       int src_stride_0,
-                       int src_stride_1,
-                       uint8_t* dst,
-                       int dst_stride,
-                       int width,
-                       int height) {
-  int y;
-  void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
-  }
-#endif
-#if defined(HAS_COPYROW_AVX)
-  if (TestCpuFlag(kCpuHasAVX)) {
-    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
-  }
-#endif
-#if defined(HAS_COPYROW_ERMS)
-  if (TestCpuFlag(kCpuHasERMS)) {
-    CopyRow = CopyRow_ERMS;
-  }
-#endif
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+// I400 is greyscale typically used in MJPG
+LIBYUV_API
+int I400ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!dst_vu || width <= 0 || height == 0) {
+    return -1;
   }
-#endif
-
-  // Copy plane
-  for (y = 0; y < height - 1; y += 2) {
-    CopyRow(src, dst, width);
-    CopyRow(src + src_stride_0, dst + dst_stride, width);
-    src += src_stride_0 + src_stride_1;
-    dst += dst_stride * 2;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
   }
-  if (height & 1) {
-    CopyRow(src, dst, width);
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
   }
+  SetPlane(dst_vu, dst_stride_vu, halfwidth * 2, halfheight, 128);
+  return 0;
 }
 
-// Support converting from FOURCC_M420
-// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
-// easy conversion to I420.
-// M420 format description:
-// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
-// Chroma is half width / half height. (420)
-// src_stride_m420 is row planar. Normally this will be the width in pixels.
-//   The UV plane is half width, but 2 values, so src_stride_m420 applies to
-//   this as well as the two Y planes.
-static int X420ToI420(const uint8_t* src_y,
-                      int src_stride_y0,
-                      int src_stride_y1,
-                      const uint8_t* src_uv,
-                      int src_stride_uv,
-                      uint8_t* dst_y,
-                      int dst_stride_y,
-                      uint8_t* dst_u,
-                      int dst_stride_u,
-                      uint8_t* dst_v,
-                      int dst_stride_v,
-                      int width,
-                      int height) {
+// Convert NV12 to I420.
+// TODO(fbarchard): Consider inverting destination. Faster on ARM with prfm.
+LIBYUV_API
+int NV12ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
   if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) {
@@ -342,21 +564,16 @@ static int X420ToI420(const uint8_t* src_y,
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    if (dst_y) {
-      dst_y = dst_y + (height - 1) * dst_stride_y;
-    }
-    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
-    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+    src_stride_y = -src_stride_y;
+    src_stride_uv = -src_stride_uv;
   }
   // Coalesce rows.
-  if (src_stride_y0 == width && src_stride_y1 == width &&
-      dst_stride_y == width) {
+  if (src_stride_y == width && dst_stride_y == width) {
     width *= height;
     height = 1;
-    src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
+    src_stride_y = dst_stride_y = 0;
   }
   // Coalesce rows.
   if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth &&
@@ -367,12 +584,7 @@ static int X420ToI420(const uint8_t* src_y,
   }
 
   if (dst_y) {
-    if (src_stride_y0 == src_stride_y1) {
-      CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);
-    } else {
-      CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
-                 width, height);
-    }
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
   }
 
   // Split UV plane - NV12 / NV21
@@ -382,25 +594,6 @@ static int X420ToI420(const uint8_t* src_y,
   return 0;
 }
 
-// Convert NV12 to I420.
-LIBYUV_API
-int NV12ToI420(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_uv,
-               int src_stride_uv,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  return X420ToI420(src_y, src_stride_y, src_stride_y, src_uv, src_stride_uv,
-                    dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
-                    dst_stride_v, width, height);
-}
-
 // Convert NV21 to I420.  Same as NV12 but u and v pointers swapped.
 LIBYUV_API
 int NV21ToI420(const uint8_t* src_y,
@@ -415,26 +608,8 @@ int NV21ToI420(const uint8_t* src_y,
                int dst_stride_v,
                int width,
                int height) {
-  return X420ToI420(src_y, src_stride_y, src_stride_y, src_vu, src_stride_vu,
-                    dst_y, dst_stride_y, dst_v, dst_stride_v, dst_u,
-                    dst_stride_u, width, height);
-}
-
-// Convert M420 to I420.
-LIBYUV_API
-int M420ToI420(const uint8_t* src_m420,
-               int src_stride_m420,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
-                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, dst_y,
-                    dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
+  return NV12ToI420(src_y, src_stride_y, src_vu, src_stride_vu, dst_y,
+                    dst_stride_y, dst_v, dst_stride_v, dst_u, dst_stride_u,
                     width, height);
 }
 
@@ -492,7 +667,19 @@ int YUY2ToI420(const uint8_t* src_yuy2,
     }
   }
 #endif
-#if defined(HAS_YUY2TOYROW_MSA)
+#if defined(HAS_YUY2TOYROW_MMI) && defined(HAS_YUY2TOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MMI;
+    YUY2ToUVRow = YUY2ToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToYRow = YUY2ToYRow_MMI;
+      if (IS_ALIGNED(width, 16)) {
+        YUY2ToUVRow = YUY2ToUVRow_MMI;
+      }
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     YUY2ToYRow = YUY2ToYRow_Any_MSA;
     YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
@@ -573,6 +760,16 @@ int UYVYToI420(const uint8_t* src_uyvy,
     }
   }
 #endif
+#if defined(HAS_UYVYTOYROW_MMI) && defined(HAS_UYVYTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    UYVYToYRow = UYVYToYRow_Any_MMI;
+    UYVYToUVRow = UYVYToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_MMI;
+      UYVYToUVRow = UYVYToUVRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_UYVYTOYROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     UYVYToYRow = UYVYToYRow_Any_MSA;
@@ -600,6 +797,144 @@ int UYVYToI420(const uint8_t* src_uyvy,
   return 0;
 }
 
+// Convert AYUV to NV12.
+LIBYUV_API
+int AYUVToNV12(const uint8_t* src_ayuv,
+               int src_stride_ayuv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int y;
+  void (*AYUVToUVRow)(const uint8_t* src_ayuv, int src_stride_ayuv,
+                      uint8_t* dst_uv, int width) = AYUVToUVRow_C;
+  void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
+      AYUVToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
+    src_stride_ayuv = -src_stride_ayuv;
+  }
+// place holders for future intel code
+#if defined(HAS_AYUVTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    AYUVToUVRow = AYUVToUVRow_Any_SSE2;
+    AYUVToYRow = AYUVToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      AYUVToUVRow = AYUVToUVRow_SSE2;
+      AYUVToYRow = AYUVToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_AYUVTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    AYUVToUVRow = AYUVToUVRow_Any_AVX2;
+    AYUVToYRow = AYUVToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      AYUVToUVRow = AYUVToUVRow_AVX2;
+      AYUVToYRow = AYUVToYRow_AVX2;
+    }
+  }
+#endif
+
+#if defined(HAS_AYUVTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    AYUVToYRow = AYUVToYRow_Any_NEON;
+    AYUVToUVRow = AYUVToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      AYUVToYRow = AYUVToYRow_NEON;
+      AYUVToUVRow = AYUVToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    AYUVToUVRow(src_ayuv, src_stride_ayuv, dst_uv, width);
+    AYUVToYRow(src_ayuv, dst_y, width);
+    AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
+    src_ayuv += src_stride_ayuv * 2;
+    dst_y += dst_stride_y * 2;
+    dst_uv += dst_stride_uv;
+  }
+  if (height & 1) {
+    AYUVToUVRow(src_ayuv, 0, dst_uv, width);
+    AYUVToYRow(src_ayuv, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert AYUV to NV21.
+LIBYUV_API
+int AYUVToNV21(const uint8_t* src_ayuv,
+               int src_stride_ayuv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  int y;
+  void (*AYUVToVURow)(const uint8_t* src_ayuv, int src_stride_ayuv,
+                      uint8_t* dst_vu, int width) = AYUVToVURow_C;
+  void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
+      AYUVToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
+    src_stride_ayuv = -src_stride_ayuv;
+  }
+// place holders for future intel code
+#if defined(HAS_AYUVTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    AYUVToVURow = AYUVToVURow_Any_SSE2;
+    AYUVToYRow = AYUVToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      AYUVToVURow = AYUVToVURow_SSE2;
+      AYUVToYRow = AYUVToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_AYUVTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    AYUVToVURow = AYUVToVURow_Any_AVX2;
+    AYUVToYRow = AYUVToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      AYUVToVURow = AYUVToVURow_AVX2;
+      AYUVToYRow = AYUVToYRow_AVX2;
+    }
+  }
+#endif
+
+#if defined(HAS_AYUVTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    AYUVToYRow = AYUVToYRow_Any_NEON;
+    AYUVToVURow = AYUVToVURow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      AYUVToYRow = AYUVToYRow_NEON;
+      AYUVToVURow = AYUVToVURow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    AYUVToVURow(src_ayuv, src_stride_ayuv, dst_vu, width);
+    AYUVToYRow(src_ayuv, dst_y, width);
+    AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
+    src_ayuv += src_stride_ayuv * 2;
+    dst_y += dst_stride_y * 2;
+    dst_vu += dst_stride_vu;
+  }
+  if (height & 1) {
+    AYUVToVURow(src_ayuv, 0, dst_vu, width);
+    AYUVToYRow(src_ayuv, dst_y, width);
+  }
+  return 0;
+}
+
 // Convert ARGB to I420.
 LIBYUV_API
 int ARGBToI420(const uint8_t* src_argb,
@@ -663,17 +998,25 @@ int ARGBToI420(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToYRow = ARGBToYRow_Any_MSA;
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYRow = ARGBToYRow_Any_MMI;
+    ARGBToUVRow = ARGBToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_MMI;
+    }
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_MSA;
+      ARGBToUVRow = ARGBToUVRow_MMI;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
     ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
     if (IS_ALIGNED(width, 32)) {
       ARGBToUVRow = ARGBToUVRow_MSA;
     }
@@ -749,18 +1092,24 @@ int BGRAToI420(const uint8_t* src_bgra,
     }
   }
 #endif
-#if defined(HAS_BGRATOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    BGRAToYRow = BGRAToYRow_Any_MSA;
+#if defined(HAS_BGRATOYROW_MMI) && defined(HAS_BGRATOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    BGRAToYRow = BGRAToYRow_Any_MMI;
+    BGRAToUVRow = BGRAToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      BGRAToYRow = BGRAToYRow_MMI;
+    }
     if (IS_ALIGNED(width, 16)) {
-      BGRAToYRow = BGRAToYRow_MSA;
+      BGRAToUVRow = BGRAToUVRow_MMI;
     }
   }
 #endif
-#if defined(HAS_BGRATOUVROW_MSA)
+#if defined(HAS_BGRATOYROW_MSA) && defined(HAS_BGRATOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
+    BGRAToYRow = BGRAToYRow_Any_MSA;
     BGRAToUVRow = BGRAToUVRow_Any_MSA;
     if (IS_ALIGNED(width, 16)) {
+      BGRAToYRow = BGRAToYRow_MSA;
       BGRAToUVRow = BGRAToUVRow_MSA;
     }
   }
@@ -819,6 +1168,16 @@ int ABGRToI420(const uint8_t* src_abgr,
     }
   }
 #endif
+#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+    ABGRToYRow = ABGRToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVRow = ABGRToUVRow_AVX2;
+      ABGRToYRow = ABGRToYRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_ABGRTOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ABGRToYRow = ABGRToYRow_Any_NEON;
@@ -835,18 +1194,24 @@ int ABGRToI420(const uint8_t* src_abgr,
     }
   }
 #endif
-#if defined(HAS_ABGRTOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ABGRToYRow = ABGRToYRow_Any_MSA;
+#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ABGRToYRow = ABGRToYRow_Any_MMI;
+    ABGRToUVRow = ABGRToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToYRow = ABGRToYRow_MMI;
+    }
     if (IS_ALIGNED(width, 16)) {
-      ABGRToYRow = ABGRToYRow_MSA;
+      ABGRToUVRow = ABGRToUVRow_MMI;
     }
   }
 #endif
-#if defined(HAS_ABGRTOUVROW_MSA)
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToYRow = ABGRToYRow_Any_MSA;
     ABGRToUVRow = ABGRToUVRow_Any_MSA;
     if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_MSA;
       ABGRToUVRow = ABGRToUVRow_MSA;
     }
   }
@@ -921,18 +1286,24 @@ int RGBAToI420(const uint8_t* src_rgba,
     }
   }
 #endif
-#if defined(HAS_RGBATOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RGBAToYRow = RGBAToYRow_Any_MSA;
+#if defined(HAS_RGBATOYROW_MMI) && defined(HAS_RGBATOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RGBAToYRow = RGBAToYRow_Any_MMI;
+    RGBAToUVRow = RGBAToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      RGBAToYRow = RGBAToYRow_MMI;
+    }
     if (IS_ALIGNED(width, 16)) {
-      RGBAToYRow = RGBAToYRow_MSA;
+      RGBAToUVRow = RGBAToUVRow_MMI;
     }
   }
 #endif
-#if defined(HAS_RGBATOUVROW_MSA)
+#if defined(HAS_RGBATOYROW_MSA) && defined(HAS_RGBATOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
+    RGBAToYRow = RGBAToYRow_Any_MSA;
     RGBAToUVRow = RGBAToUVRow_Any_MSA;
     if (IS_ALIGNED(width, 16)) {
+      RGBAToYRow = RGBAToYRow_MSA;
       RGBAToUVRow = RGBAToUVRow_MSA;
     }
   }
@@ -967,7 +1338,8 @@ int RGB24ToI420(const uint8_t* src_rgb24,
                 int width,
                 int height) {
   int y;
-#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+     defined(HAS_RGB24TOYROW_MMI))
   void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
                        uint8_t* dst_u, uint8_t* dst_v, int width) =
       RGB24ToUVRow_C;
@@ -1004,7 +1376,21 @@ int RGB24ToI420(const uint8_t* src_rgb24,
       }
     }
   }
-#elif defined(HAS_RGB24TOYROW_MSA)
+// MMI and MSA version does direct RGB24 to YUV.
+#elif (defined(HAS_RGB24TOYROW_MMI) || defined(HAS_RGB24TOYROW_MSA))
+#if defined(HAS_RGB24TOYROW_MMI) && defined(HAS_RGB24TOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RGB24ToUVRow = RGB24ToUVRow_Any_MMI;
+    RGB24ToYRow = RGB24ToYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToYRow = RGB24ToYRow_MMI;
+      if (IS_ALIGNED(width, 16)) {
+        RGB24ToUVRow = RGB24ToUVRow_MMI;
+      }
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYROW_MSA) && defined(HAS_RGB24TOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     RGB24ToUVRow = RGB24ToUVRow_Any_MSA;
     RGB24ToYRow = RGB24ToYRow_Any_MSA;
@@ -1013,6 +1399,7 @@ int RGB24ToI420(const uint8_t* src_rgb24,
       RGB24ToUVRow = RGB24ToUVRow_MSA;
     }
   }
+#endif
 // Other platforms do intermediate conversion from RGB24 to ARGB.
 #else
 #if defined(HAS_RGB24TOARGBROW_SSSE3)
@@ -1046,14 +1433,16 @@ int RGB24ToI420(const uint8_t* src_rgb24,
 #endif
 
   {
-#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+      defined(HAS_RGB24TOYROW_MMI))
     // Allocate 2 rows of ARGB.
     const int kRowSize = (width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+     defined(HAS_RGB24TOYROW_MMI))
       RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
       RGB24ToYRow(src_rgb24, dst_y, width);
       RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
@@ -1070,7 +1459,8 @@ int RGB24ToI420(const uint8_t* src_rgb24,
       dst_v += dst_stride_v;
     }
     if (height & 1) {
-#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+     defined(HAS_RGB24TOYROW_MMI))
       RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
       RGB24ToYRow(src_rgb24, dst_y, width);
 #else
@@ -1079,7 +1469,160 @@ int RGB24ToI420(const uint8_t* src_rgb24,
       ARGBToYRow(row, dst_y, width);
 #endif
     }
-#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+      defined(HAS_RGB24TOYROW_MMI))
+    free_aligned_buffer_64(row);
+#endif
+  }
+  return 0;
+}
+
+// TODO(fbarchard): Use Matrix version to implement I420 and J420.
+// Convert RGB24 to J420.
+LIBYUV_API
+int RGB24ToJ420(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                uint8_t* dst_u,
+                int dst_stride_u,
+                uint8_t* dst_v,
+                int dst_stride_v,
+                int width,
+                int height) {
+  int y;
+#if (defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+    defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI)
+  void (*RGB24ToUVJRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
+                        uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RGB24ToUVJRow_C;
+  void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
+      RGB24ToYJRow_C;
+#else
+  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+      RGB24ToARGBRow_C;
+  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYJRow_C;
+#endif
+  if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+
+// Neon version does direct RGB24 to YUV.
+#if defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToUVJRow = RGB24ToUVJRow_Any_NEON;
+    RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToYJRow = RGB24ToYJRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RGB24ToUVJRow = RGB24ToUVJRow_NEON;
+      }
+    }
+  }
+// MMI and MSA version does direct RGB24 to YUV.
+#elif (defined(HAS_RGB24TOYJROW_MMI) || defined(HAS_RGB24TOYJROW_MSA))
+#if defined(HAS_RGB24TOYJROW_MMI) && defined(HAS_RGB24TOUVJROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RGB24ToUVJRow = RGB24ToUVJRow_Any_MMI;
+    RGB24ToYJRow = RGB24ToYJRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToYJRow = RGB24ToYJRow_MMI;
+      if (IS_ALIGNED(width, 16)) {
+        RGB24ToUVJRow = RGB24ToUVJRow_MMI;
+      }
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_MSA) && defined(HAS_RGB24TOUVJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB24ToUVJRow = RGB24ToUVJRow_Any_MSA;
+    RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYJRow = RGB24ToYJRow_MSA;
+      RGB24ToUVJRow = RGB24ToUVJRow_MSA;
+    }
+  }
+#endif
+#else
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_AVX2;
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#endif
+
+  {
+#if !((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+      defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if ((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+     defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
+      RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+      RGB24ToYJRow(src_rgb24, dst_y, width);
+      RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+      ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYJRow(row, dst_y, width);
+      ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_rgb24 += src_stride_rgb24 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if ((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+     defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
+      RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width);
+      RGB24ToYJRow(src_rgb24, dst_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      ARGBToUVJRow(row, 0, dst_u, dst_v, width);
+      ARGBToYJRow(row, dst_y, width);
+#endif
+    }
+#if !((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+      defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
     free_aligned_buffer_64(row);
 #endif
   }
@@ -1099,7 +1642,8 @@ int RAWToI420(const uint8_t* src_raw,
               int width,
               int height) {
   int y;
-#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+#if (defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON)) || \
+    defined(HAS_RAWTOYROW_MSA) || defined(HAS_RAWTOYROW_MMI)
   void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u,
                      uint8_t* dst_v, int width) = RAWToUVRow_C;
   void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
@@ -1124,7 +1668,7 @@ int RAWToI420(const uint8_t* src_raw,
   }
 
 // Neon version does direct RAW to YUV.
-#if defined(HAS_RAWTOYROW_NEON)
+#if defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     RAWToUVRow = RAWToUVRow_Any_NEON;
     RAWToYRow = RAWToYRow_Any_NEON;
@@ -1135,7 +1679,21 @@ int RAWToI420(const uint8_t* src_raw,
       }
     }
   }
-#elif defined(HAS_RAWTOYROW_MSA)
+// MMI and MSA version does direct RAW to YUV.
+#elif (defined(HAS_RAWTOYROW_MMI) || defined(HAS_RAWTOYROW_MSA))
+#if defined(HAS_RAWTOYROW_MMI) && defined(HAS_RAWTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RAWToUVRow = RAWToUVRow_Any_MMI;
+    RAWToYRow = RAWToYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToYRow = RAWToYRow_MMI;
+      if (IS_ALIGNED(width, 16)) {
+        RAWToUVRow = RAWToUVRow_MMI;
+      }
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYROW_MSA) && defined(HAS_RAWTOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     RAWToUVRow = RAWToUVRow_Any_MSA;
     RAWToYRow = RAWToYRow_Any_MSA;
@@ -1144,6 +1702,7 @@ int RAWToI420(const uint8_t* src_raw,
       RAWToUVRow = RAWToUVRow_MSA;
     }
   }
+#endif
 // Other platforms do intermediate conversion from RAW to ARGB.
 #else
 #if defined(HAS_RAWTOARGBROW_SSSE3)
@@ -1177,14 +1736,16 @@ int RAWToI420(const uint8_t* src_raw,
 #endif
 
   {
-#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+      defined(HAS_RAWTOYROW_MMI))
     // Allocate 2 rows of ARGB.
     const int kRowSize = (width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+     defined(HAS_RAWTOYROW_MMI))
       RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
       RAWToYRow(src_raw, dst_y, width);
       RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
@@ -1201,7 +1762,8 @@ int RAWToI420(const uint8_t* src_raw,
       dst_v += dst_stride_v;
     }
     if (height & 1) {
-#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+     defined(HAS_RAWTOYROW_MMI))
       RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
       RAWToYRow(src_raw, dst_y, width);
 #else
@@ -1210,7 +1772,8 @@ int RAWToI420(const uint8_t* src_raw,
       ARGBToYRow(row, dst_y, width);
 #endif
     }
-#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+      defined(HAS_RAWTOYROW_MMI))
     free_aligned_buffer_64(row);
 #endif
   }
@@ -1230,7 +1793,8 @@ int RGB565ToI420(const uint8_t* src_rgb565,
                  int width,
                  int height) {
   int y;
-#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+     defined(HAS_RGB565TOYROW_MMI))
   void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565,
                         uint8_t* dst_u, uint8_t* dst_v, int width) =
       RGB565ToUVRow_C;
@@ -1267,7 +1831,21 @@ int RGB565ToI420(const uint8_t* src_rgb565,
       }
     }
   }
-#elif defined(HAS_RGB565TOYROW_MSA)
+// MMI and MSA version does direct RGB565 to YUV.
+#elif (defined(HAS_RGB565TOYROW_MMI) || defined(HAS_RGB565TOYROW_MSA))
+#if defined(HAS_RGB565TOYROW_MMI) && defined(HAS_RGB565TOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RGB565ToUVRow = RGB565ToUVRow_Any_MMI;
+    RGB565ToYRow = RGB565ToYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToYRow = RGB565ToYRow_MMI;
+      if (IS_ALIGNED(width, 16)) {
+        RGB565ToUVRow = RGB565ToUVRow_MMI;
+      }
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOYROW_MSA) && defined(HAS_RGB565TOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     RGB565ToUVRow = RGB565ToUVRow_Any_MSA;
     RGB565ToYRow = RGB565ToYRow_Any_MSA;
@@ -1276,6 +1854,7 @@ int RGB565ToI420(const uint8_t* src_rgb565,
       RGB565ToUVRow = RGB565ToUVRow_MSA;
     }
   }
+#endif
 // Other platforms do intermediate conversion from RGB565 to ARGB.
 #else
 #if defined(HAS_RGB565TOARGBROW_SSE2)
@@ -1316,13 +1895,15 @@ int RGB565ToI420(const uint8_t* src_rgb565,
 #endif
 #endif
   {
-#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+      defined(HAS_RGB565TOYROW_MMI))
     // Allocate 2 rows of ARGB.
     const int kRowSize = (width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 #endif
     for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+     defined(HAS_RGB565TOYROW_MMI))
       RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
       RGB565ToYRow(src_rgb565, dst_y, width);
       RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
@@ -1339,7 +1920,8 @@ int RGB565ToI420(const uint8_t* src_rgb565,
       dst_v += dst_stride_v;
     }
     if (height & 1) {
-#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+     defined(HAS_RGB565TOYROW_MMI))
       RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
       RGB565ToYRow(src_rgb565, dst_y, width);
 #else
@@ -1348,7 +1930,8 @@ int RGB565ToI420(const uint8_t* src_rgb565,
       ARGBToYRow(row, dst_y, width);
 #endif
     }
-#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+      defined(HAS_RGB565TOYROW_MMI))
     free_aligned_buffer_64(row);
 #endif
   }
@@ -1368,7 +1951,8 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
                    int width,
                    int height) {
   int y;
-#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+     defined(HAS_ARGB1555TOYROW_MMI))
   void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555,
                           uint8_t* dst_u, uint8_t* dst_v, int width) =
       ARGB1555ToUVRow_C;
@@ -1406,7 +1990,21 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
       }
     }
   }
-#elif defined(HAS_ARGB1555TOYROW_MSA)
+// MMI and MSA version does direct ARGB1555 to YUV.
+#elif (defined(HAS_ARGB1555TOYROW_MMI) || defined(HAS_ARGB1555TOYROW_MSA))
+#if defined(HAS_ARGB1555TOYROW_MMI) && defined(HAS_ARGB1555TOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MMI;
+    ARGB1555ToYRow = ARGB1555ToYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToYRow = ARGB1555ToYRow_MMI;
+      if (IS_ALIGNED(width, 16)) {
+        ARGB1555ToUVRow = ARGB1555ToUVRow_MMI;
+      }
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOYROW_MSA) && defined(HAS_ARGB1555TOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA;
     ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA;
@@ -1415,6 +2013,7 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
       ARGB1555ToUVRow = ARGB1555ToUVRow_MSA;
     }
   }
+#endif
 // Other platforms do intermediate conversion from ARGB1555 to ARGB.
 #else
 #if defined(HAS_ARGB1555TOARGBROW_SSE2)
@@ -1455,14 +2054,16 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
 #endif
 #endif
   {
-#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+      defined(HAS_ARGB1555TOYROW_MMI))
     // Allocate 2 rows of ARGB.
     const int kRowSize = (width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+     defined(HAS_ARGB1555TOYROW_MMI))
       ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
       ARGB1555ToYRow(src_argb1555, dst_y, width);
       ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
@@ -1481,7 +2082,8 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
       dst_v += dst_stride_v;
     }
     if (height & 1) {
-#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+     defined(HAS_ARGB1555TOYROW_MMI))
       ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
       ARGB1555ToYRow(src_argb1555, dst_y, width);
 #else
@@ -1490,7 +2092,8 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
       ARGBToYRow(row, dst_y, width);
 #endif
     }
-#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+      defined(HAS_ARGB1555TOYROW_MMI))
     free_aligned_buffer_64(row);
 #endif
   }
@@ -1510,7 +2113,7 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
                    int width,
                    int height) {
   int y;
-#if defined(HAS_ARGB4444TOYROW_NEON)
+#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
   void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444,
                           uint8_t* dst_u, uint8_t* dst_v, int width) =
       ARGB4444ToUVRow_C;
@@ -1548,6 +2151,17 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
       }
     }
   }
+#elif defined(HAS_ARGB4444TOYROW_MMI) && defined(HAS_ARGB4444TOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGB4444ToUVRow = ARGB4444ToUVRow_Any_MMI;
+    ARGB4444ToYRow = ARGB4444ToYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToYRow = ARGB4444ToYRow_MMI;
+      if (IS_ALIGNED(width, 16)) {
+        ARGB4444ToUVRow = ARGB4444ToUVRow_MMI;
+      }
+    }
+  }
 // Other platforms do intermediate conversion from ARGB4444 to ARGB.
 #else
 #if defined(HAS_ARGB4444TOARGBROW_SSE2)
@@ -1594,7 +2208,19 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_MSA)
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MMI;
+    ARGBToYRow = ARGBToYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_MMI;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUVRow = ARGBToUVRow_MMI;
+      }
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToUVRow = ARGBToUVRow_Any_MSA;
     ARGBToYRow = ARGBToYRow_Any_MSA;
@@ -1609,14 +2235,14 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
 #endif
 
   {
-#if !defined(HAS_ARGB4444TOYROW_NEON)
+#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
     // Allocate 2 rows of ARGB.
     const int kRowSize = (width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_ARGB4444TOYROW_NEON)
+#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
       ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
       ARGB4444ToYRow(src_argb4444, dst_y, width);
       ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
@@ -1635,7 +2261,7 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
       dst_v += dst_stride_v;
     }
     if (height & 1) {
-#if defined(HAS_ARGB4444TOYROW_NEON)
+#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
       ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
       ARGB4444ToYRow(src_argb4444, dst_y, width);
 #else
@@ -1644,13 +2270,161 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
       ARGBToYRow(row, dst_y, width);
 #endif
     }
-#if !defined(HAS_ARGB4444TOYROW_NEON)
+#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
     free_aligned_buffer_64(row);
 #endif
   }
   return 0;
 }
 
+// Convert RGB24 to J400.
+LIBYUV_API
+int RGB24ToJ400(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_yj,
+                int dst_stride_yj,
+                int width,
+                int height) {
+  int y;
+  void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) =
+      RGB24ToYJRow_C;
+  if (!src_rgb24 || !dst_yj || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+  // Coalesce rows.
+  if (src_stride_rgb24 == width * 3 && dst_stride_yj == width) {
+    width *= height;
+    height = 1;
+    src_stride_rgb24 = dst_stride_yj = 0;
+  }
+#if defined(HAS_RGB24TOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYJRow = RGB24ToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      RGB24ToYJRow = RGB24ToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToYJRow = RGB24ToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToYJRow = RGB24ToYJRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYJRow = RGB24ToYJRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RGB24ToYJRow(src_rgb24, dst_yj, width);
+    src_rgb24 += src_stride_rgb24;
+    dst_yj += dst_stride_yj;
+  }
+  return 0;
+}
+
+// Convert RAW to J400.
+LIBYUV_API
+int RAWToJ400(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_yj,
+              int dst_stride_yj,
+              int width,
+              int height) {
+  int y;
+  void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_yj, int width) =
+      RAWToYJRow_C;
+  if (!src_raw || !dst_yj || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+  // Coalesce rows.
+  if (src_stride_raw == width * 3 && dst_stride_yj == width) {
+    width *= height;
+    height = 1;
+    src_stride_raw = dst_stride_yj = 0;
+  }
+#if defined(HAS_RAWTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToYJRow = RAWToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYJRow = RAWToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RAWToYJRow = RAWToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      RAWToYJRow = RAWToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToYJRow = RAWToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToYJRow = RAWToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RAWToYJRow = RAWToYJRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToYJRow = RAWToYJRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToYJRow = RAWToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYJRow = RAWToYJRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RAWToYJRow(src_raw, dst_yj, width);
+    src_raw += src_stride_raw;
+    dst_yj += dst_stride_yj;
+  }
+  return 0;
+}
+
 static void SplitPixels(const uint8_t* src_u,
                         int src_pixel_stride_uv,
                         uint8_t* dst_u,
diff --git a/chromium/third_party/libyuv/source/convert_argb.cc b/chromium/third_party/libyuv/source/convert_argb.cc
index f2fe474f704..5e7225faf21 100644
--- a/chromium/third_party/libyuv/source/convert_argb.cc
+++ b/chromium/third_party/libyuv/source/convert_argb.cc
@@ -47,18 +47,19 @@ int ARGBCopy(const uint8_t* src_argb,
   return 0;
 }
 
-// Convert I420 to ARGB with matrix
-static int I420ToARGBMatrix(const uint8_t* src_y,
-                            int src_stride_y,
-                            const uint8_t* src_u,
-                            int src_stride_u,
-                            const uint8_t* src_v,
-                            int src_stride_v,
-                            uint8_t* dst_argb,
-                            int dst_stride_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width,
-                            int height) {
+// Convert I420 to ARGB with matrix.
+LIBYUV_API
+int I420ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
   int y;
   void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
                         const uint8_t* v_buf, uint8_t* rgb_buf,
@@ -97,6 +98,14 @@ static int I420ToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I422ToARGBRow = I422ToARGBRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_I422TOARGBROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     I422ToARGBRow = I422ToARGBRow_Any_MSA;
@@ -226,18 +235,55 @@ int H420ToABGR(const uint8_t* src_y,
                           width, height);
 }
 
-// Convert I422 to ARGB with matrix
-static int I422ToARGBMatrix(const uint8_t* src_y,
-                            int src_stride_y,
-                            const uint8_t* src_u,
-                            int src_stride_u,
-                            const uint8_t* src_v,
-                            int src_stride_v,
-                            uint8_t* dst_argb,
-                            int dst_stride_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width,
-                            int height) {
+// Convert U420 to ARGB.
+LIBYUV_API
+int U420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert U420 to ABGR.
+LIBYUV_API
+int U420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvu2020Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert I422 to ARGB with matrix.
+LIBYUV_API
+int I422ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
   int y;
   void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
                         const uint8_t* v_buf, uint8_t* rgb_buf,
@@ -283,6 +329,14 @@ static int I422ToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I422ToARGBRow = I422ToARGBRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_I422TOARGBROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     I422ToARGBRow = I422ToARGBRow_Any_MSA;
@@ -410,20 +464,286 @@ int H422ToABGR(const uint8_t* src_y,
                           width, height);
 }
 
-// Convert 10 bit YUV to ARGB with matrix
+// Convert U422 to ARGB.
+LIBYUV_API
+int U422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert U422 to ABGR.
+LIBYUV_API
+int U422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvu2020Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert I444 to ARGB with matrix.
+LIBYUV_API
+int I444ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I444ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_I444TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I444ToARGBRow = I444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I444ToARGBRow = I444ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I444ToARGBRow = I444ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I444ToARGBRow = I444ToARGBRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I444ToARGBRow = I444ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants, width, height);
+}
+
+// Convert J444 to ABGR.
+LIBYUV_API
+int J444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuJPEGConstants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert H444 to ARGB.
+LIBYUV_API
+int H444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert H444 to ABGR.
+LIBYUV_API
+int H444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuH709Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert U444 to ARGB.
+LIBYUV_API
+int U444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert U444 to ABGR.
+LIBYUV_API
+int U444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvu2020Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert 10 bit YUV to ARGB with matrix.
 // TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
 // multiply 10 bit yuv into high bits to allow any number of bits.
-static int I010ToAR30Matrix(const uint16_t* src_y,
-                            int src_stride_y,
-                            const uint16_t* src_u,
-                            int src_stride_u,
-                            const uint16_t* src_v,
-                            int src_stride_v,
-                            uint8_t* dst_ar30,
-                            int dst_stride_ar30,
-                            const struct YuvConstants* yuvconstants,
-                            int width,
-                            int height) {
+LIBYUV_API
+int I010ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
   int y;
   void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
@@ -500,6 +820,23 @@ int H010ToAR30(const uint16_t* src_y,
                           &kYuvH709Constants, width, height);
 }
 
+// Convert U010 to AR30.
+LIBYUV_API
+int U010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuv2020Constants, width, height);
+}
+
 // Convert I010 to AB30.
 LIBYUV_API
 int I010ToAB30(const uint16_t* src_y,
@@ -534,18 +871,193 @@ int H010ToAB30(const uint16_t* src_y,
                           &kYvuH709Constants, width, height);
 }
 
-// Convert 10 bit YUV to ARGB with matrix
-static int I010ToARGBMatrix(const uint16_t* src_y,
-                            int src_stride_y,
-                            const uint16_t* src_u,
-                            int src_stride_u,
-                            const uint16_t* src_v,
-                            int src_stride_v,
-                            uint8_t* dst_argb,
-                            int dst_stride_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width,
-                            int height) {
+// Convert U010 to AB30.
+LIBYUV_API
+int U010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert 10 bit YUV to ARGB with matrix.
+// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
+// multiply 10 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I210ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I210ToAR30Row_C;
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+#if defined(HAS_I210TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I210ToAR30Row = I210ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I210TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I210ToAR30Row = I210ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I210ToAR30Row = I210ToAR30Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I210 to AR30.
+LIBYUV_API
+int I210ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert H210 to AR30.
+LIBYUV_API
+int H210ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert U210 to AR30.
+LIBYUV_API
+int U210ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert I210 to AB30.
+LIBYUV_API
+int I210ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYvuI601Constants, width, height);
+}
+
+// Convert H210 to AB30.
+LIBYUV_API
+int H210ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYvuH709Constants, width, height);
+}
+
+// Convert U210 to AB30.
+LIBYUV_API
+int U210ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert 10 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I010ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
   int y;
   void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
@@ -576,6 +1088,14 @@ static int I010ToARGBMatrix(const uint16_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I210TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I210ToARGBRow = I210ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I210ToARGBRow = I210ToARGBRow_MMI;
+    }
+  }
+#endif
   for (y = 0; y < height; ++y) {
     I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
@@ -660,23 +1180,60 @@ int H010ToABGR(const uint16_t* src_y,
                           width, height);
 }
 
-// Convert I444 to ARGB with matrix
-static int I444ToARGBMatrix(const uint8_t* src_y,
-                            int src_stride_y,
-                            const uint8_t* src_u,
-                            int src_stride_u,
-                            const uint8_t* src_v,
-                            int src_stride_v,
-                            uint8_t* dst_argb,
-                            int dst_stride_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width,
-                            int height) {
+// Convert U010 to ARGB.
+LIBYUV_API
+int U010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert U010 to ABGR.
+LIBYUV_API
+int U010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvu2020Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert 10 bit 422 YUV to ARGB with matrix.
+LIBYUV_API
+int I210ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
   int y;
-  void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
-                        const uint8_t* v_buf, uint8_t* rgb_buf,
+  void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
-      I444ToARGBRow_C;
+      I210ToARGBRow_C;
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -686,48 +1243,32 @@ static int I444ToARGBMatrix(const uint8_t* src_y,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  // Coalesce rows.
-  if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
-  }
-#if defined(HAS_I444TOARGBROW_SSSE3)
+#if defined(HAS_I210TOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+    I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I444ToARGBRow = I444ToARGBRow_SSSE3;
+      I210ToARGBRow = I210ToARGBRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_I444TOARGBROW_AVX2)
+#if defined(HAS_I210TOARGBROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+    I210ToARGBRow = I210ToARGBRow_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
-      I444ToARGBRow = I444ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I444TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I444ToARGBRow = I444ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I444ToARGBRow = I444ToARGBRow_NEON;
+      I210ToARGBRow = I210ToARGBRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_I444TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I444ToARGBRow = I444ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I444ToARGBRow = I444ToARGBRow_MSA;
+#if defined(HAS_I210TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I210ToARGBRow = I210ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I210ToARGBRow = I210ToARGBRow_MMI;
     }
   }
 #endif
-
   for (y = 0; y < height; ++y) {
-    I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+    I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
     src_u += src_stride_u;
@@ -736,74 +1277,130 @@ static int I444ToARGBMatrix(const uint8_t* src_y,
   return 0;
 }
 
-// Convert I444 to ARGB.
+// Convert I210 to ARGB.
 LIBYUV_API
-int I444ToARGB(const uint8_t* src_y,
+int I210ToARGB(const uint16_t* src_y,
                int src_stride_y,
-               const uint8_t* src_u,
+               const uint16_t* src_u,
                int src_stride_u,
-               const uint8_t* src_v,
+               const uint16_t* src_v,
                int src_stride_v,
                uint8_t* dst_argb,
                int dst_stride_argb,
                int width,
                int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+  return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
                           src_stride_v, dst_argb, dst_stride_argb,
                           &kYuvI601Constants, width, height);
 }
 
-// Convert I444 to ABGR.
+// Convert I210 to ABGR.
 LIBYUV_API
-int I444ToABGR(const uint8_t* src_y,
+int I210ToABGR(const uint16_t* src_y,
                int src_stride_y,
-               const uint8_t* src_u,
+               const uint16_t* src_u,
                int src_stride_u,
-               const uint8_t* src_v,
+               const uint16_t* src_v,
                int src_stride_v,
                uint8_t* dst_abgr,
                int dst_stride_abgr,
                int width,
                int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+  return I210ToARGBMatrix(src_y, src_stride_y, src_v,
                           src_stride_v,  // Swap U and V
                           src_u, src_stride_u, dst_abgr, dst_stride_abgr,
                           &kYvuI601Constants,  // Use Yvu matrix
                           width, height);
 }
 
-// Convert J444 to ARGB.
+// Convert H210 to ARGB.
 LIBYUV_API
-int J444ToARGB(const uint8_t* src_y,
+int H210ToARGB(const uint16_t* src_y,
                int src_stride_y,
-               const uint8_t* src_u,
+               const uint16_t* src_u,
                int src_stride_u,
-               const uint8_t* src_v,
+               const uint16_t* src_v,
                int src_stride_v,
                uint8_t* dst_argb,
                int dst_stride_argb,
                int width,
                int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+  return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
                           src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvJPEGConstants, width, height);
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert H210 to ABGR.
+LIBYUV_API
+int H210ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I210ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuH709Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert U210 to ARGB.
+LIBYUV_API
+int U210ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuv2020Constants, width, height);
 }
 
-// Convert I420 with Alpha to preattenuated ARGB.
-static int I420AlphaToARGBMatrix(const uint8_t* src_y,
-                                 int src_stride_y,
-                                 const uint8_t* src_u,
-                                 int src_stride_u,
-                                 const uint8_t* src_v,
-                                 int src_stride_v,
-                                 const uint8_t* src_a,
-                                 int src_stride_a,
-                                 uint8_t* dst_argb,
-                                 int dst_stride_argb,
-                                 const struct YuvConstants* yuvconstants,
-                                 int width,
-                                 int height,
-                                 int attenuate) {
+// Convert U210 to ABGR.
+LIBYUV_API
+int U210ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I210ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvu2020Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert I420 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I420AlphaToARGBMatrix(const uint8_t* src_y,
+                          int src_stride_y,
+                          const uint8_t* src_u,
+                          int src_stride_u,
+                          const uint8_t* src_v,
+                          int src_stride_v,
+                          const uint8_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate) {
   int y;
   void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
                              const uint8_t* v_buf, const uint8_t* a_buf,
@@ -845,6 +1442,14 @@ static int I420AlphaToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422ALPHATOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_I422ALPHATOARGBROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
@@ -877,6 +1482,14 @@ static int I420AlphaToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGBATTENUATEROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
@@ -946,16 +1559,18 @@ int I420AlphaToABGR(const uint8_t* src_y,
       width, height, attenuate);
 }
 
-// Convert I400 to ARGB.
+// Convert I400 to ARGB with matrix.
 LIBYUV_API
-int I400ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
+int I400ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
   int y;
-  void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, int width) =
+  void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
       I400ToARGBRow_C;
   if (!src_y || !dst_argb || width <= 0 || height == 0) {
     return -1;
@@ -996,6 +1611,14 @@ int I400ToARGB(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I400TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I400ToARGBRow = I400ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      I400ToARGBRow = I400ToARGBRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_I400TOARGBROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     I400ToARGBRow = I400ToARGBRow_Any_MSA;
@@ -1006,13 +1629,25 @@ int I400ToARGB(const uint8_t* src_y,
 #endif
 
   for (y = 0; y < height; ++y) {
-    I400ToARGBRow(src_y, dst_argb, width);
+    I400ToARGBRow(src_y, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
   }
   return 0;
 }
 
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I400ToARGBMatrix(src_y, src_stride_y, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
+}
+
 // Convert J400 to ARGB.
 LIBYUV_API
 int J400ToARGB(const uint8_t* src_y,
@@ -1063,6 +1698,14 @@ int J400ToARGB(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_J400TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    J400ToARGBRow = J400ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      J400ToARGBRow = J400ToARGBRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_J400TOARGBROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     J400ToARGBRow = J400ToARGBRow_Any_MSA;
@@ -1193,6 +1836,14 @@ int RGB24ToARGB(const uint8_t* src_rgb24,
     }
   }
 #endif
+#if defined(HAS_RGB24TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_RGB24TOARGBROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA;
@@ -1252,6 +1903,14 @@ int RAWToARGB(const uint8_t* src_raw,
     }
   }
 #endif
+#if defined(HAS_RAWTOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RAWToARGBRow = RAWToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      RAWToARGBRow = RAWToARGBRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_RAWTOARGBROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     RAWToARGBRow = RAWToARGBRow_Any_MSA;
@@ -1269,6 +1928,57 @@ int RAWToARGB(const uint8_t* src_raw,
   return 0;
 }
 
+// Convert RAW to RGBA.
+LIBYUV_API
+int RAWToRGBA(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_rgba,
+              int dst_stride_rgba,
+              int width,
+              int height) {
+  int y;
+  void (*RAWToRGBARow)(const uint8_t* src_rgb, uint8_t* dst_rgba, int width) =
+      RAWToRGBARow_C;
+  if (!src_raw || !dst_rgba || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+  // Coalesce rows.
+  if (src_stride_raw == width * 3 && dst_stride_rgba == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_raw = dst_stride_rgba = 0;
+  }
+#if defined(HAS_RAWTORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToRGBARow = RAWToRGBARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToRGBARow = RAWToRGBARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RAWTORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToRGBARow = RAWToRGBARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToRGBARow = RAWToRGBARow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RAWToRGBARow(src_raw, dst_rgba, width);
+    src_raw += src_stride_raw;
+    dst_rgba += dst_stride_rgba;
+  }
+  return 0;
+}
+
 // Convert RGB565 to ARGB.
 LIBYUV_API
 int RGB565ToARGB(const uint8_t* src_rgb565,
@@ -1319,6 +2029,14 @@ int RGB565ToARGB(const uint8_t* src_rgb565,
     }
   }
 #endif
+#if defined(HAS_RGB565TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_RGB565TOARGBROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA;
@@ -1386,6 +2104,14 @@ int ARGB1555ToARGB(const uint8_t* src_argb1555,
     }
   }
 #endif
+#if defined(HAS_ARGB1555TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGB1555TOARGBROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA;
@@ -1453,6 +2179,14 @@ int ARGB4444ToARGB(const uint8_t* src_argb4444,
     }
   }
 #endif
+#if defined(HAS_ARGB4444TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGB4444TOARGBROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
@@ -1566,16 +2300,17 @@ int AR30ToAB30(const uint8_t* src_ar30,
   return 0;
 }
 
-// Convert NV12 to ARGB with matrix
-static int NV12ToARGBMatrix(const uint8_t* src_y,
-                            int src_stride_y,
-                            const uint8_t* src_uv,
-                            int src_stride_uv,
-                            uint8_t* dst_argb,
-                            int dst_stride_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width,
-                            int height) {
+// Convert NV12 to ARGB with matrix.
+LIBYUV_API
+int NV12ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
   int y;
   void (*NV12ToARGBRow)(
       const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
@@ -1613,6 +2348,14 @@ static int NV12ToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_NV12TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      NV12ToARGBRow = NV12ToARGBRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_NV12TOARGBROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
@@ -1633,16 +2376,17 @@ static int NV12ToARGBMatrix(const uint8_t* src_y,
   return 0;
 }
 
-// Convert NV21 to ARGB with matrix
-static int NV21ToARGBMatrix(const uint8_t* src_y,
-                            int src_stride_y,
-                            const uint8_t* src_vu,
-                            int src_stride_vu,
-                            uint8_t* dst_argb,
-                            int dst_stride_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width,
-                            int height) {
+// Convert NV21 to ARGB with matrix.
+LIBYUV_API
+int NV21ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_vu,
+                     int src_stride_vu,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
   int y;
   void (*NV21ToARGBRow)(
       const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
@@ -1680,6 +2424,14 @@ static int NV21ToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_NV21TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      NV21ToARGBRow = NV21ToARGBRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_NV21TOARGBROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     NV21ToARGBRow = NV21ToARGBRow_Any_MSA;
@@ -1729,8 +2481,9 @@ int NV21ToARGB(const uint8_t* src_y,
 }
 
 // Convert NV12 to ABGR.
-// To output ABGR instead of ARGB swap the UV and use a mirrrored yuc matrix.
+// To output ABGR instead of ARGB swap the UV and use a mirrored yuv matrix.
 // To swap the UV use NV12 instead of NV21.LIBYUV_API
+LIBYUV_API
 int NV12ToABGR(const uint8_t* src_y,
                int src_stride_y,
                const uint8_t* src_uv,
@@ -1758,16 +2511,17 @@ int NV21ToABGR(const uint8_t* src_y,
 }
 
 // TODO(fbarchard): Consider SSSE3 2 step conversion.
-// Convert NV12 to RGB24 with matrix
-static int NV12ToRGB24Matrix(const uint8_t* src_y,
-                             int src_stride_y,
-                             const uint8_t* src_uv,
-                             int src_stride_uv,
-                             uint8_t* dst_rgb24,
-                             int dst_stride_rgb24,
-                             const struct YuvConstants* yuvconstants,
-                             int width,
-                             int height) {
+// Convert NV12 to RGB24 with matrix.
+LIBYUV_API
+int NV12ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_uv,
+                      int src_stride_uv,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height) {
   int y;
   void (*NV12ToRGB24Row)(
       const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
@@ -1805,6 +2559,14 @@ static int NV12ToRGB24Matrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_NV12TORGB24ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    NV12ToRGB24Row = NV12ToRGB24Row_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB24Row = NV12ToRGB24Row_MMI;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width);
@@ -1817,16 +2579,17 @@ static int NV12ToRGB24Matrix(const uint8_t* src_y,
   return 0;
 }
 
-// Convert NV21 to RGB24 with matrix
-static int NV21ToRGB24Matrix(const uint8_t* src_y,
-                             int src_stride_y,
-                             const uint8_t* src_vu,
-                             int src_stride_vu,
-                             uint8_t* dst_rgb24,
-                             int dst_stride_rgb24,
-                             const struct YuvConstants* yuvconstants,
-                             int width,
-                             int height) {
+// Convert NV21 to RGB24 with matrix.
+LIBYUV_API
+int NV21ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_vu,
+                      int src_stride_vu,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height) {
   int y;
   void (*NV21ToRGB24Row)(
       const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
@@ -1864,6 +2627,14 @@ static int NV21ToRGB24Matrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_NV21TORGB24ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    NV21ToRGB24Row = NV21ToRGB24Row_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToRGB24Row = NV21ToRGB24Row_MMI;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width);
@@ -1876,7 +2647,6 @@ static int NV21ToRGB24Matrix(const uint8_t* src_y,
   return 0;
 }
 
-// TODO(fbarchard): NV12ToRAW can be implemented by mirrored matrix.
 // Convert NV12 to RGB24.
 LIBYUV_API
 int NV12ToRGB24(const uint8_t* src_y,
@@ -1907,72 +2677,79 @@ int NV21ToRGB24(const uint8_t* src_y,
                            width, height);
 }
 
-// Convert M420 to ARGB.
+// Convert NV12 to RAW.
 LIBYUV_API
-int M420ToARGB(const uint8_t* src_m420,
-               int src_stride_m420,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
+int NV12ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_uv,
+              int src_stride_uv,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return NV21ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_raw,
+                           dst_stride_raw, &kYvuI601Constants, width, height);
+}
+
+// Convert NV21 to RAW.
+LIBYUV_API
+int NV21ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_vu,
+              int src_stride_vu,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return NV12ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_raw,
+                           dst_stride_raw, &kYvuI601Constants, width, height);
+}
+
+// Convert NV21 to YUV24
+int NV21ToYUV24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_vu,
+                int src_stride_vu,
+                uint8_t* dst_yuv24,
+                int dst_stride_yuv24,
+                int width,
+                int height) {
   int y;
-  void (*NV12ToARGBRow)(
-      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
-      const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
-  if (!src_m420 || !dst_argb || width <= 0 || height == 0) {
+  void (*NV21ToYUV24Row)(const uint8_t* src_y, const uint8_t* src_vu,
+                         uint8_t* dst_yuv24, int width) = NV21ToYUV24Row_C;
+  if (!src_y || !src_vu || !dst_yuv24 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
+    dst_yuv24 = dst_yuv24 + (height - 1) * dst_stride_yuv24;
+    dst_stride_yuv24 = -dst_stride_yuv24;
   }
-#if defined(HAS_NV12TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_NV12TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+#if defined(HAS_NV21TOYUV24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV21ToYUV24Row = NV21ToYUV24Row_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
-      NV12ToARGBRow = NV12ToARGBRow_AVX2;
+      NV21ToYUV24Row = NV21ToYUV24Row_NEON;
     }
   }
 #endif
-#if defined(HAS_NV12TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_NEON;
+#if defined(HAS_NV21TOYUV24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV21ToYUV24Row = NV21ToYUV24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      NV21ToYUV24Row = NV21ToYUV24Row_AVX2;
     }
   }
 #endif
-#if defined(HAS_NV12TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_MSA;
+  for (y = 0; y < height; ++y) {
+    NV21ToYUV24Row(src_y, src_vu, dst_yuv24, width);
+    dst_yuv24 += dst_stride_yuv24;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_vu += src_stride_vu;
     }
   }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
-                  &kYuvI601Constants, width);
-    NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
-                  dst_argb + dst_stride_argb, &kYuvI601Constants, width);
-    dst_argb += dst_stride_argb * 2;
-    src_m420 += src_stride_m420 * 3;
-  }
-  if (height & 1) {
-    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
-                  &kYuvI601Constants, width);
-  }
   return 0;
 }
 
@@ -2027,6 +2804,14 @@ int YUY2ToARGB(const uint8_t* src_yuy2,
     }
   }
 #endif
+#if defined(HAS_YUY2TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_YUY2TOARGBROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;
@@ -2094,6 +2879,14 @@ int UYVYToARGB(const uint8_t* src_uyvy,
     }
   }
 #endif
+#if defined(HAS_UYVYTOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      UYVYToARGBRow = UYVYToARGBRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_UYVYTOARGBROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     UYVYToARGBRow = UYVYToARGBRow_Any_MSA;
@@ -2124,7 +2917,7 @@ static void WeavePixels(const uint8_t* src_u,
   }
 }
 
-// Convert Android420 to ARGB.
+// Convert Android420 to ARGB with matrix.
 LIBYUV_API
 int Android420ToARGBMatrix(const uint8_t* src_y,
                            int src_stride_y,
@@ -2225,6 +3018,1107 @@ int Android420ToABGR(const uint8_t* src_y,
                                 height);
 }
 
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I422ToRGBAMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_rgba,
+                     int dst_stride_rgba,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGBARow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+    dst_stride_rgba = -dst_stride_rgba;
+  }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGBARow = I422ToRGBARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToRGBARow = I422ToRGBARow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I422ToRGBARow = I422ToRGBARow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGBARow = I422ToRGBARow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+    dst_rgba += dst_stride_rgba;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height) {
+  return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_rgba, dst_stride_rgba,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height) {
+  return I422ToRGBAMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_bgra, dst_stride_bgra,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert NV12 to RGB565 with matrix.
+LIBYUV_API
+int NV12ToRGB565Matrix(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_uv,
+                       int src_stride_uv,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width,
+                       int height) {
+  int y;
+  void (*NV12ToRGB565Row)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
+  if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_MMI;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV12ToRGB565Row(src_y, src_uv, dst_rgb565, yuvconstants, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_uv,
+                 int src_stride_uv,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
+  return NV12ToRGB565Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
+                            dst_rgb565, dst_stride_rgb565, &kYuvI601Constants,
+                            width, height);
+}
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I420ToRGBAMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_rgba,
+                     int dst_stride_rgba,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGBARow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+    dst_stride_rgba = -dst_stride_rgba;
+  }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGBARow = I422ToRGBARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToRGBARow = I422ToRGBARow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I422ToRGBARow = I422ToRGBARow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGBARow = I422ToRGBARow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+    dst_rgba += dst_stride_rgba;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGBA.
+LIBYUV_API
+int I420ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height) {
+  return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_rgba, dst_stride_rgba,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to BGRA.
+LIBYUV_API
+int I420ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height) {
+  return I420ToRGBAMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_bgra, dst_stride_bgra,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert I420 to RGB24 with matrix.
+LIBYUV_API
+int I420ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height) {
+  int y;
+  void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                         const uint8_t* v_buf, uint8_t* rgb_buf,
+                         const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGB24Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_I422TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToRGB24Row = I422ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB24Row = I422ToRGB24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I422ToRGB24Row = I422ToRGB24Row_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGB24.
+LIBYUV_API
+int I420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_rgb24, dst_stride_rgb24,
+                           &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to RAW.
+LIBYUV_API
+int I420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+                           src_stride_v,  // Swap U and V
+                           src_u, src_stride_u, dst_raw, dst_stride_raw,
+                           &kYvuI601Constants,  // Use Yvu matrix
+                           width, height);
+}
+
+// Convert J420 to RGB24.
+LIBYUV_API
+int J420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_rgb24, dst_stride_rgb24,
+                           &kYuvJPEGConstants, width, height);
+}
+
+// Convert J420 to RAW.
+LIBYUV_API
+int J420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+                           src_stride_v,  // Swap U and V
+                           src_u, src_stride_u, dst_raw, dst_stride_raw,
+                           &kYvuJPEGConstants,  // Use Yvu matrix
+                           width, height);
+}
+
+// Convert H420 to RGB24.
+LIBYUV_API
+int H420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_rgb24, dst_stride_rgb24,
+                           &kYuvH709Constants, width, height);
+}
+
+// Convert H420 to RAW.
+LIBYUV_API
+int H420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+                           src_stride_v,  // Swap U and V
+                           src_u, src_stride_u, dst_raw, dst_stride_raw,
+                           &kYvuH709Constants,  // Use Yvu matrix
+                           width, height);
+}
+
+// Convert I420 to ARGB1555.
+LIBYUV_API
+int I420ToARGB1555(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb1555,
+                   int dst_stride_argb1555,
+                   int width,
+                   int height) {
+  int y;
+  void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                            const uint8_t* v_buf, uint8_t* rgb_buf,
+                            const struct YuvConstants* yuvconstants,
+                            int width) = I422ToARGB1555Row_C;
+  if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
+    dst_stride_argb1555 = -dst_stride_argb1555;
+  }
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
+                      width);
+    dst_argb1555 += dst_stride_argb1555;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to ARGB4444.
+LIBYUV_API
+int I420ToARGB4444(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb4444,
+                   int dst_stride_argb4444,
+                   int width,
+                   int height) {
+  int y;
+  void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                            const uint8_t* v_buf, uint8_t* rgb_buf,
+                            const struct YuvConstants* yuvconstants,
+                            int width) = I422ToARGB4444Row_C;
+  if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
+    dst_stride_argb4444 = -dst_stride_argb4444;
+  }
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
+                      width);
+    dst_argb4444 += dst_stride_argb4444;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGB565 with specified color matrix.
+LIBYUV_API
+int I420ToRGB565Matrix(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width,
+                       int height) {
+  int y;
+  void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                          const uint8_t* v_buf, uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGB565Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB565Row = I422ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I422ToRGB565Row = I422ToRGB565Row_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGB565.
+LIBYUV_API
+int I420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
+  return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                            src_stride_v, dst_rgb565, dst_stride_rgb565,
+                            &kYuvI601Constants, width, height);
+}
+
+// Convert J420 to RGB565.
+LIBYUV_API
+int J420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
+  return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                            src_stride_v, dst_rgb565, dst_stride_rgb565,
+                            &kYuvJPEGConstants, width, height);
+}
+
+// Convert H420 to RGB565.
+LIBYUV_API
+int H420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
+  return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                            src_stride_v, dst_rgb565, dst_stride_rgb565,
+                            &kYuvH709Constants, width, height);
+}
+
+// Convert I422 to RGB565.
+LIBYUV_API
+int I422ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
+  int y;
+  void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                          const uint8_t* v_buf, uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGB565Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB565Row = I422ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
+static const uint8_t kDither565_4x4[16] = {
+    0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
+};
+
+// Convert I420 to RGB565 with dithering.
+LIBYUV_API
+int I420ToRGB565Dither(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const uint8_t* dither4x4,
+                       int width,
+                       int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToARGBRow_C;
+  void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                                const uint32_t dither4, int width) =
+      ARGBToRGB565DitherRow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+  if (!dither4x4) {
+    dither4x4 = kDither565_4x4;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I422ToARGBRow = I422ToARGBRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
+    }
+  }
+#endif
+  {
+    // Allocate a row of argb.
+    align_buffer_64(row_argb, width * 4);
+    for (y = 0; y < height; ++y) {
+      I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
+      ARGBToRGB565DitherRow(row_argb, dst_rgb565,
+                            *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
+                            width);
+      dst_rgb565 += dst_stride_rgb565;
+      src_y += src_stride_y;
+      if (y & 1) {
+        src_u += src_stride_u;
+        src_v += src_stride_v;
+      }
+    }
+    free_aligned_buffer_64(row_argb);
+  }
+  return 0;
+}
+
+// Convert I420 to AR30 with matrix.
+LIBYUV_API
+int I420ToAR30Matrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToAR30Row_C;
+
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+
+#if defined(HAS_I422TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToAR30Row = I422ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToAR30Row = I422ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToAR30Row = I422ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToAR30Row = I422ToAR30Row_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to AR30.
+LIBYUV_API
+int I420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert H420 to AR30.
+LIBYUV_API
+int H420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYvuH709Constants, width, height);
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/chromium/third_party/libyuv/source/convert_from.cc b/chromium/third_party/libyuv/source/convert_from.cc
index 6fa253237ee..f2cfc1d8f53 100644
--- a/chromium/third_party/libyuv/source/convert_from.cc
+++ b/chromium/third_party/libyuv/source/convert_from.cc
@@ -294,6 +294,14 @@ int I420ToYUY2(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToYUY2Row = I422ToYUY2Row_MMI;
+    }
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
@@ -373,6 +381,14 @@ int I422ToUYVY(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToUYVYRow = I422ToUYVYRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
@@ -440,6 +456,14 @@ int I420ToUYVY(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToUYVYRow = I422ToUYVYRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
@@ -464,7 +488,6 @@ int I420ToUYVY(const uint8_t* src_y,
   return 0;
 }
 
-// TODO(fbarchard): test negative height for invert.
 LIBYUV_API
 int I420ToNV12(const uint8_t* src_y,
                int src_stride_y,
@@ -478,12 +501,23 @@ int I420ToNV12(const uint8_t* src_y,
                int dst_stride_uv,
                int width,
                int height) {
+  int halfwidth = (width + 1) / 2;
+  int halfheight = (height + 1) / 2;
   if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
       height == 0) {
     return -1;
   }
-  int halfwidth = (width + 1) / 2;
-  int halfheight = height > 0 ? (height + 1) / 2 : (height - 1) / 2;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
   if (dst_y) {
     CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
   }
@@ -510,755 +544,6 @@ int I420ToNV21(const uint8_t* src_y,
                     width, height);
 }
 
-// Convert I422 to RGBA with matrix
-static int I420ToRGBAMatrix(const uint8_t* src_y,
-                            int src_stride_y,
-                            const uint8_t* src_u,
-                            int src_stride_u,
-                            const uint8_t* src_v,
-                            int src_stride_v,
-                            uint8_t* dst_rgba,
-                            int dst_stride_rgba,
-                            const struct YuvConstants* yuvconstants,
-                            int width,
-                            int height) {
-  int y;
-  void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
-                        const uint8_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I422ToRGBARow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
-    dst_stride_rgba = -dst_stride_rgba;
-  }
-#if defined(HAS_I422TORGBAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGBARow = I422ToRGBARow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGBARow = I422ToRGBARow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToRGBARow = I422ToRGBARow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
-    dst_rgba += dst_stride_rgba;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to RGBA.
-LIBYUV_API
-int I420ToRGBA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_rgba,
-               int dst_stride_rgba,
-               int width,
-               int height) {
-  return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_rgba, dst_stride_rgba,
-                          &kYuvI601Constants, width, height);
-}
-
-// Convert I420 to BGRA.
-LIBYUV_API
-int I420ToBGRA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_bgra,
-               int dst_stride_bgra,
-               int width,
-               int height) {
-  return I420ToRGBAMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_bgra, dst_stride_bgra,
-                          &kYvuI601Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert I420 to RGB24 with matrix
-static int I420ToRGB24Matrix(const uint8_t* src_y,
-                             int src_stride_y,
-                             const uint8_t* src_u,
-                             int src_stride_u,
-                             const uint8_t* src_v,
-                             int src_stride_v,
-                             uint8_t* dst_rgb24,
-                             int dst_stride_rgb24,
-                             const struct YuvConstants* yuvconstants,
-                             int width,
-                             int height) {
-  int y;
-  void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
-                         const uint8_t* v_buf, uint8_t* rgb_buf,
-                         const struct YuvConstants* yuvconstants, int width) =
-      I422ToRGB24Row_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
-    dst_stride_rgb24 = -dst_stride_rgb24;
-  }
-#if defined(HAS_I422TORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB24Row = I422ToRGB24Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB24ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGB24Row = I422ToRGB24Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB24ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB24Row = I422ToRGB24Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB24ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGB24Row = I422ToRGB24Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
-    dst_rgb24 += dst_stride_rgb24;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to RGB24.
-LIBYUV_API
-int I420ToRGB24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_u,
-                int src_stride_u,
-                const uint8_t* src_v,
-                int src_stride_v,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height) {
-  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                           src_stride_v, dst_rgb24, dst_stride_rgb24,
-                           &kYuvI601Constants, width, height);
-}
-
-// Convert I420 to RAW.
-LIBYUV_API
-int I420ToRAW(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_u,
-              int src_stride_u,
-              const uint8_t* src_v,
-              int src_stride_v,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height) {
-  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
-                           src_stride_v,  // Swap U and V
-                           src_u, src_stride_u, dst_raw, dst_stride_raw,
-                           &kYvuI601Constants,  // Use Yvu matrix
-                           width, height);
-}
-
-// Convert H420 to RGB24.
-LIBYUV_API
-int H420ToRGB24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_u,
-                int src_stride_u,
-                const uint8_t* src_v,
-                int src_stride_v,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height) {
-  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                           src_stride_v, dst_rgb24, dst_stride_rgb24,
-                           &kYuvH709Constants, width, height);
-}
-
-// Convert H420 to RAW.
-LIBYUV_API
-int H420ToRAW(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_u,
-              int src_stride_u,
-              const uint8_t* src_v,
-              int src_stride_v,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height) {
-  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
-                           src_stride_v,  // Swap U and V
-                           src_u, src_stride_u, dst_raw, dst_stride_raw,
-                           &kYvuH709Constants,  // Use Yvu matrix
-                           width, height);
-}
-
-// Convert I420 to ARGB1555.
-LIBYUV_API
-int I420ToARGB1555(const uint8_t* src_y,
-                   int src_stride_y,
-                   const uint8_t* src_u,
-                   int src_stride_u,
-                   const uint8_t* src_v,
-                   int src_stride_v,
-                   uint8_t* dst_argb1555,
-                   int dst_stride_argb1555,
-                   int width,
-                   int height) {
-  int y;
-  void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf,
-                            const uint8_t* v_buf, uint8_t* rgb_buf,
-                            const struct YuvConstants* yuvconstants,
-                            int width) = I422ToARGB1555Row_C;
-  if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
-      height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
-    dst_stride_argb1555 = -dst_stride_argb1555;
-  }
-#if defined(HAS_I422TOARGB1555ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
-                      width);
-    dst_argb1555 += dst_stride_argb1555;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to ARGB4444.
-LIBYUV_API
-int I420ToARGB4444(const uint8_t* src_y,
-                   int src_stride_y,
-                   const uint8_t* src_u,
-                   int src_stride_u,
-                   const uint8_t* src_v,
-                   int src_stride_v,
-                   uint8_t* dst_argb4444,
-                   int dst_stride_argb4444,
-                   int width,
-                   int height) {
-  int y;
-  void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf,
-                            const uint8_t* v_buf, uint8_t* rgb_buf,
-                            const struct YuvConstants* yuvconstants,
-                            int width) = I422ToARGB4444Row_C;
-  if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
-      height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
-    dst_stride_argb4444 = -dst_stride_argb4444;
-  }
-#if defined(HAS_I422TOARGB4444ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB4444Row = I422ToARGB4444Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB4444Row = I422ToARGB4444Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
-                      width);
-    dst_argb4444 += dst_stride_argb4444;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to RGB565.
-LIBYUV_API
-int I420ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_u,
-                 int src_stride_u,
-                 const uint8_t* src_v,
-                 int src_stride_v,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height) {
-  int y;
-  void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
-                          const uint8_t* v_buf, uint8_t* rgb_buf,
-                          const struct YuvConstants* yuvconstants, int width) =
-      I422ToRGB565Row_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
-    dst_stride_rgb565 = -dst_stride_rgb565;
-  }
-#if defined(HAS_I422TORGB565ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB565ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGB565Row = I422ToRGB565Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB565Row = I422ToRGB565Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB565ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB565Row = I422ToRGB565Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
-    dst_rgb565 += dst_stride_rgb565;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I422 to RGB565.
-LIBYUV_API
-int I422ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_u,
-                 int src_stride_u,
-                 const uint8_t* src_v,
-                 int src_stride_v,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height) {
-  int y;
-  void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
-                          const uint8_t* v_buf, uint8_t* rgb_buf,
-                          const struct YuvConstants* yuvconstants, int width) =
-      I422ToRGB565Row_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
-    dst_stride_rgb565 = -dst_stride_rgb565;
-  }
-#if defined(HAS_I422TORGB565ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB565ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGB565Row = I422ToRGB565Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB565Row = I422ToRGB565Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB565ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB565Row = I422ToRGB565Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
-    dst_rgb565 += dst_stride_rgb565;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
-static const uint8_t kDither565_4x4[16] = {
-    0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
-};
-
-// Convert I420 to RGB565 with dithering.
-LIBYUV_API
-int I420ToRGB565Dither(const uint8_t* src_y,
-                       int src_stride_y,
-                       const uint8_t* src_u,
-                       int src_stride_u,
-                       const uint8_t* src_v,
-                       int src_stride_v,
-                       uint8_t* dst_rgb565,
-                       int dst_stride_rgb565,
-                       const uint8_t* dither4x4,
-                       int width,
-                       int height) {
-  int y;
-  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
-                        const uint8_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I422ToARGBRow_C;
-  void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
-                                const uint32_t dither4, int width) =
-      ARGBToRGB565DitherRow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
-    dst_stride_rgb565 = -dst_stride_rgb565;
-  }
-  if (!dither4x4) {
-    dither4x4 = kDither565_4x4;
-  }
-#if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToARGBRow = I422ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
-    }
-  }
-#endif
-  {
-    // Allocate a row of argb.
-    align_buffer_64(row_argb, width * 4);
-    for (y = 0; y < height; ++y) {
-      I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
-      ARGBToRGB565DitherRow(row_argb, dst_rgb565,
-                            *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
-                            width);
-      dst_rgb565 += dst_stride_rgb565;
-      src_y += src_stride_y;
-      if (y & 1) {
-        src_u += src_stride_u;
-        src_v += src_stride_v;
-      }
-    }
-    free_aligned_buffer_64(row_argb);
-  }
-  return 0;
-}
-
-// Convert I420 to AR30 with matrix
-static int I420ToAR30Matrix(const uint8_t* src_y,
-                            int src_stride_y,
-                            const uint8_t* src_u,
-                            int src_stride_u,
-                            const uint8_t* src_v,
-                            int src_stride_v,
-                            uint8_t* dst_ar30,
-                            int dst_stride_ar30,
-                            const struct YuvConstants* yuvconstants,
-                            int width,
-                            int height) {
-  int y;
-  void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf,
-                        const uint8_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I422ToAR30Row_C;
-
-  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
-    dst_stride_ar30 = -dst_stride_ar30;
-  }
-
-#if defined(HAS_I422TOAR30ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToAR30Row = I422ToAR30Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToAR30Row = I422ToAR30Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOAR30ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToAR30Row = I422ToAR30Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToAR30Row = I422ToAR30Row_AVX2;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
-    dst_ar30 += dst_stride_ar30;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to AR30.
-LIBYUV_API
-int I420ToAR30(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height) {
-  return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_ar30, dst_stride_ar30,
-                          &kYuvI601Constants, width, height);
-}
-
-// Convert H420 to AR30.
-LIBYUV_API
-int H420ToAR30(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height) {
-  return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_ar30, dst_stride_ar30,
-                          &kYvuH709Constants, width, height);
-}
-
 // Convert I420 to specified format
 LIBYUV_API
 int ConvertFromI420(const uint8_t* y,
@@ -1360,7 +645,6 @@ int ConvertFromI420(const uint8_t* y,
                      height);
       break;
     }
-    // TODO(fbarchard): Add M420.
     // Triplanar formats
     case FOURCC_I420:
     case FOURCC_YV12: {
diff --git a/chromium/third_party/libyuv/source/convert_from_argb.cc b/chromium/third_party/libyuv/source/convert_from_argb.cc
index c8d91252e9b..4ba4bb5e0f5 100644
--- a/chromium/third_party/libyuv/source/convert_from_argb.cc
+++ b/chromium/third_party/libyuv/source/convert_from_argb.cc
@@ -68,6 +68,14 @@ int ARGBToI444(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOUV444ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToUV444Row = ARGBToUV444Row_MMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOUV444ROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToUV444Row = ARGBToUV444Row_Any_MSA;
@@ -100,6 +108,14 @@ int ARGBToI444(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYRow = ARGBToYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToYRow = ARGBToYRow_Any_MSA;
@@ -191,17 +207,26 @@ int ARGBToI422(const uint8_t* src_argb,
   }
 #endif
 
-#if defined(HAS_ARGBTOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToYRow = ARGBToYRow_Any_MSA;
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYRow = ARGBToYRow_Any_MMI;
+    ARGBToUVRow = ARGBToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_MMI;
+    }
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_MSA;
+      ARGBToUVRow = ARGBToUVRow_MMI;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVROW_MSA)
+
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
     ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
     if (IS_ALIGNED(width, 32)) {
       ARGBToUVRow = ARGBToUVRow_MSA;
     }
@@ -282,17 +307,25 @@ int ARGBToNV12(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToYRow = ARGBToYRow_Any_MSA;
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYRow = ARGBToYRow_Any_MMI;
+    ARGBToUVRow = ARGBToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_MMI;
+    }
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_MSA;
+      ARGBToUVRow = ARGBToUVRow_MMI;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
     ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
     if (IS_ALIGNED(width, 32)) {
       ARGBToUVRow = ARGBToUVRow_MSA;
     }
@@ -322,6 +355,14 @@ int ARGBToNV12(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    MergeUVRow_ = MergeUVRow_Any_MMI;
+    if (IS_ALIGNED(halfwidth, 8)) {
+      MergeUVRow_ = MergeUVRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     MergeUVRow_ = MergeUVRow_Any_MSA;
@@ -418,17 +459,25 @@ int ARGBToNV21(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToYRow = ARGBToYRow_Any_MSA;
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYRow = ARGBToYRow_Any_MMI;
+    ARGBToUVRow = ARGBToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_MMI;
+    }
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_MSA;
+      ARGBToUVRow = ARGBToUVRow_MMI;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
     ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
     if (IS_ALIGNED(width, 32)) {
       ARGBToUVRow = ARGBToUVRow_MSA;
     }
@@ -458,6 +507,14 @@ int ARGBToNV21(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    MergeUVRow_ = MergeUVRow_Any_MMI;
+    if (IS_ALIGNED(halfwidth, 8)) {
+      MergeUVRow_ = MergeUVRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     MergeUVRow_ = MergeUVRow_Any_MSA;
@@ -490,6 +547,309 @@ int ARGBToNV21(const uint8_t* src_argb,
   return 0;
 }
 
+LIBYUV_API
+int ABGRToNV12(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ABGRToUVRow_C;
+  void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
+      ABGRToYRow_C;
+  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+                      uint8_t* dst_uv, int width) = MergeUVRow_C;
+  if (!src_abgr || !dst_y || !dst_uv || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+    ABGRToYRow = ABGRToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_SSSE3;
+      ABGRToYRow = ABGRToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+    ABGRToYRow = ABGRToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVRow = ABGRToUVRow_AVX2;
+      ABGRToYRow = ABGRToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYRow = ABGRToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToYRow = ABGRToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVRow = ABGRToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ABGRToYRow = ABGRToYRow_Any_MMI;
+    ABGRToUVRow = ABGRToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToYRow = ABGRToYRow_MMI;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToYRow = ABGRToYRow_Any_MSA;
+    ABGRToUVRow = ABGRToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_MSA;
+    }
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVRow = ABGRToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    MergeUVRow_ = MergeUVRow_Any_MMI;
+    if (IS_ALIGNED(halfwidth, 8)) {
+      MergeUVRow_ = MergeUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow_ = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_MSA;
+    }
+  }
+#endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+
+    for (y = 0; y < height - 1; y += 2) {
+      ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ABGRToYRow(src_abgr, dst_y, width);
+      ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+      src_abgr += src_stride_abgr * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      ABGRToUVRow(src_abgr, 0, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ABGRToYRow(src_abgr, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
+  }
+  return 0;
+}
+
+// Same as NV12 but U and V swapped.
+LIBYUV_API
+int ABGRToNV21(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ABGRToUVRow_C;
+  void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
+      ABGRToYRow_C;
+  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+                      uint8_t* dst_vu, int width) = MergeUVRow_C;
+  if (!src_abgr || !dst_y || !dst_vu || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+    ABGRToYRow = ABGRToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_SSSE3;
+      ABGRToYRow = ABGRToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+    ABGRToYRow = ABGRToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVRow = ABGRToUVRow_AVX2;
+      ABGRToYRow = ABGRToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYRow = ABGRToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToYRow = ABGRToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVRow = ABGRToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ABGRToYRow = ABGRToYRow_Any_MMI;
+    ABGRToUVRow = ABGRToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToYRow = ABGRToYRow_MMI;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToYRow = ABGRToYRow_Any_MSA;
+    ABGRToUVRow = ABGRToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_MSA;
+    }
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVRow = ABGRToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    MergeUVRow_ = MergeUVRow_Any_MMI;
+    if (IS_ALIGNED(halfwidth, 8)) {
+      MergeUVRow_ = MergeUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow_ = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_MSA;
+    }
+  }
+#endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+
+    for (y = 0; y < height - 1; y += 2) {
+      ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+      ABGRToYRow(src_abgr, dst_y, width);
+      ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+      src_abgr += src_stride_abgr * 2;
+      dst_y += dst_stride_y * 2;
+      dst_vu += dst_stride_vu;
+    }
+    if (height & 1) {
+      ABGRToUVRow(src_abgr, 0, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+      ABGRToYRow(src_abgr, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
+  }
+  return 0;
+}
+
 // Convert ARGB to YUY2.
 LIBYUV_API
 int ARGBToYUY2(const uint8_t* src_argb,
@@ -559,17 +919,25 @@ int ARGBToYUY2(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToYRow = ARGBToYRow_Any_MSA;
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYRow = ARGBToYRow_Any_MMI;
+    ARGBToUVRow = ARGBToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_MMI;
+    }
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_MSA;
+      ARGBToUVRow = ARGBToUVRow_MMI;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
     ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
     if (IS_ALIGNED(width, 32)) {
       ARGBToUVRow = ARGBToUVRow_MSA;
     }
@@ -599,6 +967,14 @@ int ARGBToYUY2(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToYUY2Row = I422ToYUY2Row_MMI;
+    }
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
@@ -696,17 +1072,25 @@ int ARGBToUYVY(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToYRow = ARGBToYRow_Any_MSA;
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYRow = ARGBToYRow_Any_MMI;
+    ARGBToUVRow = ARGBToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_MMI;
+    }
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_MSA;
+      ARGBToUVRow = ARGBToUVRow_MMI;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
     ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
     if (IS_ALIGNED(width, 32)) {
       ARGBToUVRow = ARGBToUVRow_MSA;
     }
@@ -736,6 +1120,14 @@ int ARGBToUYVY(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToUYVYRow = I422ToUYVYRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
@@ -813,6 +1205,14 @@ int ARGBToI400(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYRow = ARGBToYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToYRow = ARGBToYRow_Any_MSA;
@@ -903,6 +1303,14 @@ int ARGBToRGB24(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB24ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_MMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTORGB24ROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA;
@@ -969,6 +1377,14 @@ int ARGBToRAW(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORAWROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRAWRow = ARGBToRAWRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTORAWROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToRAWRow = ARGBToRAWRow_Any_MSA;
@@ -1039,6 +1455,14 @@ int ARGBToRGB565Dither(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTORGB565DITHERROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
@@ -1108,6 +1532,14 @@ int ARGBToRGB565(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB565ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_MMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTORGB565ROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToRGB565Row = ARGBToRGB565Row_Any_MSA;
@@ -1174,6 +1606,14 @@ int ARGBToARGB1555(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOARGB1555ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_MMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOARGB1555ROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MSA;
@@ -1240,6 +1680,14 @@ int ARGBToARGB4444(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOARGB4444ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_MMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOARGB4444ROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MSA;
@@ -1416,17 +1864,25 @@ int ARGBToJ420(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+#if defined(HAS_ARGBTOYJROW_MMI) && defined(HAS_ARGBTOUVJROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MMI;
+    ARGBToUVJRow = ARGBToUVJRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_MMI;
+    }
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_MSA;
+      ARGBToUVJRow = ARGBToUVJRow_MMI;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVJROW_MSA)
+#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
     ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
+    }
     if (IS_ALIGNED(width, 32)) {
       ARGBToUVJRow = ARGBToUVJRow_MSA;
     }
@@ -1517,17 +1973,25 @@ int ARGBToJ422(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+#if defined(HAS_ARGBTOYJROW_MMI) && defined(HAS_ARGBTOUVJROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MMI;
+    ARGBToUVJRow = ARGBToUVJRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_MMI;
+    }
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_MSA;
+      ARGBToUVJRow = ARGBToUVJRow_MMI;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVJROW_MSA)
+#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
     ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
+    }
     if (IS_ALIGNED(width, 32)) {
       ARGBToUVJRow = ARGBToUVJRow_MSA;
     }
@@ -1594,6 +2058,14 @@ int ARGBToJ400(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYJROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToYJRow = ARGBToYJRow_Any_MSA;
@@ -1611,6 +2083,80 @@ int ARGBToJ400(const uint8_t* src_argb,
   return 0;
 }
 
+// Convert RGBA to J400.
+LIBYUV_API
+int RGBAToJ400(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               int width,
+               int height) {
+  int y;
+  void (*RGBAToYJRow)(const uint8_t* src_rgba, uint8_t* dst_yj, int width) =
+      RGBAToYJRow_C;
+  if (!src_rgba || !dst_yj || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+    src_stride_rgba = -src_stride_rgba;
+  }
+  // Coalesce rows.
+  if (src_stride_rgba == width * 4 && dst_stride_yj == width) {
+    width *= height;
+    height = 1;
+    src_stride_rgba = dst_stride_yj = 0;
+  }
+#if defined(HAS_RGBATOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGBAToYJRow = RGBAToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYJRow = RGBAToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGBAToYJRow = RGBAToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      RGBAToYJRow = RGBAToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToYJRow = RGBAToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGBAToYJRow = RGBAToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYJROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RGBAToYJRow = RGBAToYJRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      RGBAToYJRow = RGBAToYJRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGBAToYJRow = RGBAToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYJRow = RGBAToYJRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RGBAToYJRow(src_rgba, dst_yj, width);
+    src_rgba += src_stride_rgba;
+    dst_yj += dst_stride_yj;
+  }
+  return 0;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/chromium/third_party/libyuv/source/convert_jpeg.cc b/chromium/third_party/libyuv/source/convert_jpeg.cc
index ae3cc18cd24..d7556ee91ba 100644
--- a/chromium/third_party/libyuv/source/convert_jpeg.cc
+++ b/chromium/third_party/libyuv/source/convert_jpeg.cc
@@ -89,12 +89,12 @@ static void JpegI400ToI420(void* opaque,
 
 // Query size of MJPG in pixels.
 LIBYUV_API
-int MJPGSize(const uint8_t* sample,
-             size_t sample_size,
+int MJPGSize(const uint8_t* src_mjpg,
+             size_t src_size_mjpg,
              int* width,
              int* height) {
   MJpegDecoder mjpeg_decoder;
-  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
   if (ret) {
     *width = mjpeg_decoder.GetWidth();
     *height = mjpeg_decoder.GetHeight();
@@ -107,8 +107,8 @@ int MJPGSize(const uint8_t* sample,
 // TODO(fbarchard): review src_width and src_height requirement. dst_width and
 // dst_height may be enough.
 LIBYUV_API
-int MJPGToI420(const uint8_t* sample,
-               size_t sample_size,
+int MJPGToI420(const uint8_t* src_mjpg,
+               size_t src_size_mjpg,
                uint8_t* dst_y,
                int dst_stride_y,
                uint8_t* dst_u,
@@ -119,14 +119,14 @@ int MJPGToI420(const uint8_t* sample,
                int src_height,
                int dst_width,
                int dst_height) {
-  if (sample_size == kUnknownDataSize) {
+  if (src_size_mjpg == kUnknownDataSize) {
     // ERROR: MJPEG frame size unknown
     return -1;
   }
 
   // TODO(fbarchard): Port MJpeg to C.
   MJpegDecoder mjpeg_decoder;
-  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
   if (ret && (mjpeg_decoder.GetWidth() != src_width ||
               mjpeg_decoder.GetHeight() != src_height)) {
     // ERROR: MJPEG frame has unexpected dimensions
@@ -180,9 +180,281 @@ int MJPGToI420(const uint8_t* sample,
       ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dst_width,
                                            dst_height);
     } else {
-      // TODO(fbarchard): Implement conversion for any other colorspace/sample
-      // factors that occur in practice.
-      // ERROR: Unable to convert MJPEG frame because format is not supported
+      // TODO(fbarchard): Implement conversion for any other
+      // colorspace/subsample factors that occur in practice. ERROR: Unable to
+      // convert MJPEG frame because format is not supported
+      mjpeg_decoder.UnloadFrame();
+      return 1;
+    }
+  }
+  return ret ? 0 : 1;
+}
+
+struct NV21Buffers {
+  uint8_t* y;
+  int y_stride;
+  uint8_t* vu;
+  int vu_stride;
+  int w;
+  int h;
+};
+
+static void JpegI420ToNV21(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  NV21Buffers* dest = (NV21Buffers*)(opaque);
+  I420ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+  dest->h -= rows;
+}
+
+static void JpegI422ToNV21(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  NV21Buffers* dest = (NV21Buffers*)(opaque);
+  I422ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+  dest->h -= rows;
+}
+
+static void JpegI444ToNV21(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  NV21Buffers* dest = (NV21Buffers*)(opaque);
+  I444ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+  dest->h -= rows;
+}
+
+static void JpegI400ToNV21(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  NV21Buffers* dest = (NV21Buffers*)(opaque);
+  I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu,
+             dest->vu_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+  dest->h -= rows;
+}
+
+// MJPG (Motion JPeg) to NV21
+LIBYUV_API
+int MJPGToNV21(const uint8_t* src_mjpg,
+               size_t src_size_mjpg,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height) {
+  if (src_size_mjpg == kUnknownDataSize) {
+    // ERROR: MJPEG frame size unknown
+    return -1;
+  }
+
+  // TODO(fbarchard): Port MJpeg to C.
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
+  if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+              mjpeg_decoder.GetHeight() != src_height)) {
+    // ERROR: MJPEG frame has unexpected dimensions
+    mjpeg_decoder.UnloadFrame();
+    return 1;  // runtime failure
+  }
+  if (ret) {
+    NV21Buffers bufs = {dst_y,         dst_stride_y, dst_vu,
+                        dst_stride_vu, dst_width,    dst_height};
+    // YUV420
+    if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+        mjpeg_decoder.GetNumComponents() == 3 &&
+        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV21, &bufs, dst_width,
+                                           dst_height);
+      // YUV422
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV21, &bufs, dst_width,
+                                           dst_height);
+      // YUV444
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV21, &bufs, dst_width,
+                                           dst_height);
+      // YUV400
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceGrayscale &&
+               mjpeg_decoder.GetNumComponents() == 1 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV21, &bufs, dst_width,
+                                           dst_height);
+    } else {
+      // Unknown colorspace.
+      mjpeg_decoder.UnloadFrame();
+      return 1;
+    }
+  }
+  return ret ? 0 : 1;
+}
+
+static void JpegI420ToNV12(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  NV21Buffers* dest = (NV21Buffers*)(opaque);
+  // Use NV21 with VU swapped.
+  I420ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+             dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+  dest->h -= rows;
+}
+
+static void JpegI422ToNV12(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  NV21Buffers* dest = (NV21Buffers*)(opaque);
+  // Use NV21 with VU swapped.
+  I422ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+             dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+  dest->h -= rows;
+}
+
+static void JpegI444ToNV12(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  NV21Buffers* dest = (NV21Buffers*)(opaque);
+  // Use NV21 with VU swapped.
+  I444ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+             dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+  dest->h -= rows;
+}
+
+static void JpegI400ToNV12(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  NV21Buffers* dest = (NV21Buffers*)(opaque);
+  // Use NV21 since there is no UV plane.
+  I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu,
+             dest->vu_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+  dest->h -= rows;
+}
+
+// MJPG (Motion JPEG) to NV12.
+LIBYUV_API
+int MJPGToNV12(const uint8_t* sample,
+               size_t sample_size,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height) {
+  if (sample_size == kUnknownDataSize) {
+    // ERROR: MJPEG frame size unknown
+    return -1;
+  }
+
+  // TODO(fbarchard): Port MJpeg to C.
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+              mjpeg_decoder.GetHeight() != src_height)) {
+    // ERROR: MJPEG frame has unexpected dimensions
+    mjpeg_decoder.UnloadFrame();
+    return 1;  // runtime failure
+  }
+  if (ret) {
+    // Use NV21Buffers but with UV instead of VU.
+    NV21Buffers bufs = {dst_y,         dst_stride_y, dst_uv,
+                        dst_stride_uv, dst_width,    dst_height};
+    // YUV420
+    if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+        mjpeg_decoder.GetNumComponents() == 3 &&
+        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV12, &bufs, dst_width,
+                                           dst_height);
+      // YUV422
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV12, &bufs, dst_width,
+                                           dst_height);
+      // YUV444
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV12, &bufs, dst_width,
+                                           dst_height);
+      // YUV400
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceGrayscale &&
+               mjpeg_decoder.GetNumComponents() == 1 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV12, &bufs, dst_width,
+                                           dst_height);
+    } else {
+      // Unknown colorspace.
       mjpeg_decoder.UnloadFrame();
       return 1;
     }
@@ -190,7 +462,6 @@ int MJPGToI420(const uint8_t* sample,
   return ret ? 0 : 1;
 }
 
-#ifdef HAVE_JPEG
 struct ARGBBuffers {
   uint8_t* argb;
   int argb_stride;
@@ -245,22 +516,22 @@ static void JpegI400ToARGB(void* opaque,
 // TODO(fbarchard): review src_width and src_height requirement. dst_width and
 // dst_height may be enough.
 LIBYUV_API
-int MJPGToARGB(const uint8_t* sample,
-               size_t sample_size,
+int MJPGToARGB(const uint8_t* src_mjpg,
+               size_t src_size_mjpg,
                uint8_t* dst_argb,
                int dst_stride_argb,
                int src_width,
                int src_height,
                int dst_width,
                int dst_height) {
-  if (sample_size == kUnknownDataSize) {
+  if (src_size_mjpg == kUnknownDataSize) {
     // ERROR: MJPEG frame size unknown
     return -1;
   }
 
   // TODO(fbarchard): Port MJpeg to C.
   MJpegDecoder mjpeg_decoder;
-  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
   if (ret && (mjpeg_decoder.GetWidth() != src_width ||
               mjpeg_decoder.GetHeight() != src_height)) {
     // ERROR: MJPEG frame has unexpected dimensions
@@ -313,18 +584,17 @@ int MJPGToARGB(const uint8_t* sample,
       ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dst_width,
                                            dst_height);
     } else {
-      // TODO(fbarchard): Implement conversion for any other colorspace/sample
-      // factors that occur in practice.
-      // ERROR: Unable to convert MJPEG frame because format is not supported
+      // TODO(fbarchard): Implement conversion for any other
+      // colorspace/subsample factors that occur in practice. ERROR: Unable to
+      // convert MJPEG frame because format is not supported
       mjpeg_decoder.UnloadFrame();
       return 1;
     }
   }
   return ret ? 0 : 1;
 }
-#endif
 
-#endif
+#endif  // HAVE_JPEG
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/chromium/third_party/libyuv/source/convert_to_argb.cc b/chromium/third_party/libyuv/source/convert_to_argb.cc
index bde1aa8891b..84df16c8c26 100644
--- a/chromium/third_party/libyuv/source/convert_to_argb.cc
+++ b/chromium/third_party/libyuv/source/convert_to_argb.cc
@@ -32,9 +32,6 @@ extern "C" {
 // TODO(fbarchard): Add the following:
 // H010ToARGB
 // I010ToARGB
-// J400ToARGB
-// J422ToARGB
-// J444ToARGB
 
 LIBYUV_API
 int ConvertToARGB(const uint8_t* sample,
@@ -161,6 +158,11 @@ int ConvertToARGB(const uint8_t* sample,
       r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
                      inv_crop_height);
       break;
+    case FOURCC_J400:
+      src = sample + src_width * crop_y + crop_x;
+      r = J400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
+      break;
 
     // Biplanar formats
     case FOURCC_NV12:
@@ -178,12 +180,6 @@ int ConvertToARGB(const uint8_t* sample,
       r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
                      dst_stride_argb, crop_width, inv_crop_height);
       break;
-    case FOURCC_M420:
-      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
-      r = M420ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
-                     inv_crop_height);
-      break;
-
     // Triplanar formats
     case FOURCC_I420:
     case FOURCC_YV12: {
@@ -208,6 +204,19 @@ int ConvertToARGB(const uint8_t* sample,
       break;
     }
 
+    case FOURCC_J420: {
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8_t* src_u = sample + src_width * abs_src_height +
+                             (halfwidth * crop_y + crop_x) / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    }
+
     case FOURCC_H420: {
       int halfwidth = (src_width + 1) / 2;
       int halfheight = (abs_src_height + 1) / 2;
@@ -221,7 +230,7 @@ int ConvertToARGB(const uint8_t* sample,
       break;
     }
 
-    case FOURCC_J420: {
+    case FOURCC_U420: {
       int halfwidth = (src_width + 1) / 2;
       int halfheight = (abs_src_height + 1) / 2;
       const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
@@ -229,7 +238,7 @@ int ConvertToARGB(const uint8_t* sample,
                              (halfwidth * crop_y + crop_x) / 2;
       const uint8_t* src_v = sample + src_width * abs_src_height +
                              halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
-      r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+      r = U420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                      dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
     }
@@ -256,6 +265,18 @@ int ConvertToARGB(const uint8_t* sample,
       break;
     }
 
+    case FOURCC_J422: {
+      int halfwidth = (src_width + 1) / 2;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u =
+          sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      r = J422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    }
+
     case FOURCC_H422: {
       int halfwidth = (src_width + 1) / 2;
       const uint8_t* src_y = sample + src_width * crop_y + crop_x;
@@ -268,6 +289,18 @@ int ConvertToARGB(const uint8_t* sample,
       break;
     }
 
+    case FOURCC_U422: {
+      int halfwidth = (src_width + 1) / 2;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u =
+          sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    }
+
     case FOURCC_I444:
     case FOURCC_YV24: {
       const uint8_t* src_y = sample + src_width * crop_y + crop_x;
@@ -284,6 +317,40 @@ int ConvertToARGB(const uint8_t* sample,
                      dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
     }
+
+    case FOURCC_J444: {
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
+      src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+      src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      r = J444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    }
+
+    case FOURCC_H444: {
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
+      src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+      src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      r = H444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    }
+
+    case FOURCC_U444: {
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
+      src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+      src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      r = U444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    }
+
 #ifdef HAVE_JPEG
     case FOURCC_MJPG:
       r = MJPGToARGB(sample, sample_size, dst_argb, dst_stride_argb, src_width,
diff --git a/chromium/third_party/libyuv/source/convert_to_i420.cc b/chromium/third_party/libyuv/source/convert_to_i420.cc
index df08309f9ba..ac6eeab24ef 100644
--- a/chromium/third_party/libyuv/source/convert_to_i420.cc
+++ b/chromium/third_party/libyuv/source/convert_to_i420.cc
@@ -179,11 +179,6 @@ int ConvertToI420(const uint8_t* sample,
                            dst_stride_y, dst_v, dst_stride_v, dst_u,
                            dst_stride_u, crop_width, inv_crop_height, rotation);
       break;
-    case FOURCC_M420:
-      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
-      r = M420ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u,
-                     dst_v, dst_stride_v, crop_width, inv_crop_height);
-      break;
     // Triplanar formats
     case FOURCC_I420:
     case FOURCC_YV12: {
@@ -193,15 +188,15 @@ int ConvertToI420(const uint8_t* sample,
       int halfwidth = (src_width + 1) / 2;
       int halfheight = (abs_src_height + 1) / 2;
       if (format == FOURCC_YV12) {
-        src_v = sample + src_width * abs_src_height +
-                (halfwidth * crop_y + crop_x) / 2;
+        src_v = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) +
+                (crop_x / 2);
         src_u = sample + src_width * abs_src_height +
-                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+                halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2);
       } else {
-        src_u = sample + src_width * abs_src_height +
-                (halfwidth * crop_y + crop_x) / 2;
+        src_u = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) +
+                (crop_x / 2);
         src_v = sample + src_width * abs_src_height +
-                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+                halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2);
       }
       r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                      dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
@@ -216,14 +211,14 @@ int ConvertToI420(const uint8_t* sample,
       int halfwidth = (src_width + 1) / 2;
       if (format == FOURCC_YV16) {
         src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
-                crop_x / 2;
+                (crop_x / 2);
         src_u = sample + src_width * abs_src_height +
-                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+                halfwidth * (abs_src_height + crop_y) + (crop_x / 2);
       } else {
         src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
-                crop_x / 2;
+                (crop_x / 2);
         src_v = sample + src_width * abs_src_height +
-                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+                halfwidth * (abs_src_height + crop_y) + (crop_x / 2);
       }
       r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                      dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
diff --git a/chromium/third_party/libyuv/source/cpu_id.cc b/chromium/third_party/libyuv/source/cpu_id.cc
index 31e24b6739b..fe89452b772 100644
--- a/chromium/third_party/libyuv/source/cpu_id.cc
+++ b/chromium/third_party/libyuv/source/cpu_id.cc
@@ -75,9 +75,9 @@ void CpuId(int info_eax, int info_ecx, int* cpu_info) {
   asm volatile(
 #if defined(__i386__) && defined(__PIC__)
       // Preserve ebx for fpic 32 bit.
-      "mov %%ebx, %%edi                          \n"
+      "mov         %%ebx, %%edi                  \n"
       "cpuid                                     \n"
-      "xchg %%edi, %%ebx                         \n"
+      "xchg        %%edi, %%ebx                  \n"
       : "=D"(info_ebx),
 #else
       "cpuid                                     \n"
@@ -163,32 +163,38 @@ LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
 }
 
 // TODO(fbarchard): Consider read_msa_ir().
-// TODO(fbarchard): Add unittest.
-LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name,
-                                       const char ase[]) {
+LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) {
   char cpuinfo_line[512];
+  int flag = 0x0;
   FILE* f = fopen(cpuinfo_name, "r");
   if (!f) {
-    // ase enabled if /proc/cpuinfo is unavailable.
-    if (strcmp(ase, " msa") == 0) {
-      return kCpuHasMSA;
-    }
+    // Assume nothing if /proc/cpuinfo is unavailable.
+    // This will occur for Chrome sandbox for Pepper or Render process.
     return 0;
   }
   while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+    if (memcmp(cpuinfo_line, "cpu model", 9) == 0) {
+      // Workaround early kernel without mmi in ASEs line.
+      if (strstr(cpuinfo_line, "Loongson-3")) {
+        flag |= kCpuHasMMI;
+      } else if (strstr(cpuinfo_line, "Loongson-2K")) {
+        flag |= kCpuHasMMI | kCpuHasMSA;
+      }
+    }
     if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
-      char* p = strstr(cpuinfo_line, ase);
-      if (p) {
-        fclose(f);
-        if (strcmp(ase, " msa") == 0) {
-          return kCpuHasMSA;
-        }
-        return 0;
+      if (strstr(cpuinfo_line, "loongson-mmi") &&
+          strstr(cpuinfo_line, "loongson-ext")) {
+        flag |= kCpuHasMMI;
+      }
+      if (strstr(cpuinfo_line, "msa")) {
+        flag |= kCpuHasMSA;
       }
+      // ASEs is the last line, so we can break here.
+      break;
     }
   }
   fclose(f);
-  return 0;
+  return flag;
 }
 
 static SAFEBUFFERS int GetCpuFlags(void) {
@@ -230,9 +236,7 @@ static SAFEBUFFERS int GetCpuFlags(void) {
   }
 #endif
 #if defined(__mips__) && defined(__linux__)
-#if defined(__mips_msa)
-  cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa");
-#endif
+  cpu_info = MipsCpuCaps("/proc/cpuinfo");
   cpu_info |= kCpuHasMIPS;
 #endif
 #if defined(__arm__) || defined(__aarch64__)
diff --git a/chromium/third_party/libyuv/source/mjpeg_decoder.cc b/chromium/third_party/libyuv/source/mjpeg_decoder.cc
index eaf2530130b..adba832f53f 100644
--- a/chromium/third_party/libyuv/source/mjpeg_decoder.cc
+++ b/chromium/third_party/libyuv/source/mjpeg_decoder.cc
@@ -25,7 +25,8 @@
 #endif
 
 #endif
-struct FILE;  // For jpeglib.h.
+
+#include <stdio.h>  // For jpeglib.h.
 
 // C++ build requires extern C for jpeg internals.
 #ifdef __cplusplus
@@ -416,7 +417,10 @@ void init_source(j_decompress_ptr cinfo) {
 boolean fill_input_buffer(j_decompress_ptr cinfo) {
   BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
   if (buf_vec->pos >= buf_vec->len) {
+    // Don't assert-fail when fuzzing.
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
     assert(0 && "No more data");
+#endif
     // ERROR: No more data
     return FALSE;
   }
@@ -427,7 +431,15 @@ boolean fill_input_buffer(j_decompress_ptr cinfo) {
 }
 
 void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {  // NOLINT
-  cinfo->src->next_input_byte += num_bytes;
+  jpeg_source_mgr* src = cinfo->src;
+  size_t bytes = static_cast<size_t>(num_bytes);
+  if (bytes > src->bytes_in_buffer) {
+    src->next_input_byte = nullptr;
+    src->bytes_in_buffer = 0;
+  } else {
+    src->next_input_byte += bytes;
+    src->bytes_in_buffer -= bytes;
+  }
 }
 
 void term_source(j_decompress_ptr cinfo) {
diff --git a/chromium/third_party/libyuv/source/mjpeg_validate.cc b/chromium/third_party/libyuv/source/mjpeg_validate.cc
index 80c2cc0cb9b..ba0a03ab9e5 100644
--- a/chromium/third_party/libyuv/source/mjpeg_validate.cc
+++ b/chromium/third_party/libyuv/source/mjpeg_validate.cc
@@ -18,10 +18,10 @@ extern "C" {
 #endif
 
 // Helper function to scan for EOI marker (0xff 0xd9).
-static LIBYUV_BOOL ScanEOI(const uint8_t* sample, size_t sample_size) {
-  if (sample_size >= 2) {
-    const uint8_t* end = sample + sample_size - 1;
-    const uint8_t* it = sample;
+static LIBYUV_BOOL ScanEOI(const uint8_t* src_mjpg, size_t src_size_mjpg) {
+  if (src_size_mjpg >= 2) {
+    const uint8_t* end = src_mjpg + src_size_mjpg - 1;
+    const uint8_t* it = src_mjpg;
     while (it < end) {
       // TODO(fbarchard): scan for 0xd9 instead.
       it = (const uint8_t*)(memchr(it, 0xff, end - it));
@@ -34,34 +34,35 @@ static LIBYUV_BOOL ScanEOI(const uint8_t* sample, size_t sample_size) {
       ++it;  // Skip over current 0xff.
     }
   }
-  // ERROR: Invalid jpeg end code not found. Size sample_size
+  // ERROR: Invalid jpeg end code not found. Size src_size_mjpg
   return LIBYUV_FALSE;
 }
 
 // Helper function to validate the jpeg appears intact.
-LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size) {
+LIBYUV_BOOL ValidateJpeg(const uint8_t* src_mjpg, size_t src_size_mjpg) {
   // Maximum size that ValidateJpeg will consider valid.
   const size_t kMaxJpegSize = 0x7fffffffull;
   const size_t kBackSearchSize = 1024;
-  if (sample_size < 64 || sample_size > kMaxJpegSize || !sample) {
-    // ERROR: Invalid jpeg size: sample_size
+  if (src_size_mjpg < 64 || src_size_mjpg > kMaxJpegSize || !src_mjpg) {
+    // ERROR: Invalid jpeg size: src_size_mjpg
     return LIBYUV_FALSE;
   }
-  if (sample[0] != 0xff || sample[1] != 0xd8) {  // SOI marker
+  // SOI marker
+  if (src_mjpg[0] != 0xff || src_mjpg[1] != 0xd8 || src_mjpg[2] != 0xff) {
     // ERROR: Invalid jpeg initial start code
     return LIBYUV_FALSE;
   }
 
   // Look for the End Of Image (EOI) marker near the end of the buffer.
-  if (sample_size > kBackSearchSize) {
-    if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
+  if (src_size_mjpg > kBackSearchSize) {
+    if (ScanEOI(src_mjpg + src_size_mjpg - kBackSearchSize, kBackSearchSize)) {
       return LIBYUV_TRUE;  // Success: Valid jpeg.
     }
     // Reduce search size for forward search.
-    sample_size = sample_size - kBackSearchSize + 1;
+    src_size_mjpg = src_size_mjpg - kBackSearchSize + 1;
   }
   // Step over SOI marker and scan for EOI.
-  return ScanEOI(sample + 2, sample_size - 2);
+  return ScanEOI(src_mjpg + 2, src_size_mjpg - 2);
 }
 
 #ifdef __cplusplus
diff --git a/chromium/third_party/libyuv/source/planar_functions.cc b/chromium/third_party/libyuv/source/planar_functions.cc
index 5eae3f763a7..4e8908c2eba 100644
--- a/chromium/third_party/libyuv/source/planar_functions.cc
+++ b/chromium/third_party/libyuv/source/planar_functions.cc
@@ -349,6 +349,39 @@ int I420ToI400(const uint8_t* src_y,
   return 0;
 }
 
+// Copy NV12. Supports inverting.
+int NV12Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv,
+             int src_stride_uv, uint8_t* dst_y, int dst_stride_y,
+             uint8_t* dst_uv, int dst_stride_uv, int width, int height) {
+  if (!src_y || !dst_y || !src_uv || !dst_uv || width <= 0 || height == 0) {
+    return -1;
+  }
+
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+    src_stride_y = -src_stride_y;
+    src_stride_uv = -src_stride_uv;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  CopyPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth * 2,
+            halfheight);
+  return 0;
+}
+
+// Copy NV21. Supports inverting.
+int NV21Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu,
+             int src_stride_vu, uint8_t* dst_y, int dst_stride_y,
+             uint8_t* dst_vu, int dst_stride_vu, int width, int height) {
+  return NV12Copy(src_y, src_stride_y, src_vu, src_stride_vu, dst_y,
+                  dst_stride_y, dst_vu, dst_stride_vu, width, height);
+}
+
 // Support function for NV12 etc UV channels.
 // Width and height are plane sizes (typically half pixel width).
 LIBYUV_API
@@ -402,6 +435,14 @@ void SplitUVPlane(const uint8_t* src_uv,
     }
   }
 #endif
+#if defined(HAS_SPLITUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    SplitUVRow = SplitUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      SplitUVRow = SplitUVRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_SPLITUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     SplitUVRow = SplitUVRow_Any_MSA;
@@ -432,7 +473,6 @@ void MergeUVPlane(const uint8_t* src_u,
   int y;
   void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
                      uint8_t* dst_uv, int width) = MergeUVRow_C;
-  // Coalesce rows.
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -470,6 +510,14 @@ void MergeUVPlane(const uint8_t* src_u,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    MergeUVRow = MergeUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      MergeUVRow = MergeUVRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     MergeUVRow = MergeUVRow_Any_MSA;
@@ -488,6 +536,96 @@ void MergeUVPlane(const uint8_t* src_u,
   }
 }
 
+// Swap U and V channels in interleaved UV plane.
+LIBYUV_API
+void SwapUVPlane(const uint8_t* src_uv,
+                 int src_stride_uv,
+                 uint8_t* dst_vu,
+                 int dst_stride_vu,
+                 int width,
+                 int height) {
+  int y;
+  void (*SwapUVRow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) =
+      SwapUVRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uv = src_uv + (height - 1) * src_stride_uv;
+    src_stride_uv = -src_stride_uv;
+  }
+  // Coalesce rows.
+  if (src_stride_uv == width * 2 && dst_stride_vu == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_uv = dst_stride_vu = 0;
+  }
+
+#if defined(HAS_SWAPUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    SwapUVRow = SwapUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      SwapUVRow = SwapUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SWAPUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SwapUVRow = SwapUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      SwapUVRow = SwapUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SWAPUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SwapUVRow = SwapUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SwapUVRow = SwapUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    SwapUVRow(src_uv, dst_vu, width);
+    src_uv += src_stride_uv;
+    dst_vu += dst_stride_vu;
+  }
+}
+
+// Convert NV21 to NV12.
+LIBYUV_API
+int NV21ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_vu || !dst_uv || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_vu = src_vu + (halfheight - 1) * src_stride_vu;
+    src_stride_vu = -src_stride_vu;
+  }
+
+  SwapUVPlane(src_vu, src_stride_vu, dst_uv, dst_stride_uv, halfwidth,
+              halfheight);
+  return 0;
+}
+
 // Support function for NV12 etc RGB channels.
 // Width and height are plane sizes (typically half pixel width).
 LIBYUV_API
@@ -529,6 +667,14 @@ void SplitRGBPlane(const uint8_t* src_rgb,
     }
   }
 #endif
+#if defined(HAS_SPLITRGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    SplitRGBRow = SplitRGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      SplitRGBRow = SplitRGBRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_SPLITRGBROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     SplitRGBRow = SplitRGBRow_Any_NEON;
@@ -593,6 +739,14 @@ void MergeRGBPlane(const uint8_t* src_r,
     }
   }
 #endif
+#if defined(HAS_MERGERGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    MergeRGBRow = MergeRGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      MergeRGBRow = MergeRGBRow_MMI;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     // Merge a row of U and V into a row of RGB.
@@ -604,62 +758,6 @@ void MergeRGBPlane(const uint8_t* src_r,
   }
 }
 
-// Mirror a plane of data.
-void MirrorPlane(const uint8_t* src_y,
-                 int src_stride_y,
-                 uint8_t* dst_y,
-                 int dst_stride_y,
-                 int width,
-                 int height) {
-  int y;
-  void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-#if defined(HAS_MIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MirrorRow = MirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      MirrorRow = MirrorRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    MirrorRow = MirrorRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      MirrorRow = MirrorRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MirrorRow = MirrorRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      MirrorRow = MirrorRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    MirrorRow = MirrorRow_Any_MSA;
-    if (IS_ALIGNED(width, 64)) {
-      MirrorRow = MirrorRow_MSA;
-    }
-  }
-#endif
-
-  // Mirror plane
-  for (y = 0; y < height; ++y) {
-    MirrorRow(src_y, dst_y, width);
-    src_y += src_stride_y;
-    dst_y += dst_stride_y;
-  }
-}
-
 // Convert YUY2 to I422.
 LIBYUV_API
 int YUY2ToI422(const uint8_t* src_yuy2,
@@ -724,7 +822,17 @@ int YUY2ToI422(const uint8_t* src_yuy2,
     }
   }
 #endif
-#if defined(HAS_YUY2TOYROW_MSA)
+#if defined(HAS_YUY2TOYROW_MMI) && defined(HAS_YUY2TOUV422ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MMI;
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToYRow = YUY2ToYRow_MMI;
+      YUY2ToUV422Row = YUY2ToUV422Row_MMI;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUV422ROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     YUY2ToYRow = YUY2ToYRow_Any_MSA;
     YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA;
@@ -810,7 +918,17 @@ int UYVYToI422(const uint8_t* src_uyvy,
     }
   }
 #endif
-#if defined(HAS_UYVYTOYROW_MSA)
+#if defined(HAS_UYVYTOYROW_MMI) && defined(HAS_UYVYTOUV422ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    UYVYToYRow = UYVYToYRow_Any_MMI;
+    UYVYToUV422Row = UYVYToUV422Row_Any_MMI;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_MMI;
+      UYVYToUV422Row = UYVYToUV422Row_MMI;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_MSA) && defined(HAS_UYVYTOUV422ROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     UYVYToYRow = UYVYToYRow_Any_MSA;
     UYVYToUV422Row = UYVYToUV422Row_Any_MSA;
@@ -882,6 +1000,14 @@ int YUY2ToY(const uint8_t* src_yuy2,
     }
   }
 #endif
+#if defined(HAS_YUY2TOYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToYRow = YUY2ToYRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_YUY2TOYROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     YUY2ToYRow = YUY2ToYRow_Any_MSA;
@@ -899,6 +1025,130 @@ int YUY2ToY(const uint8_t* src_yuy2,
   return 0;
 }
 
+// Mirror a plane of data.
+// See Also I400Mirror
+LIBYUV_API
+void MirrorPlane(const uint8_t* src_y,
+                 int src_stride_y,
+                 uint8_t* dst_y,
+                 int dst_stride_y,
+                 int width,
+                 int height) {
+  int y;
+  void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+#if defined(HAS_MIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorRow = MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorRow = MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorRow = MirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    MirrorRow = MirrorRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      MirrorRow = MirrorRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MirrorRow = MirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 64)) {
+      MirrorRow = MirrorRow_MSA;
+    }
+  }
+#endif
+
+  // Mirror plane
+  for (y = 0; y < height; ++y) {
+    MirrorRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+// Mirror a plane of UV data.
+LIBYUV_API
+void MirrorUVPlane(const uint8_t* src_uv,
+                   int src_stride_uv,
+                   uint8_t* dst_uv,
+                   int dst_stride_uv,
+                   int width,
+                   int height) {
+  int y;
+  void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst, int width) =
+      MirrorUVRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uv = src_uv + (height - 1) * src_stride_uv;
+    src_stride_uv = -src_stride_uv;
+  }
+#if defined(HAS_MIRRORUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorUVRow = MirrorUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorUVRow = MirrorUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorUVRow = MirrorUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      MirrorUVRow = MirrorUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorUVRow = MirrorUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorUVRow = MirrorUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MirrorUVRow = MirrorUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      MirrorUVRow = MirrorUVRow_MSA;
+    }
+  }
+#endif
+
+  // MirrorUV plane
+  for (y = 0; y < height; ++y) {
+    MirrorUVRow(src_uv, dst_uv, width);
+    src_uv += src_stride_uv;
+    dst_uv += dst_stride_uv;
+  }
+}
+
 // Mirror I400 with optional flipping
 LIBYUV_API
 int I400Mirror(const uint8_t* src_y,
@@ -939,7 +1189,7 @@ int I420Mirror(const uint8_t* src_y,
                int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 ||
+  if (!src_y || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
       height == 0) {
     return -1;
   }
@@ -963,6 +1213,41 @@ int I420Mirror(const uint8_t* src_y,
   return 0;
 }
 
+// NV12 mirror.
+LIBYUV_API
+int NV12Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_uv || !dst_uv || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+    src_stride_y = -src_stride_y;
+    src_stride_uv = -src_stride_uv;
+  }
+
+  if (dst_y) {
+    MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  MirrorUVPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth,
+                halfheight);
+  return 0;
+}
+
 // ARGB mirror.
 LIBYUV_API
 int ARGBMirror(const uint8_t* src_argb,
@@ -986,7 +1271,7 @@ int ARGBMirror(const uint8_t* src_argb,
 #if defined(HAS_ARGBMIRRORROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 4)) {
+    if (IS_ALIGNED(width, 8)) {
       ARGBMirrorRow = ARGBMirrorRow_NEON;
     }
   }
@@ -1007,6 +1292,14 @@ int ARGBMirror(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBMIRRORROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBMirrorRow = ARGBMirrorRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGBMIRRORROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
@@ -1025,6 +1318,52 @@ int ARGBMirror(const uint8_t* src_argb,
   return 0;
 }
 
+// RGB24 mirror.
+LIBYUV_API
+int RGB24Mirror(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  int y;
+  void (*RGB24MirrorRow)(const uint8_t* src, uint8_t* dst, int width) =
+      RGB24MirrorRow_C;
+  if (!src_rgb24 || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+#if defined(HAS_RGB24MIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24MirrorRow = RGB24MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24MirrorRow = RGB24MirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGB24MIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24MirrorRow = RGB24MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24MirrorRow = RGB24MirrorRow_SSSE3;
+    }
+  }
+#endif
+
+  // Mirror plane
+  for (y = 0; y < height; ++y) {
+    RGB24MirrorRow(src_rgb24, dst_rgb24, width);
+    src_rgb24 += src_stride_rgb24;
+    dst_rgb24 += dst_stride_rgb24;
+  }
+  return 0;
+}
+
 // Get a blender that optimized for the CPU and pixel count.
 // As there are 6 blenders to choose from, the caller should try to use
 // the same blend function for all pixels if possible.
@@ -1043,6 +1382,11 @@ ARGBBlendRow GetARGBBlend() {
     ARGBBlendRow = ARGBBlendRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBBLENDROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBBlendRow = ARGBBlendRow_MMI;
+  }
+#endif
 #if defined(HAS_ARGBBLENDROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBBlendRow = ARGBBlendRow_MSA;
@@ -1140,6 +1484,14 @@ int BlendPlane(const uint8_t* src_y0,
     }
   }
 #endif
+#if defined(HAS_BLENDPLANEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    BlendPlaneRow = BlendPlaneRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      BlendPlaneRow = BlendPlaneRow_MMI;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width);
@@ -1216,6 +1568,14 @@ int I420Blend(const uint8_t* src_y0,
     }
   }
 #endif
+#if defined(HAS_BLENDPLANEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    BlendPlaneRow = BlendPlaneRow_Any_MMI;
+    if (IS_ALIGNED(halfwidth, 8)) {
+      BlendPlaneRow = BlendPlaneRow_MMI;
+    }
+  }
+#endif
   if (!IS_ALIGNED(width, 2)) {
     ScaleRowDown2 = ScaleRowDown2Box_Odd_C;
   }
@@ -1252,6 +1612,17 @@ int I420Blend(const uint8_t* src_y0,
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN2_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ScaleRowDown2 = ScaleRowDown2Box_Odd_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      ScaleRowDown2 = ScaleRowDown2Box_Any_MMI;
+      if (IS_ALIGNED(halfwidth, 8)) {
+        ScaleRowDown2 = ScaleRowDown2Box_MMI;
+      }
+    }
+  }
+#endif
 
   // Row buffer for intermediate alpha pixels.
   align_buffer_64(halfalpha, halfwidth);
@@ -1329,6 +1700,14 @@ int ARGBMultiply(const uint8_t* src_argb0,
     }
   }
 #endif
+#if defined(HAS_ARGBMULTIPLYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGBMULTIPLYROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA;
@@ -1406,6 +1785,14 @@ int ARGBAdd(const uint8_t* src_argb0,
     }
   }
 #endif
+#if defined(HAS_ARGBADDROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBAddRow = ARGBAddRow_Any_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBAddRow = ARGBAddRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGBADDROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBAddRow = ARGBAddRow_Any_MSA;
@@ -1478,6 +1865,14 @@ int ARGBSubtract(const uint8_t* src_argb0,
     }
   }
 #endif
+#if defined(HAS_ARGBSUBTRACTROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBSubtractRow = ARGBSubtractRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGBSUBTRACTROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBSubtractRow = ARGBSubtractRow_Any_MSA;
@@ -1496,177 +1891,6 @@ int ARGBSubtract(const uint8_t* src_argb0,
   }
   return 0;
 }
-// Convert I422 to RGBA with matrix
-static int I422ToRGBAMatrix(const uint8_t* src_y,
-                            int src_stride_y,
-                            const uint8_t* src_u,
-                            int src_stride_u,
-                            const uint8_t* src_v,
-                            int src_stride_v,
-                            uint8_t* dst_rgba,
-                            int dst_stride_rgba,
-                            const struct YuvConstants* yuvconstants,
-                            int width,
-                            int height) {
-  int y;
-  void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
-                        const uint8_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I422ToRGBARow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
-    dst_stride_rgba = -dst_stride_rgba;
-  }
-#if defined(HAS_I422TORGBAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGBARow = I422ToRGBARow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGBARow = I422ToRGBARow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToRGBARow = I422ToRGBARow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
-    dst_rgba += dst_stride_rgba;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Convert I422 to RGBA.
-LIBYUV_API
-int I422ToRGBA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_rgba,
-               int dst_stride_rgba,
-               int width,
-               int height) {
-  return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_rgba, dst_stride_rgba,
-                          &kYuvI601Constants, width, height);
-}
-
-// Convert I422 to BGRA.
-LIBYUV_API
-int I422ToBGRA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_bgra,
-               int dst_stride_bgra,
-               int width,
-               int height) {
-  return I422ToRGBAMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_bgra, dst_stride_bgra,
-                          &kYvuI601Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert NV12 to RGB565.
-LIBYUV_API
-int NV12ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_uv,
-                 int src_stride_uv,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height) {
-  int y;
-  void (*NV12ToRGB565Row)(
-      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
-      const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
-  if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
-    dst_stride_rgb565 = -dst_stride_rgb565;
-  }
-#if defined(HAS_NV12TORGB565ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_NV12TORGB565ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_NV12TORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToRGB565Row = NV12ToRGB565Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_NV12TORGB565ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToRGB565Row = NV12ToRGB565Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width);
-    dst_rgb565 += dst_stride_rgb565;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_uv += src_stride_uv;
-    }
-  }
-  return 0;
-}
 
 // Convert RAW to RGB24.
 LIBYUV_API
@@ -1710,6 +1934,14 @@ int RAWToRGB24(const uint8_t* src_raw,
     }
   }
 #endif
+#if defined(HAS_RAWTORGB24ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RAWToRGB24Row = RAWToRGB24Row_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      RAWToRGB24Row = RAWToRGB24Row_MMI;
+    }
+  }
+#endif
 #if defined(HAS_RAWTORGB24ROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     RAWToRGB24Row = RAWToRGB24Row_Any_MSA;
@@ -1853,6 +2085,14 @@ int ARGBRect(uint8_t* dst_argb,
     ARGBSetRow = ARGBSetRow_X86;
   }
 #endif
+#if defined(HAS_ARGBSETROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBSetRow = ARGBSetRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSetRow = ARGBSetRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGBSETROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBSetRow = ARGBSetRow_Any_MSA;
@@ -1931,6 +2171,14 @@ int ARGBAttenuate(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGBATTENUATEROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
@@ -2034,6 +2282,11 @@ int ARGBGrayTo(const uint8_t* src_argb,
     ARGBGrayRow = ARGBGrayRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBGRAYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+    ARGBGrayRow = ARGBGrayRow_MMI;
+  }
+#endif
 #if defined(HAS_ARGBGRAYROW_MSA)
   if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
     ARGBGrayRow = ARGBGrayRow_MSA;
@@ -2079,6 +2332,11 @@ int ARGBGray(uint8_t* dst_argb,
     ARGBGrayRow = ARGBGrayRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBGRAYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+    ARGBGrayRow = ARGBGrayRow_MMI;
+  }
+#endif
 #if defined(HAS_ARGBGRAYROW_MSA)
   if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
     ARGBGrayRow = ARGBGrayRow_MSA;
@@ -2122,6 +2380,11 @@ int ARGBSepia(uint8_t* dst_argb,
     ARGBSepiaRow = ARGBSepiaRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBSEPIAROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+    ARGBSepiaRow = ARGBSepiaRow_MMI;
+  }
+#endif
 #if defined(HAS_ARGBSEPIAROW_MSA)
   if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
     ARGBSepiaRow = ARGBSepiaRow_MSA;
@@ -2173,6 +2436,11 @@ int ARGBColorMatrix(const uint8_t* src_argb,
     ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBCOLORMATRIXROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+    ARGBColorMatrixRow = ARGBColorMatrixRow_MMI;
+  }
+#endif
 #if defined(HAS_ARGBCOLORMATRIXROW_MSA)
   if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
     ARGBColorMatrixRow = ARGBColorMatrixRow_MSA;
@@ -2372,6 +2640,12 @@ int ARGBComputeCumulativeSum(const uint8_t* src_argb,
     ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
   }
 #endif
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI;
+  }
+#endif
+
   memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4);  // 4 int per pixel.
   for (y = 0; y < height; ++y) {
     ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
@@ -2430,6 +2704,11 @@ int ARGBBlur(const uint8_t* src_argb,
     CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;
   }
 #endif
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI;
+  }
+#endif
   // Compute enough CumulativeSum for first row to be blurred. After this
   // one row of CumulativeSum is updated at a time.
   ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum,
@@ -2531,6 +2810,11 @@ int ARGBShade(const uint8_t* src_argb,
     ARGBShadeRow = ARGBShadeRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBSHADEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+    ARGBShadeRow = ARGBShadeRow_MMI;
+  }
+#endif
 #if defined(HAS_ARGBSHADEROW_MSA)
   if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) {
     ARGBShadeRow = ARGBShadeRow_MSA;
@@ -2599,6 +2883,14 @@ int InterpolatePlane(const uint8_t* src0,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    InterpolateRow = InterpolateRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      InterpolateRow = InterpolateRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     InterpolateRow = InterpolateRow_Any_MSA;
@@ -2722,6 +3014,14 @@ int ARGBShuffle(const uint8_t* src_bgra,
     }
   }
 #endif
+#if defined(HAS_ARGBSHUFFLEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBShuffleRow = ARGBShuffleRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGBSHUFFLEROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBShuffleRow = ARGBShuffleRow_Any_MSA;
@@ -2739,6 +3039,80 @@ int ARGBShuffle(const uint8_t* src_bgra,
   return 0;
 }
 
+// Gauss blur a float plane using Gaussian 5x5 filter with
+// coefficients of 1, 4, 6, 4, 1.
+// Each destination pixel is a blur of the 5x5
+// pixels from the source.
+// Source edges are clamped.
+// Edge is 2 pixels on each side, and interior is multiple of 4.
+LIBYUV_API
+int GaussPlane_F32(const float* src,
+                   int src_stride,
+                   float* dst,
+                   int dst_stride,
+                   int width,
+                   int height) {
+  int y;
+  void (*GaussCol_F32)(const float* src0, const float* src1, const float* src2,
+                       const float* src3, const float* src4, float* dst,
+                       int width) = GaussCol_F32_C;
+  void (*GaussRow_F32)(const float* src, float* dst, int width) =
+      GaussRow_F32_C;
+  if (!src || !dst || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src = src + (height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+#if defined(HAS_GAUSSCOL_F32_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    GaussCol_F32 = GaussCol_F32_NEON;
+  }
+#endif
+#if defined(HAS_GAUSSROW_F32_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    GaussRow_F32 = GaussRow_F32_NEON;
+  }
+#endif
+  {
+    // 2 pixels on each side, but aligned out to 16 bytes.
+    align_buffer_64(rowbuf, (4 + width + 4) * 4);
+    memset(rowbuf, 0, 16);
+    memset(rowbuf + (4 + width) * 4, 0, 16);
+    float* row = (float*)(rowbuf + 16);
+    const float* src0 = src;
+    const float* src1 = src;
+    const float* src2 = src;
+    const float* src3 = src2 + ((height > 1) ? src_stride : 0);
+    const float* src4 = src3 + ((height > 2) ? src_stride : 0);
+
+    for (y = 0; y < height; ++y) {
+      GaussCol_F32(src0, src1, src2, src3, src4, row, width);
+
+      // Extrude edge by 2 floats
+      row[-2] = row[-1] = row[0];
+      row[width + 1] = row[width] = row[width - 1];
+
+      GaussRow_F32(row - 2, dst, width);
+
+      src0 = src1;
+      src1 = src2;
+      src2 = src3;
+      src3 = src4;
+      if ((y + 2) < (height - 1)) {
+        src4 += src_stride;
+      }
+      dst += dst_stride;
+    }
+    free_aligned_buffer_64(rowbuf);
+  }
+  return 0;
+}
+
 // Sobel ARGB effect.
 static int ARGBSobelize(const uint8_t* src_argb,
                         int src_stride_argb,
@@ -2793,6 +3167,14 @@ static int ARGBSobelize(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYJROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToYJRow = ARGBToYJRow_Any_MSA;
@@ -2812,6 +3194,11 @@ static int ARGBSobelize(const uint8_t* src_argb,
     SobelYRow = SobelYRow_NEON;
   }
 #endif
+#if defined(HAS_SOBELYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    SobelYRow = SobelYRow_MMI;
+  }
+#endif
 #if defined(HAS_SOBELYROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     SobelYRow = SobelYRow_MSA;
@@ -2827,6 +3214,11 @@ static int ARGBSobelize(const uint8_t* src_argb,
     SobelXRow = SobelXRow_NEON;
   }
 #endif
+#if defined(HAS_SOBELXROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    SobelXRow = SobelXRow_MMI;
+  }
+#endif
 #if defined(HAS_SOBELXROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     SobelXRow = SobelXRow_MSA;
@@ -2906,6 +3298,14 @@ int ARGBSobel(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_SOBELROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    SobelRow = SobelRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      SobelRow = SobelRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_SOBELROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     SobelRow = SobelRow_Any_MSA;
@@ -2944,6 +3344,14 @@ int ARGBSobelToPlane(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_SOBELTOPLANEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    SobelToPlaneRow = SobelToPlaneRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      SobelToPlaneRow = SobelToPlaneRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_SOBELTOPLANEROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     SobelToPlaneRow = SobelToPlaneRow_Any_MSA;
@@ -2983,6 +3391,14 @@ int ARGBSobelXY(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_SOBELXYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    SobelXYRow = SobelXYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      SobelXYRow = SobelXYRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_SOBELXYROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     SobelXYRow = SobelXYRow_Any_MSA;
@@ -3228,6 +3644,14 @@ int ARGBCopyAlpha(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBCOPYALPHAROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBCopyAlphaRow = ARGBCopyAlphaRow_MMI;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBCopyAlphaRow(src_argb, dst_argb, width);
@@ -3280,6 +3704,12 @@ int ARGBExtractAlpha(const uint8_t* src_argb,
                                                 : ARGBExtractAlphaRow_Any_NEON;
   }
 #endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_MMI
+                                               : ARGBExtractAlphaRow_Any_MMI;
+  }
+#endif
 #if defined(HAS_ARGBEXTRACTALPHAROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA
@@ -3337,6 +3767,14 @@ int ARGBCopyYToAlpha(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_ARGBCOPYYTOALPHAROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_MMI;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBCopyYToAlphaRow(src_y, dst_argb, width);
@@ -3398,6 +3836,14 @@ int YUY2ToNV12(const uint8_t* src_yuy2,
     }
   }
 #endif
+#if defined(HAS_SPLITUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    SplitUVRow = SplitUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      SplitUVRow = SplitUVRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_SPLITUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     SplitUVRow = SplitUVRow_Any_MSA;
@@ -3430,6 +3876,14 @@ int YUY2ToNV12(const uint8_t* src_yuy2,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    InterpolateRow = InterpolateRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      InterpolateRow = InterpolateRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     InterpolateRow = InterpolateRow_Any_MSA;
@@ -3514,6 +3968,14 @@ int UYVYToNV12(const uint8_t* src_uyvy,
     }
   }
 #endif
+#if defined(HAS_SPLITUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    SplitUVRow = SplitUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      SplitUVRow = SplitUVRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_SPLITUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     SplitUVRow = SplitUVRow_Any_MSA;
@@ -3546,6 +4008,14 @@ int UYVYToNV12(const uint8_t* src_uyvy,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    InterpolateRow = InterpolateRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      InterpolateRow = InterpolateRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     InterpolateRow = InterpolateRow_Any_MSA;
@@ -3581,6 +4051,56 @@ int UYVYToNV12(const uint8_t* src_uyvy,
   return 0;
 }
 
+// width and height are src size allowing odd size handling.
+LIBYUV_API
+void HalfMergeUVPlane(const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_uv,
+                      int dst_stride_uv,
+                      int width,
+                      int height) {
+  int y;
+  void (*HalfMergeUVRow)(const uint8_t* src_u, int src_stride_u,
+                         const uint8_t* src_v, int src_stride_v,
+                         uint8_t* dst_uv, int width) = HalfMergeUVRow_C;
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+#if defined(HAS_HALFMERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+    HalfMergeUVRow = HalfMergeUVRow_NEON;
+  }
+#endif
+#if defined(HAS_HALFMERGEUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+    HalfMergeUVRow = HalfMergeUVRow_SSSE3;
+  }
+#endif
+#if defined(HAS_HALFMERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
+    HalfMergeUVRow = HalfMergeUVRow_AVX2;
+  }
+#endif
+  for (y = 0; y < height - 1; y += 2) {
+    // Merge a row of U and V into a row of UV.
+    HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width);
+    src_u += src_stride_u * 2;
+    src_v += src_stride_v * 2;
+    dst_uv += dst_stride_uv;
+  }
+  if (height & 1) {
+    HalfMergeUVRow(src_u, 0, src_v, 0, dst_uv, width);
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/chromium/third_party/libyuv/source/rotate.cc b/chromium/third_party/libyuv/source/rotate.cc
index f2bed85b755..32904e47312 100644
--- a/chromium/third_party/libyuv/source/rotate.cc
+++ b/chromium/third_party/libyuv/source/rotate.cc
@@ -36,6 +36,15 @@ void TransposePlane(const uint8_t* src,
   void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst,
                        int dst_stride, int width) = TransposeWx8_C;
 #endif
+
+#if defined(HAS_TRANSPOSEWX16_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    TransposeWx16 = TransposeWx16_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      TransposeWx16 = TransposeWx16_MSA;
+    }
+  }
+#else
 #if defined(HAS_TRANSPOSEWX8_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     TransposeWx8 = TransposeWx8_NEON;
@@ -49,6 +58,11 @@ void TransposePlane(const uint8_t* src,
     }
   }
 #endif
+#if defined(HAS_TRANSPOSEWX8_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    TransposeWx8 = TransposeWx8_MMI;
+  }
+#endif
 #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
@@ -57,14 +71,7 @@ void TransposePlane(const uint8_t* src,
     }
   }
 #endif
-#if defined(HAS_TRANSPOSEWX16_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    TransposeWx16 = TransposeWx16_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      TransposeWx16 = TransposeWx16_MSA;
-    }
-  }
-#endif
+#endif /* defined(HAS_TRANSPOSEWX16_MSA) */
 
 #if defined(HAS_TRANSPOSEWX16_MSA)
   // Work across the source in 16x16 tiles
@@ -137,7 +144,7 @@ void RotatePlane180(const uint8_t* src,
 #if defined(HAS_MIRRORROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MirrorRow = MirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 32)) {
       MirrorRow = MirrorRow_NEON;
     }
   }
@@ -158,6 +165,14 @@ void RotatePlane180(const uint8_t* src,
     }
   }
 #endif
+#if defined(HAS_MIRRORROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    MirrorRow = MirrorRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      MirrorRow = MirrorRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_MIRRORROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     MirrorRow = MirrorRow_Any_MSA;
@@ -186,14 +201,19 @@ void RotatePlane180(const uint8_t* src,
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
+#if defined(HAS_COPYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    CopyRow = IS_ALIGNED(width, 8) ? CopyRow_MMI : CopyRow_Any_MMI;
+  }
+#endif
 
   // Odd height will harmlessly mirror the middle row twice.
   for (y = 0; y < half_height; ++y) {
-    MirrorRow(src, row, width);  // Mirror first row into a buffer
-    src += src_stride;
+    CopyRow(src, row, width);        // Copy first row into buffer
     MirrorRow(src_bot, dst, width);  // Mirror last row into first row
+    MirrorRow(row, dst_bot, width);  // Mirror buffer into last row
+    src += src_stride;
     dst += dst_stride;
-    CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
     src_bot -= src_stride;
     dst_bot -= dst_stride;
   }
@@ -219,6 +239,15 @@ void TransposeUV(const uint8_t* src,
                          int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
                          int width) = TransposeUVWx8_C;
 #endif
+
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    TransposeUVWx16 = TransposeUVWx16_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      TransposeUVWx16 = TransposeUVWx16_MSA;
+    }
+  }
+#else
 #if defined(HAS_TRANSPOSEUVWX8_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     TransposeUVWx8 = TransposeUVWx8_NEON;
@@ -232,14 +261,15 @@ void TransposeUV(const uint8_t* src,
     }
   }
 #endif
-#if defined(HAS_TRANSPOSEUVWX16_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    TransposeUVWx16 = TransposeUVWx16_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      TransposeUVWx16 = TransposeUVWx16_MSA;
+#if defined(HAS_TRANSPOSEUVWX8_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    TransposeUVWx8 = TransposeUVWx8_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      TransposeUVWx8 = TransposeUVWx8_MMI;
     }
   }
 #endif
+#endif /* defined(HAS_TRANSPOSEUVWX16_MSA) */
 
 #if defined(HAS_TRANSPOSEUVWX16_MSA)
   // Work through the source in 8x8 tiles.
@@ -314,21 +344,26 @@ void RotateUV180(const uint8_t* src,
                  int width,
                  int height) {
   int i;
-  void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
-                      int width) = MirrorUVRow_C;
-#if defined(HAS_MIRRORUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    MirrorUVRow = MirrorUVRow_NEON;
+  void (*MirrorSplitUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
+                           int width) = MirrorSplitUVRow_C;
+#if defined(HAS_MIRRORSPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+    MirrorSplitUVRow = MirrorSplitUVRow_NEON;
   }
 #endif
-#if defined(HAS_MIRRORUVROW_SSSE3)
+#if defined(HAS_MIRRORSPLITUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
-    MirrorUVRow = MirrorUVRow_SSSE3;
+    MirrorSplitUVRow = MirrorSplitUVRow_SSSE3;
+  }
+#endif
+#if defined(HAS_MIRRORSPLITUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 8)) {
+    MirrorSplitUVRow = MirrorSplitUVRow_MMI;
   }
 #endif
-#if defined(HAS_MIRRORUVROW_MSA)
+#if defined(HAS_MIRRORSPLITUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) {
-    MirrorUVRow = MirrorUVRow_MSA;
+    MirrorSplitUVRow = MirrorSplitUVRow_MSA;
   }
 #endif
 
@@ -336,7 +371,7 @@ void RotateUV180(const uint8_t* src,
   dst_b += dst_stride_b * (height - 1);
 
   for (i = 0; i < height; ++i) {
-    MirrorUVRow(src, dst_a, dst_b, width);
+    MirrorSplitUVRow(src, dst_a, dst_b, width);
     src += src_stride;
     dst_a -= dst_stride_a;
     dst_b -= dst_stride_b;
@@ -451,6 +486,66 @@ int I420Rotate(const uint8_t* src_y,
 }
 
 LIBYUV_API
+int I444Rotate(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum libyuv::RotationMode mode) {
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  switch (mode) {
+    case libyuv::kRotate0:
+      // copy frame
+      CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+      CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+      return 0;
+    case libyuv::kRotate90:
+      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+      RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+      return 0;
+    case libyuv::kRotate270:
+      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+      RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+      return 0;
+    case libyuv::kRotate180:
+      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+      RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+LIBYUV_API
 int NV12ToI420Rotate(const uint8_t* src_y,
                      int src_stride_y,
                      const uint8_t* src_uv,
diff --git a/chromium/third_party/libyuv/source/rotate_any.cc b/chromium/third_party/libyuv/source/rotate_any.cc
index c2752e6222c..b3baf084d0c 100644
--- a/chromium/third_party/libyuv/source/rotate_any.cc
+++ b/chromium/third_party/libyuv/source/rotate_any.cc
@@ -35,6 +35,9 @@ TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
 #ifdef HAS_TRANSPOSEWX8_SSSE3
 TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
 #endif
+#ifdef HAS_TRANSPOSEWX8_MMI
+TANY(TransposeWx8_Any_MMI, TransposeWx8_MMI, 7)
+#endif
 #ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
 TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
 #endif
@@ -62,6 +65,9 @@ TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
 #ifdef HAS_TRANSPOSEUVWX8_SSE2
 TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
 #endif
+#ifdef HAS_TRANSPOSEUVWX8_MMI
+TUVANY(TransposeUVWx8_Any_MMI, TransposeUVWx8_MMI, 7)
+#endif
 #ifdef HAS_TRANSPOSEUVWX16_MSA
 TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7)
 #endif
diff --git a/chromium/third_party/libyuv/source/rotate_argb.cc b/chromium/third_party/libyuv/source/rotate_argb.cc
index 5a6e05376f1..ae653886018 100644
--- a/chromium/third_party/libyuv/source/rotate_argb.cc
+++ b/chromium/third_party/libyuv/source/rotate_argb.cc
@@ -21,17 +21,21 @@ namespace libyuv {
 extern "C" {
 #endif
 
-static void ARGBTranspose(const uint8_t* src_argb,
-                          int src_stride_argb,
-                          uint8_t* dst_argb,
-                          int dst_stride_argb,
-                          int width,
-                          int height) {
+static int ARGBTranspose(const uint8_t* src_argb,
+                         int src_stride_argb,
+                         uint8_t* dst_argb,
+                         int dst_stride_argb,
+                         int width,
+                         int height) {
   int i;
   int src_pixel_step = src_stride_argb >> 2;
   void (*ScaleARGBRowDownEven)(
       const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step,
       uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C;
+  // Check stride is a multiple of 4.
+  if (src_stride_argb & 3) {
+    return -1;
+  }
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2;
@@ -48,6 +52,14 @@ static void ARGBTranspose(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MMI;
+    if (IS_ALIGNED(height, 4)) {  // Width of dest.
+      ScaleARGBRowDownEven = ScaleARGBRowDownEven_MMI;
+    }
+  }
+#endif
 #if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MSA;
@@ -62,44 +74,45 @@ static void ARGBTranspose(const uint8_t* src_argb,
     dst_argb += dst_stride_argb;
     src_argb += 4;
   }
+  return 0;
 }
 
-void ARGBRotate90(const uint8_t* src_argb,
-                  int src_stride_argb,
-                  uint8_t* dst_argb,
-                  int dst_stride_argb,
-                  int width,
-                  int height) {
+static int ARGBRotate90(const uint8_t* src_argb,
+                        int src_stride_argb,
+                        uint8_t* dst_argb,
+                        int dst_stride_argb,
+                        int width,
+                        int height) {
   // Rotate by 90 is a ARGBTranspose with the source read
   // from bottom to top. So set the source pointer to the end
   // of the buffer and flip the sign of the source stride.
   src_argb += src_stride_argb * (height - 1);
   src_stride_argb = -src_stride_argb;
-  ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
-                height);
+  return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                       width, height);
 }
 
-void ARGBRotate270(const uint8_t* src_argb,
-                   int src_stride_argb,
-                   uint8_t* dst_argb,
-                   int dst_stride_argb,
-                   int width,
-                   int height) {
+static int ARGBRotate270(const uint8_t* src_argb,
+                         int src_stride_argb,
+                         uint8_t* dst_argb,
+                         int dst_stride_argb,
+                         int width,
+                         int height) {
   // Rotate by 270 is a ARGBTranspose with the destination written
   // from bottom to top. So set the destination pointer to the end
   // of the buffer and flip the sign of the destination stride.
   dst_argb += dst_stride_argb * (width - 1);
   dst_stride_argb = -dst_stride_argb;
-  ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
-                height);
+  return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                       width, height);
 }
 
-void ARGBRotate180(const uint8_t* src_argb,
-                   int src_stride_argb,
-                   uint8_t* dst_argb,
-                   int dst_stride_argb,
-                   int width,
-                   int height) {
+static int ARGBRotate180(const uint8_t* src_argb,
+                         int src_stride_argb,
+                         uint8_t* dst_argb,
+                         int dst_stride_argb,
+                         int width,
+                         int height) {
   // Swap first and last row and mirror the content. Uses a temporary row.
   align_buffer_64(row, width * 4);
   const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1);
@@ -113,7 +126,7 @@ void ARGBRotate180(const uint8_t* src_argb,
 #if defined(HAS_ARGBMIRRORROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 4)) {
+    if (IS_ALIGNED(width, 8)) {
       ARGBMirrorRow = ARGBMirrorRow_NEON;
     }
   }
@@ -134,6 +147,14 @@ void ARGBRotate180(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBMIRRORROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBMirrorRow = ARGBMirrorRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGBMIRRORROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
@@ -174,6 +195,7 @@ void ARGBRotate180(const uint8_t* src_argb,
     dst_bot -= dst_stride_argb;
   }
   free_aligned_buffer_64(row);
+  return 0;
 }
 
 LIBYUV_API
@@ -201,17 +223,14 @@ int ARGBRotate(const uint8_t* src_argb,
       return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
                       width, height);
     case kRotate90:
-      ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
-                   height);
-      return 0;
+      return ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                          width, height);
     case kRotate270:
-      ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
-                    height);
-      return 0;
+      return ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                           width, height);
     case kRotate180:
-      ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
-                    height);
-      return 0;
+      return ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                           width, height);
     default:
       break;
   }
diff --git a/chromium/third_party/libyuv/source/rotate_gcc.cc b/chromium/third_party/libyuv/source/rotate_gcc.cc
index 04e19e29eef..fd359d4ae69 100644
--- a/chromium/third_party/libyuv/source/rotate_gcc.cc
+++ b/chromium/third_party/libyuv/source/rotate_gcc.cc
@@ -31,75 +31,75 @@ void TransposeWx8_SSSE3(const uint8_t* src,
       // Read in the data from the source pointer.
       // First round of bit swap.
       LABELALIGN
-      "1:                                          \n"
-      "movq       (%0),%%xmm0                      \n"
-      "movq       (%0,%3),%%xmm1                   \n"
-      "lea        (%0,%3,2),%0                     \n"
-      "punpcklbw  %%xmm1,%%xmm0                    \n"
-      "movq       (%0),%%xmm2                      \n"
-      "movdqa     %%xmm0,%%xmm1                    \n"
-      "palignr    $0x8,%%xmm1,%%xmm1               \n"
-      "movq       (%0,%3),%%xmm3                   \n"
-      "lea        (%0,%3,2),%0                     \n"
-      "punpcklbw  %%xmm3,%%xmm2                    \n"
-      "movdqa     %%xmm2,%%xmm3                    \n"
-      "movq       (%0),%%xmm4                      \n"
-      "palignr    $0x8,%%xmm3,%%xmm3               \n"
-      "movq       (%0,%3),%%xmm5                   \n"
-      "lea        (%0,%3,2),%0                     \n"
-      "punpcklbw  %%xmm5,%%xmm4                    \n"
-      "movdqa     %%xmm4,%%xmm5                    \n"
-      "movq       (%0),%%xmm6                      \n"
-      "palignr    $0x8,%%xmm5,%%xmm5               \n"
-      "movq       (%0,%3),%%xmm7                   \n"
-      "lea        (%0,%3,2),%0                     \n"
-      "punpcklbw  %%xmm7,%%xmm6                    \n"
-      "neg        %3                               \n"
-      "movdqa     %%xmm6,%%xmm7                    \n"
-      "lea        0x8(%0,%3,8),%0                  \n"
-      "palignr    $0x8,%%xmm7,%%xmm7               \n"
-      "neg        %3                               \n"
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"
+      "movq        (%0,%3),%%xmm1                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "punpcklbw   %%xmm1,%%xmm0                 \n"
+      "movq        (%0),%%xmm2                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "palignr     $0x8,%%xmm1,%%xmm1            \n"
+      "movq        (%0,%3),%%xmm3                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "punpcklbw   %%xmm3,%%xmm2                 \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "movq        (%0),%%xmm4                   \n"
+      "palignr     $0x8,%%xmm3,%%xmm3            \n"
+      "movq        (%0,%3),%%xmm5                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "punpcklbw   %%xmm5,%%xmm4                 \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "movq        (%0),%%xmm6                   \n"
+      "palignr     $0x8,%%xmm5,%%xmm5            \n"
+      "movq        (%0,%3),%%xmm7                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "punpcklbw   %%xmm7,%%xmm6                 \n"
+      "neg         %3                            \n"
+      "movdqa      %%xmm6,%%xmm7                 \n"
+      "lea         0x8(%0,%3,8),%0               \n"
+      "palignr     $0x8,%%xmm7,%%xmm7            \n"
+      "neg         %3                            \n"
       // Second round of bit swap.
-      "punpcklwd  %%xmm2,%%xmm0                    \n"
-      "punpcklwd  %%xmm3,%%xmm1                    \n"
-      "movdqa     %%xmm0,%%xmm2                    \n"
-      "movdqa     %%xmm1,%%xmm3                    \n"
-      "palignr    $0x8,%%xmm2,%%xmm2               \n"
-      "palignr    $0x8,%%xmm3,%%xmm3               \n"
-      "punpcklwd  %%xmm6,%%xmm4                    \n"
-      "punpcklwd  %%xmm7,%%xmm5                    \n"
-      "movdqa     %%xmm4,%%xmm6                    \n"
-      "movdqa     %%xmm5,%%xmm7                    \n"
-      "palignr    $0x8,%%xmm6,%%xmm6               \n"
-      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      "punpcklwd   %%xmm2,%%xmm0                 \n"
+      "punpcklwd   %%xmm3,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "palignr     $0x8,%%xmm2,%%xmm2            \n"
+      "palignr     $0x8,%%xmm3,%%xmm3            \n"
+      "punpcklwd   %%xmm6,%%xmm4                 \n"
+      "punpcklwd   %%xmm7,%%xmm5                 \n"
+      "movdqa      %%xmm4,%%xmm6                 \n"
+      "movdqa      %%xmm5,%%xmm7                 \n"
+      "palignr     $0x8,%%xmm6,%%xmm6            \n"
+      "palignr     $0x8,%%xmm7,%%xmm7            \n"
       // Third round of bit swap.
       // Write to the destination pointer.
-      "punpckldq  %%xmm4,%%xmm0                    \n"
-      "movq       %%xmm0,(%1)                      \n"
-      "movdqa     %%xmm0,%%xmm4                    \n"
-      "palignr    $0x8,%%xmm4,%%xmm4               \n"
-      "movq       %%xmm4,(%1,%4)                   \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "punpckldq  %%xmm6,%%xmm2                    \n"
-      "movdqa     %%xmm2,%%xmm6                    \n"
-      "movq       %%xmm2,(%1)                      \n"
-      "palignr    $0x8,%%xmm6,%%xmm6               \n"
-      "punpckldq  %%xmm5,%%xmm1                    \n"
-      "movq       %%xmm6,(%1,%4)                   \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "movdqa     %%xmm1,%%xmm5                    \n"
-      "movq       %%xmm1,(%1)                      \n"
-      "palignr    $0x8,%%xmm5,%%xmm5               \n"
-      "movq       %%xmm5,(%1,%4)                   \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "punpckldq  %%xmm7,%%xmm3                    \n"
-      "movq       %%xmm3,(%1)                      \n"
-      "movdqa     %%xmm3,%%xmm7                    \n"
-      "palignr    $0x8,%%xmm7,%%xmm7               \n"
-      "sub        $0x8,%2                          \n"
-      "movq       %%xmm7,(%1,%4)                   \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "jg         1b                               \n"
+      "punpckldq   %%xmm4,%%xmm0                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "palignr     $0x8,%%xmm4,%%xmm4            \n"
+      "movq        %%xmm4,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "punpckldq   %%xmm6,%%xmm2                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "movq        %%xmm2,(%1)                   \n"
+      "palignr     $0x8,%%xmm6,%%xmm6            \n"
+      "punpckldq   %%xmm5,%%xmm1                 \n"
+      "movq        %%xmm6,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "movdqa      %%xmm1,%%xmm5                 \n"
+      "movq        %%xmm1,(%1)                   \n"
+      "palignr     $0x8,%%xmm5,%%xmm5            \n"
+      "movq        %%xmm5,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "punpckldq   %%xmm7,%%xmm3                 \n"
+      "movq        %%xmm3,(%1)                   \n"
+      "movdqa      %%xmm3,%%xmm7                 \n"
+      "palignr     $0x8,%%xmm7,%%xmm7            \n"
+      "sub         $0x8,%2                       \n"
+      "movq        %%xmm7,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "jg          1b                            \n"
       : "+r"(src),                    // %0
         "+r"(dst),                    // %1
         "+r"(width)                   // %2
@@ -121,127 +121,127 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src,
       // Read in the data from the source pointer.
       // First round of bit swap.
       LABELALIGN
-      "1:                                          \n"
-      "movdqu     (%0),%%xmm0                      \n"
-      "movdqu     (%0,%3),%%xmm1                   \n"
-      "lea        (%0,%3,2),%0                     \n"
-      "movdqa     %%xmm0,%%xmm8                    \n"
-      "punpcklbw  %%xmm1,%%xmm0                    \n"
-      "punpckhbw  %%xmm1,%%xmm8                    \n"
-      "movdqu     (%0),%%xmm2                      \n"
-      "movdqa     %%xmm0,%%xmm1                    \n"
-      "movdqa     %%xmm8,%%xmm9                    \n"
-      "palignr    $0x8,%%xmm1,%%xmm1               \n"
-      "palignr    $0x8,%%xmm9,%%xmm9               \n"
-      "movdqu     (%0,%3),%%xmm3                   \n"
-      "lea        (%0,%3,2),%0                     \n"
-      "movdqa     %%xmm2,%%xmm10                   \n"
-      "punpcklbw  %%xmm3,%%xmm2                    \n"
-      "punpckhbw  %%xmm3,%%xmm10                   \n"
-      "movdqa     %%xmm2,%%xmm3                    \n"
-      "movdqa     %%xmm10,%%xmm11                  \n"
-      "movdqu     (%0),%%xmm4                      \n"
-      "palignr    $0x8,%%xmm3,%%xmm3               \n"
-      "palignr    $0x8,%%xmm11,%%xmm11             \n"
-      "movdqu     (%0,%3),%%xmm5                   \n"
-      "lea        (%0,%3,2),%0                     \n"
-      "movdqa     %%xmm4,%%xmm12                   \n"
-      "punpcklbw  %%xmm5,%%xmm4                    \n"
-      "punpckhbw  %%xmm5,%%xmm12                   \n"
-      "movdqa     %%xmm4,%%xmm5                    \n"
-      "movdqa     %%xmm12,%%xmm13                  \n"
-      "movdqu     (%0),%%xmm6                      \n"
-      "palignr    $0x8,%%xmm5,%%xmm5               \n"
-      "palignr    $0x8,%%xmm13,%%xmm13             \n"
-      "movdqu     (%0,%3),%%xmm7                   \n"
-      "lea        (%0,%3,2),%0                     \n"
-      "movdqa     %%xmm6,%%xmm14                   \n"
-      "punpcklbw  %%xmm7,%%xmm6                    \n"
-      "punpckhbw  %%xmm7,%%xmm14                   \n"
-      "neg        %3                               \n"
-      "movdqa     %%xmm6,%%xmm7                    \n"
-      "movdqa     %%xmm14,%%xmm15                  \n"
-      "lea        0x10(%0,%3,8),%0                 \n"
-      "palignr    $0x8,%%xmm7,%%xmm7               \n"
-      "palignr    $0x8,%%xmm15,%%xmm15             \n"
-      "neg        %3                               \n"
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      (%0,%3),%%xmm1                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "movdqa      %%xmm0,%%xmm8                 \n"
+      "punpcklbw   %%xmm1,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm8                 \n"
+      "movdqu      (%0),%%xmm2                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm8,%%xmm9                 \n"
+      "palignr     $0x8,%%xmm1,%%xmm1            \n"
+      "palignr     $0x8,%%xmm9,%%xmm9            \n"
+      "movdqu      (%0,%3),%%xmm3                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "movdqa      %%xmm2,%%xmm10                \n"
+      "punpcklbw   %%xmm3,%%xmm2                 \n"
+      "punpckhbw   %%xmm3,%%xmm10                \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "movdqa      %%xmm10,%%xmm11               \n"
+      "movdqu      (%0),%%xmm4                   \n"
+      "palignr     $0x8,%%xmm3,%%xmm3            \n"
+      "palignr     $0x8,%%xmm11,%%xmm11          \n"
+      "movdqu      (%0,%3),%%xmm5                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "movdqa      %%xmm4,%%xmm12                \n"
+      "punpcklbw   %%xmm5,%%xmm4                 \n"
+      "punpckhbw   %%xmm5,%%xmm12                \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "movdqa      %%xmm12,%%xmm13               \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "palignr     $0x8,%%xmm5,%%xmm5            \n"
+      "palignr     $0x8,%%xmm13,%%xmm13          \n"
+      "movdqu      (%0,%3),%%xmm7                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "movdqa      %%xmm6,%%xmm14                \n"
+      "punpcklbw   %%xmm7,%%xmm6                 \n"
+      "punpckhbw   %%xmm7,%%xmm14                \n"
+      "neg         %3                            \n"
+      "movdqa      %%xmm6,%%xmm7                 \n"
+      "movdqa      %%xmm14,%%xmm15               \n"
+      "lea         0x10(%0,%3,8),%0              \n"
+      "palignr     $0x8,%%xmm7,%%xmm7            \n"
+      "palignr     $0x8,%%xmm15,%%xmm15          \n"
+      "neg         %3                            \n"
       // Second round of bit swap.
-      "punpcklwd  %%xmm2,%%xmm0                    \n"
-      "punpcklwd  %%xmm3,%%xmm1                    \n"
-      "movdqa     %%xmm0,%%xmm2                    \n"
-      "movdqa     %%xmm1,%%xmm3                    \n"
-      "palignr    $0x8,%%xmm2,%%xmm2               \n"
-      "palignr    $0x8,%%xmm3,%%xmm3               \n"
-      "punpcklwd  %%xmm6,%%xmm4                    \n"
-      "punpcklwd  %%xmm7,%%xmm5                    \n"
-      "movdqa     %%xmm4,%%xmm6                    \n"
-      "movdqa     %%xmm5,%%xmm7                    \n"
-      "palignr    $0x8,%%xmm6,%%xmm6               \n"
-      "palignr    $0x8,%%xmm7,%%xmm7               \n"
-      "punpcklwd  %%xmm10,%%xmm8                   \n"
-      "punpcklwd  %%xmm11,%%xmm9                   \n"
-      "movdqa     %%xmm8,%%xmm10                   \n"
-      "movdqa     %%xmm9,%%xmm11                   \n"
-      "palignr    $0x8,%%xmm10,%%xmm10             \n"
-      "palignr    $0x8,%%xmm11,%%xmm11             \n"
-      "punpcklwd  %%xmm14,%%xmm12                  \n"
-      "punpcklwd  %%xmm15,%%xmm13                  \n"
-      "movdqa     %%xmm12,%%xmm14                  \n"
-      "movdqa     %%xmm13,%%xmm15                  \n"
-      "palignr    $0x8,%%xmm14,%%xmm14             \n"
-      "palignr    $0x8,%%xmm15,%%xmm15             \n"
+      "punpcklwd   %%xmm2,%%xmm0                 \n"
+      "punpcklwd   %%xmm3,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "palignr     $0x8,%%xmm2,%%xmm2            \n"
+      "palignr     $0x8,%%xmm3,%%xmm3            \n"
+      "punpcklwd   %%xmm6,%%xmm4                 \n"
+      "punpcklwd   %%xmm7,%%xmm5                 \n"
+      "movdqa      %%xmm4,%%xmm6                 \n"
+      "movdqa      %%xmm5,%%xmm7                 \n"
+      "palignr     $0x8,%%xmm6,%%xmm6            \n"
+      "palignr     $0x8,%%xmm7,%%xmm7            \n"
+      "punpcklwd   %%xmm10,%%xmm8                \n"
+      "punpcklwd   %%xmm11,%%xmm9                \n"
+      "movdqa      %%xmm8,%%xmm10                \n"
+      "movdqa      %%xmm9,%%xmm11                \n"
+      "palignr     $0x8,%%xmm10,%%xmm10          \n"
+      "palignr     $0x8,%%xmm11,%%xmm11          \n"
+      "punpcklwd   %%xmm14,%%xmm12               \n"
+      "punpcklwd   %%xmm15,%%xmm13               \n"
+      "movdqa      %%xmm12,%%xmm14               \n"
+      "movdqa      %%xmm13,%%xmm15               \n"
+      "palignr     $0x8,%%xmm14,%%xmm14          \n"
+      "palignr     $0x8,%%xmm15,%%xmm15          \n"
       // Third round of bit swap.
       // Write to the destination pointer.
-      "punpckldq  %%xmm4,%%xmm0                    \n"
-      "movq       %%xmm0,(%1)                      \n"
-      "movdqa     %%xmm0,%%xmm4                    \n"
-      "palignr    $0x8,%%xmm4,%%xmm4               \n"
-      "movq       %%xmm4,(%1,%4)                   \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "punpckldq  %%xmm6,%%xmm2                    \n"
-      "movdqa     %%xmm2,%%xmm6                    \n"
-      "movq       %%xmm2,(%1)                      \n"
-      "palignr    $0x8,%%xmm6,%%xmm6               \n"
-      "punpckldq  %%xmm5,%%xmm1                    \n"
-      "movq       %%xmm6,(%1,%4)                   \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "movdqa     %%xmm1,%%xmm5                    \n"
-      "movq       %%xmm1,(%1)                      \n"
-      "palignr    $0x8,%%xmm5,%%xmm5               \n"
-      "movq       %%xmm5,(%1,%4)                   \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "punpckldq  %%xmm7,%%xmm3                    \n"
-      "movq       %%xmm3,(%1)                      \n"
-      "movdqa     %%xmm3,%%xmm7                    \n"
-      "palignr    $0x8,%%xmm7,%%xmm7               \n"
-      "movq       %%xmm7,(%1,%4)                   \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "punpckldq  %%xmm12,%%xmm8                   \n"
-      "movq       %%xmm8,(%1)                      \n"
-      "movdqa     %%xmm8,%%xmm12                   \n"
-      "palignr    $0x8,%%xmm12,%%xmm12             \n"
-      "movq       %%xmm12,(%1,%4)                  \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "punpckldq  %%xmm14,%%xmm10                  \n"
-      "movdqa     %%xmm10,%%xmm14                  \n"
-      "movq       %%xmm10,(%1)                     \n"
-      "palignr    $0x8,%%xmm14,%%xmm14             \n"
-      "punpckldq  %%xmm13,%%xmm9                   \n"
-      "movq       %%xmm14,(%1,%4)                  \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "movdqa     %%xmm9,%%xmm13                   \n"
-      "movq       %%xmm9,(%1)                      \n"
-      "palignr    $0x8,%%xmm13,%%xmm13             \n"
-      "movq       %%xmm13,(%1,%4)                  \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "punpckldq  %%xmm15,%%xmm11                  \n"
-      "movq       %%xmm11,(%1)                     \n"
-      "movdqa     %%xmm11,%%xmm15                  \n"
-      "palignr    $0x8,%%xmm15,%%xmm15             \n"
-      "sub        $0x10,%2                         \n"
-      "movq       %%xmm15,(%1,%4)                  \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "jg         1b                               \n"
+      "punpckldq   %%xmm4,%%xmm0                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "palignr     $0x8,%%xmm4,%%xmm4            \n"
+      "movq        %%xmm4,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "punpckldq   %%xmm6,%%xmm2                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "movq        %%xmm2,(%1)                   \n"
+      "palignr     $0x8,%%xmm6,%%xmm6            \n"
+      "punpckldq   %%xmm5,%%xmm1                 \n"
+      "movq        %%xmm6,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "movdqa      %%xmm1,%%xmm5                 \n"
+      "movq        %%xmm1,(%1)                   \n"
+      "palignr     $0x8,%%xmm5,%%xmm5            \n"
+      "movq        %%xmm5,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "punpckldq   %%xmm7,%%xmm3                 \n"
+      "movq        %%xmm3,(%1)                   \n"
+      "movdqa      %%xmm3,%%xmm7                 \n"
+      "palignr     $0x8,%%xmm7,%%xmm7            \n"
+      "movq        %%xmm7,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "punpckldq   %%xmm12,%%xmm8                \n"
+      "movq        %%xmm8,(%1)                   \n"
+      "movdqa      %%xmm8,%%xmm12                \n"
+      "palignr     $0x8,%%xmm12,%%xmm12          \n"
+      "movq        %%xmm12,(%1,%4)               \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "punpckldq   %%xmm14,%%xmm10               \n"
+      "movdqa      %%xmm10,%%xmm14               \n"
+      "movq        %%xmm10,(%1)                  \n"
+      "palignr     $0x8,%%xmm14,%%xmm14          \n"
+      "punpckldq   %%xmm13,%%xmm9                \n"
+      "movq        %%xmm14,(%1,%4)               \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "movdqa      %%xmm9,%%xmm13                \n"
+      "movq        %%xmm9,(%1)                   \n"
+      "palignr     $0x8,%%xmm13,%%xmm13          \n"
+      "movq        %%xmm13,(%1,%4)               \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "punpckldq   %%xmm15,%%xmm11               \n"
+      "movq        %%xmm11,(%1)                  \n"
+      "movdqa      %%xmm11,%%xmm15               \n"
+      "palignr     $0x8,%%xmm15,%%xmm15          \n"
+      "sub         $0x10,%2                      \n"
+      "movq        %%xmm15,(%1,%4)               \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "jg          1b                            \n"
       : "+r"(src),                    // %0
         "+r"(dst),                    // %1
         "+r"(width)                   // %2
@@ -266,95 +266,95 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
       // Read in the data from the source pointer.
       // First round of bit swap.
       LABELALIGN
-      "1:                                          \n"
-      "movdqu     (%0),%%xmm0                      \n"
-      "movdqu     (%0,%4),%%xmm1                   \n"
-      "lea        (%0,%4,2),%0                     \n"
-      "movdqa     %%xmm0,%%xmm8                    \n"
-      "punpcklbw  %%xmm1,%%xmm0                    \n"
-      "punpckhbw  %%xmm1,%%xmm8                    \n"
-      "movdqa     %%xmm8,%%xmm1                    \n"
-      "movdqu     (%0),%%xmm2                      \n"
-      "movdqu     (%0,%4),%%xmm3                   \n"
-      "lea        (%0,%4,2),%0                     \n"
-      "movdqa     %%xmm2,%%xmm8                    \n"
-      "punpcklbw  %%xmm3,%%xmm2                    \n"
-      "punpckhbw  %%xmm3,%%xmm8                    \n"
-      "movdqa     %%xmm8,%%xmm3                    \n"
-      "movdqu     (%0),%%xmm4                      \n"
-      "movdqu     (%0,%4),%%xmm5                   \n"
-      "lea        (%0,%4,2),%0                     \n"
-      "movdqa     %%xmm4,%%xmm8                    \n"
-      "punpcklbw  %%xmm5,%%xmm4                    \n"
-      "punpckhbw  %%xmm5,%%xmm8                    \n"
-      "movdqa     %%xmm8,%%xmm5                    \n"
-      "movdqu     (%0),%%xmm6                      \n"
-      "movdqu     (%0,%4),%%xmm7                   \n"
-      "lea        (%0,%4,2),%0                     \n"
-      "movdqa     %%xmm6,%%xmm8                    \n"
-      "punpcklbw  %%xmm7,%%xmm6                    \n"
-      "neg        %4                               \n"
-      "lea        0x10(%0,%4,8),%0                 \n"
-      "punpckhbw  %%xmm7,%%xmm8                    \n"
-      "movdqa     %%xmm8,%%xmm7                    \n"
-      "neg        %4                               \n"
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      (%0,%4),%%xmm1                \n"
+      "lea         (%0,%4,2),%0                  \n"
+      "movdqa      %%xmm0,%%xmm8                 \n"
+      "punpcklbw   %%xmm1,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm8                 \n"
+      "movdqa      %%xmm8,%%xmm1                 \n"
+      "movdqu      (%0),%%xmm2                   \n"
+      "movdqu      (%0,%4),%%xmm3                \n"
+      "lea         (%0,%4,2),%0                  \n"
+      "movdqa      %%xmm2,%%xmm8                 \n"
+      "punpcklbw   %%xmm3,%%xmm2                 \n"
+      "punpckhbw   %%xmm3,%%xmm8                 \n"
+      "movdqa      %%xmm8,%%xmm3                 \n"
+      "movdqu      (%0),%%xmm4                   \n"
+      "movdqu      (%0,%4),%%xmm5                \n"
+      "lea         (%0,%4,2),%0                  \n"
+      "movdqa      %%xmm4,%%xmm8                 \n"
+      "punpcklbw   %%xmm5,%%xmm4                 \n"
+      "punpckhbw   %%xmm5,%%xmm8                 \n"
+      "movdqa      %%xmm8,%%xmm5                 \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "movdqu      (%0,%4),%%xmm7                \n"
+      "lea         (%0,%4,2),%0                  \n"
+      "movdqa      %%xmm6,%%xmm8                 \n"
+      "punpcklbw   %%xmm7,%%xmm6                 \n"
+      "neg         %4                            \n"
+      "lea         0x10(%0,%4,8),%0              \n"
+      "punpckhbw   %%xmm7,%%xmm8                 \n"
+      "movdqa      %%xmm8,%%xmm7                 \n"
+      "neg         %4                            \n"
       // Second round of bit swap.
-      "movdqa     %%xmm0,%%xmm8                    \n"
-      "movdqa     %%xmm1,%%xmm9                    \n"
-      "punpckhwd  %%xmm2,%%xmm8                    \n"
-      "punpckhwd  %%xmm3,%%xmm9                    \n"
-      "punpcklwd  %%xmm2,%%xmm0                    \n"
-      "punpcklwd  %%xmm3,%%xmm1                    \n"
-      "movdqa     %%xmm8,%%xmm2                    \n"
-      "movdqa     %%xmm9,%%xmm3                    \n"
-      "movdqa     %%xmm4,%%xmm8                    \n"
-      "movdqa     %%xmm5,%%xmm9                    \n"
-      "punpckhwd  %%xmm6,%%xmm8                    \n"
-      "punpckhwd  %%xmm7,%%xmm9                    \n"
-      "punpcklwd  %%xmm6,%%xmm4                    \n"
-      "punpcklwd  %%xmm7,%%xmm5                    \n"
-      "movdqa     %%xmm8,%%xmm6                    \n"
-      "movdqa     %%xmm9,%%xmm7                    \n"
+      "movdqa      %%xmm0,%%xmm8                 \n"
+      "movdqa      %%xmm1,%%xmm9                 \n"
+      "punpckhwd   %%xmm2,%%xmm8                 \n"
+      "punpckhwd   %%xmm3,%%xmm9                 \n"
+      "punpcklwd   %%xmm2,%%xmm0                 \n"
+      "punpcklwd   %%xmm3,%%xmm1                 \n"
+      "movdqa      %%xmm8,%%xmm2                 \n"
+      "movdqa      %%xmm9,%%xmm3                 \n"
+      "movdqa      %%xmm4,%%xmm8                 \n"
+      "movdqa      %%xmm5,%%xmm9                 \n"
+      "punpckhwd   %%xmm6,%%xmm8                 \n"
+      "punpckhwd   %%xmm7,%%xmm9                 \n"
+      "punpcklwd   %%xmm6,%%xmm4                 \n"
+      "punpcklwd   %%xmm7,%%xmm5                 \n"
+      "movdqa      %%xmm8,%%xmm6                 \n"
+      "movdqa      %%xmm9,%%xmm7                 \n"
       // Third round of bit swap.
       // Write to the destination pointer.
-      "movdqa     %%xmm0,%%xmm8                    \n"
-      "punpckldq  %%xmm4,%%xmm0                    \n"
-      "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
-      "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
-      "punpckhdq  %%xmm4,%%xmm8                    \n"
-      "movlpd     %%xmm8,(%1,%5)                   \n"
-      "lea        (%1,%5,2),%1                     \n"
-      "movhpd     %%xmm8,(%2,%6)                   \n"
-      "lea        (%2,%6,2),%2                     \n"
-      "movdqa     %%xmm2,%%xmm8                    \n"
-      "punpckldq  %%xmm6,%%xmm2                    \n"
-      "movlpd     %%xmm2,(%1)                      \n"
-      "movhpd     %%xmm2,(%2)                      \n"
-      "punpckhdq  %%xmm6,%%xmm8                    \n"
-      "movlpd     %%xmm8,(%1,%5)                   \n"
-      "lea        (%1,%5,2),%1                     \n"
-      "movhpd     %%xmm8,(%2,%6)                   \n"
-      "lea        (%2,%6,2),%2                     \n"
-      "movdqa     %%xmm1,%%xmm8                    \n"
-      "punpckldq  %%xmm5,%%xmm1                    \n"
-      "movlpd     %%xmm1,(%1)                      \n"
-      "movhpd     %%xmm1,(%2)                      \n"
-      "punpckhdq  %%xmm5,%%xmm8                    \n"
-      "movlpd     %%xmm8,(%1,%5)                   \n"
-      "lea        (%1,%5,2),%1                     \n"
-      "movhpd     %%xmm8,(%2,%6)                   \n"
-      "lea        (%2,%6,2),%2                     \n"
-      "movdqa     %%xmm3,%%xmm8                    \n"
-      "punpckldq  %%xmm7,%%xmm3                    \n"
-      "movlpd     %%xmm3,(%1)                      \n"
-      "movhpd     %%xmm3,(%2)                      \n"
-      "punpckhdq  %%xmm7,%%xmm8                    \n"
-      "sub        $0x8,%3                          \n"
-      "movlpd     %%xmm8,(%1,%5)                   \n"
-      "lea        (%1,%5,2),%1                     \n"
-      "movhpd     %%xmm8,(%2,%6)                   \n"
-      "lea        (%2,%6,2),%2                     \n"
-      "jg         1b                               \n"
+      "movdqa      %%xmm0,%%xmm8                 \n"
+      "punpckldq   %%xmm4,%%xmm0                 \n"
+      "movlpd      %%xmm0,(%1)                   \n"  // Write back U channel
+      "movhpd      %%xmm0,(%2)                   \n"  // Write back V channel
+      "punpckhdq   %%xmm4,%%xmm8                 \n"
+      "movlpd      %%xmm8,(%1,%5)                \n"
+      "lea         (%1,%5,2),%1                  \n"
+      "movhpd      %%xmm8,(%2,%6)                \n"
+      "lea         (%2,%6,2),%2                  \n"
+      "movdqa      %%xmm2,%%xmm8                 \n"
+      "punpckldq   %%xmm6,%%xmm2                 \n"
+      "movlpd      %%xmm2,(%1)                   \n"
+      "movhpd      %%xmm2,(%2)                   \n"
+      "punpckhdq   %%xmm6,%%xmm8                 \n"
+      "movlpd      %%xmm8,(%1,%5)                \n"
+      "lea         (%1,%5,2),%1                  \n"
+      "movhpd      %%xmm8,(%2,%6)                \n"
+      "lea         (%2,%6,2),%2                  \n"
+      "movdqa      %%xmm1,%%xmm8                 \n"
+      "punpckldq   %%xmm5,%%xmm1                 \n"
+      "movlpd      %%xmm1,(%1)                   \n"
+      "movhpd      %%xmm1,(%2)                   \n"
+      "punpckhdq   %%xmm5,%%xmm8                 \n"
+      "movlpd      %%xmm8,(%1,%5)                \n"
+      "lea         (%1,%5,2),%1                  \n"
+      "movhpd      %%xmm8,(%2,%6)                \n"
+      "lea         (%2,%6,2),%2                  \n"
+      "movdqa      %%xmm3,%%xmm8                 \n"
+      "punpckldq   %%xmm7,%%xmm3                 \n"
+      "movlpd      %%xmm3,(%1)                   \n"
+      "movhpd      %%xmm3,(%2)                   \n"
+      "punpckhdq   %%xmm7,%%xmm8                 \n"
+      "sub         $0x8,%3                       \n"
+      "movlpd      %%xmm8,(%1,%5)                \n"
+      "lea         (%1,%5,2),%1                  \n"
+      "movhpd      %%xmm8,(%2,%6)                \n"
+      "lea         (%2,%6,2),%2                  \n"
+      "jg          1b                            \n"
       : "+r"(src),                      // %0
         "+r"(dst_a),                    // %1
         "+r"(dst_b),                    // %2
diff --git a/chromium/third_party/libyuv/source/rotate_mmi.cc b/chromium/third_party/libyuv/source/rotate_mmi.cc
new file mode 100644
index 00000000000..f8de60834d9
--- /dev/null
+++ b/chromium/third_party/libyuv/source/rotate_mmi.cc
@@ -0,0 +1,291 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Mips MMI.
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+void TransposeWx8_MMI(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst,
+                      int dst_stride,
+                      int width) {
+  uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+  uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13;
+  uint8_t* src_tmp = nullptr;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "ldc1       %[tmp12],        0x00(%[src])                     \n\t"
+      "dadd       %[src_tmp],      %[src],         %[src_stride]    \n\t"
+      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
+
+      /* tmp0 = (00 10 01 11 02 12 03 13) */
+      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
+      /* tmp1 = (04 14 05 15 06 16 07 17) */
+      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
+
+      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
+      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
+      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
+      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
+
+      /* tmp2 = (20 30 21 31 22 32 23 33) */
+      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
+      /* tmp3 = (24 34 25 35 26 36 27 37) */
+      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
+
+      /* tmp4 = (00 10 20 30 01 11 21 31) */
+      "punpcklhw  %[tmp4],         %[tmp0],        %[tmp2]          \n\t"
+      /* tmp5 = (02 12 22 32 03 13 23 33) */
+      "punpckhhw  %[tmp5],         %[tmp0],        %[tmp2]          \n\t"
+      /* tmp6 = (04 14 24 34 05 15 25 35) */
+      "punpcklhw  %[tmp6],         %[tmp1],        %[tmp3]          \n\t"
+      /* tmp7 = (06 16 26 36 07 17 27 37) */
+      "punpckhhw  %[tmp7],         %[tmp1],        %[tmp3]          \n\t"
+
+      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
+      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
+      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
+      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
+
+      /* tmp0 = (40 50 41 51 42 52 43 53) */
+      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
+      /* tmp1 = (44 54 45 55 46 56 47 57) */
+      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
+
+      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
+      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
+      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
+      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
+
+      /* tmp2 = (60 70 61 71 62 72 63 73) */
+      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
+      /* tmp3 = (64 74 65 75 66 76 67 77) */
+      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
+
+      /* tmp8 = (40 50 60 70 41 51 61 71) */
+      "punpcklhw  %[tmp8],         %[tmp0],        %[tmp2]          \n\t"
+      /* tmp9 = (42 52 62 72 43 53 63 73) */
+      "punpckhhw  %[tmp9],         %[tmp0],        %[tmp2]          \n\t"
+      /* tmp10 = (44 54 64 74 45 55 65 75) */
+      "punpcklhw  %[tmp10],        %[tmp1],        %[tmp3]          \n\t"
+      /* tmp11 = (46 56 66 76 47 57 67 77) */
+      "punpckhhw  %[tmp11],        %[tmp1],        %[tmp3]          \n\t"
+
+      /* tmp0 = (00 10 20 30 40 50 60 70) */
+      "punpcklwd  %[tmp0],         %[tmp4],        %[tmp8]          \n\t"
+      /* tmp1 = (01 11 21 31 41 51 61 71) */
+      "punpckhwd  %[tmp1],         %[tmp4],        %[tmp8]          \n\t"
+      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
+      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
+      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
+      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
+      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
+
+      /* tmp0 = (02 12 22 32 42 52 62 72) */
+      "punpcklwd  %[tmp0],         %[tmp5],        %[tmp9]          \n\t"
+      /* tmp1 = (03 13 23 33 43 53 63 73) */
+      "punpckhwd  %[tmp1],         %[tmp5],        %[tmp9]          \n\t"
+      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
+      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
+      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
+      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
+      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
+      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
+
+      /* tmp0 = (04 14 24 34 44 54 64 74) */
+      "punpcklwd  %[tmp0],         %[tmp6],        %[tmp10]         \n\t"
+      /* tmp1 = (05 15 25 35 45 55 65 75) */
+      "punpckhwd  %[tmp1],         %[tmp6],        %[tmp10]         \n\t"
+      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
+      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
+      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
+      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
+      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
+      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
+
+      /* tmp0 = (06 16 26 36 46 56 66 76) */
+      "punpcklwd  %[tmp0],         %[tmp7],        %[tmp11]         \n\t"
+      /* tmp1 = (07 17 27 37 47 57 67 77) */
+      "punpckhwd  %[tmp1],         %[tmp7],        %[tmp11]         \n\t"
+      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
+      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
+      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
+      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
+      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
+      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
+
+      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
+      "daddi      %[src],          %[src],          0x08            \n\t"
+      "daddi      %[width],        %[width],       -0x08            \n\t"
+      "bnez       %[width],        1b                               \n\t"
+
+      : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
+        [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5),
+        [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8),
+        [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11),
+        [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst] "+&r"(dst),
+        [src_tmp] "+&r"(src_tmp)
+      : [src] "r"(src), [width] "r"(width), [src_stride] "r"(src_stride),
+        [dst_stride] "r"(dst_stride)
+      : "memory");
+}
+
+void TransposeUVWx8_MMI(const uint8_t* src,
+                        int src_stride,
+                        uint8_t* dst_a,
+                        int dst_stride_a,
+                        uint8_t* dst_b,
+                        int dst_stride_b,
+                        int width) {
+  uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+  uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13;
+  uint8_t* src_tmp = nullptr;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      /* tmp12 = (u00 v00 u01 v01 u02 v02 u03 v03) */
+      "ldc1       %[tmp12],        0x00(%[src])                     \n\t"
+      "dadd       %[src_tmp],      %[src],         %[src_stride]    \n\t"
+      /* tmp13 = (u10 v10 u11 v11 u12 v12 u13 v13) */
+      "ldc1       %[tmp13],        0x00(%[src_tmp])                  \n\t"
+
+      /* tmp0 = (u00 u10 v00 v10 u01 u11 v01 v11) */
+      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
+      /* tmp1 = (u02 u12 v02 v12 u03 u13 v03 v13) */
+      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
+
+      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
+      /* tmp12 = (u20 v20 u21 v21 u22 v22 u23 v23) */
+      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
+      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
+      /* tmp13 = (u30 v30 u31 v31 u32 v32 u33 v33) */
+      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
+
+      /* tmp2 = (u20 u30 v20 v30 u21 u31 v21 v31) */
+      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
+      /* tmp3 = (u22 u32 v22 v32 u23 u33 v23 v33) */
+      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
+
+      /* tmp4 = (u00 u10 u20 u30 v00 v10 v20 v30) */
+      "punpcklhw  %[tmp4],         %[tmp0],        %[tmp2]          \n\t"
+      /* tmp5 = (u01 u11 u21 u31 v01 v11 v21 v31) */
+      "punpckhhw  %[tmp5],         %[tmp0],        %[tmp2]          \n\t"
+      /* tmp6 = (u02 u12 u22 u32 v02 v12 v22 v32) */
+      "punpcklhw  %[tmp6],         %[tmp1],        %[tmp3]          \n\t"
+      /* tmp7 = (u03 u13 u23 u33 v03 v13 v23 v33) */
+      "punpckhhw  %[tmp7],         %[tmp1],        %[tmp3]          \n\t"
+
+      "dadd       %[src_tmp],     %[src_tmp],      %[src_stride]    \n\t"
+      /* tmp12 = (u40 v40 u41 v41 u42 v42 u43 v43) */
+      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
+      /* tmp13 = (u50 v50 u51 v51 u52 v52 u53 v53) */
+      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
+      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
+
+      /* tmp0 = (u40 u50 v40 v50 u41 u51 v41 v51) */
+      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
+      /* tmp1 = (u42 u52 v42 v52 u43 u53 v43 v53) */
+      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
+
+      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
+      /* tmp12 = (u60 v60 u61 v61 u62 v62 u63 v63) */
+      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
+      /* tmp13 = (u70 v70 u71 v71 u72 v72 u73 v73) */
+      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
+      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
+
+      /* tmp2 = (u60 u70 v60 v70 u61 u71 v61 v71) */
+      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
+      /* tmp3 = (u62 u72 v62 v72 u63 u73 v63 v73) */
+      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
+
+      /* tmp8 = (u40 u50 u60 u70 v40 v50 v60 v70) */
+      "punpcklhw  %[tmp8],         %[tmp0],        %[tmp2]          \n\t"
+      /* tmp9 = (u41 u51 u61 u71 v41 v51 v61 v71) */
+      "punpckhhw  %[tmp9],         %[tmp0],        %[tmp2]          \n\t"
+      /* tmp10 = (u42 u52 u62 u72 v42 v52 v62 v72) */
+      "punpcklhw  %[tmp10],        %[tmp1],        %[tmp3]          \n\t"
+      /* tmp11 = (u43 u53 u63 u73 v43 v53 v63 v73) */
+      "punpckhhw  %[tmp11],        %[tmp1],        %[tmp3]          \n\t"
+
+      /* tmp0 = (u00 u10 u20 u30 u40 u50 u60 u70) */
+      "punpcklwd  %[tmp0],         %[tmp4],        %[tmp8]          \n\t"
+      /* tmp1 = (v00 v10 v20 v30 v40 v50 v60 v70) */
+      "punpckhwd  %[tmp1],         %[tmp4],        %[tmp8]          \n\t"
+      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
+      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
+      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
+      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
+
+      /* tmp0 = (u01 u11 u21 u31 u41 u51 u61 u71) */
+      "punpcklwd  %[tmp0],         %[tmp5],        %[tmp9]          \n\t"
+      /* tmp1 = (v01 v11 v21 v31 v41 v51 v61 v71) */
+      "punpckhwd  %[tmp1],         %[tmp5],        %[tmp9]          \n\t"
+      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
+      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
+      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
+      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
+      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
+      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
+
+      /* tmp0 = (u02 u12 u22 u32 u42 u52 u62 u72) */
+      "punpcklwd  %[tmp0],         %[tmp6],        %[tmp10]         \n\t"
+      /* tmp1 = (v02 v12 v22 v32 v42 v52 v62 v72) */
+      "punpckhwd  %[tmp1],         %[tmp6],        %[tmp10]         \n\t"
+      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
+      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
+      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
+      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
+      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
+      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
+
+      /* tmp0 = (u03 u13 u23 u33 u43 u53 u63 u73) */
+      "punpcklwd  %[tmp0],         %[tmp7],        %[tmp11]         \n\t"
+      /* tmp1 = (v03 v13 v23 v33 v43 v53 v63 v73) */
+      "punpckhwd  %[tmp1],         %[tmp7],        %[tmp11]         \n\t"
+      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
+      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
+      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
+      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
+      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
+      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
+
+      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
+      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
+      "daddiu     %[src],          %[src],          0x08            \n\t"
+      "daddi      %[width],        %[width],       -0x04            \n\t"
+      "bnez       %[width],        1b                               \n\t"
+
+      : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
+        [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5),
+        [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8),
+        [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11),
+        [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst_a] "+&r"(dst_a),
+        [dst_b] "+&r"(dst_b), [src_tmp] "+&r"(src_tmp)
+      : [src] "r"(src), [width] "r"(width), [dst_stride_a] "r"(dst_stride_a),
+        [dst_stride_b] "r"(dst_stride_b), [src_stride] "r"(src_stride)
+      : "memory");
+}
+
+#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/rotate_neon.cc b/chromium/third_party/libyuv/source/rotate_neon.cc
index fdc0dd476c6..844df2bf305 100644
--- a/chromium/third_party/libyuv/source/rotate_neon.cc
+++ b/chromium/third_party/libyuv/source/rotate_neon.cc
@@ -38,52 +38,52 @@ void TransposeWx8_NEON(const uint8_t* src,
 
       // handle 8x8 blocks. this should be the majority of the plane
       "1:                                        \n"
-      "mov         %0, %1                      \n"
-
-      "vld1.8      {d0}, [%0], %2              \n"
-      "vld1.8      {d1}, [%0], %2              \n"
-      "vld1.8      {d2}, [%0], %2              \n"
-      "vld1.8      {d3}, [%0], %2              \n"
-      "vld1.8      {d4}, [%0], %2              \n"
-      "vld1.8      {d5}, [%0], %2              \n"
-      "vld1.8      {d6}, [%0], %2              \n"
-      "vld1.8      {d7}, [%0]                  \n"
-
-      "vtrn.8      d1, d0                      \n"
-      "vtrn.8      d3, d2                      \n"
-      "vtrn.8      d5, d4                      \n"
-      "vtrn.8      d7, d6                      \n"
-
-      "vtrn.16     d1, d3                      \n"
-      "vtrn.16     d0, d2                      \n"
-      "vtrn.16     d5, d7                      \n"
-      "vtrn.16     d4, d6                      \n"
-
-      "vtrn.32     d1, d5                      \n"
-      "vtrn.32     d0, d4                      \n"
-      "vtrn.32     d3, d7                      \n"
-      "vtrn.32     d2, d6                      \n"
-
-      "vrev16.8    q0, q0                      \n"
-      "vrev16.8    q1, q1                      \n"
-      "vrev16.8    q2, q2                      \n"
-      "vrev16.8    q3, q3                      \n"
-
-      "mov         %0, %3                      \n"
-
-      "vst1.8      {d1}, [%0], %4              \n"
-      "vst1.8      {d0}, [%0], %4              \n"
-      "vst1.8      {d3}, [%0], %4              \n"
-      "vst1.8      {d2}, [%0], %4              \n"
-      "vst1.8      {d5}, [%0], %4              \n"
-      "vst1.8      {d4}, [%0], %4              \n"
-      "vst1.8      {d7}, [%0], %4              \n"
-      "vst1.8      {d6}, [%0]                  \n"
-
-      "add         %1, #8                      \n"  // src += 8
-      "add         %3, %3, %4, lsl #3          \n"  // dst += 8 * dst_stride
-      "subs        %5,  #8                     \n"  // w   -= 8
-      "bge         1b                          \n"
+      "mov         %0, %1                        \n"
+
+      "vld1.8      {d0}, [%0], %2                \n"
+      "vld1.8      {d1}, [%0], %2                \n"
+      "vld1.8      {d2}, [%0], %2                \n"
+      "vld1.8      {d3}, [%0], %2                \n"
+      "vld1.8      {d4}, [%0], %2                \n"
+      "vld1.8      {d5}, [%0], %2                \n"
+      "vld1.8      {d6}, [%0], %2                \n"
+      "vld1.8      {d7}, [%0]                    \n"
+
+      "vtrn.8      d1, d0                        \n"
+      "vtrn.8      d3, d2                        \n"
+      "vtrn.8      d5, d4                        \n"
+      "vtrn.8      d7, d6                        \n"
+
+      "vtrn.16     d1, d3                        \n"
+      "vtrn.16     d0, d2                        \n"
+      "vtrn.16     d5, d7                        \n"
+      "vtrn.16     d4, d6                        \n"
+
+      "vtrn.32     d1, d5                        \n"
+      "vtrn.32     d0, d4                        \n"
+      "vtrn.32     d3, d7                        \n"
+      "vtrn.32     d2, d6                        \n"
+
+      "vrev16.8    q0, q0                        \n"
+      "vrev16.8    q1, q1                        \n"
+      "vrev16.8    q2, q2                        \n"
+      "vrev16.8    q3, q3                        \n"
+
+      "mov         %0, %3                        \n"
+
+      "vst1.8      {d1}, [%0], %4                \n"
+      "vst1.8      {d0}, [%0], %4                \n"
+      "vst1.8      {d3}, [%0], %4                \n"
+      "vst1.8      {d2}, [%0], %4                \n"
+      "vst1.8      {d5}, [%0], %4                \n"
+      "vst1.8      {d4}, [%0], %4                \n"
+      "vst1.8      {d7}, [%0], %4                \n"
+      "vst1.8      {d6}, [%0]                    \n"
+
+      "add         %1, #8                        \n"  // src += 8
+      "add         %3, %3, %4, lsl #3            \n"  // dst += 8 * dst_stride
+      "subs        %5,  #8                       \n"  // w   -= 8
+      "bge         1b                            \n"
 
       // add 8 back to counter. if the result is 0 there are
       // no residuals.
@@ -208,68 +208,70 @@ void TransposeUVWx8_NEON(const uint8_t* src,
 
       // handle 8x8 blocks. this should be the majority of the plane
       "1:                                        \n"
-      "mov         %0, %1                      \n"
-
-      "vld2.8      {d0,  d1},  [%0], %2        \n"
-      "vld2.8      {d2,  d3},  [%0], %2        \n"
-      "vld2.8      {d4,  d5},  [%0], %2        \n"
-      "vld2.8      {d6,  d7},  [%0], %2        \n"
-      "vld2.8      {d16, d17}, [%0], %2        \n"
-      "vld2.8      {d18, d19}, [%0], %2        \n"
-      "vld2.8      {d20, d21}, [%0], %2        \n"
-      "vld2.8      {d22, d23}, [%0]            \n"
-
-      "vtrn.8      q1, q0                      \n"
-      "vtrn.8      q3, q2                      \n"
-      "vtrn.8      q9, q8                      \n"
-      "vtrn.8      q11, q10                    \n"
-
-      "vtrn.16     q1, q3                      \n"
-      "vtrn.16     q0, q2                      \n"
-      "vtrn.16     q9, q11                     \n"
-      "vtrn.16     q8, q10                     \n"
-
-      "vtrn.32     q1, q9                      \n"
-      "vtrn.32     q0, q8                      \n"
-      "vtrn.32     q3, q11                     \n"
-      "vtrn.32     q2, q10                     \n"
-
-      "vrev16.8    q0, q0                      \n"
-      "vrev16.8    q1, q1                      \n"
-      "vrev16.8    q2, q2                      \n"
-      "vrev16.8    q3, q3                      \n"
-      "vrev16.8    q8, q8                      \n"
-      "vrev16.8    q9, q9                      \n"
-      "vrev16.8    q10, q10                    \n"
-      "vrev16.8    q11, q11                    \n"
-
-      "mov         %0, %3                      \n"
-
-      "vst1.8      {d2},  [%0], %4             \n"
-      "vst1.8      {d0},  [%0], %4             \n"
-      "vst1.8      {d6},  [%0], %4             \n"
-      "vst1.8      {d4},  [%0], %4             \n"
-      "vst1.8      {d18}, [%0], %4             \n"
-      "vst1.8      {d16}, [%0], %4             \n"
-      "vst1.8      {d22}, [%0], %4             \n"
-      "vst1.8      {d20}, [%0]                 \n"
-
-      "mov         %0, %5                      \n"
-
-      "vst1.8      {d3},  [%0], %6             \n"
-      "vst1.8      {d1},  [%0], %6             \n"
-      "vst1.8      {d7},  [%0], %6             \n"
-      "vst1.8      {d5},  [%0], %6             \n"
-      "vst1.8      {d19}, [%0], %6             \n"
-      "vst1.8      {d17}, [%0], %6             \n"
-      "vst1.8      {d23}, [%0], %6             \n"
-      "vst1.8      {d21}, [%0]                 \n"
-
-      "add         %1, #8*2                    \n"  // src   += 8*2
-      "add         %3, %3, %4, lsl #3          \n"  // dst_a += 8 * dst_stride_a
-      "add         %5, %5, %6, lsl #3          \n"  // dst_b += 8 * dst_stride_b
-      "subs        %7,  #8                     \n"  // w     -= 8
-      "bge         1b                          \n"
+      "mov         %0, %1                        \n"
+
+      "vld2.8      {d0,  d1},  [%0], %2          \n"
+      "vld2.8      {d2,  d3},  [%0], %2          \n"
+      "vld2.8      {d4,  d5},  [%0], %2          \n"
+      "vld2.8      {d6,  d7},  [%0], %2          \n"
+      "vld2.8      {d16, d17}, [%0], %2          \n"
+      "vld2.8      {d18, d19}, [%0], %2          \n"
+      "vld2.8      {d20, d21}, [%0], %2          \n"
+      "vld2.8      {d22, d23}, [%0]              \n"
+
+      "vtrn.8      q1, q0                        \n"
+      "vtrn.8      q3, q2                        \n"
+      "vtrn.8      q9, q8                        \n"
+      "vtrn.8      q11, q10                      \n"
+
+      "vtrn.16     q1, q3                        \n"
+      "vtrn.16     q0, q2                        \n"
+      "vtrn.16     q9, q11                       \n"
+      "vtrn.16     q8, q10                       \n"
+
+      "vtrn.32     q1, q9                        \n"
+      "vtrn.32     q0, q8                        \n"
+      "vtrn.32     q3, q11                       \n"
+      "vtrn.32     q2, q10                       \n"
+
+      "vrev16.8    q0, q0                        \n"
+      "vrev16.8    q1, q1                        \n"
+      "vrev16.8    q2, q2                        \n"
+      "vrev16.8    q3, q3                        \n"
+      "vrev16.8    q8, q8                        \n"
+      "vrev16.8    q9, q9                        \n"
+      "vrev16.8    q10, q10                      \n"
+      "vrev16.8    q11, q11                      \n"
+
+      "mov         %0, %3                        \n"
+
+      "vst1.8      {d2},  [%0], %4               \n"
+      "vst1.8      {d0},  [%0], %4               \n"
+      "vst1.8      {d6},  [%0], %4               \n"
+      "vst1.8      {d4},  [%0], %4               \n"
+      "vst1.8      {d18}, [%0], %4               \n"
+      "vst1.8      {d16}, [%0], %4               \n"
+      "vst1.8      {d22}, [%0], %4               \n"
+      "vst1.8      {d20}, [%0]                   \n"
+
+      "mov         %0, %5                        \n"
+
+      "vst1.8      {d3},  [%0], %6               \n"
+      "vst1.8      {d1},  [%0], %6               \n"
+      "vst1.8      {d7},  [%0], %6               \n"
+      "vst1.8      {d5},  [%0], %6               \n"
+      "vst1.8      {d19}, [%0], %6               \n"
+      "vst1.8      {d17}, [%0], %6               \n"
+      "vst1.8      {d23}, [%0], %6               \n"
+      "vst1.8      {d21}, [%0]                   \n"
+
+      "add         %1, #8*2                      \n"  // src   += 8*2
+      "add         %3, %3, %4, lsl #3            \n"  // dst_a += 8 *
+                                                      // dst_stride_a
+      "add         %5, %5, %6, lsl #3            \n"  // dst_b += 8 *
+                                                      // dst_stride_b
+      "subs        %7,  #8                       \n"  // w     -= 8
+      "bge         1b                            \n"
 
       // add 8 back to counter. if the result is 0 there are
       // no residuals.
diff --git a/chromium/third_party/libyuv/source/rotate_neon64.cc b/chromium/third_party/libyuv/source/rotate_neon64.cc
index f469baacf68..43c1581731d 100644
--- a/chromium/third_party/libyuv/source/rotate_neon64.cc
+++ b/chromium/third_party/libyuv/source/rotate_neon64.cc
@@ -34,58 +34,74 @@ void TransposeWx8_NEON(const uint8_t* src,
       // loops are on blocks of 8. loop will stop when
       // counter gets to or below 0. starting the counter
       // at w-8 allow for this
-      "sub         %w3, %w3, #8                     \n"
+      "sub         %w3, %w3, #8                  \n"
 
       // handle 8x8 blocks. this should be the majority of the plane
-      "1:                                          \n"
+      "1:                                        \n"
+      "mov         %0, %1                        \n"
+
+      "ld1         {v0.8b}, [%0], %5             \n"
+      "ld1         {v1.8b}, [%0], %5             \n"
+      "ld1         {v2.8b}, [%0], %5             \n"
+      "ld1         {v3.8b}, [%0], %5             \n"
+      "ld1         {v4.8b}, [%0], %5             \n"
+      "ld1         {v5.8b}, [%0], %5             \n"
+      "ld1         {v6.8b}, [%0], %5             \n"
+      "ld1         {v7.8b}, [%0]                 \n"
       "mov         %0, %1                        \n"
 
-      "ld1        {v0.8b}, [%0], %5              \n"
-      "ld1        {v1.8b}, [%0], %5              \n"
-      "ld1        {v2.8b}, [%0], %5              \n"
-      "ld1        {v3.8b}, [%0], %5              \n"
-      "ld1        {v4.8b}, [%0], %5              \n"
-      "ld1        {v5.8b}, [%0], %5              \n"
-      "ld1        {v6.8b}, [%0], %5              \n"
-      "ld1        {v7.8b}, [%0]                  \n"
-
-      "trn2     v16.8b, v0.8b, v1.8b             \n"
-      "trn1     v17.8b, v0.8b, v1.8b             \n"
-      "trn2     v18.8b, v2.8b, v3.8b             \n"
-      "trn1     v19.8b, v2.8b, v3.8b             \n"
-      "trn2     v20.8b, v4.8b, v5.8b             \n"
-      "trn1     v21.8b, v4.8b, v5.8b             \n"
-      "trn2     v22.8b, v6.8b, v7.8b             \n"
-      "trn1     v23.8b, v6.8b, v7.8b             \n"
-
-      "trn2     v3.4h, v17.4h, v19.4h            \n"
-      "trn1     v1.4h, v17.4h, v19.4h            \n"
-      "trn2     v2.4h, v16.4h, v18.4h            \n"
-      "trn1     v0.4h, v16.4h, v18.4h            \n"
-      "trn2     v7.4h, v21.4h, v23.4h            \n"
-      "trn1     v5.4h, v21.4h, v23.4h            \n"
-      "trn2     v6.4h, v20.4h, v22.4h            \n"
-      "trn1     v4.4h, v20.4h, v22.4h            \n"
-
-      "trn2     v21.2s, v1.2s, v5.2s             \n"
-      "trn1     v17.2s, v1.2s, v5.2s             \n"
-      "trn2     v20.2s, v0.2s, v4.2s             \n"
-      "trn1     v16.2s, v0.2s, v4.2s             \n"
-      "trn2     v23.2s, v3.2s, v7.2s             \n"
-      "trn1     v19.2s, v3.2s, v7.2s             \n"
-      "trn2     v22.2s, v2.2s, v6.2s             \n"
-      "trn1     v18.2s, v2.2s, v6.2s             \n"
+      "trn2        v16.8b, v0.8b, v1.8b          \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "trn1        v17.8b, v0.8b, v1.8b          \n"
+      "add         %0, %0, %5                    \n"
+      "trn2        v18.8b, v2.8b, v3.8b          \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // row 1
+      "trn1        v19.8b, v2.8b, v3.8b          \n"
+      "add         %0, %0, %5                    \n"
+      "trn2        v20.8b, v4.8b, v5.8b          \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // row 2
+      "trn1        v21.8b, v4.8b, v5.8b          \n"
+      "add         %0, %0, %5                    \n"
+      "trn2        v22.8b, v6.8b, v7.8b          \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // row 3
+      "trn1        v23.8b, v6.8b, v7.8b          \n"
+      "add         %0, %0, %5                    \n"
+
+      "trn2        v3.4h, v17.4h, v19.4h         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // row 4
+      "trn1        v1.4h, v17.4h, v19.4h         \n"
+      "add         %0, %0, %5                    \n"
+      "trn2        v2.4h, v16.4h, v18.4h         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // row 5
+      "trn1        v0.4h, v16.4h, v18.4h         \n"
+      "add         %0, %0, %5                    \n"
+      "trn2        v7.4h, v21.4h, v23.4h         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // row 6
+      "trn1        v5.4h, v21.4h, v23.4h         \n"
+      "add         %0, %0, %5                    \n"
+      "trn2        v6.4h, v20.4h, v22.4h         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // row 7
+      "trn1        v4.4h, v20.4h, v22.4h         \n"
+
+      "trn2        v21.2s, v1.2s, v5.2s          \n"
+      "trn1        v17.2s, v1.2s, v5.2s          \n"
+      "trn2        v20.2s, v0.2s, v4.2s          \n"
+      "trn1        v16.2s, v0.2s, v4.2s          \n"
+      "trn2        v23.2s, v3.2s, v7.2s          \n"
+      "trn1        v19.2s, v3.2s, v7.2s          \n"
+      "trn2        v22.2s, v2.2s, v6.2s          \n"
+      "trn1        v18.2s, v2.2s, v6.2s          \n"
 
       "mov         %0, %2                        \n"
 
-      "st1      {v17.8b}, [%0], %6               \n"
-      "st1      {v16.8b}, [%0], %6               \n"
-      "st1      {v19.8b}, [%0], %6               \n"
-      "st1      {v18.8b}, [%0], %6               \n"
-      "st1      {v21.8b}, [%0], %6               \n"
-      "st1      {v20.8b}, [%0], %6               \n"
-      "st1      {v23.8b}, [%0], %6               \n"
-      "st1      {v22.8b}, [%0]                   \n"
+      "st1         {v17.8b}, [%0], %6            \n"
+      "st1         {v16.8b}, [%0], %6            \n"
+      "st1         {v19.8b}, [%0], %6            \n"
+      "st1         {v18.8b}, [%0], %6            \n"
+      "st1         {v21.8b}, [%0], %6            \n"
+      "st1         {v20.8b}, [%0], %6            \n"
+      "st1         {v23.8b}, [%0], %6            \n"
+      "st1         {v22.8b}, [%0]                \n"
 
       "add         %1, %1, #8                    \n"  // src += 8
       "add         %2, %2, %6, lsl #3            \n"  // dst += 8 * dst_stride
@@ -94,33 +110,33 @@ void TransposeWx8_NEON(const uint8_t* src,
 
       // add 8 back to counter. if the result is 0 there are
       // no residuals.
-      "adds        %w3, %w3, #8                    \n"
-      "b.eq        4f                              \n"
+      "adds        %w3, %w3, #8                  \n"
+      "b.eq        4f                            \n"
 
       // some residual, so between 1 and 7 lines left to transpose
-      "cmp         %w3, #2                          \n"
-      "b.lt        3f                              \n"
+      "cmp         %w3, #2                       \n"
+      "b.lt        3f                            \n"
 
-      "cmp         %w3, #4                          \n"
-      "b.lt        2f                              \n"
+      "cmp         %w3, #4                       \n"
+      "b.lt        2f                            \n"
 
       // 4x8 block
-      "mov         %0, %1                          \n"
-      "ld1     {v0.s}[0], [%0], %5                 \n"
-      "ld1     {v0.s}[1], [%0], %5                 \n"
-      "ld1     {v0.s}[2], [%0], %5                 \n"
-      "ld1     {v0.s}[3], [%0], %5                 \n"
-      "ld1     {v1.s}[0], [%0], %5                 \n"
-      "ld1     {v1.s}[1], [%0], %5                 \n"
-      "ld1     {v1.s}[2], [%0], %5                 \n"
-      "ld1     {v1.s}[3], [%0]                     \n"
+      "mov         %0, %1                        \n"
+      "ld1         {v0.s}[0], [%0], %5           \n"
+      "ld1         {v0.s}[1], [%0], %5           \n"
+      "ld1         {v0.s}[2], [%0], %5           \n"
+      "ld1         {v0.s}[3], [%0], %5           \n"
+      "ld1         {v1.s}[0], [%0], %5           \n"
+      "ld1         {v1.s}[1], [%0], %5           \n"
+      "ld1         {v1.s}[2], [%0], %5           \n"
+      "ld1         {v1.s}[3], [%0]               \n"
 
-      "mov         %0, %2                          \n"
+      "mov         %0, %2                        \n"
 
-      "ld1      {v2.16b}, [%4]                     \n"
+      "ld1         {v2.16b}, [%4]                \n"
 
-      "tbl      v3.16b, {v0.16b}, v2.16b           \n"
-      "tbl      v0.16b, {v1.16b}, v2.16b           \n"
+      "tbl         v3.16b, {v0.16b}, v2.16b      \n"
+      "tbl         v0.16b, {v1.16b}, v2.16b      \n"
 
       // TODO(frkoenig): Rework shuffle above to
       // write out with 4 instead of 8 writes.
@@ -212,89 +228,90 @@ void TransposeUVWx8_NEON(const uint8_t* src,
       // loops are on blocks of 8. loop will stop when
       // counter gets to or below 0. starting the counter
       // at w-8 allow for this
-      "sub       %w4, %w4, #8                    \n"
+      "sub         %w4, %w4, #8                  \n"
 
       // handle 8x8 blocks. this should be the majority of the plane
       "1:                                        \n"
-      "mov       %0, %1                          \n"
-
-      "ld1       {v0.16b}, [%0], %5              \n"
-      "ld1       {v1.16b}, [%0], %5              \n"
-      "ld1       {v2.16b}, [%0], %5              \n"
-      "ld1       {v3.16b}, [%0], %5              \n"
-      "ld1       {v4.16b}, [%0], %5              \n"
-      "ld1       {v5.16b}, [%0], %5              \n"
-      "ld1       {v6.16b}, [%0], %5              \n"
-      "ld1       {v7.16b}, [%0]                  \n"
-
-      "trn1      v16.16b, v0.16b, v1.16b         \n"
-      "trn2      v17.16b, v0.16b, v1.16b         \n"
-      "trn1      v18.16b, v2.16b, v3.16b         \n"
-      "trn2      v19.16b, v2.16b, v3.16b         \n"
-      "trn1      v20.16b, v4.16b, v5.16b         \n"
-      "trn2      v21.16b, v4.16b, v5.16b         \n"
-      "trn1      v22.16b, v6.16b, v7.16b         \n"
-      "trn2      v23.16b, v6.16b, v7.16b         \n"
-
-      "trn1      v0.8h, v16.8h, v18.8h           \n"
-      "trn2      v1.8h, v16.8h, v18.8h           \n"
-      "trn1      v2.8h, v20.8h, v22.8h           \n"
-      "trn2      v3.8h, v20.8h, v22.8h           \n"
-      "trn1      v4.8h, v17.8h, v19.8h           \n"
-      "trn2      v5.8h, v17.8h, v19.8h           \n"
-      "trn1      v6.8h, v21.8h, v23.8h           \n"
-      "trn2      v7.8h, v21.8h, v23.8h           \n"
-
-      "trn1      v16.4s, v0.4s, v2.4s            \n"
-      "trn2      v17.4s, v0.4s, v2.4s            \n"
-      "trn1      v18.4s, v1.4s, v3.4s            \n"
-      "trn2      v19.4s, v1.4s, v3.4s            \n"
-      "trn1      v20.4s, v4.4s, v6.4s            \n"
-      "trn2      v21.4s, v4.4s, v6.4s            \n"
-      "trn1      v22.4s, v5.4s, v7.4s            \n"
-      "trn2      v23.4s, v5.4s, v7.4s            \n"
+      "mov         %0, %1                        \n"
 
-      "mov       %0, %2                          \n"
+      "ld1         {v0.16b}, [%0], %5            \n"
+      "ld1         {v1.16b}, [%0], %5            \n"
+      "ld1         {v2.16b}, [%0], %5            \n"
+      "ld1         {v3.16b}, [%0], %5            \n"
+      "ld1         {v4.16b}, [%0], %5            \n"
+      "ld1         {v5.16b}, [%0], %5            \n"
+      "ld1         {v6.16b}, [%0], %5            \n"
+      "ld1         {v7.16b}, [%0]                \n"
+      "mov         %0, %1                        \n"
 
-      "st1       {v16.d}[0], [%0], %6            \n"
-      "st1       {v18.d}[0], [%0], %6            \n"
-      "st1       {v17.d}[0], [%0], %6            \n"
-      "st1       {v19.d}[0], [%0], %6            \n"
-      "st1       {v16.d}[1], [%0], %6            \n"
-      "st1       {v18.d}[1], [%0], %6            \n"
-      "st1       {v17.d}[1], [%0], %6            \n"
-      "st1       {v19.d}[1], [%0]                \n"
+      "trn1        v16.16b, v0.16b, v1.16b       \n"
+      "trn2        v17.16b, v0.16b, v1.16b       \n"
+      "trn1        v18.16b, v2.16b, v3.16b       \n"
+      "trn2        v19.16b, v2.16b, v3.16b       \n"
+      "trn1        v20.16b, v4.16b, v5.16b       \n"
+      "trn2        v21.16b, v4.16b, v5.16b       \n"
+      "trn1        v22.16b, v6.16b, v7.16b       \n"
+      "trn2        v23.16b, v6.16b, v7.16b       \n"
+
+      "trn1        v0.8h, v16.8h, v18.8h         \n"
+      "trn2        v1.8h, v16.8h, v18.8h         \n"
+      "trn1        v2.8h, v20.8h, v22.8h         \n"
+      "trn2        v3.8h, v20.8h, v22.8h         \n"
+      "trn1        v4.8h, v17.8h, v19.8h         \n"
+      "trn2        v5.8h, v17.8h, v19.8h         \n"
+      "trn1        v6.8h, v21.8h, v23.8h         \n"
+      "trn2        v7.8h, v21.8h, v23.8h         \n"
+
+      "trn1        v16.4s, v0.4s, v2.4s          \n"
+      "trn2        v17.4s, v0.4s, v2.4s          \n"
+      "trn1        v18.4s, v1.4s, v3.4s          \n"
+      "trn2        v19.4s, v1.4s, v3.4s          \n"
+      "trn1        v20.4s, v4.4s, v6.4s          \n"
+      "trn2        v21.4s, v4.4s, v6.4s          \n"
+      "trn1        v22.4s, v5.4s, v7.4s          \n"
+      "trn2        v23.4s, v5.4s, v7.4s          \n"
 
-      "mov       %0, %3                          \n"
+      "mov         %0, %2                        \n"
 
-      "st1       {v20.d}[0], [%0], %7            \n"
-      "st1       {v22.d}[0], [%0], %7            \n"
-      "st1       {v21.d}[0], [%0], %7            \n"
-      "st1       {v23.d}[0], [%0], %7            \n"
-      "st1       {v20.d}[1], [%0], %7            \n"
-      "st1       {v22.d}[1], [%0], %7            \n"
-      "st1       {v21.d}[1], [%0], %7            \n"
-      "st1       {v23.d}[1], [%0]                \n"
-
-      "add       %1, %1, #16                     \n"  // src   += 8*2
-      "add       %2, %2, %6, lsl #3              \n"  // dst_a += 8 *
+      "st1         {v16.d}[0], [%0], %6          \n"
+      "st1         {v18.d}[0], [%0], %6          \n"
+      "st1         {v17.d}[0], [%0], %6          \n"
+      "st1         {v19.d}[0], [%0], %6          \n"
+      "st1         {v16.d}[1], [%0], %6          \n"
+      "st1         {v18.d}[1], [%0], %6          \n"
+      "st1         {v17.d}[1], [%0], %6          \n"
+      "st1         {v19.d}[1], [%0]              \n"
+
+      "mov         %0, %3                        \n"
+
+      "st1         {v20.d}[0], [%0], %7          \n"
+      "st1         {v22.d}[0], [%0], %7          \n"
+      "st1         {v21.d}[0], [%0], %7          \n"
+      "st1         {v23.d}[0], [%0], %7          \n"
+      "st1         {v20.d}[1], [%0], %7          \n"
+      "st1         {v22.d}[1], [%0], %7          \n"
+      "st1         {v21.d}[1], [%0], %7          \n"
+      "st1         {v23.d}[1], [%0]              \n"
+
+      "add         %1, %1, #16                   \n"  // src   += 8*2
+      "add         %2, %2, %6, lsl #3            \n"  // dst_a += 8 *
                                                       // dst_stride_a
-      "add       %3, %3, %7, lsl #3              \n"  // dst_b += 8 *
+      "add         %3, %3, %7, lsl #3            \n"  // dst_b += 8 *
                                                       // dst_stride_b
-      "subs      %w4, %w4,  #8                   \n"  // w     -= 8
-      "b.ge      1b                              \n"
+      "subs        %w4, %w4,  #8                 \n"  // w     -= 8
+      "b.ge        1b                            \n"
 
       // add 8 back to counter. if the result is 0 there are
       // no residuals.
-      "adds      %w4, %w4, #8                    \n"
-      "b.eq      4f                              \n"
+      "adds        %w4, %w4, #8                  \n"
+      "b.eq        4f                            \n"
 
       // some residual, so between 1 and 7 lines left to transpose
-      "cmp       %w4, #2                         \n"
-      "b.lt      3f                              \n"
+      "cmp         %w4, #2                       \n"
+      "b.lt        3f                            \n"
 
-      "cmp       %w4, #4                         \n"
-      "b.lt      2f                              \n"
+      "cmp         %w4, #4                       \n"
+      "b.lt        2f                            \n"
 
       // TODO(frkoenig): Clean this up
       // 4x8 block
diff --git a/chromium/third_party/libyuv/source/row_any.cc b/chromium/third_party/libyuv/source/row_any.cc
index e91560c44c6..7216373bcd1 100644
--- a/chromium/third_party/libyuv/source/row_any.cc
+++ b/chromium/third_party/libyuv/source/row_any.cc
@@ -64,6 +64,9 @@ ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
 #ifdef HAS_I422ALPHATOARGBROW_MSA
 ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
 #endif
+#ifdef HAS_I422ALPHATOARGBROW_MMI
+ANY41C(I422AlphaToARGBRow_Any_MMI, I422AlphaToARGBRow_MMI, 1, 0, 4, 7)
+#endif
 #undef ANY41C
 
 // Any 3 planes to 1.
@@ -92,6 +95,9 @@ ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15)
 #ifdef HAS_MERGERGBROW_NEON
 ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
 #endif
+#ifdef HAS_MERGERGBROW_MMI
+ANY31(MergeRGBRow_Any_MMI, MergeRGBRow_MMI, 0, 0, 3, 7)
+#endif
 #ifdef HAS_I422TOYUY2ROW_SSE2
 ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
 ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
@@ -106,18 +112,27 @@ ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
 #ifdef HAS_I422TOYUY2ROW_MSA
 ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)
 #endif
+#ifdef HAS_I422TOYUY2ROW_MMI
+ANY31(I422ToYUY2Row_Any_MMI, I422ToYUY2Row_MMI, 1, 1, 4, 7)
+#endif
 #ifdef HAS_I422TOUYVYROW_NEON
 ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
 #endif
 #ifdef HAS_I422TOUYVYROW_MSA
 ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)
 #endif
+#ifdef HAS_I422TOUYVYROW_MMI
+ANY31(I422ToUYVYRow_Any_MMI, I422ToUYVYRow_MMI, 1, 1, 4, 7)
+#endif
 #ifdef HAS_BLENDPLANEROW_AVX2
 ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
 #endif
 #ifdef HAS_BLENDPLANEROW_SSSE3
 ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
 #endif
+#ifdef HAS_BLENDPLANEROW_MMI
+ANY31(BlendPlaneRow_Any_MMI, BlendPlaneRow_MMI, 0, 0, 1, 7)
+#endif
 #undef ANY31
 
 // Note that odd width replication includes 444 due to implementation
@@ -203,6 +218,15 @@ ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7)
 ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7)
 ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
 #endif
+#ifdef HAS_I422TOARGBROW_MMI
+ANY31C(I444ToARGBRow_Any_MMI, I444ToARGBRow_MMI, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_MMI, I422ToARGBRow_MMI, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_MMI, I422ToRGB24Row_MMI, 1, 0, 3, 15)
+ANY31C(I422ToARGB4444Row_Any_MMI, I422ToARGB4444Row_MMI, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_MMI, I422ToARGB1555Row_MMI, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_MMI, I422ToRGB565Row_MMI, 1, 0, 2, 7)
+ANY31C(I422ToRGBARow_Any_MMI, I422ToRGBARow_MMI, 1, 0, 4, 7)
+#endif
 #undef ANY31C
 
 // Any 3 planes of 16 bit to 1 with yuvconstants
@@ -238,6 +262,9 @@ ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
 #ifdef HAS_I210TOAR30ROW_AVX2
 ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
 #endif
+#ifdef HAS_I210TOARGBROW_MMI
+ANY31CT(I210ToARGBRow_Any_MMI, I210ToARGBRow_MMI, 1, 0, uint16_t, 2, 4, 7)
+#endif
 #undef ANY31CT
 
 // Any 2 planes to 1.
@@ -271,7 +298,15 @@ ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
 #ifdef HAS_MERGEUVROW_MSA
 ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15)
 #endif
-
+#ifdef HAS_MERGEUVROW_MMI
+ANY21(MergeUVRow_Any_MMI, MergeUVRow_MMI, 0, 1, 1, 2, 7)
+#endif
+#ifdef HAS_NV21TOYUV24ROW_NEON
+ANY21(NV21ToYUV24Row_Any_NEON, NV21ToYUV24Row_NEON, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV21TOYUV24ROW_AVX2
+ANY21(NV21ToYUV24Row_Any_AVX2, NV21ToYUV24Row_AVX2, 1, 1, 2, 3, 31)
+#endif
 // Math functions.
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
 ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
@@ -303,12 +338,21 @@ ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
 #ifdef HAS_ARGBMULTIPLYROW_MSA
 ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3)
 #endif
+#ifdef HAS_ARGBMULTIPLYROW_MMI
+ANY21(ARGBMultiplyRow_Any_MMI, ARGBMultiplyRow_MMI, 0, 4, 4, 4, 1)
+#endif
 #ifdef HAS_ARGBADDROW_MSA
 ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7)
 #endif
+#ifdef HAS_ARGBADDROW_MMI
+ANY21(ARGBAddRow_Any_MMI, ARGBAddRow_MMI, 0, 4, 4, 4, 1)
+#endif
 #ifdef HAS_ARGBSUBTRACTROW_MSA
 ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7)
 #endif
+#ifdef HAS_ARGBSUBTRACTROW_MMI
+ANY21(ARGBSubtractRow_Any_MMI, ARGBSubtractRow_MMI, 0, 4, 4, 4, 1)
+#endif
 #ifdef HAS_SOBELROW_SSE2
 ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
 #endif
@@ -318,6 +362,9 @@ ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
 #ifdef HAS_SOBELROW_MSA
 ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15)
 #endif
+#ifdef HAS_SOBELROW_MMI
+ANY21(SobelRow_Any_MMI, SobelRow_MMI, 0, 1, 1, 4, 7)
+#endif
 #ifdef HAS_SOBELTOPLANEROW_SSE2
 ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
 #endif
@@ -327,6 +374,9 @@ ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
 #ifdef HAS_SOBELTOPLANEROW_MSA
 ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31)
 #endif
+#ifdef HAS_SOBELTOPLANEROW_MMI
+ANY21(SobelToPlaneRow_Any_MMI, SobelToPlaneRow_MMI, 0, 1, 1, 1, 7)
+#endif
 #ifdef HAS_SOBELXYROW_SSE2
 ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
 #endif
@@ -336,6 +386,9 @@ ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
 #ifdef HAS_SOBELXYROW_MSA
 ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15)
 #endif
+#ifdef HAS_SOBELXYROW_MMI
+ANY21(SobelXYRow_Any_MMI, SobelXYRow_MMI, 0, 1, 1, 4, 7)
+#endif
 #undef ANY21
 
 // Any 2 planes to 1 with yuvconstants
@@ -369,6 +422,9 @@ ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
 #ifdef HAS_NV12TOARGBROW_MSA
 ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7)
 #endif
+#ifdef HAS_NV12TOARGBROW_MMI
+ANY21C(NV12ToARGBRow_Any_MMI, NV12ToARGBRow_MMI, 1, 1, 2, 4, 7)
+#endif
 #ifdef HAS_NV21TOARGBROW_SSSE3
 ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
 #endif
@@ -381,6 +437,9 @@ ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
 #ifdef HAS_NV21TOARGBROW_MSA
 ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7)
 #endif
+#ifdef HAS_NV21TOARGBROW_MMI
+ANY21C(NV21ToARGBRow_Any_MMI, NV21ToARGBRow_MMI, 1, 1, 2, 4, 7)
+#endif
 #ifdef HAS_NV12TORGB24ROW_NEON
 ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7)
 #endif
@@ -390,6 +449,9 @@ ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7)
 #ifdef HAS_NV12TORGB24ROW_SSSE3
 ANY21C(NV12ToRGB24Row_Any_SSSE3, NV12ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
 #endif
+#ifdef HAS_NV12TORGB24ROW_MMI
+ANY21C(NV12ToRGB24Row_Any_MMI, NV12ToRGB24Row_MMI, 1, 1, 2, 3, 7)
+#endif
 #ifdef HAS_NV21TORGB24ROW_SSSE3
 ANY21C(NV21ToRGB24Row_Any_SSSE3, NV21ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
 #endif
@@ -399,6 +461,9 @@ ANY21C(NV12ToRGB24Row_Any_AVX2, NV12ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
 #ifdef HAS_NV21TORGB24ROW_AVX2
 ANY21C(NV21ToRGB24Row_Any_AVX2, NV21ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
 #endif
+#ifdef HAS_NV21TORGB24ROW_MMI
+ANY21C(NV21ToRGB24Row_Any_MMI, NV21ToRGB24Row_MMI, 1, 1, 2, 3, 7)
+#endif
 #ifdef HAS_NV12TORGB565ROW_SSSE3
 ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
 #endif
@@ -411,6 +476,9 @@ ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
 #ifdef HAS_NV12TORGB565ROW_MSA
 ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7)
 #endif
+#ifdef HAS_NV12TORGB565ROW_MMI
+ANY21C(NV12ToRGB565Row_Any_MMI, NV12ToRGB565Row_MMI, 1, 1, 2, 2, 7)
+#endif
 #undef ANY21C
 
 // Any 1 to 1.
@@ -478,12 +546,6 @@ ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
 #if defined(HAS_J400TOARGBROW_AVX2)
 ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
 #endif
-#if defined(HAS_I400TOARGBROW_SSE2)
-ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7)
-#endif
-#if defined(HAS_I400TOARGBROW_AVX2)
-ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15)
-#endif
 #if defined(HAS_RGB24TOARGBROW_SSSE3)
 ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
 ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
@@ -491,6 +553,9 @@ ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
 ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
 ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
 #endif
+#if defined(HAS_RAWTORGBAROW_SSSE3)
+ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15)
+#endif
 #if defined(HAS_RAWTORGB24ROW_SSSE3)
 ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7)
 #endif
@@ -510,7 +575,6 @@ ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7)
 ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
 ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
 ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
-ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
 #endif
 #if defined(HAS_ARGBTORGB24ROW_MSA)
 ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15)
@@ -519,7 +583,14 @@ ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7)
 ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7)
 ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7)
 ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)
-ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_MMI)
+ANY11(ARGBToRGB24Row_Any_MMI, ARGBToRGB24Row_MMI, 0, 4, 3, 3)
+ANY11(ARGBToRAWRow_Any_MMI, ARGBToRAWRow_MMI, 0, 4, 3, 3)
+ANY11(ARGBToRGB565Row_Any_MMI, ARGBToRGB565Row_MMI, 0, 4, 2, 3)
+ANY11(ARGBToARGB1555Row_Any_MMI, ARGBToARGB1555Row_MMI, 0, 4, 2, 3)
+ANY11(ARGBToARGB4444Row_Any_MMI, ARGBToARGB4444Row_MMI, 0, 4, 2, 3)
+ANY11(J400ToARGBRow_Any_MMI, J400ToARGBRow_MMI, 0, 1, 4, 3)
 #endif
 #if defined(HAS_RAWTORGB24ROW_NEON)
 ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
@@ -527,12 +598,21 @@ ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
 #if defined(HAS_RAWTORGB24ROW_MSA)
 ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15)
 #endif
+#if defined(HAS_RAWTORGB24ROW_MMI)
+ANY11(RAWToRGB24Row_Any_MMI, RAWToRGB24Row_MMI, 0, 3, 3, 3)
+#endif
 #ifdef HAS_ARGBTOYROW_AVX2
 ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
 #endif
+#ifdef HAS_ABGRTOYROW_AVX2
+ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31)
+#endif
 #ifdef HAS_ARGBTOYJROW_AVX2
 ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
 #endif
+#ifdef HAS_RGBATOYJROW_AVX2
+ANY11(RGBAToYJRow_Any_AVX2, RGBAToYJRow_AVX2, 0, 4, 1, 31)
+#endif
 #ifdef HAS_UYVYTOYROW_AVX2
 ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31)
 #endif
@@ -552,63 +632,117 @@ ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
 #ifdef HAS_ARGBTOYJROW_SSSE3
 ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
 #endif
+#ifdef HAS_RGBATOYJROW_SSSE3
+ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15)
+#endif
 #ifdef HAS_ARGBTOYROW_NEON
 ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
 #endif
 #ifdef HAS_ARGBTOYROW_MSA
 ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
 #endif
+#ifdef HAS_ARGBTOYROW_MMI
+ANY11(ARGBToYRow_Any_MMI, ARGBToYRow_MMI, 0, 4, 1, 7)
+#endif
 #ifdef HAS_ARGBTOYJROW_NEON
 ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
 #endif
+#ifdef HAS_RGBATOYJROW_NEON
+ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 7)
+#endif
 #ifdef HAS_ARGBTOYJROW_MSA
 ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
 #endif
+#ifdef HAS_ARGBTOYJROW_MMI
+ANY11(ARGBToYJRow_Any_MMI, ARGBToYJRow_MMI, 0, 4, 1, 7)
+#endif
 #ifdef HAS_BGRATOYROW_NEON
 ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
 #endif
 #ifdef HAS_BGRATOYROW_MSA
 ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
 #endif
+#ifdef HAS_BGRATOYROW_MMI
+ANY11(BGRAToYRow_Any_MMI, BGRAToYRow_MMI, 0, 4, 1, 7)
+#endif
 #ifdef HAS_ABGRTOYROW_NEON
 ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
 #endif
 #ifdef HAS_ABGRTOYROW_MSA
 ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
 #endif
+#ifdef HAS_ABGRTOYROW_MMI
+ANY11(ABGRToYRow_Any_MMI, ABGRToYRow_MMI, 0, 4, 1, 7)
+#endif
 #ifdef HAS_RGBATOYROW_NEON
 ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
 #endif
 #ifdef HAS_RGBATOYROW_MSA
 ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
 #endif
+#ifdef HAS_RGBATOYROW_MMI
+ANY11(RGBAToYRow_Any_MMI, RGBAToYRow_MMI, 0, 4, 1, 7)
+#endif
 #ifdef HAS_RGB24TOYROW_NEON
 ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
 #endif
+#ifdef HAS_RGB24TOYJROW_AVX2
+ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RGB24TOYJROW_SSSE3
+ANY11(RGB24ToYJRow_Any_SSSE3, RGB24ToYJRow_SSSE3, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYJROW_NEON
+ANY11(RGB24ToYJRow_Any_NEON, RGB24ToYJRow_NEON, 0, 3, 1, 7)
+#endif
 #ifdef HAS_RGB24TOYROW_MSA
 ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15)
 #endif
+#ifdef HAS_RGB24TOYROW_MMI
+ANY11(RGB24ToYRow_Any_MMI, RGB24ToYRow_MMI, 0, 3, 1, 7)
+#endif
 #ifdef HAS_RAWTOYROW_NEON
 ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
 #endif
+#ifdef HAS_RAWTOYJROW_AVX2
+ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RAWTOYJROW_SSSE3
+ANY11(RAWToYJRow_Any_SSSE3, RAWToYJRow_SSSE3, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYJROW_NEON
+ANY11(RAWToYJRow_Any_NEON, RAWToYJRow_NEON, 0, 3, 1, 7)
+#endif
 #ifdef HAS_RAWTOYROW_MSA
 ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15)
 #endif
+#ifdef HAS_RAWTOYROW_MMI
+ANY11(RAWToYRow_Any_MMI, RAWToYRow_MMI, 0, 3, 1, 7)
+#endif
 #ifdef HAS_RGB565TOYROW_NEON
 ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
 #endif
 #ifdef HAS_RGB565TOYROW_MSA
 ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15)
 #endif
+#ifdef HAS_RGB565TOYROW_MMI
+ANY11(RGB565ToYRow_Any_MMI, RGB565ToYRow_MMI, 0, 2, 1, 7)
+#endif
 #ifdef HAS_ARGB1555TOYROW_NEON
 ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
 #endif
 #ifdef HAS_ARGB1555TOYROW_MSA
 ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15)
 #endif
+#ifdef HAS_ARGB1555TOYROW_MMI
+ANY11(ARGB1555ToYRow_Any_MMI, ARGB1555ToYRow_MMI, 0, 2, 1, 7)
+#endif
 #ifdef HAS_ARGB4444TOYROW_NEON
 ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
 #endif
+#ifdef HAS_ARGB4444TOYROW_MMI
+ANY11(ARGB4444ToYRow_Any_MMI, ARGB4444ToYRow_MMI, 0, 2, 1, 7)
+#endif
 #ifdef HAS_YUY2TOYROW_NEON
 ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
 #endif
@@ -618,39 +752,75 @@ ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15)
 #ifdef HAS_YUY2TOYROW_MSA
 ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)
 #endif
+#ifdef HAS_YUY2TOYROW_MMI
+ANY11(YUY2ToYRow_Any_MMI, YUY2ToYRow_MMI, 1, 4, 1, 7)
+#endif
 #ifdef HAS_UYVYTOYROW_MSA
 ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
 #endif
+#ifdef HAS_UYVYTOYROW_MMI
+ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15)
+#endif
+#ifdef HAS_AYUVTOYROW_NEON
+ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_SWAPUVROW_SSSE3
+ANY11(SwapUVRow_Any_SSSE3, SwapUVRow_SSSE3, 0, 2, 2, 15)
+#endif
+#ifdef HAS_SWAPUVROW_AVX2
+ANY11(SwapUVRow_Any_AVX2, SwapUVRow_AVX2, 0, 2, 2, 31)
+#endif
+#ifdef HAS_SWAPUVROW_NEON
+ANY11(SwapUVRow_Any_NEON, SwapUVRow_NEON, 0, 2, 2, 15)
+#endif
 #ifdef HAS_RGB24TOARGBROW_NEON
 ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
 #endif
 #ifdef HAS_RGB24TOARGBROW_MSA
 ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15)
 #endif
+#ifdef HAS_RGB24TOARGBROW_MMI
+ANY11(RGB24ToARGBRow_Any_MMI, RGB24ToARGBRow_MMI, 0, 3, 4, 3)
+#endif
 #ifdef HAS_RAWTOARGBROW_NEON
 ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
 #endif
+#ifdef HAS_RAWTORGBAROW_NEON
+ANY11(RAWToRGBARow_Any_NEON, RAWToRGBARow_NEON, 0, 3, 4, 7)
+#endif
 #ifdef HAS_RAWTOARGBROW_MSA
 ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15)
 #endif
+#ifdef HAS_RAWTOARGBROW_MMI
+ANY11(RAWToARGBRow_Any_MMI, RAWToARGBRow_MMI, 0, 3, 4, 3)
+#endif
 #ifdef HAS_RGB565TOARGBROW_NEON
 ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
 #endif
 #ifdef HAS_RGB565TOARGBROW_MSA
 ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15)
 #endif
+#ifdef HAS_RGB565TOARGBROW_MMI
+ANY11(RGB565ToARGBRow_Any_MMI, RGB565ToARGBRow_MMI, 0, 2, 4, 3)
+#endif
 #ifdef HAS_ARGB1555TOARGBROW_NEON
 ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
 #endif
 #ifdef HAS_ARGB1555TOARGBROW_MSA
 ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15)
 #endif
+#ifdef HAS_ARGB1555TOARGBROW_MMI
+ANY11(ARGB1555ToARGBRow_Any_MMI, ARGB1555ToARGBRow_MMI, 0, 2, 4, 3)
+#endif
 #ifdef HAS_ARGB4444TOARGBROW_NEON
 ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
 #endif
 #ifdef HAS_ARGB4444TOARGBROW_MSA
 ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15)
 #endif
+#ifdef HAS_ARGB4444TOARGBROW_MMI
+ANY11(ARGB4444ToARGBRow_Any_MMI, ARGB4444ToARGBRow_MMI, 0, 2, 4, 3)
+#endif
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
 #endif
@@ -669,6 +839,9 @@ ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
 #ifdef HAS_ARGBATTENUATEROW_MSA
 ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7)
 #endif
+#ifdef HAS_ARGBATTENUATEROW_MMI
+ANY11(ARGBAttenuateRow_Any_MMI, ARGBAttenuateRow_MMI, 0, 4, 4, 1)
+#endif
 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
 ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
 #endif
@@ -681,6 +854,9 @@ ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
 #ifdef HAS_ARGBEXTRACTALPHAROW_MSA
 ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15)
 #endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_MMI
+ANY11(ARGBExtractAlphaRow_Any_MMI, ARGBExtractAlphaRow_MMI, 0, 4, 1, 7)
+#endif
 #undef ANY11
 
 // Any 1 to 1 blended.  Destination is read, modify, write.
@@ -705,12 +881,18 @@ ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
 ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)
 #endif
+#ifdef HAS_ARGBCOPYALPHAROW_MMI
+ANY11B(ARGBCopyAlphaRow_Any_MMI, ARGBCopyAlphaRow_MMI, 0, 4, 4, 1)
+#endif
 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
 ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
 #endif
 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
 ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
 #endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_MMI
+ANY11B(ARGBCopyYToAlphaRow_Any_MMI, ARGBCopyYToAlphaRow_MMI, 0, 1, 4, 7)
+#endif
 #undef ANY11B
 
 // Any 1 to 1 with parameter.
@@ -728,6 +910,47 @@ ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
     memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                             \
   }
 
+#if defined(HAS_I400TOARGBROW_SSE2)
+ANY11P(I400ToARGBRow_Any_SSE2,
+       I400ToARGBRow_SSE2,
+       const struct YuvConstants*,
+       1,
+       4,
+       7)
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+ANY11P(I400ToARGBRow_Any_AVX2,
+       I400ToARGBRow_AVX2,
+       const struct YuvConstants*,
+       1,
+       4,
+       15)
+#endif
+#if defined(HAS_I400TOARGBROW_NEON)
+ANY11P(I400ToARGBRow_Any_NEON,
+       I400ToARGBRow_NEON,
+       const struct YuvConstants*,
+       1,
+       4,
+       7)
+#endif
+#if defined(HAS_I400TOARGBROW_MSA)
+ANY11P(I400ToARGBRow_Any_MSA,
+       I400ToARGBRow_MSA,
+       const struct YuvConstants*,
+       1,
+       4,
+       15)
+#endif
+#if defined(HAS_I400TOARGBROW_MMI)
+ANY11P(I400ToARGBRow_Any_MMI,
+       I400ToARGBRow_MMI,
+       const struct YuvConstants*,
+       1,
+       4,
+       7)
+#endif
+
 #if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
 ANY11P(ARGBToRGB565DitherRow_Any_SSE2,
        ARGBToRGB565DitherRow_SSE2,
@@ -760,6 +983,14 @@ ANY11P(ARGBToRGB565DitherRow_Any_MSA,
        2,
        7)
 #endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
+ANY11P(ARGBToRGB565DitherRow_Any_MMI,
+       ARGBToRGB565DitherRow_MMI,
+       const uint32_t,
+       4,
+       2,
+       3)
+#endif
 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
 ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7)
 #endif
@@ -772,6 +1003,10 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3)
 #ifdef HAS_ARGBSHUFFLEROW_MSA
 ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7)
 #endif
+#ifdef HAS_ARGBSHUFFLEROW_MMI
+ANY11P(ARGBShuffleRow_Any_MMI, ARGBShuffleRow_MMI, const uint8_t*, 4, 4, 1)
+#endif
+#undef ANY11P
 #undef ANY11P
 
 // Any 1 to 1 with parameter and shorts.  BPP measures in shorts.
@@ -909,6 +1144,10 @@ ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
 ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7)
 ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7)
 #endif
+#if defined(HAS_YUY2TOARGBROW_MMI)
+ANY11C(YUY2ToARGBRow_Any_MMI, YUY2ToARGBRow_MMI, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_MMI, UYVYToARGBRow_MMI, 1, 4, 4, 7)
+#endif
 #undef ANY11C
 
 // Any 1 to 1 interpolate.  Takes 2 rows of source via stride.
@@ -940,6 +1179,9 @@ ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
 #ifdef HAS_INTERPOLATEROW_MSA
 ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31)
 #endif
+#ifdef HAS_INTERPOLATEROW_MMI
+ANY11T(InterpolateRow_Any_MMI, InterpolateRow_MMI, 1, 1, 7)
+#endif
 #undef ANY11T
 
 // Any 1 to 1 mirror.
@@ -964,11 +1206,26 @@ ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
 ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
 #endif
 #ifdef HAS_MIRRORROW_NEON
-ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
+ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 31)
 #endif
 #ifdef HAS_MIRRORROW_MSA
 ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
 #endif
+#ifdef HAS_MIRRORROW_MMI
+ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7)
+#endif
+#ifdef HAS_MIRRORUVROW_AVX2
+ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15)
+#endif
+#ifdef HAS_MIRRORUVROW_SSSE3
+ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7)
+#endif
+#ifdef HAS_MIRRORUVROW_NEON
+ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31)
+#endif
+#ifdef HAS_MIRRORUVROW_MSA
+ANY11M(MirrorUVRow_Any_MSA, MirrorUVRow_MSA, 2, 7)
+#endif
 #ifdef HAS_ARGBMIRRORROW_AVX2
 ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
 #endif
@@ -976,17 +1233,27 @@ ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
 ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
 #endif
 #ifdef HAS_ARGBMIRRORROW_NEON
-ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
+ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7)
 #endif
 #ifdef HAS_ARGBMIRRORROW_MSA
 ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
 #endif
+#ifdef HAS_ARGBMIRRORROW_MMI
+ANY11M(ARGBMirrorRow_Any_MMI, ARGBMirrorRow_MMI, 4, 1)
+#endif
+#ifdef HAS_RGB24MIRRORROW_SSSE3
+ANY11M(RGB24MirrorRow_Any_SSSE3, RGB24MirrorRow_SSSE3, 3, 15)
+#endif
+#ifdef HAS_RGB24MIRRORROW_NEON
+ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15)
+#endif
 #undef ANY11M
 
 // Any 1 plane. (memset)
 #define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)        \
   void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \
     SIMD_ALIGNED(uint8_t temp[64]);                  \
+    memset(temp, 0, 64); /* for msan */              \
     int r = width & MASK;                            \
     int n = width & ~MASK;                           \
     if (n > 0) {                                     \
@@ -1008,6 +1275,9 @@ ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32_t, 4, 3)
 #ifdef HAS_ARGBSETROW_MSA
 ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3)
 #endif
+#ifdef HAS_ARGBSETROW_MMI
+ANY1(ARGBSetRow_Any_MMI, ARGBSetRow_MMI, uint32_t, 4, 3)
+#endif
 #undef ANY1
 
 // Any 1 to 2.  Outputs UV planes.
@@ -1039,6 +1309,9 @@ ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
 #ifdef HAS_SPLITUVROW_MSA
 ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31)
 #endif
+#ifdef HAS_SPLITUVROW_MMI
+ANY12(SplitUVRow_Any_MMI, SplitUVRow_MMI, 0, 2, 0, 7)
+#endif
 #ifdef HAS_ARGBTOUV444ROW_SSSE3
 ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
 #endif
@@ -1060,6 +1333,11 @@ ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)
 ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
 ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
 #endif
+#ifdef HAS_YUY2TOUV422ROW_MMI
+ANY12(ARGBToUV444Row_Any_MMI, ARGBToUV444Row_MMI, 0, 4, 0, 7)
+ANY12(UYVYToUV422Row_Any_MMI, UYVYToUV422Row_MMI, 1, 4, 1, 15)
+ANY12(YUY2ToUV422Row_Any_MMI, YUY2ToUV422Row_MMI, 1, 4, 1, 15)
+#endif
 #undef ANY12
 
 // Any 1 to 3.  Outputs RGB planes.
@@ -1086,6 +1364,9 @@ ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15)
 #ifdef HAS_SPLITRGBROW_NEON
 ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
 #endif
+#ifdef HAS_SPLITRGBROW_MMI
+ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3)
+#endif
 
 // Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.
 // 128 byte row allows for 32 avx ARGB pixels.
@@ -1116,6 +1397,9 @@ ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
 #ifdef HAS_ARGBTOUVROW_AVX2
 ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
 #endif
+#ifdef HAS_ABGRTOUVROW_AVX2
+ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31)
+#endif
 #ifdef HAS_ARGBTOUVJROW_AVX2
 ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
 #endif
@@ -1140,29 +1424,44 @@ ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
 #ifdef HAS_ARGBTOUVROW_MSA
 ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
 #endif
+#ifdef HAS_ARGBTOUVROW_MMI
+ANY12S(ARGBToUVRow_Any_MMI, ARGBToUVRow_MMI, 0, 4, 15)
+#endif
 #ifdef HAS_ARGBTOUVJROW_NEON
 ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
 #endif
 #ifdef HAS_ARGBTOUVJROW_MSA
 ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)
 #endif
+#ifdef HAS_ARGBTOUVJROW_MMI
+ANY12S(ARGBToUVJRow_Any_MMI, ARGBToUVJRow_MMI, 0, 4, 15)
+#endif
 #ifdef HAS_BGRATOUVROW_NEON
 ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
 #endif
 #ifdef HAS_BGRATOUVROW_MSA
-ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31)
+ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 15)
+#endif
+#ifdef HAS_BGRATOUVROW_MMI
+ANY12S(BGRAToUVRow_Any_MMI, BGRAToUVRow_MMI, 0, 4, 15)
 #endif
 #ifdef HAS_ABGRTOUVROW_NEON
 ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
 #endif
 #ifdef HAS_ABGRTOUVROW_MSA
-ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31)
+ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_MMI
+ANY12S(ABGRToUVRow_Any_MMI, ABGRToUVRow_MMI, 0, 4, 15)
 #endif
 #ifdef HAS_RGBATOUVROW_NEON
 ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
 #endif
 #ifdef HAS_RGBATOUVROW_MSA
-ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31)
+ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_MMI
+ANY12S(RGBAToUVRow_Any_MMI, RGBAToUVRow_MMI, 0, 4, 15)
 #endif
 #ifdef HAS_RGB24TOUVROW_NEON
 ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
@@ -1170,27 +1469,42 @@ ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
 #ifdef HAS_RGB24TOUVROW_MSA
 ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15)
 #endif
+#ifdef HAS_RGB24TOUVROW_MMI
+ANY12S(RGB24ToUVRow_Any_MMI, RGB24ToUVRow_MMI, 0, 3, 15)
+#endif
 #ifdef HAS_RAWTOUVROW_NEON
 ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
 #endif
 #ifdef HAS_RAWTOUVROW_MSA
 ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15)
 #endif
+#ifdef HAS_RAWTOUVROW_MMI
+ANY12S(RAWToUVRow_Any_MMI, RAWToUVRow_MMI, 0, 3, 15)
+#endif
 #ifdef HAS_RGB565TOUVROW_NEON
 ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
 #endif
 #ifdef HAS_RGB565TOUVROW_MSA
 ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15)
 #endif
+#ifdef HAS_RGB565TOUVROW_MMI
+ANY12S(RGB565ToUVRow_Any_MMI, RGB565ToUVRow_MMI, 0, 2, 15)
+#endif
 #ifdef HAS_ARGB1555TOUVROW_NEON
 ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
 #endif
 #ifdef HAS_ARGB1555TOUVROW_MSA
 ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15)
 #endif
+#ifdef HAS_ARGB1555TOUVROW_MMI
+ANY12S(ARGB1555ToUVRow_Any_MMI, ARGB1555ToUVRow_MMI, 0, 2, 15)
+#endif
 #ifdef HAS_ARGB4444TOUVROW_NEON
 ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
 #endif
+#ifdef HAS_ARGB4444TOUVROW_MMI
+ANY12S(ARGB4444ToUVRow_Any_MMI, ARGB4444ToUVRow_MMI, 0, 2, 15)
+#endif
 #ifdef HAS_YUY2TOUVROW_NEON
 ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
 #endif
@@ -1200,11 +1514,48 @@ ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
 #ifdef HAS_YUY2TOUVROW_MSA
 ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
 #endif
+#ifdef HAS_YUY2TOUVROW_MMI
+ANY12S(YUY2ToUVRow_Any_MMI, YUY2ToUVRow_MMI, 1, 4, 15)
+#endif
 #ifdef HAS_UYVYTOUVROW_MSA
 ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)
 #endif
+#ifdef HAS_UYVYTOUVROW_MMI
+ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15)
+#endif
 #undef ANY12S
 
+// Any 1 to 1 with source stride (2 rows of source).  Outputs UV plane.
+// 128 byte row allows for 32 avx ARGB pixels.
+#define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
+  void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_vu,  \
+               int width) {                                                  \
+    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                     \
+    memset(temp, 0, 128 * 2); /* for msan */                                 \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src_ptr, src_stride_ptr, dst_vu, n);                          \
+    }                                                                        \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);      \
+    memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP,      \
+           SS(r, UVSHIFT) * BPP);                                            \
+    if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
+      memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
+             BPP);                                                           \
+      memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
+             temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
+    }                                                                        \
+    ANY_SIMD(temp, 128, temp + 256, MASK + 1);                               \
+    memcpy(dst_vu + (n >> 1) * 2, temp + 256, SS(r, 1) * 2);                 \
+  }
+
+#ifdef HAS_AYUVTOVUROW_NEON
+ANY11S(AYUVToUVRow_Any_NEON, AYUVToUVRow_NEON, 0, 4, 15)
+ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15)
+#endif
+#undef ANY11S
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/chromium/third_party/libyuv/source/row_common.cc b/chromium/third_party/libyuv/source/row_common.cc
index 2bbc5adbf14..79aed5c7877 100644
--- a/chromium/third_party/libyuv/source/row_common.cc
+++ b/chromium/third_party/libyuv/source/row_common.cc
@@ -14,30 +14,44 @@
 #include <string.h>  // For memcpy and memset.
 
 #include "libyuv/basic_types.h"
+#include "libyuv/convert_argb.h"  // For kYuvI601Constants
 
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
+// The following ifdef from row_win makes the C code match the row_win code,
+// which is 7 bit fixed point.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+    (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
+#define LIBYUV_RGB7 1
+#endif
+
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+    defined(_M_IX86)
+#define LIBYUV_ARGBTOUV_PAVGB 1
+#define LIBYUV_RGBTOU_TRUNCATE 1
+#endif
+
 // llvm x86 is poor at ternary operator, so use branchless min/max.
 
 #define USE_BRANCHLESS 1
 #if USE_BRANCHLESS
 static __inline int32_t clamp0(int32_t v) {
-  return ((-(v) >> 31) & (v));
+  return -(v >= 0) & v;
 }
-
+// TODO(fbarchard): make clamp255 preserve negative values.
 static __inline int32_t clamp255(int32_t v) {
-  return (((255 - (v)) >> 31) | (v)) & 255;
+  return (-(v >= 255) | v) & 255;
 }
 
 static __inline int32_t clamp1023(int32_t v) {
-  return (((1023 - (v)) >> 31) | (v)) & 1023;
+  return (-(v >= 1023) | v) & 1023;
 }
 
 static __inline uint32_t Abs(int32_t v) {
-  int m = v >> 31;
+  int m = -(v < 0);
   return (v + m) ^ m;
 }
 #else   // USE_BRANCHLESS
@@ -111,6 +125,21 @@ void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
   }
 }
 
+void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t r = src_raw[0];
+    uint8_t g = src_raw[1];
+    uint8_t b = src_raw[2];
+    dst_rgba[0] = 255u;
+    dst_rgba[1] = b;
+    dst_rgba[2] = g;
+    dst_rgba[3] = r;
+    dst_rgba += 4;
+    src_raw += 3;
+  }
+}
+
 void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
   int x;
   for (x = 0; x < width; ++x) {
@@ -181,7 +210,8 @@ void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
 void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint32_t ar30 = *(const uint32_t*)src_ar30;
+    uint32_t ar30;
+    memcpy(&ar30, src_ar30, sizeof ar30);
     uint32_t b = (ar30 >> 2) & 0xff;
     uint32_t g = (ar30 >> 12) & 0xff;
     uint32_t r = (ar30 >> 22) & 0xff;
@@ -195,7 +225,8 @@ void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
 void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint32_t ar30 = *(const uint32_t*)src_ar30;
+    uint32_t ar30;
+    memcpy(&ar30, src_ar30, sizeof ar30);
     uint32_t b = (ar30 >> 2) & 0xff;
     uint32_t g = (ar30 >> 12) & 0xff;
     uint32_t r = (ar30 >> 22) & 0xff;
@@ -209,7 +240,8 @@ void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
 void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint32_t ar30 = *(const uint32_t*)src_ar30;
+    uint32_t ar30;
+    memcpy(&ar30, src_ar30, sizeof ar30);
     uint32_t b = ar30 & 0x3ff;
     uint32_t ga = ar30 & 0xc00ffc00;
     uint32_t r = (ar30 >> 20) & 0x3ff;
@@ -381,18 +413,55 @@ void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
   }
 }
 
+#ifdef LIBYUV_RGB7
+// Old 7 bit math for compatibility on unsupported platforms.
+static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
+  return ((33 * r + 65 * g + 13 * b) >> 7) + 16;
+}
+#else
+// 8 bit
+// Intel SSE/AVX uses the following equivalent formula
+// 0x7e80 = (66 + 129 + 25) * -128 + 0x1000 (for +16) and 0x0080 for round.
+//  return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) +
+//  0x7e80) >> 8;
+
 static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
   return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
 }
+#endif
+
+#define AVGB(a, b) (((a) + (b) + 1) >> 1)
 
+#ifdef LIBYUV_RGBTOU_TRUNCATE
+static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
+  return (112 * b - 74 * g - 38 * r + 0x8000) >> 8;
+}
+static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
+  return (112 * r - 94 * g - 18 * b + 0x8000) >> 8;
+}
+#else
+// TODO(fbarchard): Add rounding to SIMD and use this
 static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
   return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
 }
 static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
   return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
 }
+#endif
+
+#if !defined(LIBYUV_ARGBTOUV_PAVGB)
+static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) {
+  return ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8;
+}
+static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
+  return ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8;
+}
+#endif
 
 // ARGBToY_C and ARGBToUV_C
+// Intel version mimic SSE/AVX which does 2 pavgb
+#if LIBYUV_ARGBTOUV_PAVGB
+
 #define MAKEROWY(NAME, R, G, B, BPP)                                         \
   void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
     int x;                                                                   \
@@ -407,15 +476,12 @@ static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
     const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                     \
     int x;                                                                   \
     for (x = 0; x < width - 1; x += 2) {                                     \
-      uint8_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] +          \
-                    src_rgb1[B + BPP]) >>                                    \
-                   2;                                                        \
-      uint8_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] +          \
-                    src_rgb1[G + BPP]) >>                                    \
-                   2;                                                        \
-      uint8_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] +          \
-                    src_rgb1[R + BPP]) >>                                    \
-                   2;                                                        \
+      uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                      \
+                        AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));         \
+      uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                      \
+                        AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));         \
+      uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                      \
+                        AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));         \
       dst_u[0] = RGBToU(ar, ag, ab);                                         \
       dst_v[0] = RGBToV(ar, ag, ab);                                         \
       src_rgb0 += BPP * 2;                                                   \
@@ -424,13 +490,54 @@ static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
       dst_v += 1;                                                            \
     }                                                                        \
     if (width & 1) {                                                         \
-      uint8_t ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                         \
-      uint8_t ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                         \
-      uint8_t ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                         \
+      uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]);                           \
+      uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]);                           \
+      uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]);                           \
       dst_u[0] = RGBToU(ar, ag, ab);                                         \
       dst_v[0] = RGBToV(ar, ag, ab);                                         \
     }                                                                        \
   }
+#else
+// ARM version does sum / 2 then multiply by 2x smaller coefficients
+#define MAKEROWY(NAME, R, G, B, BPP)                                         \
+  void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+    int x;                                                                   \
+    for (x = 0; x < width; ++x) {                                            \
+      dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);           \
+      src_argb0 += BPP;                                                      \
+      dst_y += 1;                                                            \
+    }                                                                        \
+  }                                                                          \
+  void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
+                       uint8_t* dst_u, uint8_t* dst_v, int width) {          \
+    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                     \
+    int x;                                                                   \
+    for (x = 0; x < width - 1; x += 2) {                                     \
+      uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] +         \
+                     src_rgb1[B + BPP] + 1) >>                               \
+                    1;                                                       \
+      uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] +         \
+                     src_rgb1[G + BPP] + 1) >>                               \
+                    1;                                                       \
+      uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] +         \
+                     src_rgb1[R + BPP] + 1) >>                               \
+                    1;                                                       \
+      dst_u[0] = RGB2xToU(ar, ag, ab);                                       \
+      dst_v[0] = RGB2xToV(ar, ag, ab);                                       \
+      src_rgb0 += BPP * 2;                                                   \
+      src_rgb1 += BPP * 2;                                                   \
+      dst_u += 1;                                                            \
+      dst_v += 1;                                                            \
+    }                                                                        \
+    if (width & 1) {                                                         \
+      uint16_t ab = src_rgb0[B] + src_rgb1[B];                               \
+      uint16_t ag = src_rgb0[G] + src_rgb1[G];                               \
+      uint16_t ar = src_rgb0[R] + src_rgb1[R];                               \
+      dst_u[0] = RGB2xToU(ar, ag, ab);                                       \
+      dst_v[0] = RGB2xToV(ar, ag, ab);                                       \
+    }                                                                        \
+  }
+#endif
 
 MAKEROWY(ARGB, 2, 1, 0, 4)
 MAKEROWY(BGRA, 1, 2, 3, 4)
@@ -448,14 +555,14 @@ MAKEROWY(RAW, 0, 1, 2, 3)
 // b 0.1016 * 255 = 25.908 = 25
 // g 0.5078 * 255 = 129.489 = 129
 // r 0.2578 * 255 = 65.739 = 66
-// JPeg 8 bit Y (not used):
-// b 0.11400 * 256 = 29.184 = 29
-// g 0.58700 * 256 = 150.272 = 150
-// r 0.29900 * 256 = 76.544 = 77
-// JPeg 7 bit Y:
+// JPeg 7 bit Y (deprecated)
 // b 0.11400 * 128 = 14.592 = 15
 // g 0.58700 * 128 = 75.136 = 75
 // r 0.29900 * 128 = 38.272 = 38
+// JPeg 8 bit Y:
+// b 0.11400 * 256 = 29.184 = 29
+// g 0.58700 * 256 = 150.272 = 150
+// r 0.29900 * 256 = 76.544 = 77
 // JPeg 8 bit U:
 // b  0.50000 * 255 = 127.5 = 127
 // g -0.33126 * 255 = -84.4713 = -84
@@ -465,20 +572,37 @@ MAKEROWY(RAW, 0, 1, 2, 3)
 // g -0.41869 * 255 = -106.76595 = -107
 // r  0.50000 * 255 = 127.5 = 127
 
+#ifdef LIBYUV_RGB7
+// Old 7 bit math for compatibility on unsupported platforms.
 static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
   return (38 * r + 75 * g + 15 * b + 64) >> 7;
 }
+#else
+// 8 bit
+static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
+  return (77 * r + 150 * g + 29 * b + 128) >> 8;
+}
+#endif
 
+#if defined(LIBYUV_ARGBTOUV_PAVGB)
 static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
   return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
 }
 static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
   return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
 }
-
-#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+#else
+static __inline int RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) {
+  return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8;
+}
+static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
+  return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8;
+}
+#endif
 
 // ARGBToYJ_C and ARGBToUVJ_C
+// Intel version mimic SSE/AVX which does 2 pavgb
+#if LIBYUV_ARGBTOUV_PAVGB
 #define MAKEROWYJ(NAME, R, G, B, BPP)                                         \
   void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
     int x;                                                                    \
@@ -514,8 +638,53 @@ static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
       dst_v[0] = RGBToVJ(ar, ag, ab);                                         \
     }                                                                         \
   }
+#else
+// ARM version does sum / 2 then multiply by 2x smaller coefficients
+#define MAKEROWYJ(NAME, R, G, B, BPP)                                         \
+  void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+    int x;                                                                    \
+    for (x = 0; x < width; ++x) {                                             \
+      dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);           \
+      src_argb0 += BPP;                                                       \
+      dst_y += 1;                                                             \
+    }                                                                         \
+  }                                                                           \
+  void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
+                        uint8_t* dst_u, uint8_t* dst_v, int width) {          \
+    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                      \
+    int x;                                                                    \
+    for (x = 0; x < width - 1; x += 2) {                                      \
+      uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] +          \
+                     src_rgb1[B + BPP] + 1) >>                                \
+                    1;                                                        \
+      uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] +          \
+                     src_rgb1[G + BPP] + 1) >>                                \
+                    1;                                                        \
+      uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] +          \
+                     src_rgb1[R + BPP] + 1) >>                                \
+                    1;                                                        \
+      dst_u[0] = RGB2xToUJ(ar, ag, ab);                                       \
+      dst_v[0] = RGB2xToVJ(ar, ag, ab);                                       \
+      src_rgb0 += BPP * 2;                                                    \
+      src_rgb1 += BPP * 2;                                                    \
+      dst_u += 1;                                                             \
+      dst_v += 1;                                                             \
+    }                                                                         \
+    if (width & 1) {                                                          \
+      uint16_t ab = (src_rgb0[B] + src_rgb1[B]);                              \
+      uint16_t ag = (src_rgb0[G] + src_rgb1[G]);                              \
+      uint16_t ar = (src_rgb0[R] + src_rgb1[R]);                              \
+      dst_u[0] = RGB2xToUJ(ar, ag, ab);                                       \
+      dst_v[0] = RGB2xToVJ(ar, ag, ab);                                       \
+    }                                                                         \
+  }
+
+#endif
 
 MAKEROWYJ(ARGB, 2, 1, 0, 4)
+MAKEROWYJ(RGBA, 3, 2, 1, 4)
+MAKEROWYJ(RGB24, 2, 1, 0, 3)
+MAKEROWYJ(RAW, 0, 1, 2, 3)
 #undef MAKEROWYJ
 
 void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
@@ -583,13 +752,34 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
     uint8_t b3 = next_rgb565[2] & 0x1f;
     uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
     uint8_t r3 = next_rgb565[3] >> 3;
-    uint8_t b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.
-    uint8_t g = (g0 + g1 + g2 + g3);
-    uint8_t r = (r0 + r1 + r2 + r3);
-    b = (b << 1) | (b >> 6);  // 787 -> 888.
-    r = (r << 1) | (r >> 6);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
+
+    b0 = (b0 << 3) | (b0 >> 2);
+    g0 = (g0 << 2) | (g0 >> 4);
+    r0 = (r0 << 3) | (r0 >> 2);
+    b1 = (b1 << 3) | (b1 >> 2);
+    g1 = (g1 << 2) | (g1 >> 4);
+    r1 = (r1 << 3) | (r1 >> 2);
+    b2 = (b2 << 3) | (b2 >> 2);
+    g2 = (g2 << 2) | (g2 >> 4);
+    r2 = (r2 << 3) | (r2 >> 2);
+    b3 = (b3 << 3) | (b3 >> 2);
+    g3 = (g3 << 2) | (g3 >> 4);
+    r3 = (r3 << 3) | (r3 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+    uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+    uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+    uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+#else
+    uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+    uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+    uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+    dst_u[0] = RGB2xToU(r, g, b);
+    dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
     src_rgb565 += 4;
     next_rgb565 += 4;
     dst_u += 1;
@@ -602,14 +792,27 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
     uint8_t b2 = next_rgb565[0] & 0x1f;
     uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
     uint8_t r2 = next_rgb565[1] >> 3;
-    uint8_t b = (b0 + b2);  // 565 * 2 = 676.
-    uint8_t g = (g0 + g2);
-    uint8_t r = (r0 + r2);
-    b = (b << 2) | (b >> 4);  // 676 -> 888
-    g = (g << 1) | (g >> 6);
-    r = (r << 2) | (r >> 4);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
+
+    b0 = (b0 << 3) | (b0 >> 2);
+    g0 = (g0 << 2) | (g0 >> 4);
+    r0 = (r0 << 3) | (r0 >> 2);
+    b2 = (b2 << 3) | (b2 >> 2);
+    g2 = (g2 << 2) | (g2 >> 4);
+    r2 = (r2 << 3) | (r2 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+    uint8_t ab = AVGB(b0, b2);
+    uint8_t ag = AVGB(g0, g2);
+    uint8_t ar = AVGB(r0, r2);
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+#else
+    uint16_t b = b0 + b2;
+    uint16_t g = g0 + g2;
+    uint16_t r = r0 + r2;
+    dst_u[0] = RGB2xToU(r, g, b);
+    dst_v[0] = RGB2xToV(r, g, b);
+#endif
   }
 }
 
@@ -633,14 +836,34 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
     uint8_t b3 = next_argb1555[2] & 0x1f;
     uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
     uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2;
-    uint8_t b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.
-    uint8_t g = (g0 + g1 + g2 + g3);
-    uint8_t r = (r0 + r1 + r2 + r3);
-    b = (b << 1) | (b >> 6);  // 777 -> 888.
-    g = (g << 1) | (g >> 6);
-    r = (r << 1) | (r >> 6);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
+
+    b0 = (b0 << 3) | (b0 >> 2);
+    g0 = (g0 << 3) | (g0 >> 2);
+    r0 = (r0 << 3) | (r0 >> 2);
+    b1 = (b1 << 3) | (b1 >> 2);
+    g1 = (g1 << 3) | (g1 >> 2);
+    r1 = (r1 << 3) | (r1 >> 2);
+    b2 = (b2 << 3) | (b2 >> 2);
+    g2 = (g2 << 3) | (g2 >> 2);
+    r2 = (r2 << 3) | (r2 >> 2);
+    b3 = (b3 << 3) | (b3 >> 2);
+    g3 = (g3 << 3) | (g3 >> 2);
+    r3 = (r3 << 3) | (r3 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+    uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+    uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+    uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+#else
+    uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+    uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+    uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+    dst_u[0] = RGB2xToU(r, g, b);
+    dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
     src_argb1555 += 4;
     next_argb1555 += 4;
     dst_u += 1;
@@ -653,14 +876,27 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
     uint8_t b2 = next_argb1555[0] & 0x1f;
     uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
     uint8_t r2 = next_argb1555[1] >> 3;
-    uint8_t b = (b0 + b2);  // 555 * 2 = 666.
-    uint8_t g = (g0 + g2);
-    uint8_t r = (r0 + r2);
-    b = (b << 2) | (b >> 4);  // 666 -> 888.
-    g = (g << 2) | (g >> 4);
-    r = (r << 2) | (r >> 4);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
+
+    b0 = (b0 << 3) | (b0 >> 2);
+    g0 = (g0 << 3) | (g0 >> 2);
+    r0 = (r0 << 3) | (r0 >> 2);
+    b2 = (b2 << 3) | (b2 >> 2);
+    g2 = (g2 << 3) | (g2 >> 2);
+    r2 = (r2 << 3) | (r2 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+    uint8_t ab = AVGB(b0, b2);
+    uint8_t ag = AVGB(g0, g2);
+    uint8_t ar = AVGB(r0, r2);
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+#else
+    uint16_t b = b0 + b2;
+    uint16_t g = g0 + g2;
+    uint16_t r = r0 + r2;
+    dst_u[0] = RGB2xToU(r, g, b);
+    dst_v[0] = RGB2xToV(r, g, b);
+#endif
   }
 }
 
@@ -684,14 +920,34 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
     uint8_t b3 = next_argb4444[2] & 0x0f;
     uint8_t g3 = next_argb4444[2] >> 4;
     uint8_t r3 = next_argb4444[3] & 0x0f;
-    uint8_t b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.
-    uint8_t g = (g0 + g1 + g2 + g3);
-    uint8_t r = (r0 + r1 + r2 + r3);
-    b = (b << 2) | (b >> 4);  // 666 -> 888.
-    g = (g << 2) | (g >> 4);
-    r = (r << 2) | (r >> 4);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
+
+    b0 = (b0 << 4) | b0;
+    g0 = (g0 << 4) | g0;
+    r0 = (r0 << 4) | r0;
+    b1 = (b1 << 4) | b1;
+    g1 = (g1 << 4) | g1;
+    r1 = (r1 << 4) | r1;
+    b2 = (b2 << 4) | b2;
+    g2 = (g2 << 4) | g2;
+    r2 = (r2 << 4) | r2;
+    b3 = (b3 << 4) | b3;
+    g3 = (g3 << 4) | g3;
+    r3 = (r3 << 4) | r3;
+
+#if LIBYUV_ARGBTOUV_PAVGB
+    uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+    uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+    uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+#else
+    uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+    uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+    uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+    dst_u[0] = RGB2xToU(r, g, b);
+    dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
     src_argb4444 += 4;
     next_argb4444 += 4;
     dst_u += 1;
@@ -704,14 +960,27 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
     uint8_t b2 = next_argb4444[0] & 0x0f;
     uint8_t g2 = next_argb4444[0] >> 4;
     uint8_t r2 = next_argb4444[1] & 0x0f;
-    uint8_t b = (b0 + b2);  // 444 * 2 = 555.
-    uint8_t g = (g0 + g2);
-    uint8_t r = (r0 + r2);
-    b = (b << 3) | (b >> 2);  // 555 -> 888.
-    g = (g << 3) | (g >> 2);
-    r = (r << 3) | (r >> 2);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
+
+    b0 = (b0 << 4) | b0;
+    g0 = (g0 << 4) | g0;
+    r0 = (r0 << 4) | r0;
+    b2 = (b2 << 4) | b2;
+    g2 = (g2 << 4) | g2;
+    r2 = (r2 << 4) | r2;
+
+#if LIBYUV_ARGBTOUV_PAVGB
+    uint8_t ab = AVGB(b0, b2);
+    uint8_t ag = AVGB(g0, g2);
+    uint8_t ar = AVGB(r0, r2);
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+#else
+    uint16_t b = b0 + b2;
+    uint16_t g = g0 + g2;
+    uint16_t r = r0 + r2;
+    dst_u[0] = RGB2xToU(r, g, b);
+    dst_v[0] = RGB2xToV(r, g, b);
+#endif
   }
 }
 
@@ -1087,26 +1356,26 @@ const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
     {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
     {UG, VG, UG, VG, UG, VG, UG, VG},
     {UG, VG, UG, VG, UG, VG, UG, VG},
-    {BB, BG, BR, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
+    {BB, BG, BR, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
 const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
     {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
     {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
     {VG, UG, VG, UG, VG, UG, VG, UG},
     {VG, UG, VG, UG, VG, UG, VG, UG},
-    {BR, BG, BB, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
+    {BR, BG, BB, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
 #elif defined(__arm__)  // 32 bit arm
 const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
     {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
     {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
-    {BB, BG, BR, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
+    {BB, BG, BR, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
 const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
     {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
     {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
-    {BR, BG, BB, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
+    {BR, BG, BB, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
 #else
 const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
     {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
@@ -1118,7 +1387,9 @@ const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
     {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
     {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
     {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
-    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+    {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+     YGB}};
 const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
     {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
      VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
@@ -1129,7 +1400,9 @@ const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
     {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
     {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
     {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
-    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+    {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+     YGB}};
 #endif
 
 #undef BB
@@ -1168,26 +1441,26 @@ const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
     {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
     {UG, VG, UG, VG, UG, VG, UG, VG},
     {UG, VG, UG, VG, UG, VG, UG, VG},
-    {BB, BG, BR, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
+    {BB, BG, BR, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
 const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
     {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
     {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
     {VG, UG, VG, UG, VG, UG, VG, UG},
     {VG, UG, VG, UG, VG, UG, VG, UG},
-    {BR, BG, BB, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
+    {BR, BG, BB, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
 #elif defined(__arm__)
 const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
     {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
     {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
-    {BB, BG, BR, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
+    {BB, BG, BR, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
 const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
     {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
     {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
-    {BR, BG, BB, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
+    {BR, BG, BB, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
 #else
 const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
     {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
@@ -1199,7 +1472,9 @@ const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
     {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
     {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
     {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
-    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+    {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+     YGB}};
 const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
     {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
      VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
@@ -1210,7 +1485,9 @@ const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
     {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
     {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
     {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
-    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+    {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+     YGB}};
 #endif
 
 #undef BB
@@ -1251,26 +1528,26 @@ const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
     {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
     {UG, VG, UG, VG, UG, VG, UG, VG},
     {UG, VG, UG, VG, UG, VG, UG, VG},
-    {BB, BG, BR, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
+    {BB, BG, BR, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
 const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
     {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
     {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
     {VG, UG, VG, UG, VG, UG, VG, UG},
     {VG, UG, VG, UG, VG, UG, VG, UG},
-    {BR, BG, BB, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
+    {BR, BG, BB, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
 #elif defined(__arm__)
 const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
     {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
     {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
-    {BB, BG, BR, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
+    {BB, BG, BR, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
 const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
     {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
     {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
-    {BR, BG, BB, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
+    {BR, BG, BB, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
 #else
 const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
     {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
@@ -1282,7 +1559,9 @@ const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
     {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
     {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
     {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
-    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+    {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+     YGB}};
 const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
     {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
      VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
@@ -1293,7 +1572,95 @@ const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
     {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
     {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
     {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
-    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+    {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+     YGB}};
+#endif
+
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
+// BT.2020 YUV to RGB reference
+//  R = (Y - 16) * 1.164384                - V * -1.67867
+//  G = (Y - 16) * 1.164384 - U * 0.187326 - V *  0.65042
+//  B = (Y - 16) * 1.164384 - U * -2.14177
+
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 19003  /* round(1.164384 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
+
+// TODO(fbarchard): Improve accuracy; the B channel is off by 7%.
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.142 * 64)) */
+#define UG 12   /* round(0.187326 * 64) */
+#define VG 42   /* round(0.65042 * 64) */
+#define VR -107 /* round(-1.67867 * 64) */
+
+// Bias values to round, and subtract 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
+#if defined(__aarch64__)
+const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = {
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {BB, BG, BR, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = {
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {BR, BG, BB, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
+#elif defined(__arm__)
+const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = {
+    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BB, BG, BR, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = {
+    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BR, BG, BB, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
+#else
+const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = {
+    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+    {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+     YGB}};
+const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = {
+    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+    {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+     YGB}};
 #endif
 
 #undef BB
@@ -1308,7 +1675,6 @@ const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
 
 // C reference code that mimics the YUV assembly.
 // Reads 8 bit YUV and leaves result as 16 bit.
-
 static __inline void YuvPixel(uint8_t y,
                               uint8_t u,
                               uint8_t v,
@@ -1324,7 +1690,7 @@ static __inline void YuvPixel(uint8_t y,
   int bb = yuvconstants->kUVBiasBGR[0];
   int bg = yuvconstants->kUVBiasBGR[1];
   int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+  int yg = yuvconstants->kYToRgb[1];
 #elif defined(__arm__)
   int ub = -yuvconstants->kUVToRB[0];
   int ug = yuvconstants->kUVToG[0];
@@ -1333,7 +1699,7 @@ static __inline void YuvPixel(uint8_t y,
   int bb = yuvconstants->kUVBiasBGR[0];
   int bg = yuvconstants->kUVBiasBGR[1];
   int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+  int yg = yuvconstants->kYToRgb[1];
 #else
   int ub = yuvconstants->kUVToB[0];
   int ug = yuvconstants->kUVToG[0];
@@ -1367,7 +1733,7 @@ static __inline void YuvPixel8_16(uint8_t y,
   int bb = yuvconstants->kUVBiasBGR[0];
   int bg = yuvconstants->kUVBiasBGR[1];
   int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+  int yg = yuvconstants->kYToRgb[1];
 #elif defined(__arm__)
   int ub = -yuvconstants->kUVToRB[0];
   int ug = yuvconstants->kUVToG[0];
@@ -1376,7 +1742,7 @@ static __inline void YuvPixel8_16(uint8_t y,
   int bb = yuvconstants->kUVBiasBGR[0];
   int bg = yuvconstants->kUVBiasBGR[1];
   int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+  int yg = yuvconstants->kYToRgb[1];
 #else
   int ub = yuvconstants->kUVToB[0];
   int ug = yuvconstants->kUVToG[0];
@@ -1411,7 +1777,7 @@ static __inline void YuvPixel16(int16_t y,
   int bb = yuvconstants->kUVBiasBGR[0];
   int bg = yuvconstants->kUVBiasBGR[1];
   int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+  int yg = yuvconstants->kYToRgb[1];
 #elif defined(__arm__)
   int ub = -yuvconstants->kUVToRB[0];
   int ug = yuvconstants->kUVToG[0];
@@ -1420,7 +1786,7 @@ static __inline void YuvPixel16(int16_t y,
   int bb = yuvconstants->kUVBiasBGR[0];
   int bg = yuvconstants->kUVBiasBGR[1];
   int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+  int yg = yuvconstants->kYToRgb[1];
 #else
   int ub = yuvconstants->kUVToB[0];
   int ug = yuvconstants->kUVToG[0];
@@ -1458,21 +1824,26 @@ static __inline void YuvPixel10(uint16_t y,
   *r = Clamp(r16 >> 6);
 }
 
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
-
 // C reference code that mimics the YUV assembly.
-static __inline void YPixel(uint8_t y, uint8_t* b, uint8_t* g, uint8_t* r) {
-  uint32_t y1 = (uint32_t)(y * 0x0101 * YG) >> 16;
-  *b = Clamp((int32_t)(y1 + YGB) >> 6);
-  *g = Clamp((int32_t)(y1 + YGB) >> 6);
-  *r = Clamp((int32_t)(y1 + YGB) >> 6);
+// Reads 8 bit YUV and leaves result as 16 bit.
+static __inline void YPixel(uint8_t y,
+                            uint8_t* b,
+                            uint8_t* g,
+                            uint8_t* r,
+                            const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__) || defined(__arm__)
+  int ygb = yuvconstants->kUVBiasBGR[3];
+  int yg = yuvconstants->kYToRgb[1];
+#else
+  int ygb = yuvconstants->kYBiasToRgb[0];
+  int yg = yuvconstants->kYToRgb[0];
+#endif
+  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
+  *b = Clamp(((int32_t)(y1) + ygb) >> 6);
+  *g = Clamp(((int32_t)(y1) + ygb) >> 6);
+  *r = Clamp(((int32_t)(y1) + ygb) >> 6);
 }
 
-#undef YG
-#undef YGB
-
 #if !defined(LIBYUV_DISABLE_NEON) && \
     (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
 // C mimic assembly.
@@ -2006,18 +2377,21 @@ void I422ToRGBARow_C(const uint8_t* src_y,
   }
 }
 
-void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width) {
+void I400ToARGBRow_C(const uint8_t* src_y,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_y += 2;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
@@ -2035,10 +2409,21 @@ void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
   }
 }
 
-void MirrorUVRow_C(const uint8_t* src_uv,
-                   uint8_t* dst_u,
-                   uint8_t* dst_v,
-                   int width) {
+void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  int x;
+  src_uv += (width - 1) << 1;
+  for (x = 0; x < width; ++x) {
+    dst_uv[0] = src_uv[0];
+    dst_uv[1] = src_uv[1];
+    src_uv -= 2;
+    dst_uv += 2;
+  }
+}
+
+void MirrorSplitUVRow_C(const uint8_t* src_uv,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
   int x;
   src_uv += (width - 1) << 1;
   for (x = 0; x < width - 1; x += 2) {
@@ -2069,6 +2454,21 @@ void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
   }
 }
 
+void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) {
+  int x;
+  src_rgb24 += width * 3 - 3;
+  for (x = 0; x < width; ++x) {
+    uint8_t b = src_rgb24[0];
+    uint8_t g = src_rgb24[1];
+    uint8_t r = src_rgb24[2];
+    dst_rgb24[0] = b;
+    dst_rgb24[1] = g;
+    dst_rgb24[2] = r;
+    src_rgb24 -= 3;
+    dst_rgb24 += 3;
+  }
+}
+
 void SplitUVRow_C(const uint8_t* src_uv,
                   uint8_t* dst_u,
                   uint8_t* dst_v,
@@ -2208,10 +2608,9 @@ void SetRow_C(uint8_t* dst, uint8_t v8, int width) {
 }
 
 void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) {
-  uint32_t* d = (uint32_t*)(dst_argb);
   int x;
   for (x = 0; x < width; ++x) {
-    d[x] = v32;
+    memcpy(dst_argb + x * sizeof v32, &v32, sizeof v32);
   }
 }
 
@@ -2309,7 +2708,7 @@ void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
   }
 }
 
-#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
+#define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f)
 
 // Blend src_argb0 over src_argb1 and store to dst_argb.
 // dst_argb may be src_argb0 or src_argb1.
@@ -2385,10 +2784,14 @@ void BlendPlaneRow_C(const uint8_t* src0,
 }
 #undef UBLEND
 
+#if defined(__aarch64__) || defined(__arm__)
+#define ATTENUATE(f, a) (f * a + 128) >> 8
+#else
+// This code mimics the SSSE3 version for better testability.
 #define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
+#endif
 
 // Multiply source RGB by alpha and store to destination.
-// This code mimics the SSSE3 version for better testability.
 void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
   int i;
   for (i = 0; i < width - 1; i += 2) {
@@ -3175,12 +3578,73 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
 }
 #endif
 
+#ifdef HAS_RGB24TOYJROW_AVX2
+// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
+void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
+    ARGBToYJRow_AVX2(row, dst_yj, twidth);
+    src_rgb24 += twidth * 3;
+    dst_yj += twidth;
+    width -= twidth;
+  }
+}
+#endif  // HAS_RGB24TOYJROW_AVX2
+
+#ifdef HAS_RAWTOYJROW_AVX2
+// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
+void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    RAWToARGBRow_SSSE3(src_raw, row, twidth);
+    ARGBToYJRow_AVX2(row, dst_yj, twidth);
+    src_raw += twidth * 3;
+    dst_yj += twidth;
+    width -= twidth;
+  }
+}
+#endif  // HAS_RAWTOYJROW_AVX2
+
+#ifdef HAS_RGB24TOYJROW_SSSE3
+// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
+void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
+    ARGBToYJRow_SSSE3(row, dst_yj, twidth);
+    src_rgb24 += twidth * 3;
+    dst_yj += twidth;
+    width -= twidth;
+  }
+}
+#endif  // HAS_RGB24TOYJROW_SSSE3
+
+#ifdef HAS_RAWTOYJROW_SSSE3
+// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
+void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    RAWToARGBRow_SSSE3(src_raw, row, twidth);
+    ARGBToYJRow_SSSE3(row, dst_yj, twidth);
+    src_raw += twidth * 3;
+    dst_yj += twidth;
+    width -= twidth;
+  }
+}
+#endif  // HAS_RAWTOYJROW_SSSE3
+
 float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
   float fsum = 0.f;
   int i;
-#if defined(__clang__)
-#pragma clang loop vectorize_width(4)
-#endif
   for (i = 0; i < width; ++i) {
     float v = *src++;
     fsum += v * v;
@@ -3231,6 +3695,154 @@ void GaussCol_C(const uint16_t* src0,
   }
 }
 
+void GaussRow_F32_C(const float* src, float* dst, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    *dst++ = (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4]) *
+             (1.0f / 256.0f);
+    ++src;
+  }
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_F32_C(const float* src0,
+                    const float* src1,
+                    const float* src2,
+                    const float* src3,
+                    const float* src4,
+                    float* dst,
+                    int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
+  }
+}
+
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_vu,
+                      uint8_t* dst_yuv24,
+                      int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_yuv24[0] = src_vu[0];  // V
+    dst_yuv24[1] = src_vu[1];  // U
+    dst_yuv24[2] = src_y[0];   // Y0
+    dst_yuv24[3] = src_vu[0];  // V
+    dst_yuv24[4] = src_vu[1];  // U
+    dst_yuv24[5] = src_y[1];   // Y1
+    src_y += 2;
+    src_vu += 2;
+    dst_yuv24 += 6;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    dst_yuv24[0] = src_vu[0];  // V
+    dst_yuv24[1] = src_vu[1];  // U
+    dst_yuv24[2] = src_y[0];   // Y0
+  }
+}
+
+// Filter 2 rows of AYUV UV's (444) into UV (420).
+void AYUVToUVRow_C(const uint8_t* src_ayuv,
+                   int src_stride_ayuv,
+                   uint8_t* dst_uv,
+                   int width) {
+  // Output a row of UV values, filtering 2x2 rows of AYUV.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
+                 src_ayuv[src_stride_ayuv + 5] + 2) >>
+                2;
+    dst_uv[1] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
+                 src_ayuv[src_stride_ayuv + 4] + 2) >>
+                2;
+    src_ayuv += 8;
+    dst_uv += 2;
+  }
+  if (width & 1) {
+    dst_uv[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
+                 src_ayuv[src_stride_ayuv + 0] + 2) >>
+                2;
+    dst_uv[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
+                 src_ayuv[src_stride_ayuv + 1] + 2) >>
+                2;
+  }
+}
+
+// Filter 2 rows of AYUV UV's (444) into VU (420).
+void AYUVToVURow_C(const uint8_t* src_ayuv,
+                   int src_stride_ayuv,
+                   uint8_t* dst_vu,
+                   int width) {
+  // Output a row of VU values, filtering 2x2 rows of AYUV.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
+                 src_ayuv[src_stride_ayuv + 4] + 2) >>
+                2;
+    dst_vu[1] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
+                 src_ayuv[src_stride_ayuv + 5] + 2) >>
+                2;
+    src_ayuv += 8;
+    dst_vu += 2;
+  }
+  if (width & 1) {
+    dst_vu[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
+                 src_ayuv[src_stride_ayuv + 0] + 2) >>
+                2;
+    dst_vu[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
+                 src_ayuv[src_stride_ayuv + 1] + 2) >>
+                2;
+  }
+}
+
+// Copy row of AYUV Y's into Y
+void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+  // Output a row of Y values.
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_y[x] = src_ayuv[2];  // v,u,y,a
+    src_ayuv += 4;
+  }
+}
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t u = src_uv[0];
+    uint8_t v = src_uv[1];
+    dst_vu[0] = v;
+    dst_vu[1] = u;
+    src_uv += 2;
+    dst_vu += 2;
+  }
+}
+
+void HalfMergeUVRow_C(const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_uv,
+                      int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_uv[0] = (src_u[0] + src_u[1] + src_u[src_stride_u] +
+                 src_u[src_stride_u + 1] + 2) >>
+                2;
+    dst_uv[1] = (src_v[0] + src_v[1] + src_v[src_stride_v] +
+                 src_v[src_stride_v + 1] + 2) >>
+                2;
+    src_u += 2;
+    src_v += 2;
+    dst_uv += 2;
+  }
+  if (width & 1) {
+    dst_uv[0] = (src_u[0] + src_u[src_stride_u] + 1) >> 1;
+    dst_uv[1] = (src_v[0] + src_v[src_stride_v] + 1) >> 1;
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/chromium/third_party/libyuv/source/row_gcc.cc b/chromium/third_party/libyuv/source/row_gcc.cc
index 8d3cb81cec2..a107c30e769 100644
--- a/chromium/third_party/libyuv/source/row_gcc.cc
+++ b/chromium/third_party/libyuv/source/row_gcc.cc
@@ -22,12 +22,15 @@ extern "C" {
 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
 
 // Constants for ARGB
-static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
-                              13, 65, 33, 0, 13, 65, 33, 0};
+static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
+                               25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};
 
 // JPeg full range.
-static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
-                               15, 75, 38, 0, 15, 75, 38, 0};
+static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
+                                29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
+
+static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
+                                0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
 
 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
@@ -45,8 +48,8 @@ static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
                                -20, -107, 127, 0, -20, -107, 127, 0};
 
 // Constants for BGRA
-static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
-                              0, 33, 65, 13, 0, 33, 65, 13};
+static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
+                               0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
 
 static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
                               0, -38, -74, 112, 0, -38, -74, 112};
@@ -55,8 +58,8 @@ static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
                               0, 112, -94, -18, 0, 112, -94, -18};
 
 // Constants for ABGR
-static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
-                              33, 65, 13, 0, 33, 65, 13, 0};
+static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
+                               66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};
 
 static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
                               -38, -74, 112, 0, -38, -74, 112, 0};
@@ -65,8 +68,8 @@ static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
                               112, -94, -18, 0, 112, -94, -18, 0};
 
 // Constants for RGBA.
-static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
-                              0, 13, 65, 33, 0, 13, 65, 33};
+static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
+                               0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};
 
 static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
                               0, 112, -74, -38, 0, 112, -74, -38};
@@ -74,17 +77,15 @@ static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
 static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
                               0, -18, -94, 112, 0, -18, -94, 112};
 
-static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
-                              16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
-
-// 7 bit fixed point 0.5.
-static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
+static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
+                               0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};
 
 static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
                                 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
-static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
-                                  0x8080u, 0x8080u, 0x8080u, 0x8080u};
+static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
+                               0x8080u, 0x8080u, 0x8080u, 0x8080u};
+
 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
 
 #ifdef HAS_RGB24TOARGBROW_SSSE3
@@ -97,6 +98,10 @@ static const uvec8 kShuffleMaskRGB24ToARGB = {
 static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
                                             8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
 
+// Shuffle table for converting RAW to RGBA.
+static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u,  4u,  3u,
+                                            14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u};
+
 // Shuffle table for converting RAW to RGB24.  First 8.
 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
     2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
@@ -154,24 +159,24 @@ static const lvec8 kShuffleNV21 = {
 #ifdef HAS_J400TOARGBROW_SSE2
 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
   asm volatile(
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
-      "pslld     $0x18,%%xmm5                    \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pslld       $0x18,%%xmm5                  \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movq      (%0),%%xmm0                     \n"
-      "lea       0x8(%0),%0                      \n"
-      "punpcklbw %%xmm0,%%xmm0                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "punpcklwd %%xmm0,%%xmm0                   \n"
-      "punpckhwd %%xmm1,%%xmm1                   \n"
-      "por       %%xmm5,%%xmm0                   \n"
-      "por       %%xmm5,%%xmm1                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "movdqu    %%xmm1,0x10(%1)                 \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
+      "movq        (%0),%%xmm0                   \n"
+      "lea         0x8(%0),%0                    \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklwd   %%xmm0,%%xmm0                 \n"
+      "punpckhwd   %%xmm1,%%xmm1                 \n"
+      "por         %%xmm5,%%xmm0                 \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src_y),     // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
@@ -185,35 +190,35 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
                           uint8_t* dst_argb,
                           int width) {
   asm volatile(
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"  // 0xff000000
-      "pslld     $0x18,%%xmm5                    \n"
-      "movdqa    %3,%%xmm4                       \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
+      "pslld       $0x18,%%xmm5                  \n"
+      "movdqa      %3,%%xmm4                     \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm3                 \n"
-      "lea       0x30(%0),%0                     \n"
-      "movdqa    %%xmm3,%%xmm2                   \n"
-      "palignr   $0x8,%%xmm1,%%xmm2              \n"
-      "pshufb    %%xmm4,%%xmm2                   \n"
-      "por       %%xmm5,%%xmm2                   \n"
-      "palignr   $0xc,%%xmm0,%%xmm1              \n"
-      "pshufb    %%xmm4,%%xmm0                   \n"
-      "movdqu    %%xmm2,0x20(%1)                 \n"
-      "por       %%xmm5,%%xmm0                   \n"
-      "pshufb    %%xmm4,%%xmm1                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "por       %%xmm5,%%xmm1                   \n"
-      "palignr   $0x4,%%xmm3,%%xmm3              \n"
-      "pshufb    %%xmm4,%%xmm3                   \n"
-      "movdqu    %%xmm1,0x10(%1)                 \n"
-      "por       %%xmm5,%%xmm3                   \n"
-      "movdqu    %%xmm3,0x30(%1)                 \n"
-      "lea       0x40(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm3               \n"
+      "lea         0x30(%0),%0                   \n"
+      "movdqa      %%xmm3,%%xmm2                 \n"
+      "palignr     $0x8,%%xmm1,%%xmm2            \n"
+      "pshufb      %%xmm4,%%xmm2                 \n"
+      "por         %%xmm5,%%xmm2                 \n"
+      "palignr     $0xc,%%xmm0,%%xmm1            \n"
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "movdqu      %%xmm2,0x20(%1)               \n"
+      "por         %%xmm5,%%xmm0                 \n"
+      "pshufb      %%xmm4,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "palignr     $0x4,%%xmm3,%%xmm3            \n"
+      "pshufb      %%xmm4,%%xmm3                 \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "por         %%xmm5,%%xmm3                 \n"
+      "movdqu      %%xmm3,0x30(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
       : "+r"(src_rgb24),              // %0
         "+r"(dst_argb),               // %1
         "+r"(width)                   // %2
@@ -223,35 +228,35 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
 
 void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
   asm volatile(
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"  // 0xff000000
-      "pslld     $0x18,%%xmm5                    \n"
-      "movdqa    %3,%%xmm4                       \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
+      "pslld       $0x18,%%xmm5                  \n"
+      "movdqa      %3,%%xmm4                     \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm3                 \n"
-      "lea       0x30(%0),%0                     \n"
-      "movdqa    %%xmm3,%%xmm2                   \n"
-      "palignr   $0x8,%%xmm1,%%xmm2              \n"
-      "pshufb    %%xmm4,%%xmm2                   \n"
-      "por       %%xmm5,%%xmm2                   \n"
-      "palignr   $0xc,%%xmm0,%%xmm1              \n"
-      "pshufb    %%xmm4,%%xmm0                   \n"
-      "movdqu    %%xmm2,0x20(%1)                 \n"
-      "por       %%xmm5,%%xmm0                   \n"
-      "pshufb    %%xmm4,%%xmm1                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "por       %%xmm5,%%xmm1                   \n"
-      "palignr   $0x4,%%xmm3,%%xmm3              \n"
-      "pshufb    %%xmm4,%%xmm3                   \n"
-      "movdqu    %%xmm1,0x10(%1)                 \n"
-      "por       %%xmm5,%%xmm3                   \n"
-      "movdqu    %%xmm3,0x30(%1)                 \n"
-      "lea       0x40(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm3               \n"
+      "lea         0x30(%0),%0                   \n"
+      "movdqa      %%xmm3,%%xmm2                 \n"
+      "palignr     $0x8,%%xmm1,%%xmm2            \n"
+      "pshufb      %%xmm4,%%xmm2                 \n"
+      "por         %%xmm5,%%xmm2                 \n"
+      "palignr     $0xc,%%xmm0,%%xmm1            \n"
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "movdqu      %%xmm2,0x20(%1)               \n"
+      "por         %%xmm5,%%xmm0                 \n"
+      "pshufb      %%xmm4,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "palignr     $0x4,%%xmm3,%%xmm3            \n"
+      "pshufb      %%xmm4,%%xmm3                 \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "por         %%xmm5,%%xmm3                 \n"
+      "movdqu      %%xmm3,0x30(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
       : "+r"(src_raw),              // %0
         "+r"(dst_argb),             // %1
         "+r"(width)                 // %2
@@ -259,29 +264,68 @@ void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
+// Same code as RAWToARGB with different shuffler and A in low bits
+void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0x000000ff
+      "psrld       $0x18,%%xmm5                  \n"
+      "movdqa      %3,%%xmm4                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm3               \n"
+      "lea         0x30(%0),%0                   \n"
+      "movdqa      %%xmm3,%%xmm2                 \n"
+      "palignr     $0x8,%%xmm1,%%xmm2            \n"
+      "pshufb      %%xmm4,%%xmm2                 \n"
+      "por         %%xmm5,%%xmm2                 \n"
+      "palignr     $0xc,%%xmm0,%%xmm1            \n"
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "movdqu      %%xmm2,0x20(%1)               \n"
+      "por         %%xmm5,%%xmm0                 \n"
+      "pshufb      %%xmm4,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "palignr     $0x4,%%xmm3,%%xmm3            \n"
+      "pshufb      %%xmm4,%%xmm3                 \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "por         %%xmm5,%%xmm3                 \n"
+      "movdqu      %%xmm3,0x30(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_raw),              // %0
+        "+r"(dst_rgba),             // %1
+        "+r"(width)                 // %2
+      : "m"(kShuffleMaskRAWToRGBA)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
 void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
                          uint8_t* dst_rgb24,
                          int width) {
   asm volatile(
-      "movdqa     %3,%%xmm3                       \n"
-      "movdqa     %4,%%xmm4                       \n"
-      "movdqa     %5,%%xmm5                       \n"
+      "movdqa      %3,%%xmm3                     \n"
+      "movdqa      %4,%%xmm4                     \n"
+      "movdqa      %5,%%xmm5                     \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x4(%0),%%xmm1                  \n"
-      "movdqu    0x8(%0),%%xmm2                  \n"
-      "lea       0x18(%0),%0                     \n"
-      "pshufb    %%xmm3,%%xmm0                   \n"
-      "pshufb    %%xmm4,%%xmm1                   \n"
-      "pshufb    %%xmm5,%%xmm2                   \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "movq      %%xmm1,0x8(%1)                  \n"
-      "movq      %%xmm2,0x10(%1)                 \n"
-      "lea       0x18(%1),%1                     \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x4(%0),%%xmm1                \n"
+      "movdqu      0x8(%0),%%xmm2                \n"
+      "lea         0x18(%0),%0                   \n"
+      "pshufb      %%xmm3,%%xmm0                 \n"
+      "pshufb      %%xmm4,%%xmm1                 \n"
+      "pshufb      %%xmm5,%%xmm2                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movq        %%xmm1,0x8(%1)                \n"
+      "movq        %%xmm2,0x10(%1)               \n"
+      "lea         0x18(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src_raw),                  // %0
         "+r"(dst_rgb24),                // %1
         "+r"(width)                     // %2
@@ -293,44 +337,44 @@ void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
 
 void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
-      "mov       $0x1080108,%%eax                \n"
-      "movd      %%eax,%%xmm5                    \n"
-      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-      "mov       $0x20802080,%%eax               \n"
-      "movd      %%eax,%%xmm6                    \n"
-      "pshufd    $0x0,%%xmm6,%%xmm6              \n"
-      "pcmpeqb   %%xmm3,%%xmm3                   \n"
-      "psllw     $0xb,%%xmm3                     \n"
-      "pcmpeqb   %%xmm4,%%xmm4                   \n"
-      "psllw     $0xa,%%xmm4                     \n"
-      "psrlw     $0x5,%%xmm4                     \n"
-      "pcmpeqb   %%xmm7,%%xmm7                   \n"
-      "psllw     $0x8,%%xmm7                     \n"
-      "sub       %0,%1                           \n"
-      "sub       %0,%1                           \n"
+      "mov         $0x1080108,%%eax              \n"
+      "movd        %%eax,%%xmm5                  \n"
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
+      "mov         $0x20802080,%%eax             \n"
+      "movd        %%eax,%%xmm6                  \n"
+      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
+      "pcmpeqb     %%xmm3,%%xmm3                 \n"
+      "psllw       $0xb,%%xmm3                   \n"
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psllw       $0xa,%%xmm4                   \n"
+      "psrlw       $0x5,%%xmm4                   \n"
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psllw       $0x8,%%xmm7                   \n"
+      "sub         %0,%1                         \n"
+      "sub         %0,%1                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "pand      %%xmm3,%%xmm1                   \n"
-      "psllw     $0xb,%%xmm2                     \n"
-      "pmulhuw   %%xmm5,%%xmm1                   \n"
-      "pmulhuw   %%xmm5,%%xmm2                   \n"
-      "psllw     $0x8,%%xmm1                     \n"
-      "por       %%xmm2,%%xmm1                   \n"
-      "pand      %%xmm4,%%xmm0                   \n"
-      "pmulhuw   %%xmm6,%%xmm0                   \n"
-      "por       %%xmm7,%%xmm0                   \n"
-      "movdqa    %%xmm1,%%xmm2                   \n"
-      "punpcklbw %%xmm0,%%xmm1                   \n"
-      "punpckhbw %%xmm0,%%xmm2                   \n"
-      "movdqu    %%xmm1,0x00(%1,%0,2)            \n"
-      "movdqu    %%xmm2,0x10(%1,%0,2)            \n"
-      "lea       0x10(%0),%0                     \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "pand        %%xmm3,%%xmm1                 \n"
+      "psllw       $0xb,%%xmm2                   \n"
+      "pmulhuw     %%xmm5,%%xmm1                 \n"
+      "pmulhuw     %%xmm5,%%xmm2                 \n"
+      "psllw       $0x8,%%xmm1                   \n"
+      "por         %%xmm2,%%xmm1                 \n"
+      "pand        %%xmm4,%%xmm0                 \n"
+      "pmulhuw     %%xmm6,%%xmm0                 \n"
+      "por         %%xmm7,%%xmm0                 \n"
+      "movdqa      %%xmm1,%%xmm2                 \n"
+      "punpcklbw   %%xmm0,%%xmm1                 \n"
+      "punpckhbw   %%xmm0,%%xmm2                 \n"
+      "movdqu      %%xmm1,0x00(%1,%0,2)          \n"
+      "movdqu      %%xmm2,0x10(%1,%0,2)          \n"
+      "lea         0x10(%0),%0                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
         "+r"(width)  // %2
@@ -341,47 +385,47 @@ void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 
 void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
-      "mov       $0x1080108,%%eax                \n"
-      "movd      %%eax,%%xmm5                    \n"
-      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-      "mov       $0x42004200,%%eax               \n"
-      "movd      %%eax,%%xmm6                    \n"
-      "pshufd    $0x0,%%xmm6,%%xmm6              \n"
-      "pcmpeqb   %%xmm3,%%xmm3                   \n"
-      "psllw     $0xb,%%xmm3                     \n"
-      "movdqa    %%xmm3,%%xmm4                   \n"
-      "psrlw     $0x6,%%xmm4                     \n"
-      "pcmpeqb   %%xmm7,%%xmm7                   \n"
-      "psllw     $0x8,%%xmm7                     \n"
-      "sub       %0,%1                           \n"
-      "sub       %0,%1                           \n"
+      "mov         $0x1080108,%%eax              \n"
+      "movd        %%eax,%%xmm5                  \n"
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
+      "mov         $0x42004200,%%eax             \n"
+      "movd        %%eax,%%xmm6                  \n"
+      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
+      "pcmpeqb     %%xmm3,%%xmm3                 \n"
+      "psllw       $0xb,%%xmm3                   \n"
+      "movdqa      %%xmm3,%%xmm4                 \n"
+      "psrlw       $0x6,%%xmm4                   \n"
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psllw       $0x8,%%xmm7                   \n"
+      "sub         %0,%1                         \n"
+      "sub         %0,%1                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "psllw     $0x1,%%xmm1                     \n"
-      "psllw     $0xb,%%xmm2                     \n"
-      "pand      %%xmm3,%%xmm1                   \n"
-      "pmulhuw   %%xmm5,%%xmm2                   \n"
-      "pmulhuw   %%xmm5,%%xmm1                   \n"
-      "psllw     $0x8,%%xmm1                     \n"
-      "por       %%xmm2,%%xmm1                   \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "pand      %%xmm4,%%xmm0                   \n"
-      "psraw     $0x8,%%xmm2                     \n"
-      "pmulhuw   %%xmm6,%%xmm0                   \n"
-      "pand      %%xmm7,%%xmm2                   \n"
-      "por       %%xmm2,%%xmm0                   \n"
-      "movdqa    %%xmm1,%%xmm2                   \n"
-      "punpcklbw %%xmm0,%%xmm1                   \n"
-      "punpckhbw %%xmm0,%%xmm2                   \n"
-      "movdqu    %%xmm1,0x00(%1,%0,2)            \n"
-      "movdqu    %%xmm2,0x10(%1,%0,2)            \n"
-      "lea       0x10(%0),%0                     \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "psllw       $0x1,%%xmm1                   \n"
+      "psllw       $0xb,%%xmm2                   \n"
+      "pand        %%xmm3,%%xmm1                 \n"
+      "pmulhuw     %%xmm5,%%xmm2                 \n"
+      "pmulhuw     %%xmm5,%%xmm1                 \n"
+      "psllw       $0x8,%%xmm1                   \n"
+      "por         %%xmm2,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "pand        %%xmm4,%%xmm0                 \n"
+      "psraw       $0x8,%%xmm2                   \n"
+      "pmulhuw     %%xmm6,%%xmm0                 \n"
+      "pand        %%xmm7,%%xmm2                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqa      %%xmm1,%%xmm2                 \n"
+      "punpcklbw   %%xmm0,%%xmm1                 \n"
+      "punpckhbw   %%xmm0,%%xmm2                 \n"
+      "movdqu      %%xmm1,0x00(%1,%0,2)          \n"
+      "movdqu      %%xmm2,0x10(%1,%0,2)          \n"
+      "lea         0x10(%0),%0                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
         "+r"(width)  // %2
@@ -392,34 +436,34 @@ void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 
 void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
-      "mov       $0xf0f0f0f,%%eax                \n"
-      "movd      %%eax,%%xmm4                    \n"
-      "pshufd    $0x0,%%xmm4,%%xmm4              \n"
-      "movdqa    %%xmm4,%%xmm5                   \n"
-      "pslld     $0x4,%%xmm5                     \n"
-      "sub       %0,%1                           \n"
-      "sub       %0,%1                           \n"
+      "mov         $0xf0f0f0f,%%eax              \n"
+      "movd        %%eax,%%xmm4                  \n"
+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "pslld       $0x4,%%xmm5                   \n"
+      "sub         %0,%1                         \n"
+      "sub         %0,%1                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "pand      %%xmm4,%%xmm0                   \n"
-      "pand      %%xmm5,%%xmm2                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "movdqa    %%xmm2,%%xmm3                   \n"
-      "psllw     $0x4,%%xmm1                     \n"
-      "psrlw     $0x4,%%xmm3                     \n"
-      "por       %%xmm1,%%xmm0                   \n"
-      "por       %%xmm3,%%xmm2                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "punpcklbw %%xmm2,%%xmm0                   \n"
-      "punpckhbw %%xmm2,%%xmm1                   \n"
-      "movdqu    %%xmm0,0x00(%1,%0,2)            \n"
-      "movdqu    %%xmm1,0x10(%1,%0,2)            \n"
-      "lea       0x10(%0),%0                     \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "pand        %%xmm4,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "psllw       $0x4,%%xmm1                   \n"
+      "psrlw       $0x4,%%xmm3                   \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm3,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm2,%%xmm0                 \n"
+      "punpckhbw   %%xmm2,%%xmm1                 \n"
+      "movdqu      %%xmm0,0x00(%1,%0,2)          \n"
+      "movdqu      %%xmm1,0x10(%1,%0,2)          \n"
+      "lea         0x10(%0),%0                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
         "+r"(width)  // %2
@@ -430,35 +474,35 @@ void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
 
-      "movdqa    %3,%%xmm6                       \n"
+      "movdqa      %3,%%xmm6                     \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x30(%0),%%xmm3                 \n"
-      "lea       0x40(%0),%0                     \n"
-      "pshufb    %%xmm6,%%xmm0                   \n"
-      "pshufb    %%xmm6,%%xmm1                   \n"
-      "pshufb    %%xmm6,%%xmm2                   \n"
-      "pshufb    %%xmm6,%%xmm3                   \n"
-      "movdqa    %%xmm1,%%xmm4                   \n"
-      "psrldq    $0x4,%%xmm1                     \n"
-      "pslldq    $0xc,%%xmm4                     \n"
-      "movdqa    %%xmm2,%%xmm5                   \n"
-      "por       %%xmm4,%%xmm0                   \n"
-      "pslldq    $0x8,%%xmm5                     \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "por       %%xmm5,%%xmm1                   \n"
-      "psrldq    $0x8,%%xmm2                     \n"
-      "pslldq    $0x4,%%xmm3                     \n"
-      "por       %%xmm3,%%xmm2                   \n"
-      "movdqu    %%xmm1,0x10(%1)                 \n"
-      "movdqu    %%xmm2,0x20(%1)                 \n"
-      "lea       0x30(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x30(%0),%%xmm3               \n"
+      "lea         0x40(%0),%0                   \n"
+      "pshufb      %%xmm6,%%xmm0                 \n"
+      "pshufb      %%xmm6,%%xmm1                 \n"
+      "pshufb      %%xmm6,%%xmm2                 \n"
+      "pshufb      %%xmm6,%%xmm3                 \n"
+      "movdqa      %%xmm1,%%xmm4                 \n"
+      "psrldq      $0x4,%%xmm1                   \n"
+      "pslldq      $0xc,%%xmm4                   \n"
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "por         %%xmm4,%%xmm0                 \n"
+      "pslldq      $0x8,%%xmm5                   \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "psrldq      $0x8,%%xmm2                   \n"
+      "pslldq      $0x4,%%xmm3                   \n"
+      "por         %%xmm3,%%xmm2                 \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "movdqu      %%xmm2,0x20(%1)               \n"
+      "lea         0x30(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
       : "+r"(src),                    // %0
         "+r"(dst),                    // %1
         "+r"(width)                   // %2
@@ -469,35 +513,35 @@ void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
 void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
 
-      "movdqa    %3,%%xmm6                       \n"
+      "movdqa      %3,%%xmm6                     \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x30(%0),%%xmm3                 \n"
-      "lea       0x40(%0),%0                     \n"
-      "pshufb    %%xmm6,%%xmm0                   \n"
-      "pshufb    %%xmm6,%%xmm1                   \n"
-      "pshufb    %%xmm6,%%xmm2                   \n"
-      "pshufb    %%xmm6,%%xmm3                   \n"
-      "movdqa    %%xmm1,%%xmm4                   \n"
-      "psrldq    $0x4,%%xmm1                     \n"
-      "pslldq    $0xc,%%xmm4                     \n"
-      "movdqa    %%xmm2,%%xmm5                   \n"
-      "por       %%xmm4,%%xmm0                   \n"
-      "pslldq    $0x8,%%xmm5                     \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "por       %%xmm5,%%xmm1                   \n"
-      "psrldq    $0x8,%%xmm2                     \n"
-      "pslldq    $0x4,%%xmm3                     \n"
-      "por       %%xmm3,%%xmm2                   \n"
-      "movdqu    %%xmm1,0x10(%1)                 \n"
-      "movdqu    %%xmm2,0x20(%1)                 \n"
-      "lea       0x30(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x30(%0),%%xmm3               \n"
+      "lea         0x40(%0),%0                   \n"
+      "pshufb      %%xmm6,%%xmm0                 \n"
+      "pshufb      %%xmm6,%%xmm1                 \n"
+      "pshufb      %%xmm6,%%xmm2                 \n"
+      "pshufb      %%xmm6,%%xmm3                 \n"
+      "movdqa      %%xmm1,%%xmm4                 \n"
+      "psrldq      $0x4,%%xmm1                   \n"
+      "pslldq      $0xc,%%xmm4                   \n"
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "por         %%xmm4,%%xmm0                 \n"
+      "pslldq      $0x8,%%xmm5                   \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "psrldq      $0x8,%%xmm2                   \n"
+      "pslldq      $0x4,%%xmm3                   \n"
+      "por         %%xmm3,%%xmm2                 \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "movdqu      %%xmm2,0x20(%1)               \n"
+      "lea         0x30(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
       : "+r"(src),                  // %0
         "+r"(dst),                  // %1
         "+r"(width)                 // %2
@@ -512,37 +556,37 @@ static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
 void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
       "vbroadcastf128 %3,%%ymm6                  \n"
-      "vmovdqa    %4,%%ymm7                      \n"
+      "vmovdqa     %4,%%ymm7                     \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "vmovdqu    0x40(%0),%%ymm2                \n"
-      "vmovdqu    0x60(%0),%%ymm3                \n"
-      "lea        0x80(%0),%0                    \n"
-      "vpshufb    %%ymm6,%%ymm0,%%ymm0           \n"  // xxx0yyy0
-      "vpshufb    %%ymm6,%%ymm1,%%ymm1           \n"
-      "vpshufb    %%ymm6,%%ymm2,%%ymm2           \n"
-      "vpshufb    %%ymm6,%%ymm3,%%ymm3           \n"
-      "vpermd     %%ymm0,%%ymm7,%%ymm0           \n"  // pack to 24 bytes
-      "vpermd     %%ymm1,%%ymm7,%%ymm1           \n"
-      "vpermd     %%ymm2,%%ymm7,%%ymm2           \n"
-      "vpermd     %%ymm3,%%ymm7,%%ymm3           \n"
-      "vpermq     $0x3f,%%ymm1,%%ymm4            \n"  // combine 24 + 8
-      "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "vpermq     $0xf9,%%ymm1,%%ymm1            \n"  // combine 16 + 16
-      "vpermq     $0x4f,%%ymm2,%%ymm4            \n"
-      "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
-      "vmovdqu    %%ymm1,0x20(%1)                \n"
-      "vpermq     $0xfe,%%ymm2,%%ymm2            \n"  // combine 8 + 24
-      "vpermq     $0x93,%%ymm3,%%ymm3            \n"
-      "vpor       %%ymm3,%%ymm2,%%ymm2           \n"
-      "vmovdqu    %%ymm2,0x40(%1)                \n"
-      "lea        0x60(%1),%1                    \n"
-      "sub        $0x20,%2                       \n"
-      "jg         1b                             \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x40(%0),%%ymm2               \n"
+      "vmovdqu     0x60(%0),%%ymm3               \n"
+      "lea         0x80(%0),%0                   \n"
+      "vpshufb     %%ymm6,%%ymm0,%%ymm0          \n"  // xxx0yyy0
+      "vpshufb     %%ymm6,%%ymm1,%%ymm1          \n"
+      "vpshufb     %%ymm6,%%ymm2,%%ymm2          \n"
+      "vpshufb     %%ymm6,%%ymm3,%%ymm3          \n"
+      "vpermd      %%ymm0,%%ymm7,%%ymm0          \n"  // pack to 24 bytes
+      "vpermd      %%ymm1,%%ymm7,%%ymm1          \n"
+      "vpermd      %%ymm2,%%ymm7,%%ymm2          \n"
+      "vpermd      %%ymm3,%%ymm7,%%ymm3          \n"
+      "vpermq      $0x3f,%%ymm1,%%ymm4           \n"  // combine 24 + 8
+      "vpor        %%ymm4,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vpermq      $0xf9,%%ymm1,%%ymm1           \n"  // combine 16 + 16
+      "vpermq      $0x4f,%%ymm2,%%ymm4           \n"
+      "vpor        %%ymm4,%%ymm1,%%ymm1          \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "vpermq      $0xfe,%%ymm2,%%ymm2           \n"  // combine 8 + 24
+      "vpermq      $0x93,%%ymm3,%%ymm3           \n"
+      "vpor        %%ymm3,%%ymm2,%%ymm2          \n"
+      "vmovdqu     %%ymm2,0x40(%1)               \n"
+      "lea         0x60(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src),                     // %0
         "+r"(dst),                     // %1
@@ -571,26 +615,26 @@ static const ulvec8 kPermARGBToRGB24_2 = {
 
 void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
-      "vmovdqa    %3,%%ymm5                      \n"
-      "vmovdqa    %4,%%ymm6                      \n"
-      "vmovdqa    %5,%%ymm7                      \n"
+      "vmovdqa     %3,%%ymm5                     \n"
+      "vmovdqa     %4,%%ymm6                     \n"
+      "vmovdqa     %5,%%ymm7                     \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "vmovdqu    0x40(%0),%%ymm2                \n"
-      "vmovdqu    0x60(%0),%%ymm3                \n"
-      "lea        0x80(%0),%0                    \n"
-      "vpermt2b   %%ymm1,%%ymm5,%%ymm0           \n"
-      "vpermt2b   %%ymm2,%%ymm6,%%ymm1           \n"
-      "vpermt2b   %%ymm3,%%ymm7,%%ymm2           \n"
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "vmovdqu    %%ymm1,0x20(%1)                \n"
-      "vmovdqu    %%ymm2,0x40(%1)                \n"
-      "lea        0x60(%1),%1                    \n"
-      "sub        $0x20,%2                       \n"
-      "jg         1b                             \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x40(%0),%%ymm2               \n"
+      "vmovdqu     0x60(%0),%%ymm3               \n"
+      "lea         0x80(%0),%0                   \n"
+      "vpermt2b    %%ymm1,%%ymm5,%%ymm0          \n"
+      "vpermt2b    %%ymm2,%%ymm6,%%ymm1          \n"
+      "vpermt2b    %%ymm3,%%ymm7,%%ymm2          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "vmovdqu     %%ymm2,0x40(%1)               \n"
+      "lea         0x60(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src),                // %0
         "+r"(dst),                // %1
@@ -606,37 +650,37 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
 void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
       "vbroadcastf128 %3,%%ymm6                  \n"
-      "vmovdqa    %4,%%ymm7                      \n"
+      "vmovdqa     %4,%%ymm7                     \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "vmovdqu    0x40(%0),%%ymm2                \n"
-      "vmovdqu    0x60(%0),%%ymm3                \n"
-      "lea        0x80(%0),%0                    \n"
-      "vpshufb    %%ymm6,%%ymm0,%%ymm0           \n"  // xxx0yyy0
-      "vpshufb    %%ymm6,%%ymm1,%%ymm1           \n"
-      "vpshufb    %%ymm6,%%ymm2,%%ymm2           \n"
-      "vpshufb    %%ymm6,%%ymm3,%%ymm3           \n"
-      "vpermd     %%ymm0,%%ymm7,%%ymm0           \n"  // pack to 24 bytes
-      "vpermd     %%ymm1,%%ymm7,%%ymm1           \n"
-      "vpermd     %%ymm2,%%ymm7,%%ymm2           \n"
-      "vpermd     %%ymm3,%%ymm7,%%ymm3           \n"
-      "vpermq     $0x3f,%%ymm1,%%ymm4            \n"  // combine 24 + 8
-      "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "vpermq     $0xf9,%%ymm1,%%ymm1            \n"  // combine 16 + 16
-      "vpermq     $0x4f,%%ymm2,%%ymm4            \n"
-      "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
-      "vmovdqu    %%ymm1,0x20(%1)                \n"
-      "vpermq     $0xfe,%%ymm2,%%ymm2            \n"  // combine 8 + 24
-      "vpermq     $0x93,%%ymm3,%%ymm3            \n"
-      "vpor       %%ymm3,%%ymm2,%%ymm2           \n"
-      "vmovdqu    %%ymm2,0x40(%1)                \n"
-      "lea        0x60(%1),%1                    \n"
-      "sub        $0x20,%2                       \n"
-      "jg         1b                             \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x40(%0),%%ymm2               \n"
+      "vmovdqu     0x60(%0),%%ymm3               \n"
+      "lea         0x80(%0),%0                   \n"
+      "vpshufb     %%ymm6,%%ymm0,%%ymm0          \n"  // xxx0yyy0
+      "vpshufb     %%ymm6,%%ymm1,%%ymm1          \n"
+      "vpshufb     %%ymm6,%%ymm2,%%ymm2          \n"
+      "vpshufb     %%ymm6,%%ymm3,%%ymm3          \n"
+      "vpermd      %%ymm0,%%ymm7,%%ymm0          \n"  // pack to 24 bytes
+      "vpermd      %%ymm1,%%ymm7,%%ymm1          \n"
+      "vpermd      %%ymm2,%%ymm7,%%ymm2          \n"
+      "vpermd      %%ymm3,%%ymm7,%%ymm3          \n"
+      "vpermq      $0x3f,%%ymm1,%%ymm4           \n"  // combine 24 + 8
+      "vpor        %%ymm4,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vpermq      $0xf9,%%ymm1,%%ymm1           \n"  // combine 16 + 16
+      "vpermq      $0x4f,%%ymm2,%%ymm4           \n"
+      "vpor        %%ymm4,%%ymm1,%%ymm1          \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "vpermq      $0xfe,%%ymm2,%%ymm2           \n"  // combine 8 + 24
+      "vpermq      $0x93,%%ymm3,%%ymm3           \n"
+      "vpor        %%ymm3,%%ymm2,%%ymm2          \n"
+      "vmovdqu     %%ymm2,0x40(%1)               \n"
+      "lea         0x60(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src),                   // %0
         "+r"(dst),                   // %1
@@ -650,34 +694,34 @@ void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 
 void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
-      "pcmpeqb   %%xmm3,%%xmm3                   \n"
-      "psrld     $0x1b,%%xmm3                    \n"
-      "pcmpeqb   %%xmm4,%%xmm4                   \n"
-      "psrld     $0x1a,%%xmm4                    \n"
-      "pslld     $0x5,%%xmm4                     \n"
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
-      "pslld     $0xb,%%xmm5                     \n"
+      "pcmpeqb     %%xmm3,%%xmm3                 \n"
+      "psrld       $0x1b,%%xmm3                  \n"
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrld       $0x1a,%%xmm4                  \n"
+      "pslld       $0x5,%%xmm4                   \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pslld       $0xb,%%xmm5                   \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "pslld     $0x8,%%xmm0                     \n"
-      "psrld     $0x3,%%xmm1                     \n"
-      "psrld     $0x5,%%xmm2                     \n"
-      "psrad     $0x10,%%xmm0                    \n"
-      "pand      %%xmm3,%%xmm1                   \n"
-      "pand      %%xmm4,%%xmm2                   \n"
-      "pand      %%xmm5,%%xmm0                   \n"
-      "por       %%xmm2,%%xmm1                   \n"
-      "por       %%xmm1,%%xmm0                   \n"
-      "packssdw  %%xmm0,%%xmm0                   \n"
-      "lea       0x10(%0),%0                     \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x4,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "pslld       $0x8,%%xmm0                   \n"
+      "psrld       $0x3,%%xmm1                   \n"
+      "psrld       $0x5,%%xmm2                   \n"
+      "psrad       $0x10,%%xmm0                  \n"
+      "pand        %%xmm3,%%xmm1                 \n"
+      "pand        %%xmm4,%%xmm2                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm1                 \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "packssdw    %%xmm0,%%xmm0                 \n"
+      "lea         0x10(%0),%0                   \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
         "+r"(width)  // %2
@@ -690,40 +734,40 @@ void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
                                 const uint32_t dither4,
                                 int width) {
   asm volatile(
-      "movd       %3,%%xmm6                      \n"
-      "punpcklbw  %%xmm6,%%xmm6                  \n"
-      "movdqa     %%xmm6,%%xmm7                  \n"
-      "punpcklwd  %%xmm6,%%xmm6                  \n"
-      "punpckhwd  %%xmm7,%%xmm7                  \n"
-      "pcmpeqb    %%xmm3,%%xmm3                  \n"
-      "psrld      $0x1b,%%xmm3                   \n"
-      "pcmpeqb    %%xmm4,%%xmm4                  \n"
-      "psrld      $0x1a,%%xmm4                   \n"
-      "pslld      $0x5,%%xmm4                    \n"
-      "pcmpeqb    %%xmm5,%%xmm5                  \n"
-      "pslld      $0xb,%%xmm5                    \n"
+      "movd        %3,%%xmm6                     \n"
+      "punpcklbw   %%xmm6,%%xmm6                 \n"
+      "movdqa      %%xmm6,%%xmm7                 \n"
+      "punpcklwd   %%xmm6,%%xmm6                 \n"
+      "punpckhwd   %%xmm7,%%xmm7                 \n"
+      "pcmpeqb     %%xmm3,%%xmm3                 \n"
+      "psrld       $0x1b,%%xmm3                  \n"
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrld       $0x1a,%%xmm4                  \n"
+      "pslld       $0x5,%%xmm4                   \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pslld       $0xb,%%xmm5                   \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu     (%0),%%xmm0                    \n"
-      "paddusb    %%xmm6,%%xmm0                  \n"
-      "movdqa     %%xmm0,%%xmm1                  \n"
-      "movdqa     %%xmm0,%%xmm2                  \n"
-      "pslld      $0x8,%%xmm0                    \n"
-      "psrld      $0x3,%%xmm1                    \n"
-      "psrld      $0x5,%%xmm2                    \n"
-      "psrad      $0x10,%%xmm0                   \n"
-      "pand       %%xmm3,%%xmm1                  \n"
-      "pand       %%xmm4,%%xmm2                  \n"
-      "pand       %%xmm5,%%xmm0                  \n"
-      "por        %%xmm2,%%xmm1                  \n"
-      "por        %%xmm1,%%xmm0                  \n"
-      "packssdw   %%xmm0,%%xmm0                  \n"
-      "lea        0x10(%0),%0                    \n"
-      "movq       %%xmm0,(%1)                    \n"
-      "lea        0x8(%1),%1                     \n"
-      "sub        $0x4,%2                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "paddusb     %%xmm6,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "pslld       $0x8,%%xmm0                   \n"
+      "psrld       $0x3,%%xmm1                   \n"
+      "psrld       $0x5,%%xmm2                   \n"
+      "psrad       $0x10,%%xmm0                  \n"
+      "pand        %%xmm3,%%xmm1                 \n"
+      "pand        %%xmm4,%%xmm2                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm1                 \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "packssdw    %%xmm0,%%xmm0                 \n"
+      "lea         0x10(%0),%0                   \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src),    // %0
         "+r"(dst),    // %1
         "+r"(width)   // %2
@@ -739,35 +783,35 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
                                 int width) {
   asm volatile(
       "vbroadcastss %3,%%xmm6                    \n"
-      "vpunpcklbw %%xmm6,%%xmm6,%%xmm6           \n"
-      "vpermq     $0xd8,%%ymm6,%%ymm6            \n"
-      "vpunpcklwd %%ymm6,%%ymm6,%%ymm6           \n"
-      "vpcmpeqb   %%ymm3,%%ymm3,%%ymm3           \n"
-      "vpsrld     $0x1b,%%ymm3,%%ymm3            \n"
-      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-      "vpsrld     $0x1a,%%ymm4,%%ymm4            \n"
-      "vpslld     $0x5,%%ymm4,%%ymm4             \n"
-      "vpslld     $0xb,%%ymm3,%%ymm5             \n"
+      "vpunpcklbw  %%xmm6,%%xmm6,%%xmm6          \n"
+      "vpermq      $0xd8,%%ymm6,%%ymm6           \n"
+      "vpunpcklwd  %%ymm6,%%ymm6,%%ymm6          \n"
+      "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"
+      "vpsrld      $0x1b,%%ymm3,%%ymm3           \n"
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrld      $0x1a,%%ymm4,%%ymm4           \n"
+      "vpslld      $0x5,%%ymm4,%%ymm4            \n"
+      "vpslld      $0xb,%%ymm3,%%ymm5            \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vpaddusb   %%ymm6,%%ymm0,%%ymm0           \n"
-      "vpsrld     $0x5,%%ymm0,%%ymm2             \n"
-      "vpsrld     $0x3,%%ymm0,%%ymm1             \n"
-      "vpsrld     $0x8,%%ymm0,%%ymm0             \n"
-      "vpand      %%ymm4,%%ymm2,%%ymm2           \n"
-      "vpand      %%ymm3,%%ymm1,%%ymm1           \n"
-      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
-      "vpor       %%ymm2,%%ymm1,%%ymm1           \n"
-      "vpor       %%ymm1,%%ymm0,%%ymm0           \n"
-      "vpackusdw  %%ymm0,%%ymm0,%%ymm0           \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "lea        0x20(%0),%0                    \n"
-      "vmovdqu    %%xmm0,(%1)                    \n"
-      "lea        0x10(%1),%1                    \n"
-      "sub        $0x8,%2                        \n"
-      "jg         1b                             \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vpaddusb    %%ymm6,%%ymm0,%%ymm0          \n"
+      "vpsrld      $0x5,%%ymm0,%%ymm2            \n"
+      "vpsrld      $0x3,%%ymm0,%%ymm1            \n"
+      "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
+      "vpand       %%ymm4,%%ymm2,%%ymm2          \n"
+      "vpand       %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpor        %%ymm2,%%ymm1,%%ymm1          \n"
+      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "lea         0x20(%0),%0                   \n"
+      "vmovdqu     %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src),    // %0
         "+r"(dst),    // %1
@@ -780,38 +824,38 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
 
 void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
-      "pcmpeqb   %%xmm4,%%xmm4                   \n"
-      "psrld     $0x1b,%%xmm4                    \n"
-      "movdqa    %%xmm4,%%xmm5                   \n"
-      "pslld     $0x5,%%xmm5                     \n"
-      "movdqa    %%xmm4,%%xmm6                   \n"
-      "pslld     $0xa,%%xmm6                     \n"
-      "pcmpeqb   %%xmm7,%%xmm7                   \n"
-      "pslld     $0xf,%%xmm7                     \n"
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrld       $0x1b,%%xmm4                  \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "pslld       $0x5,%%xmm5                   \n"
+      "movdqa      %%xmm4,%%xmm6                 \n"
+      "pslld       $0xa,%%xmm6                   \n"
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "pslld       $0xf,%%xmm7                   \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "movdqa    %%xmm0,%%xmm3                   \n"
-      "psrad     $0x10,%%xmm0                    \n"
-      "psrld     $0x3,%%xmm1                     \n"
-      "psrld     $0x6,%%xmm2                     \n"
-      "psrld     $0x9,%%xmm3                     \n"
-      "pand      %%xmm7,%%xmm0                   \n"
-      "pand      %%xmm4,%%xmm1                   \n"
-      "pand      %%xmm5,%%xmm2                   \n"
-      "pand      %%xmm6,%%xmm3                   \n"
-      "por       %%xmm1,%%xmm0                   \n"
-      "por       %%xmm3,%%xmm2                   \n"
-      "por       %%xmm2,%%xmm0                   \n"
-      "packssdw  %%xmm0,%%xmm0                   \n"
-      "lea       0x10(%0),%0                     \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x4,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm3                 \n"
+      "psrad       $0x10,%%xmm0                  \n"
+      "psrld       $0x3,%%xmm1                   \n"
+      "psrld       $0x6,%%xmm2                   \n"
+      "psrld       $0x9,%%xmm3                   \n"
+      "pand        %%xmm7,%%xmm0                 \n"
+      "pand        %%xmm4,%%xmm1                 \n"
+      "pand        %%xmm5,%%xmm2                 \n"
+      "pand        %%xmm6,%%xmm3                 \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm3,%%xmm2                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "packssdw    %%xmm0,%%xmm0                 \n"
+      "lea         0x10(%0),%0                   \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
         "+r"(width)  // %2
@@ -821,26 +865,26 @@ void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 
 void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
-      "pcmpeqb   %%xmm4,%%xmm4                   \n"
-      "psllw     $0xc,%%xmm4                     \n"
-      "movdqa    %%xmm4,%%xmm3                   \n"
-      "psrlw     $0x8,%%xmm3                     \n"
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psllw       $0xc,%%xmm4                   \n"
+      "movdqa      %%xmm4,%%xmm3                 \n"
+      "psrlw       $0x8,%%xmm3                   \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "pand      %%xmm3,%%xmm0                   \n"
-      "pand      %%xmm4,%%xmm1                   \n"
-      "psrlq     $0x4,%%xmm0                     \n"
-      "psrlq     $0x8,%%xmm1                     \n"
-      "por       %%xmm1,%%xmm0                   \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "lea       0x10(%0),%0                     \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x4,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pand        %%xmm3,%%xmm0                 \n"
+      "pand        %%xmm4,%%xmm1                 \n"
+      "psrlq       $0x4,%%xmm0                   \n"
+      "psrlq       $0x8,%%xmm1                   \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "lea         0x10(%0),%0                   \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
         "+r"(width)  // %2
@@ -884,31 +928,31 @@ static const uint32_t kMulAG10 = 64 * 65536 + 1028;
 
 void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
-      "movdqa     %3,%%xmm2                     \n"  // shuffler for RB
-      "movd       %4,%%xmm3                     \n"  // multipler for RB
-      "movd       %5,%%xmm4                     \n"  // mask for R10 B10
-      "movd       %6,%%xmm5                     \n"  // mask for AG
-      "movd       %7,%%xmm6                     \n"  // multipler for AG
-      "pshufd     $0x0,%%xmm3,%%xmm3            \n"
-      "pshufd     $0x0,%%xmm4,%%xmm4            \n"
-      "pshufd     $0x0,%%xmm5,%%xmm5            \n"
-      "pshufd     $0x0,%%xmm6,%%xmm6            \n"
-      "sub        %0,%1                         \n"
-
-      "1:                                       \n"
-      "movdqu     (%0),%%xmm0                   \n"  // fetch 4 ARGB pixels
-      "movdqa     %%xmm0,%%xmm1                 \n"
-      "pshufb     %%xmm2,%%xmm1                 \n"  // R0B0
-      "pand       %%xmm5,%%xmm0                 \n"  // A0G0
-      "pmulhuw    %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
-      "pmulhuw    %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
-      "pand       %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
-      "pslld      $10,%%xmm0                    \n"  // A2 x10 G10 x10
-      "por        %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
-      "movdqu     %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
-      "add        $0x10,%0                      \n"
-      "sub        $0x4,%2                       \n"
-      "jg         1b                            \n"
+      "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
+      "movd        %4,%%xmm3                     \n"  // multipler for RB
+      "movd        %5,%%xmm4                     \n"  // mask for R10 B10
+      "movd        %6,%%xmm5                     \n"  // mask for AG
+      "movd        %7,%%xmm6                     \n"  // multipler for AG
+      "pshufd      $0x0,%%xmm3,%%xmm3            \n"
+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
+      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
+      "sub         %0,%1                         \n"
+
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"  // fetch 4 ARGB pixels
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pshufb      %%xmm2,%%xmm1                 \n"  // R0B0
+      "pand        %%xmm5,%%xmm0                 \n"  // A0G0
+      "pmulhuw     %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
+      "pmulhuw     %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
+      "pand        %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
+      "pslld       $10,%%xmm0                    \n"  // A2 x10 G10 x10
+      "por         %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
+      "movdqu      %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
+      "add         $0x10,%0                      \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
 
       : "+r"(src),          // %0
         "+r"(dst),          // %1
@@ -923,31 +967,31 @@ void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
 
 void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
-      "movdqa     %3,%%xmm2                     \n"  // shuffler for RB
-      "movd       %4,%%xmm3                     \n"  // multipler for RB
-      "movd       %5,%%xmm4                     \n"  // mask for R10 B10
-      "movd       %6,%%xmm5                     \n"  // mask for AG
-      "movd       %7,%%xmm6                     \n"  // multipler for AG
-      "pshufd     $0x0,%%xmm3,%%xmm3            \n"
-      "pshufd     $0x0,%%xmm4,%%xmm4            \n"
-      "pshufd     $0x0,%%xmm5,%%xmm5            \n"
-      "pshufd     $0x0,%%xmm6,%%xmm6            \n"
-      "sub        %0,%1                         \n"
-
-      "1:                                       \n"
-      "movdqu     (%0),%%xmm0                   \n"  // fetch 4 ABGR pixels
-      "movdqa     %%xmm0,%%xmm1                 \n"
-      "pshufb     %%xmm2,%%xmm1                 \n"  // R0B0
-      "pand       %%xmm5,%%xmm0                 \n"  // A0G0
-      "pmulhuw    %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
-      "pmulhuw    %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
-      "pand       %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
-      "pslld      $10,%%xmm0                    \n"  // A2 x10 G10 x10
-      "por        %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
-      "movdqu     %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
-      "add        $0x10,%0                      \n"
-      "sub        $0x4,%2                       \n"
-      "jg         1b                            \n"
+      "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
+      "movd        %4,%%xmm3                     \n"  // multipler for RB
+      "movd        %5,%%xmm4                     \n"  // mask for R10 B10
+      "movd        %6,%%xmm5                     \n"  // mask for AG
+      "movd        %7,%%xmm6                     \n"  // multipler for AG
+      "pshufd      $0x0,%%xmm3,%%xmm3            \n"
+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
+      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
+      "sub         %0,%1                         \n"
+
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"  // fetch 4 ABGR pixels
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pshufb      %%xmm2,%%xmm1                 \n"  // R0B0
+      "pand        %%xmm5,%%xmm0                 \n"  // A0G0
+      "pmulhuw     %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
+      "pmulhuw     %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
+      "pand        %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
+      "pslld       $10,%%xmm0                    \n"  // A2 x10 G10 x10
+      "por         %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
+      "movdqu      %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
+      "add         $0x10,%0                      \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
 
       : "+r"(src),          // %0
         "+r"(dst),          // %1
@@ -964,25 +1008,25 @@ void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
 void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
       "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
-      "vbroadcastss  %4,%%ymm3                   \n"  // multipler for RB
-      "vbroadcastss  %5,%%ymm4                   \n"  // mask for R10 B10
-      "vbroadcastss  %6,%%ymm5                   \n"  // mask for AG
-      "vbroadcastss  %7,%%ymm6                   \n"  // multipler for AG
-      "sub        %0,%1                          \n"
-
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"  // fetch 8 ARGB pixels
-      "vpshufb    %%ymm2,%%ymm0,%%ymm1           \n"  // R0B0
-      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"  // A0G0
-      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"  // X2 R16 X4  B10
-      "vpmulhuw   %%ymm6,%%ymm0,%%ymm0           \n"  // X10 A2 X10 G10
-      "vpand      %%ymm4,%%ymm1,%%ymm1           \n"  // X2 R10 X10 B10
-      "vpslld     $10,%%ymm0,%%ymm0              \n"  // A2 x10 G10 x10
-      "vpor       %%ymm1,%%ymm0,%%ymm0           \n"  // A2 R10 G10 B10
-      "vmovdqu    %%ymm0,(%1,%0)                 \n"  // store 8 AR30 pixels
-      "add        $0x20,%0                       \n"
-      "sub        $0x8,%2                        \n"
-      "jg         1b                             \n"
+      "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
+      "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
+      "vbroadcastss %6,%%ymm5                    \n"  // mask for AG
+      "vbroadcastss %7,%%ymm6                    \n"  // multipler for AG
+      "sub         %0,%1                         \n"
+
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"  // fetch 8 ARGB pixels
+      "vpshufb     %%ymm2,%%ymm0,%%ymm1          \n"  // R0B0
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"  // A0G0
+      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"  // X2 R16 X4  B10
+      "vpmulhuw    %%ymm6,%%ymm0,%%ymm0          \n"  // X10 A2 X10 G10
+      "vpand       %%ymm4,%%ymm1,%%ymm1          \n"  // X2 R10 X10 B10
+      "vpslld      $10,%%ymm0,%%ymm0             \n"  // A2 x10 G10 x10
+      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // A2 R10 G10 B10
+      "vmovdqu     %%ymm0,(%1,%0)                \n"  // store 8 AR30 pixels
+      "add         $0x20,%0                      \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
 
       : "+r"(src),          // %0
@@ -1001,25 +1045,25 @@ void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
       "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
-      "vbroadcastss  %4,%%ymm3                   \n"  // multipler for RB
-      "vbroadcastss  %5,%%ymm4                   \n"  // mask for R10 B10
-      "vbroadcastss  %6,%%ymm5                   \n"  // mask for AG
-      "vbroadcastss  %7,%%ymm6                   \n"  // multipler for AG
-      "sub        %0,%1                          \n"
-
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"  // fetch 8 ABGR pixels
-      "vpshufb    %%ymm2,%%ymm0,%%ymm1           \n"  // R0B0
-      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"  // A0G0
-      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"  // X2 R16 X4  B10
-      "vpmulhuw   %%ymm6,%%ymm0,%%ymm0           \n"  // X10 A2 X10 G10
-      "vpand      %%ymm4,%%ymm1,%%ymm1           \n"  // X2 R10 X10 B10
-      "vpslld     $10,%%ymm0,%%ymm0              \n"  // A2 x10 G10 x10
-      "vpor       %%ymm1,%%ymm0,%%ymm0           \n"  // A2 R10 G10 B10
-      "vmovdqu    %%ymm0,(%1,%0)                 \n"  // store 8 AR30 pixels
-      "add        $0x20,%0                       \n"
-      "sub        $0x8,%2                        \n"
-      "jg         1b                             \n"
+      "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
+      "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
+      "vbroadcastss %6,%%ymm5                    \n"  // mask for AG
+      "vbroadcastss %7,%%ymm6                    \n"  // multipler for AG
+      "sub         %0,%1                         \n"
+
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"  // fetch 8 ABGR pixels
+      "vpshufb     %%ymm2,%%ymm0,%%ymm1          \n"  // R0B0
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"  // A0G0
+      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"  // X2 R16 X4  B10
+      "vpmulhuw    %%ymm6,%%ymm0,%%ymm0          \n"  // X10 A2 X10 G10
+      "vpand       %%ymm4,%%ymm1,%%ymm1          \n"  // X2 R10 X10 B10
+      "vpslld      $10,%%ymm0,%%ymm0             \n"  // A2 x10 G10 x10
+      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // A2 R10 G10 B10
+      "vmovdqu     %%ymm0,(%1,%0)                \n"  // store 8 AR30 pixels
+      "add         $0x20,%0                      \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
 
       : "+r"(src),          // %0
@@ -1034,82 +1078,130 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 }
 #endif
 
+// clang-format off
+
+// TODO(mraptis): Consider passing R, G, B multipliers as parameter.
+// round parameter is register containing value to add before shift.
+#define RGBTOY(round)                            \
+  "1:                                        \n" \
+  "movdqu    (%0),%%xmm0                     \n" \
+  "movdqu    0x10(%0),%%xmm1                 \n" \
+  "movdqu    0x20(%0),%%xmm2                 \n" \
+  "movdqu    0x30(%0),%%xmm3                 \n" \
+  "psubb     %%xmm5,%%xmm0                   \n" \
+  "psubb     %%xmm5,%%xmm1                   \n" \
+  "psubb     %%xmm5,%%xmm2                   \n" \
+  "psubb     %%xmm5,%%xmm3                   \n" \
+  "movdqu    %%xmm4,%%xmm6                   \n" \
+  "pmaddubsw %%xmm0,%%xmm6                   \n" \
+  "movdqu    %%xmm4,%%xmm0                   \n" \
+  "pmaddubsw %%xmm1,%%xmm0                   \n" \
+  "movdqu    %%xmm4,%%xmm1                   \n" \
+  "pmaddubsw %%xmm2,%%xmm1                   \n" \
+  "movdqu    %%xmm4,%%xmm2                   \n" \
+  "pmaddubsw %%xmm3,%%xmm2                   \n" \
+  "lea       0x40(%0),%0                     \n" \
+  "phaddw    %%xmm0,%%xmm6                   \n" \
+  "phaddw    %%xmm2,%%xmm1                   \n" \
+  "prefetcht0 1280(%0)                       \n" \
+  "paddw     %%" #round ",%%xmm6             \n" \
+  "paddw     %%" #round ",%%xmm1             \n" \
+  "psrlw     $0x8,%%xmm6                     \n" \
+  "psrlw     $0x8,%%xmm1                     \n" \
+  "packuswb  %%xmm1,%%xmm6                   \n" \
+  "movdqu    %%xmm6,(%1)                     \n" \
+  "lea       0x10(%1),%1                     \n" \
+  "sub       $0x10,%2                        \n" \
+  "jg        1b                              \n"
+
+#define RGBTOY_AVX2(round)                                       \
+  "1:                                        \n"                 \
+  "vmovdqu    (%0),%%ymm0                    \n"                 \
+  "vmovdqu    0x20(%0),%%ymm1                \n"                 \
+  "vmovdqu    0x40(%0),%%ymm2                \n"                 \
+  "vmovdqu    0x60(%0),%%ymm3                \n"                 \
+  "vpsubb     %%ymm5, %%ymm0, %%ymm0         \n"                 \
+  "vpsubb     %%ymm5, %%ymm1, %%ymm1         \n"                 \
+  "vpsubb     %%ymm5, %%ymm2, %%ymm2         \n"                 \
+  "vpsubb     %%ymm5, %%ymm3, %%ymm3         \n"                 \
+  "vpmaddubsw %%ymm0,%%ymm4,%%ymm0           \n"                 \
+  "vpmaddubsw %%ymm1,%%ymm4,%%ymm1           \n"                 \
+  "vpmaddubsw %%ymm2,%%ymm4,%%ymm2           \n"                 \
+  "vpmaddubsw %%ymm3,%%ymm4,%%ymm3           \n"                 \
+  "lea       0x80(%0),%0                     \n"                 \
+  "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n" /* mutates. */  \
+  "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"                 \
+  "prefetcht0 1280(%0)                       \n"                 \
+  "vpaddw     %%" #round ",%%ymm0,%%ymm0     \n" /* Add .5 for rounding. */             \
+  "vpaddw     %%" #round ",%%ymm2,%%ymm2     \n" \
+  "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"                 \
+  "vpsrlw     $0x8,%%ymm2,%%ymm2             \n"                 \
+  "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n" /* mutates. */  \
+  "vpermd     %%ymm0,%%ymm6,%%ymm0           \n" /* unmutate. */ \
+  "vmovdqu    %%ymm0,(%1)                    \n"                 \
+  "lea       0x20(%1),%1                     \n"                 \
+  "sub       $0x20,%2                        \n"                 \
+  "jg        1b                              \n"                 \
+  "vzeroupper                                \n"
+
+// clang-format on
+
 #ifdef HAS_ARGBTOYROW_SSSE3
 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
 void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   asm volatile(
-      "movdqa    %3,%%xmm4                       \n"
-      "movdqa    %4,%%xmm5                       \n"
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+      "movdqa      %5,%%xmm7                     \n"
 
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x30(%0),%%xmm3                 \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm1                   \n"
-      "pmaddubsw %%xmm4,%%xmm2                   \n"
-      "pmaddubsw %%xmm4,%%xmm3                   \n"
-      "lea       0x40(%0),%0                     \n"
-      "phaddw    %%xmm1,%%xmm0                   \n"
-      "phaddw    %%xmm3,%%xmm2                   \n"
-      "psrlw     $0x7,%%xmm0                     \n"
-      "psrlw     $0x7,%%xmm2                     \n"
-      "packuswb  %%xmm2,%%xmm0                   \n"
-      "paddb     %%xmm5,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
+      LABELALIGN RGBTOY(xmm7)
       : "+r"(src_argb),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
       : "m"(kARGBToY),   // %3
-        "m"(kAddY16)     // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+        "m"(kSub128),    // %4
+        "m"(kAddY16)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBTOYROW_SSSE3
 
 #ifdef HAS_ARGBTOYJROW_SSSE3
 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
-// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
+// Same as ARGBToYRow but different coefficients, no add 16.
 void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   asm volatile(
-      "movdqa    %3,%%xmm4                       \n"
-      "movdqa    %4,%%xmm5                       \n"
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
 
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x30(%0),%%xmm3                 \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm1                   \n"
-      "pmaddubsw %%xmm4,%%xmm2                   \n"
-      "pmaddubsw %%xmm4,%%xmm3                   \n"
-      "lea       0x40(%0),%0                     \n"
-      "phaddw    %%xmm1,%%xmm0                   \n"
-      "phaddw    %%xmm3,%%xmm2                   \n"
-      "paddw     %%xmm5,%%xmm0                   \n"
-      "paddw     %%xmm5,%%xmm2                   \n"
-      "psrlw     $0x7,%%xmm0                     \n"
-      "psrlw     $0x7,%%xmm2                     \n"
-      "packuswb  %%xmm2,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
+      LABELALIGN RGBTOY(xmm5)
       : "+r"(src_argb),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
       : "m"(kARGBToYJ),  // %3
-        "m"(kAddYJ64)    // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+        "m"(kSub128)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_ARGBTOYJROW_SSSE3
 
+#ifdef HAS_RGBATOYJROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16.
+void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+
+      LABELALIGN RGBTOY(xmm5)
+      : "+r"(src_rgba),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kRGBAToYJ),  // %3
+        "m"(kSub128)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif  // HAS_RGBATOYJROW_SSSE3
+
 #ifdef HAS_ARGBTOYROW_AVX2
 // vpermd for vphaddw + vpackuswb vpermd.
 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
@@ -1119,83 +1211,84 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   asm volatile(
       "vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
-      "vmovdqu    %5,%%ymm6                      \n"
+      "vbroadcastf128 %5,%%ymm7                  \n"
+      "vmovdqu     %6,%%ymm6                     \n"
 
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "vmovdqu    0x40(%0),%%ymm2                \n"
-      "vmovdqu    0x60(%0),%%ymm3                \n"
-      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-      "lea       0x80(%0),%0                     \n"
-      "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
-      "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
-      "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
-      "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
-      "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
-      "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
-      "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x20,%2                        \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
+      LABELALIGN RGBTOY_AVX2(ymm7)
       : "+r"(src_argb),         // %0
         "+r"(dst_y),            // %1
         "+r"(width)             // %2
       : "m"(kARGBToY),          // %3
-        "m"(kAddY16),           // %4
-        "m"(kPermdARGBToY_AVX)  // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+        "m"(kSub128),           // %4
+        "m"(kAddY16),           // %5
+        "m"(kPermdARGBToY_AVX)  // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBTOYROW_AVX2
 
+#ifdef HAS_ABGRTOYROW_AVX2
+// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
+void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+      "vbroadcastf128 %5,%%ymm7                  \n"
+      "vmovdqu     %6,%%ymm6                     \n"
+
+      LABELALIGN RGBTOY_AVX2(ymm7)
+      : "+r"(src_abgr),         // %0
+        "+r"(dst_y),            // %1
+        "+r"(width)             // %2
+      : "m"(kABGRToY),          // %3
+        "m"(kSub128),           // %4
+        "m"(kAddY16),           // %5
+        "m"(kPermdARGBToY_AVX)  // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ABGRTOYROW_AVX2
+
 #ifdef HAS_ARGBTOYJROW_AVX2
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
 void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   asm volatile(
       "vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
-      "vmovdqu    %5,%%ymm6                      \n"
+      "vmovdqu     %5,%%ymm6                     \n"
 
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "vmovdqu    0x40(%0),%%ymm2                \n"
-      "vmovdqu    0x60(%0),%%ymm3                \n"
-      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-      "lea       0x80(%0),%0                     \n"
-      "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
-      "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
-      "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
-      "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
-      "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
-      "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
-      "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
-      "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x20,%2                        \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
+      LABELALIGN RGBTOY_AVX2(ymm5)
       : "+r"(src_argb),         // %0
         "+r"(dst_y),            // %1
         "+r"(width)             // %2
       : "m"(kARGBToYJ),         // %3
-        "m"(kAddYJ64),          // %4
+        "m"(kSub128),           // %4
         "m"(kPermdARGBToY_AVX)  // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBTOYJROW_AVX2
 
+#ifdef HAS_RGBATOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+      "vmovdqu     %5,%%ymm6                     \n"
+
+      LABELALIGN RGBTOY_AVX2(
+          ymm5) "vzeroupper                                \n"
+      : "+r"(src_rgba),         // %0
+        "+r"(dst_y),            // %1
+        "+r"(width)             // %2
+      : "m"(kRGBAToYJ),         // %3
+        "m"(kSub128),           // %4
+        "m"(kPermdARGBToY_AVX)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif  // HAS_RGBATOYJROW_AVX2
+
 #ifdef HAS_ARGBTOUVROW_SSSE3
 void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
                        int src_stride_argb,
@@ -1203,52 +1296,52 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
                        uint8_t* dst_v,
                        int width) {
   asm volatile(
-      "movdqa    %5,%%xmm3                       \n"
-      "movdqa    %6,%%xmm4                       \n"
-      "movdqa    %7,%%xmm5                       \n"
-      "sub       %1,%2                           \n"
+      "movdqa      %5,%%xmm3                     \n"
+      "movdqa      %6,%%xmm4                     \n"
+      "movdqa      %7,%%xmm5                     \n"
+      "sub         %1,%2                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm0                   \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm1                   \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm2                   \n"
-      "movdqu    0x30(%0),%%xmm6                 \n"
-      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm6                   \n"
-
-      "lea       0x40(%0),%0                     \n"
-      "movdqa    %%xmm0,%%xmm7                   \n"
-      "shufps    $0x88,%%xmm1,%%xmm0             \n"
-      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-      "pavgb     %%xmm7,%%xmm0                   \n"
-      "movdqa    %%xmm2,%%xmm7                   \n"
-      "shufps    $0x88,%%xmm6,%%xmm2             \n"
-      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-      "pavgb     %%xmm7,%%xmm2                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "movdqa    %%xmm2,%%xmm6                   \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm2                   \n"
-      "pmaddubsw %%xmm3,%%xmm1                   \n"
-      "pmaddubsw %%xmm3,%%xmm6                   \n"
-      "phaddw    %%xmm2,%%xmm0                   \n"
-      "phaddw    %%xmm6,%%xmm1                   \n"
-      "psraw     $0x8,%%xmm0                     \n"
-      "psraw     $0x8,%%xmm1                     \n"
-      "packsswb  %%xmm1,%%xmm0                   \n"
-      "paddb     %%xmm5,%%xmm0                   \n"
-      "movlps    %%xmm0,(%1)                     \n"
-      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm1                 \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+
+      "lea         0x40(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm6,%%xmm2           \n"
+      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "phaddw      %%xmm2,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm1                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm1                   \n"
+      "packsswb    %%xmm1,%%xmm0                 \n"
+      "paddb       %%xmm5,%%xmm0                 \n"
+      "movlps      %%xmm0,(%1)                   \n"
+      "movhps      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
       : "+r"(src_argb0),                   // %0
         "+r"(dst_u),                       // %1
         "+r"(dst_v),                       // %2
@@ -1275,44 +1368,44 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
       "vbroadcastf128 %5,%%ymm5                  \n"
       "vbroadcastf128 %6,%%ymm6                  \n"
       "vbroadcastf128 %7,%%ymm7                  \n"
-      "sub        %1,%2                          \n"
+      "sub         %1,%2                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "vmovdqu    0x40(%0),%%ymm2                \n"
-      "vmovdqu    0x60(%0),%%ymm3                \n"
-      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
-      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
-      "vpavgb    0x40(%0,%4,1),%%ymm2,%%ymm2     \n"
-      "vpavgb    0x60(%0,%4,1),%%ymm3,%%ymm3     \n"
-      "lea        0x80(%0),%0                    \n"
-      "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
-      "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
-      "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
-      "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
-      "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
-      "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
-
-      "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
-      "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
-      "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
-      "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
-      "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
-      "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
-      "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
-      "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
-      "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "vpshufb    %8,%%ymm0,%%ymm0               \n"
-      "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x40(%0),%%ymm2               \n"
+      "vmovdqu     0x60(%0),%%ymm3               \n"
+      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
+      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
+      "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
+      "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
+      "lea         0x80(%0),%0                   \n"
+      "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
+      "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
+      "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
+      "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
+
+      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
+      "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
+      "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
+      "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpshufb     %8,%%ymm0,%%ymm0              \n"
+      "vpaddb      %%ymm5,%%ymm0,%%ymm0          \n"
 
       "vextractf128 $0x0,%%ymm0,(%1)             \n"
       "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
-      "lea        0x10(%1),%1                    \n"
-      "sub        $0x20,%3                       \n"
-      "jg         1b                             \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src_argb0),                   // %0
         "+r"(dst_u),                       // %1
@@ -1328,6 +1421,69 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
 }
 #endif  // HAS_ARGBTOUVROW_AVX2
 
+#ifdef HAS_ABGRTOUVROW_AVX2
+void ABGRToUVRow_AVX2(const uint8_t* src_abgr0,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "vbroadcastf128 %5,%%ymm5                  \n"
+      "vbroadcastf128 %6,%%ymm6                  \n"
+      "vbroadcastf128 %7,%%ymm7                  \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x40(%0),%%ymm2               \n"
+      "vmovdqu     0x60(%0),%%ymm3               \n"
+      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
+      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
+      "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
+      "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
+      "lea         0x80(%0),%0                   \n"
+      "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
+      "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
+      "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
+      "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
+
+      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
+      "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
+      "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
+      "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpshufb     %8,%%ymm0,%%ymm0              \n"
+      "vpaddb      %%ymm5,%%ymm0,%%ymm0          \n"
+
+      "vextractf128 $0x0,%%ymm0,(%1)             \n"
+      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_abgr0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_abgr)),  // %4
+        "m"(kAddUV128),                    // %5
+        "m"(kABGRToV),                     // %6
+        "m"(kABGRToU),                     // %7
+        "m"(kShufARGBToUV_AVX)             // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ABGRTOUVROW_AVX2
+
 #ifdef HAS_ARGBTOUVJROW_AVX2
 void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
                        int src_stride_argb,
@@ -1338,52 +1494,52 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
       "vbroadcastf128 %5,%%ymm5                  \n"
       "vbroadcastf128 %6,%%ymm6                  \n"
       "vbroadcastf128 %7,%%ymm7                  \n"
-      "sub        %1,%2                          \n"
+      "sub         %1,%2                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "vmovdqu    0x40(%0),%%ymm2                \n"
-      "vmovdqu    0x60(%0),%%ymm3                \n"
-      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
-      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
-      "vpavgb    0x40(%0,%4,1),%%ymm2,%%ymm2     \n"
-      "vpavgb    0x60(%0,%4,1),%%ymm3,%%ymm3     \n"
-      "lea       0x80(%0),%0                     \n"
-      "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
-      "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
-      "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
-      "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
-      "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
-      "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
-
-      "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
-      "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
-      "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
-      "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
-      "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
-      "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
-      "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
-      "vpaddw     %%ymm5,%%ymm1,%%ymm1           \n"
-      "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
-      "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
-      "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "vpshufb    %8,%%ymm0,%%ymm0               \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x40(%0),%%ymm2               \n"
+      "vmovdqu     0x60(%0),%%ymm3               \n"
+      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
+      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
+      "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
+      "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
+      "lea         0x80(%0),%0                   \n"
+      "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
+      "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
+      "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
+      "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
+
+      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
+      "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
+      "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
+      "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpshufb     %8,%%ymm0,%%ymm0              \n"
 
       "vextractf128 $0x0,%%ymm0,(%1)             \n"
       "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x20,%3                        \n"
-      "jg        1b                              \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src_argb0),                   // %0
         "+r"(dst_u),                       // %1
         "+r"(dst_v),                       // %2
         "+rm"(width)                       // %3
       : "r"((intptr_t)(src_stride_argb)),  // %4
-        "m"(kAddUVJ128),                   // %5
+        "m"(kSub128),                      // %5
         "m"(kARGBToVJ),                    // %6
         "m"(kARGBToUJ),                    // %7
         "m"(kShufARGBToUV_AVX)             // %8
@@ -1399,53 +1555,53 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
                         uint8_t* dst_v,
                         int width) {
   asm volatile(
-      "movdqa    %5,%%xmm3                       \n"
-      "movdqa    %6,%%xmm4                       \n"
-      "movdqa    %7,%%xmm5                       \n"
-      "sub       %1,%2                           \n"
+      "movdqa      %5,%%xmm3                     \n"
+      "movdqa      %6,%%xmm4                     \n"
+      "movdqa      %7,%%xmm5                     \n"
+      "sub         %1,%2                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm0                   \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm1                   \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm2                   \n"
-      "movdqu    0x30(%0),%%xmm6                 \n"
-      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm6                   \n"
-
-      "lea       0x40(%0),%0                     \n"
-      "movdqa    %%xmm0,%%xmm7                   \n"
-      "shufps    $0x88,%%xmm1,%%xmm0             \n"
-      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-      "pavgb     %%xmm7,%%xmm0                   \n"
-      "movdqa    %%xmm2,%%xmm7                   \n"
-      "shufps    $0x88,%%xmm6,%%xmm2             \n"
-      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-      "pavgb     %%xmm7,%%xmm2                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "movdqa    %%xmm2,%%xmm6                   \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm2                   \n"
-      "pmaddubsw %%xmm3,%%xmm1                   \n"
-      "pmaddubsw %%xmm3,%%xmm6                   \n"
-      "phaddw    %%xmm2,%%xmm0                   \n"
-      "phaddw    %%xmm6,%%xmm1                   \n"
-      "paddw     %%xmm5,%%xmm0                   \n"
-      "paddw     %%xmm5,%%xmm1                   \n"
-      "psraw     $0x8,%%xmm0                     \n"
-      "psraw     $0x8,%%xmm1                     \n"
-      "packsswb  %%xmm1,%%xmm0                   \n"
-      "movlps    %%xmm0,(%1)                     \n"
-      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm1                 \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+
+      "lea         0x40(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm6,%%xmm2           \n"
+      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "phaddw      %%xmm2,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm1                 \n"
+      "paddw       %%xmm5,%%xmm0                 \n"
+      "paddw       %%xmm5,%%xmm1                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm1                   \n"
+      "packsswb    %%xmm1,%%xmm0                 \n"
+      "movlps      %%xmm0,(%1)                   \n"
+      "movhps      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
       : "+r"(src_argb0),                   // %0
         "+r"(dst_u),                       // %1
         "+r"(dst_v),                       // %2
@@ -1453,7 +1609,7 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
       : "r"((intptr_t)(src_stride_argb)),  // %4
         "m"(kARGBToVJ),                    // %5
         "m"(kARGBToUJ),                    // %6
-        "m"(kAddUVJ128)                    // %7
+        "m"(kSub128)                       // %7
       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 #endif  // HAS_ARGBTOUVJROW_SSSE3
@@ -1464,47 +1620,47 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
                           uint8_t* dst_v,
                           int width) {
   asm volatile(
-      "movdqa    %4,%%xmm3                       \n"
-      "movdqa    %5,%%xmm4                       \n"
-      "movdqa    %6,%%xmm5                       \n"
-      "sub       %1,%2                           \n"
+      "movdqa      %4,%%xmm3                     \n"
+      "movdqa      %5,%%xmm4                     \n"
+      "movdqa      %6,%%xmm5                     \n"
+      "sub         %1,%2                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x30(%0),%%xmm6                 \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm1                   \n"
-      "pmaddubsw %%xmm4,%%xmm2                   \n"
-      "pmaddubsw %%xmm4,%%xmm6                   \n"
-      "phaddw    %%xmm1,%%xmm0                   \n"
-      "phaddw    %%xmm6,%%xmm2                   \n"
-      "psraw     $0x8,%%xmm0                     \n"
-      "psraw     $0x8,%%xmm2                     \n"
-      "packsswb  %%xmm2,%%xmm0                   \n"
-      "paddb     %%xmm5,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x30(%0),%%xmm6                 \n"
-      "pmaddubsw %%xmm3,%%xmm0                   \n"
-      "pmaddubsw %%xmm3,%%xmm1                   \n"
-      "pmaddubsw %%xmm3,%%xmm2                   \n"
-      "pmaddubsw %%xmm3,%%xmm6                   \n"
-      "phaddw    %%xmm1,%%xmm0                   \n"
-      "phaddw    %%xmm6,%%xmm2                   \n"
-      "psraw     $0x8,%%xmm0                     \n"
-      "psraw     $0x8,%%xmm2                     \n"
-      "packsswb  %%xmm2,%%xmm0                   \n"
-      "paddb     %%xmm5,%%xmm0                   \n"
-      "lea       0x40(%0),%0                     \n"
-      "movdqu    %%xmm0,0x00(%1,%2,1)            \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm1                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm4,%%xmm6                 \n"
+      "phaddw      %%xmm1,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm2                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm2                   \n"
+      "packsswb    %%xmm2,%%xmm0                 \n"
+      "paddb       %%xmm5,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "pmaddubsw   %%xmm3,%%xmm0                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "pmaddubsw   %%xmm3,%%xmm2                 \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "phaddw      %%xmm1,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm2                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm2                   \n"
+      "packsswb    %%xmm2,%%xmm0                 \n"
+      "paddb       %%xmm5,%%xmm0                 \n"
+      "lea         0x40(%0),%0                   \n"
+      "movdqu      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_u),     // %1
         "+r"(dst_v),     // %2
@@ -1518,36 +1674,19 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
 
 void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
   asm volatile(
-      "movdqa    %4,%%xmm5                       \n"
-      "movdqa    %3,%%xmm4                       \n"
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+      "movdqa      %5,%%xmm7                     \n"
 
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x30(%0),%%xmm3                 \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm1                   \n"
-      "pmaddubsw %%xmm4,%%xmm2                   \n"
-      "pmaddubsw %%xmm4,%%xmm3                   \n"
-      "lea       0x40(%0),%0                     \n"
-      "phaddw    %%xmm1,%%xmm0                   \n"
-      "phaddw    %%xmm3,%%xmm2                   \n"
-      "psrlw     $0x7,%%xmm0                     \n"
-      "psrlw     $0x7,%%xmm2                     \n"
-      "packuswb  %%xmm2,%%xmm0                   \n"
-      "paddb     %%xmm5,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
+      LABELALIGN RGBTOY(xmm7)
       : "+r"(src_bgra),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
       : "m"(kBGRAToY),   // %3
-        "m"(kAddY16)     // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+        "m"(kSub128),    // %4
+        "m"(kAddY16)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
 void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
@@ -1556,52 +1695,52 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
                        uint8_t* dst_v,
                        int width) {
   asm volatile(
-      "movdqa    %5,%%xmm3                       \n"
-      "movdqa    %6,%%xmm4                       \n"
-      "movdqa    %7,%%xmm5                       \n"
-      "sub       %1,%2                           \n"
+      "movdqa      %5,%%xmm3                     \n"
+      "movdqa      %6,%%xmm4                     \n"
+      "movdqa      %7,%%xmm5                     \n"
+      "sub         %1,%2                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm0                   \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm1                   \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm2                   \n"
-      "movdqu    0x30(%0),%%xmm6                 \n"
-      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm6                   \n"
-
-      "lea       0x40(%0),%0                     \n"
-      "movdqa    %%xmm0,%%xmm7                   \n"
-      "shufps    $0x88,%%xmm1,%%xmm0             \n"
-      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-      "pavgb     %%xmm7,%%xmm0                   \n"
-      "movdqa    %%xmm2,%%xmm7                   \n"
-      "shufps    $0x88,%%xmm6,%%xmm2             \n"
-      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-      "pavgb     %%xmm7,%%xmm2                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "movdqa    %%xmm2,%%xmm6                   \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm2                   \n"
-      "pmaddubsw %%xmm3,%%xmm1                   \n"
-      "pmaddubsw %%xmm3,%%xmm6                   \n"
-      "phaddw    %%xmm2,%%xmm0                   \n"
-      "phaddw    %%xmm6,%%xmm1                   \n"
-      "psraw     $0x8,%%xmm0                     \n"
-      "psraw     $0x8,%%xmm1                     \n"
-      "packsswb  %%xmm1,%%xmm0                   \n"
-      "paddb     %%xmm5,%%xmm0                   \n"
-      "movlps    %%xmm0,(%1)                     \n"
-      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm1                 \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+
+      "lea         0x40(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm6,%%xmm2           \n"
+      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "phaddw      %%xmm2,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm1                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm1                   \n"
+      "packsswb    %%xmm1,%%xmm0                 \n"
+      "paddb       %%xmm5,%%xmm0                 \n"
+      "movlps      %%xmm0,(%1)                   \n"
+      "movhps      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
       : "+r"(src_bgra0),                   // %0
         "+r"(dst_u),                       // %1
         "+r"(dst_v),                       // %2
@@ -1615,70 +1754,36 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
 
 void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
   asm volatile(
-      "movdqa    %4,%%xmm5                       \n"
-      "movdqa    %3,%%xmm4                       \n"
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+      "movdqa      %5,%%xmm7                     \n"
 
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x30(%0),%%xmm3                 \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm1                   \n"
-      "pmaddubsw %%xmm4,%%xmm2                   \n"
-      "pmaddubsw %%xmm4,%%xmm3                   \n"
-      "lea       0x40(%0),%0                     \n"
-      "phaddw    %%xmm1,%%xmm0                   \n"
-      "phaddw    %%xmm3,%%xmm2                   \n"
-      "psrlw     $0x7,%%xmm0                     \n"
-      "psrlw     $0x7,%%xmm2                     \n"
-      "packuswb  %%xmm2,%%xmm0                   \n"
-      "paddb     %%xmm5,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
+      LABELALIGN RGBTOY(xmm7)
       : "+r"(src_abgr),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
       : "m"(kABGRToY),   // %3
-        "m"(kAddY16)     // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+        "m"(kSub128),    // %4
+        "m"(kAddY16)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
 void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
   asm volatile(
-      "movdqa    %4,%%xmm5                       \n"
-      "movdqa    %3,%%xmm4                       \n"
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+      "movdqa      %5,%%xmm7                     \n"
 
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x30(%0),%%xmm3                 \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm1                   \n"
-      "pmaddubsw %%xmm4,%%xmm2                   \n"
-      "pmaddubsw %%xmm4,%%xmm3                   \n"
-      "lea       0x40(%0),%0                     \n"
-      "phaddw    %%xmm1,%%xmm0                   \n"
-      "phaddw    %%xmm3,%%xmm2                   \n"
-      "psrlw     $0x7,%%xmm0                     \n"
-      "psrlw     $0x7,%%xmm2                     \n"
-      "packuswb  %%xmm2,%%xmm0                   \n"
-      "paddb     %%xmm5,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
+      LABELALIGN RGBTOY(xmm7)
       : "+r"(src_rgba),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
       : "m"(kRGBAToY),   // %3
-        "m"(kAddY16)     // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+        "m"(kSub128),    // %4
+        "m"(kAddY16)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
 void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
@@ -1687,52 +1792,52 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
                        uint8_t* dst_v,
                        int width) {
   asm volatile(
-      "movdqa    %5,%%xmm3                       \n"
-      "movdqa    %6,%%xmm4                       \n"
-      "movdqa    %7,%%xmm5                       \n"
-      "sub       %1,%2                           \n"
+      "movdqa      %5,%%xmm3                     \n"
+      "movdqa      %6,%%xmm4                     \n"
+      "movdqa      %7,%%xmm5                     \n"
+      "sub         %1,%2                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm0                   \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm1                   \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm2                   \n"
-      "movdqu    0x30(%0),%%xmm6                 \n"
-      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm6                   \n"
-
-      "lea       0x40(%0),%0                     \n"
-      "movdqa    %%xmm0,%%xmm7                   \n"
-      "shufps    $0x88,%%xmm1,%%xmm0             \n"
-      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-      "pavgb     %%xmm7,%%xmm0                   \n"
-      "movdqa    %%xmm2,%%xmm7                   \n"
-      "shufps    $0x88,%%xmm6,%%xmm2             \n"
-      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-      "pavgb     %%xmm7,%%xmm2                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "movdqa    %%xmm2,%%xmm6                   \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm2                   \n"
-      "pmaddubsw %%xmm3,%%xmm1                   \n"
-      "pmaddubsw %%xmm3,%%xmm6                   \n"
-      "phaddw    %%xmm2,%%xmm0                   \n"
-      "phaddw    %%xmm6,%%xmm1                   \n"
-      "psraw     $0x8,%%xmm0                     \n"
-      "psraw     $0x8,%%xmm1                     \n"
-      "packsswb  %%xmm1,%%xmm0                   \n"
-      "paddb     %%xmm5,%%xmm0                   \n"
-      "movlps    %%xmm0,(%1)                     \n"
-      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm1                 \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+
+      "lea         0x40(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm6,%%xmm2           \n"
+      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "phaddw      %%xmm2,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm1                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm1                   \n"
+      "packsswb    %%xmm1,%%xmm0                 \n"
+      "paddb       %%xmm5,%%xmm0                 \n"
+      "movlps      %%xmm0,(%1)                   \n"
+      "movhps      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
       : "+r"(src_abgr0),                   // %0
         "+r"(dst_u),                       // %1
         "+r"(dst_v),                       // %2
@@ -1750,52 +1855,52 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
                        uint8_t* dst_v,
                        int width) {
   asm volatile(
-      "movdqa    %5,%%xmm3                       \n"
-      "movdqa    %6,%%xmm4                       \n"
-      "movdqa    %7,%%xmm5                       \n"
-      "sub       %1,%2                           \n"
+      "movdqa      %5,%%xmm3                     \n"
+      "movdqa      %6,%%xmm4                     \n"
+      "movdqa      %7,%%xmm5                     \n"
+      "sub         %1,%2                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm0                   \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm1                   \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm2                   \n"
-      "movdqu    0x30(%0),%%xmm6                 \n"
-      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm6                   \n"
-
-      "lea       0x40(%0),%0                     \n"
-      "movdqa    %%xmm0,%%xmm7                   \n"
-      "shufps    $0x88,%%xmm1,%%xmm0             \n"
-      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-      "pavgb     %%xmm7,%%xmm0                   \n"
-      "movdqa    %%xmm2,%%xmm7                   \n"
-      "shufps    $0x88,%%xmm6,%%xmm2             \n"
-      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-      "pavgb     %%xmm7,%%xmm2                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "movdqa    %%xmm2,%%xmm6                   \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm2                   \n"
-      "pmaddubsw %%xmm3,%%xmm1                   \n"
-      "pmaddubsw %%xmm3,%%xmm6                   \n"
-      "phaddw    %%xmm2,%%xmm0                   \n"
-      "phaddw    %%xmm6,%%xmm1                   \n"
-      "psraw     $0x8,%%xmm0                     \n"
-      "psraw     $0x8,%%xmm1                     \n"
-      "packsswb  %%xmm1,%%xmm0                   \n"
-      "paddb     %%xmm5,%%xmm0                   \n"
-      "movlps    %%xmm0,(%1)                     \n"
-      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm1                 \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+
+      "lea         0x40(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm6,%%xmm2           \n"
+      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "phaddw      %%xmm2,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm1                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm1                   \n"
+      "packsswb    %%xmm1,%%xmm0                 \n"
+      "paddb       %%xmm5,%%xmm0                 \n"
+      "movlps      %%xmm0,(%1)                   \n"
+      "movhps      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
       : "+r"(src_rgba0),                   // %0
         "+r"(dst_u),                       // %1
         "+r"(dst_v),                       // %2
@@ -2012,16 +2117,16 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
                                 int width) {
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
     LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
     READYUV444
     YUVTORGB(yuvconstants)
     STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
@@ -2041,27 +2146,27 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
                                  int width) {
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
-    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
-    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
-    "sub       %[u_buf],%[v_buf]               \n"
+      "movdqa      %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+      "movdqa      %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
+      "sub         %[u_buf],%[v_buf]             \n"
 
     LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
     READYUV422
     YUVTORGB(yuvconstants)
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm2,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm6,%%xmm1                   \n"
-    "palignr   $0xc,%%xmm0,%%xmm1              \n"
-    "movq      %%xmm0,(%[dst_rgb24])           \n"
-    "movdqu    %%xmm1,0x8(%[dst_rgb24])        \n"
-    "lea       0x18(%[dst_rgb24]),%[dst_rgb24] \n"
-    "subl      $0x8,%[width]                   \n"
-    "jg        1b                              \n"
+      "punpcklbw   %%xmm1,%%xmm0                 \n"
+      "punpcklbw   %%xmm2,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklwd   %%xmm2,%%xmm0                 \n"
+      "punpckhwd   %%xmm2,%%xmm1                 \n"
+      "pshufb      %%xmm5,%%xmm0                 \n"
+      "pshufb      %%xmm6,%%xmm1                 \n"
+      "palignr     $0xc,%%xmm0,%%xmm1            \n"
+      "movq        %%xmm0,(%[dst_rgb24])         \n"
+      "movdqu      %%xmm1,0x8(%[dst_rgb24])      \n"
+      "lea         0x18(%[dst_rgb24]),%[dst_rgb24] \n"
+      "subl        $0x8,%[width]                 \n"
+      "jg          1b                            \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
@@ -2087,16 +2192,16 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
                                 int width) {
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
     LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
     READYUV422
     YUVTORGB(yuvconstants)
     STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
@@ -2116,21 +2221,21 @@ void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
                                 int width) {
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // AR30 constants
-    "psrlw     $14,%%xmm5                      \n"
-    "psllw     $4,%%xmm5                       \n"  // 2 alpha bits
-    "pxor      %%xmm6,%%xmm6                   \n"
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"  // 0 for min
-    "psrlw     $6,%%xmm7                       \n"  // 1023 for max
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // AR30 constants
+      "psrlw       $14,%%xmm5                    \n"
+      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
+      "pxor        %%xmm6,%%xmm6                 \n"
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"  // 0 for min
+      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
 
     LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
     READYUV422
     YUVTORGB16(yuvconstants)
     STOREAR30
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
@@ -2151,16 +2256,16 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
                                 int width) {
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
     LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
     READYUV210
     YUVTORGB(yuvconstants)
     STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
@@ -2181,21 +2286,21 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
                                 int width) {
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $14,%%xmm5                      \n"
-    "psllw     $4,%%xmm5                       \n"  // 2 alpha bits
-    "pxor      %%xmm6,%%xmm6                   \n"
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"  // 0 for min
-    "psrlw     $6,%%xmm7                       \n"  // 1023 for max
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $14,%%xmm5                    \n"
+      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
+      "pxor        %%xmm6,%%xmm6                 \n"
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"  // 0 for min
+      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
 
     LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
     READYUV210
     YUVTORGB16(yuvconstants)
     STOREAR30
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
@@ -2218,15 +2323,15 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
   // clang-format off
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
+      "sub         %[u_buf],%[v_buf]             \n"
 
     LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
     READYUVA422
     YUVTORGB(yuvconstants)
     STOREARGB
-    "subl      $0x8,%[width]                   \n"
-    "jg        1b                              \n"
+      "subl        $0x8,%[width]                 \n"
+      "jg          1b                            \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
@@ -2253,15 +2358,15 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
   // clang-format off
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
     LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
     READNV12
     YUVTORGB(yuvconstants)
     STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
@@ -2281,15 +2386,15 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
   // clang-format off
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
     LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
     READNV21
     YUVTORGB(yuvconstants)
     STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [vu_buf]"+r"(vu_buf),    // %[vu_buf]
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
@@ -2309,15 +2414,15 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
   // clang-format off
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
     LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
     READYUY2
     YUVTORGB(yuvconstants)
     STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
   : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
@@ -2337,15 +2442,15 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
   // clang-format off
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
     LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
     READUYVY
     YUVTORGB(yuvconstants)
     STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
   : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
@@ -2366,16 +2471,16 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
                                 int width) {
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
     LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
     READYUV422
     YUVTORGB(yuvconstants)
     STORERGBA
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
@@ -2590,17 +2695,17 @@ void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
                                int width) {
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
     LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
     READYUV444_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
@@ -2624,18 +2729,18 @@ void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
                                int width) {
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
     LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
     READYUV422_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
 
-    "vzeroupper                                \n"
+      "vzeroupper                                \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
@@ -2659,23 +2764,23 @@ void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
                                int width) {
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"  // AR30 constants
-    "vpsrlw    $14,%%ymm5,%%ymm5               \n"
-    "vpsllw    $4,%%ymm5,%%ymm5                \n"  // 2 alpha bits
-    "vpxor     %%ymm6,%%ymm6,%%ymm6            \n"  // 0 for min
-    "vpcmpeqb  %%ymm7,%%ymm7,%%ymm7            \n"  // 1023 for max
-    "vpsrlw    $6,%%ymm7,%%ymm7                \n"
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
+      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
+      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
 
     LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
     READYUV422_AVX2
     YUVTORGB16_AVX2(yuvconstants)
     STOREAR30_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
 
-    "vzeroupper                                \n"
+      "vzeroupper                                \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
@@ -2699,18 +2804,18 @@ void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
                                int width) {
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
     LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
     READYUV210_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
 
-    "vzeroupper                                \n"
+      "vzeroupper                                \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
@@ -2734,23 +2839,23 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
                                int width) {
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"  // AR30 constants
-    "vpsrlw    $14,%%ymm5,%%ymm5               \n"
-    "vpsllw    $4,%%ymm5,%%ymm5                \n"  // 2 alpha bits
-    "vpxor     %%ymm6,%%ymm6,%%ymm6            \n"  // 0 for min
-    "vpcmpeqb  %%ymm7,%%ymm7,%%ymm7            \n"  // 1023 for max
-    "vpsrlw    $6,%%ymm7,%%ymm7                \n"
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
+      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
+      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
 
     LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
     READYUV210_AVX2
     YUVTORGB16_AVX2(yuvconstants)
     STOREAR30_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
 
-    "vzeroupper                                \n"
+      "vzeroupper                                \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
@@ -2776,16 +2881,16 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
   // clang-format off
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
+      "sub         %[u_buf],%[v_buf]             \n"
 
     LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
     READYUVA422_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
-    "subl      $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
+      "subl        $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
@@ -2815,11 +2920,11 @@ void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
                                int width) {
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
     LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
     READYUV422_AVX2
     YUVTORGB_AVX2(yuvconstants)
 
@@ -2859,16 +2964,16 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
   // clang-format off
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
     LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
     READNV12_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
@@ -2892,16 +2997,16 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
   // clang-format off
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
     LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
     READNV21_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [vu_buf]"+r"(vu_buf),    // %[vu_buf]
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
@@ -2925,16 +3030,16 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
   // clang-format off
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
     LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
     READYUY2_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
   : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
@@ -2958,16 +3063,16 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
   // clang-format off
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
     LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
     READUYVY_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
   : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
@@ -2982,17 +3087,15 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
 #endif  // HAS_UYVYTOARGBROW_AVX2
 
 #ifdef HAS_I400TOARGBROW_SSE2
-void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
-      "movd      %%eax,%%xmm2                    \n"
-      "pshufd    $0x0,%%xmm2,%%xmm2              \n"
-      "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 *
-                                                      // 16
-      "movd      %%eax,%%xmm3                    \n"
-      "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-      "pcmpeqb   %%xmm4,%%xmm4                   \n"
-      "pslld     $0x18,%%xmm4                    \n"
+void I400ToARGBRow_SSE2(const uint8_t* y_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      "movdqa      192(%3),%%xmm2                \n"  // yg = 18997 = 1.164
+      "movdqa      224(%3),%%xmm3                \n"  // ygb = 1160 = 1.164 * 16
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 0xff000000
+      "pslld       $0x18,%%xmm4                  \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -3001,8 +3104,8 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
       "lea       0x8(%0),%0                      \n"
       "punpcklbw %%xmm0,%%xmm0                   \n"
       "pmulhuw   %%xmm2,%%xmm0                   \n"
-      "psubusw   %%xmm3,%%xmm0                   \n"
-      "psrlw     $6, %%xmm0                      \n"
+      "paddsw    %%xmm3,%%xmm0                   \n"
+      "psraw     $6, %%xmm0                      \n"
       "packuswb  %%xmm0,%%xmm0                   \n"
 
       // Step 2: Weave into ARGB
@@ -3018,28 +3121,26 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
 
       "sub       $0x8,%2                         \n"
       "jg        1b                              \n"
-      : "+r"(y_buf),     // %0
-        "+r"(dst_argb),  // %1
-        "+rm"(width)     // %2
-      :
-      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+      : "+r"(y_buf),       // %0
+        "+r"(dst_argb),    // %1
+        "+rm"(width)       // %2
+      : "r"(yuvconstants)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
 #endif  // HAS_I400TOARGBROW_SSE2
 
 #ifdef HAS_I400TOARGBROW_AVX2
 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
 // note: vpunpcklbw mutates and vpackuswb unmutates.
-void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
+void I400ToARGBRow_AVX2(const uint8_t* y_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
   asm volatile(
-      "mov        $0x4a354a35,%%eax              \n"  // 0488 = 1160 = 1.164 *
-                                                      // 16
-      "vmovd      %%eax,%%xmm2                   \n"
-      "vbroadcastss %%xmm2,%%ymm2                \n"
-      "mov        $0x4880488,%%eax               \n"  // 4a35 = 18997 = 1.164
-      "vmovd      %%eax,%%xmm3                   \n"
-      "vbroadcastss %%xmm3,%%ymm3                \n"
-      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-      "vpslld     $0x18,%%ymm4,%%ymm4            \n"
+      "vmovdqa     192(%3),%%ymm2                \n"  // yg = 18997 = 1.164
+      "vmovdqa     224(%3),%%ymm3                \n"  // ygb = -1160 = 1.164*16
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 0xff000000
+      "vpslld      $0x18,%%ymm4,%%ymm4           \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -3049,8 +3150,8 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
       "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
       "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
       "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-      "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
-      "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
+      "vpaddsw    %%ymm3,%%ymm0,%%ymm0           \n"
+      "vpsraw     $0x6,%%ymm0,%%ymm0             \n"
       "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
       "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
       "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
@@ -3060,15 +3161,15 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
       "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
       "vmovdqu    %%ymm0,(%1)                    \n"
       "vmovdqu    %%ymm1,0x20(%1)                \n"
-      "lea       0x40(%1),%1                     \n"
+      "lea        0x40(%1),%1                     \n"
       "sub        $0x10,%2                       \n"
       "jg        1b                              \n"
       "vzeroupper                                \n"
-      : "+r"(y_buf),     // %0
-        "+r"(dst_argb),  // %1
-        "+rm"(width)     // %2
-      :
-      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+      : "+r"(y_buf),       // %0
+        "+r"(dst_argb),    // %1
+        "+rm"(width)       // %2
+      : "r"(yuvconstants)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
 #endif  // HAS_I400TOARGBROW_AVX2
 
@@ -3081,16 +3182,16 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
   asm volatile(
 
-      "movdqa    %3,%%xmm5                       \n"
+      "movdqa      %3,%%xmm5                     \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    -0x10(%0,%2,1),%%xmm0           \n"
-      "pshufb    %%xmm5,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
+      "movdqu      -0x10(%0,%2,1),%%xmm0         \n"
+      "pshufb      %%xmm5,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
       : "+r"(src),           // %0
         "+r"(dst),           // %1
         "+r"(temp_width)     // %2
@@ -3108,13 +3209,13 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu    -0x20(%0,%2,1),%%ymm0          \n"
-      "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
-      "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x20,%2                        \n"
-      "jg        1b                              \n"
+      "vmovdqu     -0x20(%0,%2,1),%%ymm0         \n"
+      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src),           // %0
         "+r"(dst),           // %1
@@ -3125,37 +3226,136 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 #endif  // HAS_MIRRORROW_AVX2
 
 #ifdef HAS_MIRRORUVROW_SSSE3
+// Shuffle table for reversing the UV.
+static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
+                                       6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};
+
+void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile(
+
+      "movdqa      %3,%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      -0x10(%0,%2,2),%%xmm0         \n"
+      "pshufb      %%xmm5,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_uv),          // %0
+        "+r"(dst_uv),          // %1
+        "+r"(temp_width)       // %2
+      : "m"(kShuffleMirrorUV)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
+}
+#endif  // HAS_MIRRORUVROW_SSSE3
+
+#ifdef HAS_MIRRORUVROW_AVX2
+void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile(
+
+      "vbroadcastf128 %3,%%ymm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     -0x20(%0,%2,2),%%ymm0         \n"
+      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uv),          // %0
+        "+r"(dst_uv),          // %1
+        "+r"(temp_width)       // %2
+      : "m"(kShuffleMirrorUV)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
+}
+#endif  // HAS_MIRRORUVROW_AVX2
+
+#ifdef HAS_MIRRORSPLITUVROW_SSSE3
 // Shuffle table for reversing the bytes of UV channels.
-static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
-                                       15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
-void MirrorUVRow_SSSE3(const uint8_t* src,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
+static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+                                            15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width) {
   intptr_t temp_width = (intptr_t)(width);
   asm volatile(
-      "movdqa    %4,%%xmm1                       \n"
-      "lea       -0x10(%0,%3,2),%0               \n"
-      "sub       %1,%2                           \n"
+      "movdqa      %4,%%xmm1                     \n"
+      "lea         -0x10(%0,%3,2),%0             \n"
+      "sub         %1,%2                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "lea       -0x10(%0),%0                    \n"
-      "pshufb    %%xmm1,%%xmm0                   \n"
-      "movlpd    %%xmm0,(%1)                     \n"
-      "movhpd    %%xmm0,0x00(%1,%2,1)            \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $8,%3                           \n"
-      "jg        1b                              \n"
-      : "+r"(src),             // %0
-        "+r"(dst_u),           // %1
-        "+r"(dst_v),           // %2
-        "+r"(temp_width)       // %3
-      : "m"(kShuffleMirrorUV)  // %4
+      "movdqu      (%0),%%xmm0                   \n"
+      "lea         -0x10(%0),%0                  \n"
+      "pshufb      %%xmm1,%%xmm0                 \n"
+      "movlpd      %%xmm0,(%1)                   \n"
+      "movhpd      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $8,%3                         \n"
+      "jg          1b                            \n"
+      : "+r"(src),                  // %0
+        "+r"(dst_u),                // %1
+        "+r"(dst_v),                // %2
+        "+r"(temp_width)            // %3
+      : "m"(kShuffleMirrorSplitUV)  // %4
       : "memory", "cc", "xmm0", "xmm1");
 }
-#endif  // HAS_MIRRORUVROW_SSSE3
+#endif  // HAS_MIRRORSPLITUVROW_SSSE3
+
+#ifdef HAS_RGB24MIRRORROW_SSSE3
+
+// Shuffle first 5 pixels to last 5 mirrored.  first byte zero
+static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u,
+                                         7u,   8u,  3u,  4u,  5u, 0u,  1u,  2u};
+
+// Shuffle last 5 pixels to first 5 mirrored.  last byte zero
+static const uvec8 kShuffleMirrorRGB1 = {
+    13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u};
+
+// Shuffle 5 pixels at a time (15 bytes)
+void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
+                          uint8_t* dst_rgb24,
+                          int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  src_rgb24 += width * 3 - 48;
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"  // first 5
+      "movdqu      15(%0),%%xmm1                 \n"  // next 5
+      "movdqu      30(%0),%%xmm2                 \n"  // next 5
+      "movdqu      32(%0),%%xmm3                 \n"  // last 1 special
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "pshufb      %%xmm4,%%xmm1                 \n"
+      "pshufb      %%xmm4,%%xmm2                 \n"
+      "pshufb      %%xmm5,%%xmm3                 \n"
+      "lea         -0x30(%0),%0                  \n"
+      "movdqu      %%xmm0,32(%1)                 \n"  // last 5
+      "movdqu      %%xmm1,17(%1)                 \n"  // next 5
+      "movdqu      %%xmm2,2(%1)                  \n"  // next 5
+      "movlpd      %%xmm3,0(%1)                  \n"  // first 1
+      "lea         0x30(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_rgb24),          // %0
+        "+r"(dst_rgb24),          // %1
+        "+r"(temp_width)          // %2
+      : "m"(kShuffleMirrorRGB0),  // %3
+        "m"(kShuffleMirrorRGB1)   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_RGB24MIRRORROW_SSSE3
 
 #ifdef HAS_ARGBMIRRORROW_SSE2
 
@@ -3163,17 +3363,17 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
   asm volatile(
 
-      "lea       -0x10(%0,%2,4),%0               \n"
+      "lea         -0x10(%0,%2,4),%0             \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
-      "lea       -0x10(%0),%0                    \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x4,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "pshufd      $0x1b,%%xmm0,%%xmm0           \n"
+      "lea         -0x10(%0),%0                  \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src),        // %0
         "+r"(dst),        // %1
         "+r"(temp_width)  // %2
@@ -3189,15 +3389,15 @@ void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
   asm volatile(
 
-      "vmovdqu    %3,%%ymm5                      \n"
+      "vmovdqu     %3,%%ymm5                     \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vpermd    -0x20(%0,%2,4),%%ymm5,%%ymm0    \n"
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "lea        0x20(%1),%1                    \n"
-      "sub        $0x8,%2                        \n"
-      "jg         1b                             \n"
+      "vpermd      -0x20(%0,%2,4),%%ymm5,%%ymm0  \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src),                    // %0
         "+r"(dst),                    // %1
@@ -3213,28 +3413,28 @@ void SplitUVRow_AVX2(const uint8_t* src_uv,
                      uint8_t* dst_v,
                      int width) {
   asm volatile(
-      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-      "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
-      "sub        %1,%2                          \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
+      "sub         %1,%2                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "lea        0x40(%0),%0                    \n"
-      "vpsrlw     $0x8,%%ymm0,%%ymm2             \n"
-      "vpsrlw     $0x8,%%ymm1,%%ymm3             \n"
-      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
-      "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
-      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-      "vpackuswb  %%ymm3,%%ymm2,%%ymm2           \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "vmovdqu    %%ymm2,0x00(%1,%2,1)            \n"
-      "lea        0x20(%1),%1                    \n"
-      "sub        $0x20,%3                       \n"
-      "jg         1b                             \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm2            \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm3            \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpackuswb   %%ymm3,%%ymm2,%%ymm2          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpermq      $0xd8,%%ymm2,%%ymm2           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm2,0x00(%1,%2,1)          \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src_uv),  // %0
         "+r"(dst_u),   // %1
@@ -3251,28 +3451,28 @@ void SplitUVRow_SSE2(const uint8_t* src_uv,
                      uint8_t* dst_v,
                      int width) {
   asm volatile(
-      "pcmpeqb    %%xmm5,%%xmm5                  \n"
-      "psrlw      $0x8,%%xmm5                    \n"
-      "sub        %1,%2                          \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $0x8,%%xmm5                   \n"
+      "sub         %1,%2                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu     (%0),%%xmm0                    \n"
-      "movdqu     0x10(%0),%%xmm1                \n"
-      "lea        0x20(%0),%0                    \n"
-      "movdqa     %%xmm0,%%xmm2                  \n"
-      "movdqa     %%xmm1,%%xmm3                  \n"
-      "pand       %%xmm5,%%xmm0                  \n"
-      "pand       %%xmm5,%%xmm1                  \n"
-      "packuswb   %%xmm1,%%xmm0                  \n"
-      "psrlw      $0x8,%%xmm2                    \n"
-      "psrlw      $0x8,%%xmm3                    \n"
-      "packuswb   %%xmm3,%%xmm2                  \n"
-      "movdqu     %%xmm0,(%1)                    \n"
-      "movdqu    %%xmm2,0x00(%1,%2,1)            \n"
-      "lea        0x10(%1),%1                    \n"
-      "sub        $0x10,%3                       \n"
-      "jg         1b                             \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "psrlw       $0x8,%%xmm2                   \n"
+      "psrlw       $0x8,%%xmm3                   \n"
+      "packuswb    %%xmm3,%%xmm2                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm2,0x00(%1,%2,1)          \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
       : "+r"(src_uv),  // %0
         "+r"(dst_u),   // %1
         "+r"(dst_v),   // %2
@@ -3289,22 +3489,22 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
                      int width) {
   asm volatile(
 
-      "sub       %0,%1                           \n"
+      "sub         %0,%1                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu   (%0),%%ymm0                     \n"
-      "vmovdqu    0x00(%0,%1,1),%%ymm1           \n"
-      "lea       0x20(%0),%0                     \n"
-      "vpunpcklbw %%ymm1,%%ymm0,%%ymm2           \n"
-      "vpunpckhbw %%ymm1,%%ymm0,%%ymm0           \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x00(%0,%1,1),%%ymm1          \n"
+      "lea         0x20(%0),%0                   \n"
+      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm2          \n"
+      "vpunpckhbw  %%ymm1,%%ymm0,%%ymm0          \n"
       "vextractf128 $0x0,%%ymm2,(%2)             \n"
       "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
       "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
       "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
-      "lea       0x40(%2),%2                     \n"
-      "sub       $0x20,%3                        \n"
-      "jg        1b                              \n"
+      "lea         0x40(%2),%2                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src_u),   // %0
         "+r"(src_v),   // %1
@@ -3322,21 +3522,21 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
                      int width) {
   asm volatile(
 
-      "sub       %0,%1                           \n"
+      "sub         %0,%1                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
-      "lea       0x10(%0),%0                     \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "punpcklbw %%xmm1,%%xmm0                   \n"
-      "punpckhbw %%xmm1,%%xmm2                   \n"
-      "movdqu    %%xmm0,(%2)                     \n"
-      "movdqu    %%xmm2,0x10(%2)                 \n"
-      "lea       0x20(%2),%2                     \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%1,1),%%xmm1          \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpcklbw   %%xmm1,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm2                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "movdqu      %%xmm2,0x10(%2)               \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
       : "+r"(src_u),   // %0
         "+r"(src_v),   // %1
         "+r"(dst_uv),  // %2
@@ -3359,30 +3559,30 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
                         int width) {
   // clang-format off
   asm volatile (
-    "vmovd      %4,%%xmm3                      \n"
-    "vpunpcklwd %%xmm3,%%xmm3,%%xmm3           \n"
-    "vbroadcastss %%xmm3,%%ymm3                \n"
-    "sub       %0,%1                           \n"
+      "vmovd       %4,%%xmm3                     \n"
+      "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
+      "vbroadcastss %%xmm3,%%ymm3                \n"
+      "sub         %0,%1                         \n"
 
     // 16 pixels per loop.
     LABELALIGN
-    "1:                                        \n"
-    "vmovdqu   (%0),%%ymm0                     \n"
-    "vmovdqu   (%0,%1,1),%%ymm1                \n"
-    "add        $0x20,%0                       \n"
-
-    "vpmullw   %%ymm3,%%ymm0,%%ymm0            \n"
-    "vpmullw   %%ymm3,%%ymm1,%%ymm1            \n"
-    "vpunpcklwd %%ymm1,%%ymm0,%%ymm2           \n"  // mutates
-    "vpunpckhwd %%ymm1,%%ymm0,%%ymm0           \n"
-    "vextractf128 $0x0,%%ymm2,(%2)             \n"
-    "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
-    "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
-    "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
-    "add       $0x40,%2                        \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     (%0,%1,1),%%ymm1              \n"
+      "add         $0x20,%0                      \n"
+
+      "vpmullw     %%ymm3,%%ymm0,%%ymm0          \n"
+      "vpmullw     %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm2          \n"  // mutates
+      "vpunpckhwd  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vextractf128 $0x0,%%ymm2,(%2)             \n"
+      "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
+      "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
+      "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
+      "add         $0x40,%2                      \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
   : "+r"(src_u),   // %0
     "+r"(src_v),   // %1
     "+r"(dst_uv),  // %2
@@ -3405,24 +3605,24 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y,
                          int width) {
   // clang-format off
   asm volatile (
-    "vmovd      %3,%%xmm3                      \n"
-    "vpunpcklwd %%xmm3,%%xmm3,%%xmm3           \n"
-    "vbroadcastss %%xmm3,%%ymm3                \n"
-    "sub       %0,%1                           \n"
+      "vmovd       %3,%%xmm3                     \n"
+      "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
+      "vbroadcastss %%xmm3,%%ymm3                \n"
+      "sub         %0,%1                         \n"
 
     // 16 pixels per loop.
     LABELALIGN
-    "1:                                        \n"
-    "vmovdqu   (%0),%%ymm0                     \n"
-    "vmovdqu   0x20(%0),%%ymm1                 \n"
-    "vpmullw   %%ymm3,%%ymm0,%%ymm0            \n"
-    "vpmullw   %%ymm3,%%ymm1,%%ymm1            \n"
-    "vmovdqu   %%ymm0,(%0,%1)                  \n"
-    "vmovdqu   %%ymm1,0x20(%0,%1)              \n"
-    "add        $0x40,%0                       \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vpmullw     %%ymm3,%%ymm0,%%ymm0          \n"
+      "vpmullw     %%ymm3,%%ymm1,%%ymm1          \n"
+      "vmovdqu     %%ymm0,(%0,%1)                \n"
+      "vmovdqu     %%ymm1,0x20(%0,%1)            \n"
+      "add         $0x40,%0                      \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
   : "+r"(src_y),   // %0
     "+r"(dst_y),   // %1
     "+r"(width)    // %2
@@ -3443,23 +3643,23 @@ void Convert16To8Row_SSSE3(const uint16_t* src_y,
                            int width) {
   // clang-format off
   asm volatile (
-    "movd      %3,%%xmm2                      \n"
-    "punpcklwd %%xmm2,%%xmm2                  \n"
-    "pshufd    $0x0,%%xmm2,%%xmm2             \n"
+      "movd        %3,%%xmm2                     \n"
+      "punpcklwd   %%xmm2,%%xmm2                 \n"
+      "pshufd      $0x0,%%xmm2,%%xmm2            \n"
 
     // 32 pixels per loop.
     LABELALIGN
-    "1:                                       \n"
-    "movdqu    (%0),%%xmm0                    \n"
-    "movdqu    0x10(%0),%%xmm1                \n"
-    "add       $0x20,%0                       \n"
-    "pmulhuw   %%xmm2,%%xmm0                  \n"
-    "pmulhuw   %%xmm2,%%xmm1                  \n"
-    "packuswb  %%xmm1,%%xmm0                  \n"
-    "movdqu    %%xmm0,(%1)                    \n"
-    "add       $0x10,%1                       \n"
-    "sub       $0x10,%2                       \n"
-    "jg        1b                             \n"
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "add         $0x20,%0                      \n"
+      "pmulhuw     %%xmm2,%%xmm0                 \n"
+      "pmulhuw     %%xmm2,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "add         $0x10,%1                      \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
   : "+r"(src_y),   // %0
     "+r"(dst_y),   // %1
     "+r"(width)    // %2
@@ -3475,25 +3675,25 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
                           int width) {
   // clang-format off
   asm volatile (
-    "vmovd      %3,%%xmm2                      \n"
-    "vpunpcklwd %%xmm2,%%xmm2,%%xmm2           \n"
-    "vbroadcastss %%xmm2,%%ymm2                \n"
+      "vmovd       %3,%%xmm2                     \n"
+      "vpunpcklwd  %%xmm2,%%xmm2,%%xmm2          \n"
+      "vbroadcastss %%xmm2,%%ymm2                \n"
 
     // 32 pixels per loop.
     LABELALIGN
-    "1:                                        \n"
-    "vmovdqu   (%0),%%ymm0                     \n"
-    "vmovdqu   0x20(%0),%%ymm1                 \n"
-    "add       $0x40,%0                        \n"
-    "vpmulhuw  %%ymm2,%%ymm0,%%ymm0            \n"
-    "vpmulhuw  %%ymm2,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"  // mutates
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vmovdqu   %%ymm0,(%1)                     \n"
-    "add       $0x20,%1                        \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "add         $0x40,%0                      \n"
+      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm2,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"  // mutates
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "add         $0x20,%1                      \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
   : "+r"(src_y),   // %0
     "+r"(dst_y),   // %1
     "+r"(width)    // %2
@@ -3514,25 +3714,25 @@ void Convert8To16Row_SSE2(const uint8_t* src_y,
                           int width) {
   // clang-format off
   asm volatile (
-    "movd      %3,%%xmm2                      \n"
-    "punpcklwd %%xmm2,%%xmm2                  \n"
-    "pshufd    $0x0,%%xmm2,%%xmm2             \n"
+      "movd        %3,%%xmm2                     \n"
+      "punpcklwd   %%xmm2,%%xmm2                 \n"
+      "pshufd      $0x0,%%xmm2,%%xmm2            \n"
 
     // 32 pixels per loop.
     LABELALIGN
-    "1:                                       \n"
-    "movdqu    (%0),%%xmm0                    \n"
-    "movdqa    %%xmm0,%%xmm1                  \n"
-    "punpcklbw %%xmm0,%%xmm0                  \n"
-    "punpckhbw %%xmm1,%%xmm1                  \n"
-    "add       $0x10,%0                       \n"
-    "pmulhuw   %%xmm2,%%xmm0                  \n"
-    "pmulhuw   %%xmm2,%%xmm1                  \n"
-    "movdqu    %%xmm0,(%1)                    \n"
-    "movdqu    %%xmm1,0x10(%1)                \n"
-    "add       $0x20,%1                       \n"
-    "sub       $0x10,%2                       \n"
-    "jg        1b                             \n"
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm1                 \n"
+      "add         $0x10,%0                      \n"
+      "pmulhuw     %%xmm2,%%xmm0                 \n"
+      "pmulhuw     %%xmm2,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "add         $0x20,%1                      \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
   : "+r"(src_y),   // %0
     "+r"(dst_y),   // %1
     "+r"(width)    // %2
@@ -3548,26 +3748,26 @@ void Convert8To16Row_AVX2(const uint8_t* src_y,
                           int width) {
   // clang-format off
   asm volatile (
-    "vmovd      %3,%%xmm2                      \n"
-    "vpunpcklwd %%xmm2,%%xmm2,%%xmm2           \n"
-    "vbroadcastss %%xmm2,%%ymm2                \n"
+      "vmovd       %3,%%xmm2                     \n"
+      "vpunpcklwd  %%xmm2,%%xmm2,%%xmm2          \n"
+      "vbroadcastss %%xmm2,%%ymm2                \n"
 
     // 32 pixels per loop.
     LABELALIGN
-    "1:                                        \n"
-    "vmovdqu   (%0),%%ymm0                     \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "add       $0x20,%0                        \n"
-    "vpunpckhbw %%ymm0,%%ymm0,%%ymm1           \n"
-    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpmulhuw  %%ymm2,%%ymm0,%%ymm0            \n"
-    "vpmulhuw  %%ymm2,%%ymm1,%%ymm1            \n"
-    "vmovdqu   %%ymm0,(%1)                     \n"
-    "vmovdqu   %%ymm1,0x20(%1)                 \n"
-    "add       $0x40,%1                        \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "add         $0x20,%0                      \n"
+      "vpunpckhbw  %%ymm0,%%ymm0,%%ymm1          \n"
+      "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm2,%%ymm1,%%ymm1          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "add         $0x40,%1                      \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
   : "+r"(src_y),   // %0
     "+r"(dst_y),   // %1
     "+r"(width)    // %2
@@ -3619,41 +3819,41 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu     (%0),%%xmm0                    \n"
-      "movdqu     0x10(%0),%%xmm1                \n"
-      "movdqu     0x20(%0),%%xmm2                \n"
-      "pshufb     %5, %%xmm0                     \n"
-      "pshufb     %6, %%xmm1                     \n"
-      "pshufb     %7, %%xmm2                     \n"
-      "por        %%xmm1,%%xmm0                  \n"
-      "por        %%xmm2,%%xmm0                  \n"
-      "movdqu     %%xmm0,(%1)                    \n"
-      "lea        0x10(%1),%1                    \n"
-
-      "movdqu     (%0),%%xmm0                    \n"
-      "movdqu     0x10(%0),%%xmm1                \n"
-      "movdqu     0x20(%0),%%xmm2                \n"
-      "pshufb     %8, %%xmm0                     \n"
-      "pshufb     %9, %%xmm1                     \n"
-      "pshufb     %10, %%xmm2                    \n"
-      "por        %%xmm1,%%xmm0                  \n"
-      "por        %%xmm2,%%xmm0                  \n"
-      "movdqu     %%xmm0,(%2)                    \n"
-      "lea        0x10(%2),%2                    \n"
-
-      "movdqu     (%0),%%xmm0                    \n"
-      "movdqu     0x10(%0),%%xmm1                \n"
-      "movdqu     0x20(%0),%%xmm2                \n"
-      "pshufb     %11, %%xmm0                    \n"
-      "pshufb     %12, %%xmm1                    \n"
-      "pshufb     %13, %%xmm2                    \n"
-      "por        %%xmm1,%%xmm0                  \n"
-      "por        %%xmm2,%%xmm0                  \n"
-      "movdqu     %%xmm0,(%3)                    \n"
-      "lea        0x10(%3),%3                    \n"
-      "lea        0x30(%0),%0                    \n"
-      "sub        $0x10,%4                       \n"
-      "jg         1b                             \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "pshufb      %5, %%xmm0                    \n"
+      "pshufb      %6, %%xmm1                    \n"
+      "pshufb      %7, %%xmm2                    \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "pshufb      %8, %%xmm0                    \n"
+      "pshufb      %9, %%xmm1                    \n"
+      "pshufb      %10, %%xmm2                   \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "pshufb      %11, %%xmm0                   \n"
+      "pshufb      %12, %%xmm1                   \n"
+      "pshufb      %13, %%xmm2                   \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%3)                   \n"
+      "lea         0x10(%3),%3                   \n"
+      "lea         0x30(%0),%0                   \n"
+      "sub         $0x10,%4                      \n"
+      "jg          1b                            \n"
       : "+r"(src_rgb),             // %0
         "+r"(dst_r),               // %1
         "+r"(dst_g),               // %2
@@ -3714,42 +3914,42 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu     (%0),%%xmm0                    \n"
-      "movdqu     (%1),%%xmm1                    \n"
-      "movdqu     (%2),%%xmm2                    \n"
-      "pshufb     %5, %%xmm0                     \n"
-      "pshufb     %6, %%xmm1                     \n"
-      "pshufb     %7, %%xmm2                     \n"
-      "por        %%xmm1,%%xmm0                  \n"
-      "por        %%xmm2,%%xmm0                  \n"
-      "movdqu     %%xmm0,(%3)                    \n"
-
-      "movdqu     (%0),%%xmm0                    \n"
-      "movdqu     (%1),%%xmm1                    \n"
-      "movdqu     (%2),%%xmm2                    \n"
-      "pshufb     %8, %%xmm0                     \n"
-      "pshufb     %9, %%xmm1                     \n"
-      "pshufb     %10, %%xmm2                    \n"
-      "por        %%xmm1,%%xmm0                  \n"
-      "por        %%xmm2,%%xmm0                  \n"
-      "movdqu     %%xmm0,16(%3)                  \n"
-
-      "movdqu     (%0),%%xmm0                    \n"
-      "movdqu     (%1),%%xmm1                    \n"
-      "movdqu     (%2),%%xmm2                    \n"
-      "pshufb     %11, %%xmm0                    \n"
-      "pshufb     %12, %%xmm1                    \n"
-      "pshufb     %13, %%xmm2                    \n"
-      "por        %%xmm1,%%xmm0                  \n"
-      "por        %%xmm2,%%xmm0                  \n"
-      "movdqu     %%xmm0,32(%3)                  \n"
-
-      "lea        0x10(%0),%0                    \n"
-      "lea        0x10(%1),%1                    \n"
-      "lea        0x10(%2),%2                    \n"
-      "lea        0x30(%3),%3                    \n"
-      "sub        $0x10,%4                       \n"
-      "jg         1b                             \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      (%1),%%xmm1                   \n"
+      "movdqu      (%2),%%xmm2                   \n"
+      "pshufb      %5, %%xmm0                    \n"
+      "pshufb      %6, %%xmm1                    \n"
+      "pshufb      %7, %%xmm2                    \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%3)                   \n"
+
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      (%1),%%xmm1                   \n"
+      "movdqu      (%2),%%xmm2                   \n"
+      "pshufb      %8, %%xmm0                    \n"
+      "pshufb      %9, %%xmm1                    \n"
+      "pshufb      %10, %%xmm2                   \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,16(%3)                 \n"
+
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      (%1),%%xmm1                   \n"
+      "movdqu      (%2),%%xmm2                   \n"
+      "pshufb      %11, %%xmm0                   \n"
+      "pshufb      %12, %%xmm1                   \n"
+      "pshufb      %13, %%xmm2                   \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,32(%3)                 \n"
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "lea         0x30(%3),%3                   \n"
+      "sub         $0x10,%4                      \n"
+      "jg          1b                            \n"
       : "+r"(src_r),               // %0
         "+r"(src_g),               // %1
         "+r"(src_b),               // %2
@@ -3771,35 +3971,35 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
 #ifdef HAS_COPYROW_SSE2
 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
-      "test       $0xf,%0                        \n"
-      "jne        2f                             \n"
-      "test       $0xf,%1                        \n"
-      "jne        2f                             \n"
+      "test        $0xf,%0                       \n"
+      "jne         2f                            \n"
+      "test        $0xf,%1                       \n"
+      "jne         2f                            \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqa    (%0),%%xmm0                     \n"
-      "movdqa    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "movdqa    %%xmm0,(%1)                     \n"
-      "movdqa    %%xmm1,0x10(%1)                 \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x20,%2                        \n"
-      "jg        1b                              \n"
-      "jmp       9f                              \n"
+      "movdqa      (%0),%%xmm0                   \n"
+      "movdqa      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "movdqa      %%xmm0,(%1)                   \n"
+      "movdqa      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "jmp         9f                            \n"
 
       LABELALIGN
       "2:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "movdqu    %%xmm1,0x10(%1)                 \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x20,%2                        \n"
-      "jg        2b                              \n"
-
-      LABELALIGN "9:                             \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          2b                            \n"
+
+      LABELALIGN "9:                                        \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
         "+r"(width)  // %2
@@ -3814,14 +4014,14 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu   (%0),%%ymm0                     \n"
-      "vmovdqu   0x20(%0),%%ymm1                 \n"
-      "lea       0x40(%0),%0                     \n"
-      "vmovdqu   %%ymm0,(%1)                     \n"
-      "vmovdqu   %%ymm1,0x20(%1)                 \n"
-      "lea       0x40(%1),%1                     \n"
-      "sub       $0x40,%2                        \n"
-      "jg        1b                              \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x40,%2                      \n"
+      "jg          1b                            \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
         "+r"(width)  // %2
@@ -3836,7 +4036,7 @@ void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
   size_t width_tmp = (size_t)(width);
   asm volatile(
 
-      "rep movsb                      \n"
+      "rep         movsb                         \n"
       : "+S"(src),       // %0
         "+D"(dst),       // %1
         "+c"(width_tmp)  // %2
@@ -3849,29 +4049,29 @@ void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
 // width in pixels
 void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
-      "pcmpeqb   %%xmm0,%%xmm0                   \n"
-      "pslld     $0x18,%%xmm0                    \n"
-      "pcmpeqb   %%xmm1,%%xmm1                   \n"
-      "psrld     $0x8,%%xmm1                     \n"
+      "pcmpeqb     %%xmm0,%%xmm0                 \n"
+      "pslld       $0x18,%%xmm0                  \n"
+      "pcmpeqb     %%xmm1,%%xmm1                 \n"
+      "psrld       $0x8,%%xmm1                   \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm2                     \n"
-      "movdqu    0x10(%0),%%xmm3                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "movdqu    (%1),%%xmm4                     \n"
-      "movdqu    0x10(%1),%%xmm5                 \n"
-      "pand      %%xmm0,%%xmm2                   \n"
-      "pand      %%xmm0,%%xmm3                   \n"
-      "pand      %%xmm1,%%xmm4                   \n"
-      "pand      %%xmm1,%%xmm5                   \n"
-      "por       %%xmm4,%%xmm2                   \n"
-      "por       %%xmm5,%%xmm3                   \n"
-      "movdqu    %%xmm2,(%1)                     \n"
-      "movdqu    %%xmm3,0x10(%1)                 \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm2                   \n"
+      "movdqu      0x10(%0),%%xmm3               \n"
+      "lea         0x20(%0),%0                   \n"
+      "movdqu      (%1),%%xmm4                   \n"
+      "movdqu      0x10(%1),%%xmm5               \n"
+      "pand        %%xmm0,%%xmm2                 \n"
+      "pand        %%xmm0,%%xmm3                 \n"
+      "pand        %%xmm1,%%xmm4                 \n"
+      "pand        %%xmm1,%%xmm5                 \n"
+      "por         %%xmm4,%%xmm2                 \n"
+      "por         %%xmm5,%%xmm3                 \n"
+      "movdqu      %%xmm2,(%1)                   \n"
+      "movdqu      %%xmm3,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
         "+r"(width)  // %2
@@ -3884,21 +4084,21 @@ void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 // width in pixels
 void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
-      "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
-      "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+      "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu   (%0),%%ymm1                     \n"
-      "vmovdqu   0x20(%0),%%ymm2                 \n"
-      "lea       0x40(%0),%0                     \n"
-      "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1       \n"
-      "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2   \n"
-      "vmovdqu   %%ymm1,(%1)                     \n"
-      "vmovdqu   %%ymm2,0x20(%1)                 \n"
-      "lea       0x40(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
+      "vmovdqu     (%0),%%ymm1                   \n"
+      "vmovdqu     0x20(%0),%%ymm2               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpblendvb   %%ymm0,(%1),%%ymm1,%%ymm1     \n"
+      "vpblendvb   %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
+      "vmovdqu     %%ymm1,(%1)                   \n"
+      "vmovdqu     %%ymm2,0x20(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
@@ -3917,17 +4117,17 @@ void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0), %%xmm0                    \n"
-      "movdqu    0x10(%0), %%xmm1                \n"
-      "lea       0x20(%0), %0                    \n"
-      "psrld     $0x18, %%xmm0                   \n"
-      "psrld     $0x18, %%xmm1                   \n"
-      "packssdw  %%xmm1, %%xmm0                  \n"
-      "packuswb  %%xmm0, %%xmm0                  \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "lea       0x8(%1), %1                     \n"
-      "sub       $0x8, %2                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0), %%xmm0                  \n"
+      "movdqu      0x10(%0), %%xmm1              \n"
+      "lea         0x20(%0), %0                  \n"
+      "psrld       $0x18, %%xmm0                 \n"
+      "psrld       $0x18, %%xmm1                 \n"
+      "packssdw    %%xmm1, %%xmm0                \n"
+      "packuswb    %%xmm0, %%xmm0                \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1), %1                   \n"
+      "sub         $0x8, %2                      \n"
+      "jg          1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_a),     // %1
         "+rm"(width)     // %2
@@ -3945,28 +4145,28 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
                               uint8_t* dst_a,
                               int width) {
   asm volatile(
-      "vmovdqa    %3,%%ymm4                      \n"
+      "vmovdqa     %3,%%ymm4                     \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu   (%0), %%ymm0                    \n"
-      "vmovdqu   0x20(%0), %%ymm1                \n"
-      "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"  // vpsrld $0x18, %%ymm0
-      "vpshufb    %%ymm5,%%ymm1,%%ymm1           \n"
-      "vmovdqu   0x40(%0), %%ymm2                \n"
-      "vmovdqu   0x60(%0), %%ymm3                \n"
-      "lea       0x80(%0), %0                    \n"
-      "vpackssdw  %%ymm1, %%ymm0, %%ymm0         \n"  // mutates
-      "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
-      "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
-      "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // mutates
-      "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
-      "vpermd     %%ymm0,%%ymm4,%%ymm0           \n"  // unmutate.
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub        $0x20, %2                      \n"
-      "jg         1b                             \n"
+      "vmovdqu     (%0), %%ymm0                  \n"
+      "vmovdqu     0x20(%0), %%ymm1              \n"
+      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"  // vpsrld $0x18, %%ymm0
+      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
+      "vmovdqu     0x40(%0), %%ymm2              \n"
+      "vmovdqu     0x60(%0), %%ymm3              \n"
+      "lea         0x80(%0), %0                  \n"
+      "vpackssdw   %%ymm1, %%ymm0, %%ymm0        \n"  // mutates
+      "vpshufb     %%ymm5,%%ymm2,%%ymm2          \n"
+      "vpshufb     %%ymm5,%%ymm3,%%ymm3          \n"
+      "vpackssdw   %%ymm3, %%ymm2, %%ymm2        \n"  // mutates
+      "vpackuswb   %%ymm2,%%ymm0,%%ymm0          \n"  // mutates.
+      "vpermd      %%ymm0,%%ymm4,%%ymm0          \n"  // unmutate.
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20, %2                     \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src_argb),               // %0
         "+r"(dst_a),                  // %1
@@ -3981,31 +4181,31 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
 // width in pixels
 void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
-      "pcmpeqb   %%xmm0,%%xmm0                   \n"
-      "pslld     $0x18,%%xmm0                    \n"
-      "pcmpeqb   %%xmm1,%%xmm1                   \n"
-      "psrld     $0x8,%%xmm1                     \n"
+      "pcmpeqb     %%xmm0,%%xmm0                 \n"
+      "pslld       $0x18,%%xmm0                  \n"
+      "pcmpeqb     %%xmm1,%%xmm1                 \n"
+      "psrld       $0x8,%%xmm1                   \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movq      (%0),%%xmm2                     \n"
-      "lea       0x8(%0),%0                      \n"
-      "punpcklbw %%xmm2,%%xmm2                   \n"
-      "punpckhwd %%xmm2,%%xmm3                   \n"
-      "punpcklwd %%xmm2,%%xmm2                   \n"
-      "movdqu    (%1),%%xmm4                     \n"
-      "movdqu    0x10(%1),%%xmm5                 \n"
-      "pand      %%xmm0,%%xmm2                   \n"
-      "pand      %%xmm0,%%xmm3                   \n"
-      "pand      %%xmm1,%%xmm4                   \n"
-      "pand      %%xmm1,%%xmm5                   \n"
-      "por       %%xmm4,%%xmm2                   \n"
-      "por       %%xmm5,%%xmm3                   \n"
-      "movdqu    %%xmm2,(%1)                     \n"
-      "movdqu    %%xmm3,0x10(%1)                 \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
+      "movq        (%0),%%xmm2                   \n"
+      "lea         0x8(%0),%0                    \n"
+      "punpcklbw   %%xmm2,%%xmm2                 \n"
+      "punpckhwd   %%xmm2,%%xmm3                 \n"
+      "punpcklwd   %%xmm2,%%xmm2                 \n"
+      "movdqu      (%1),%%xmm4                   \n"
+      "movdqu      0x10(%1),%%xmm5               \n"
+      "pand        %%xmm0,%%xmm2                 \n"
+      "pand        %%xmm0,%%xmm3                 \n"
+      "pand        %%xmm1,%%xmm4                 \n"
+      "pand        %%xmm1,%%xmm5                 \n"
+      "por         %%xmm4,%%xmm2                 \n"
+      "por         %%xmm5,%%xmm3                 \n"
+      "movdqu      %%xmm2,(%1)                   \n"
+      "movdqu      %%xmm3,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
         "+r"(width)  // %2
@@ -4018,23 +4218,23 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 // width in pixels
 void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
-      "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
-      "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+      "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vpmovzxbd (%0),%%ymm1                     \n"
-      "vpmovzxbd 0x8(%0),%%ymm2                  \n"
-      "lea       0x10(%0),%0                     \n"
-      "vpslld    $0x18,%%ymm1,%%ymm1             \n"
-      "vpslld    $0x18,%%ymm2,%%ymm2             \n"
-      "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1       \n"
-      "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2   \n"
-      "vmovdqu   %%ymm1,(%1)                     \n"
-      "vmovdqu   %%ymm2,0x20(%1)                 \n"
-      "lea       0x40(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
+      "vpmovzxbd   (%0),%%ymm1                   \n"
+      "vpmovzxbd   0x8(%0),%%ymm2                \n"
+      "lea         0x10(%0),%0                   \n"
+      "vpslld      $0x18,%%ymm1,%%ymm1           \n"
+      "vpslld      $0x18,%%ymm2,%%ymm2           \n"
+      "vpblendvb   %%ymm0,(%1),%%ymm1,%%ymm1     \n"
+      "vpblendvb   %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
+      "vmovdqu     %%ymm1,(%1)                   \n"
+      "vmovdqu     %%ymm2,0x20(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
@@ -4050,7 +4250,7 @@ void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
   const uint32_t v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
   asm volatile(
 
-      "rep stosl                      \n"
+      "rep         stosl                         \n"
       : "+D"(dst),       // %0
         "+c"(width_tmp)  // %1
       : "a"(v32)         // %2
@@ -4061,7 +4261,7 @@ void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
   size_t width_tmp = (size_t)(width);
   asm volatile(
 
-      "rep stosb                      \n"
+      "rep         stosb                         \n"
       : "+D"(dst),       // %0
         "+c"(width_tmp)  // %1
       : "a"(v8)          // %2
@@ -4072,7 +4272,7 @@ void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
   size_t width_tmp = (size_t)(width);
   asm volatile(
 
-      "rep stosl                      \n"
+      "rep         stosl                         \n"
       : "+D"(dst_argb),  // %0
         "+c"(width_tmp)  // %1
       : "a"(v32)         // %2
@@ -4083,21 +4283,21 @@ void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
 #ifdef HAS_YUY2TOYROW_SSE2
 void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
   asm volatile(
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
-      "psrlw     $0x8,%%xmm5                     \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $0x8,%%xmm5                   \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "pand      %%xmm5,%%xmm0                   \n"
-      "pand      %%xmm5,%%xmm1                   \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
       : "+r"(src_yuy2),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
@@ -4111,32 +4311,32 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
                       uint8_t* dst_v,
                       int width) {
   asm volatile(
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
-      "psrlw     $0x8,%%xmm5                     \n"
-      "sub       %1,%2                           \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $0x8,%%xmm5                   \n"
+      "sub         %1,%2                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x00(%0,%4,1),%%xmm2            \n"
-      "movdqu    0x10(%0,%4,1),%%xmm3            \n"
-      "lea       0x20(%0),%0                     \n"
-      "pavgb     %%xmm2,%%xmm0                   \n"
-      "pavgb     %%xmm3,%%xmm1                   \n"
-      "psrlw     $0x8,%%xmm0                     \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "pand      %%xmm5,%%xmm0                   \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "packuswb  %%xmm1,%%xmm1                   \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "movq    %%xmm1,0x00(%1,%2,1)              \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x00(%0,%4,1),%%xmm2          \n"
+      "movdqu      0x10(%0,%4,1),%%xmm3          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "pavgb       %%xmm3,%%xmm1                 \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movq        %%xmm1,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
       : "+r"(src_yuy2),               // %0
         "+r"(dst_u),                  // %1
         "+r"(dst_v),                  // %2
@@ -4150,28 +4350,28 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
                          uint8_t* dst_v,
                          int width) {
   asm volatile(
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
-      "psrlw     $0x8,%%xmm5                     \n"
-      "sub       %1,%2                           \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $0x8,%%xmm5                   \n"
+      "sub         %1,%2                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "psrlw     $0x8,%%xmm0                     \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "pand      %%xmm5,%%xmm0                   \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "packuswb  %%xmm1,%%xmm1                   \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "movq    %%xmm1,0x00(%1,%2,1)              \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movq        %%xmm1,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
       : "+r"(src_yuy2),  // %0
         "+r"(dst_u),     // %1
         "+r"(dst_v),     // %2
@@ -4185,16 +4385,16 @@ void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "psrlw     $0x8,%%xmm0                     \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
       : "+r"(src_uyvy),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
@@ -4208,32 +4408,32 @@ void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
                       uint8_t* dst_v,
                       int width) {
   asm volatile(
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
-      "psrlw     $0x8,%%xmm5                     \n"
-      "sub       %1,%2                           \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $0x8,%%xmm5                   \n"
+      "sub         %1,%2                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x00(%0,%4,1),%%xmm2            \n"
-      "movdqu    0x10(%0,%4,1),%%xmm3            \n"
-      "lea       0x20(%0),%0                     \n"
-      "pavgb     %%xmm2,%%xmm0                   \n"
-      "pavgb     %%xmm3,%%xmm1                   \n"
-      "pand      %%xmm5,%%xmm0                   \n"
-      "pand      %%xmm5,%%xmm1                   \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "pand      %%xmm5,%%xmm0                   \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "packuswb  %%xmm1,%%xmm1                   \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "movq    %%xmm1,0x00(%1,%2,1)              \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x00(%0,%4,1),%%xmm2          \n"
+      "movdqu      0x10(%0,%4,1),%%xmm3          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "pavgb       %%xmm3,%%xmm1                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movq        %%xmm1,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
       : "+r"(src_uyvy),               // %0
         "+r"(dst_u),                  // %1
         "+r"(dst_v),                  // %2
@@ -4247,28 +4447,28 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
                          uint8_t* dst_v,
                          int width) {
   asm volatile(
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
-      "psrlw     $0x8,%%xmm5                     \n"
-      "sub       %1,%2                           \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $0x8,%%xmm5                   \n"
+      "sub         %1,%2                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "pand      %%xmm5,%%xmm0                   \n"
-      "pand      %%xmm5,%%xmm1                   \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "pand      %%xmm5,%%xmm0                   \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "packuswb  %%xmm1,%%xmm1                   \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "movq    %%xmm1,0x00(%1,%2,1)              \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movq        %%xmm1,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
       : "+r"(src_uyvy),  // %0
         "+r"(dst_u),     // %1
         "+r"(dst_v),     // %2
@@ -4281,22 +4481,22 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
 #ifdef HAS_YUY2TOYROW_AVX2
 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
   asm volatile(
-      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu   (%0),%%ymm0                     \n"
-      "vmovdqu   0x20(%0),%%ymm1                 \n"
-      "lea       0x40(%0),%0                     \n"
-      "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
-      "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
-      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-      "vmovdqu   %%ymm0,(%1)                     \n"
-      "lea      0x20(%1),%1                      \n"
-      "sub       $0x20,%2                        \n"
-      "jg        1b                              \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src_yuy2),  // %0
         "+r"(dst_y),     // %1
@@ -4311,32 +4511,32 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
                       uint8_t* dst_v,
                       int width) {
   asm volatile(
-      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-      "sub       %1,%2                           \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
+      "sub         %1,%2                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu   (%0),%%ymm0                     \n"
-      "vmovdqu   0x20(%0),%%ymm1                 \n"
-      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
-      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
-      "lea       0x40(%0),%0                     \n"
-      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-      "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
-      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
+      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
       "vextractf128 $0x0,%%ymm1,(%1)             \n"
       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
-      "lea      0x10(%1),%1                      \n"
-      "sub       $0x20,%3                        \n"
-      "jg        1b                              \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src_yuy2),               // %0
         "+r"(dst_u),                  // %1
@@ -4351,30 +4551,30 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
                          uint8_t* dst_v,
                          int width) {
   asm volatile(
-      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-      "sub       %1,%2                           \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
+      "sub         %1,%2                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu   (%0),%%ymm0                     \n"
-      "vmovdqu   0x20(%0),%%ymm1                 \n"
-      "lea       0x40(%0),%0                     \n"
-      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-      "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
-      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
       "vextractf128 $0x0,%%ymm1,(%1)             \n"
       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
-      "lea      0x10(%1),%1                      \n"
-      "sub       $0x20,%3                        \n"
-      "jg        1b                              \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src_yuy2),  // %0
         "+r"(dst_u),     // %1
@@ -4389,17 +4589,17 @@ void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu   (%0),%%ymm0                     \n"
-      "vmovdqu   0x20(%0),%%ymm1                 \n"
-      "lea       0x40(%0),%0                     \n"
-      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-      "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
-      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-      "vmovdqu   %%ymm0,(%1)                     \n"
-      "lea      0x20(%1),%1                      \n"
-      "sub       $0x20,%2                        \n"
-      "jg        1b                              \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src_uyvy),  // %0
         "+r"(dst_y),     // %1
@@ -4413,32 +4613,32 @@ void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
                       uint8_t* dst_v,
                       int width) {
   asm volatile(
-      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-      "sub       %1,%2                           \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
+      "sub         %1,%2                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu   (%0),%%ymm0                     \n"
-      "vmovdqu   0x20(%0),%%ymm1                 \n"
-      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
-      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
-      "lea       0x40(%0),%0                     \n"
-      "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
-      "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
-      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
+      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
       "vextractf128 $0x0,%%ymm1,(%1)             \n"
       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
-      "lea      0x10(%1),%1                      \n"
-      "sub       $0x20,%3                        \n"
-      "jg        1b                              \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src_uyvy),               // %0
         "+r"(dst_u),                  // %1
@@ -4453,30 +4653,30 @@ void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
                          uint8_t* dst_v,
                          int width) {
   asm volatile(
-      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-      "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
-      "sub       %1,%2                           \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
+      "sub         %1,%2                         \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu   (%0),%%ymm0                     \n"
-      "vmovdqu   0x20(%0),%%ymm1                 \n"
-      "lea       0x40(%0),%0                     \n"
-      "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
-      "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
-      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
       "vextractf128 $0x0,%%ymm1,(%1)             \n"
       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
-      "lea      0x10(%1),%1                      \n"
-      "sub       $0x20,%3                        \n"
-      "jg        1b                              \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src_uyvy),  // %0
         "+r"(dst_u),     // %1
@@ -4498,71 +4698,71 @@ void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
                         uint8_t* dst_argb,
                         int width) {
   asm volatile(
-      "pcmpeqb   %%xmm7,%%xmm7                   \n"
-      "psrlw     $0xf,%%xmm7                     \n"
-      "pcmpeqb   %%xmm6,%%xmm6                   \n"
-      "psrlw     $0x8,%%xmm6                     \n"
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
-      "psllw     $0x8,%%xmm5                     \n"
-      "pcmpeqb   %%xmm4,%%xmm4                   \n"
-      "pslld     $0x18,%%xmm4                    \n"
-      "sub       $0x4,%3                         \n"
-      "jl        49f                             \n"
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psrlw       $0xf,%%xmm7                   \n"
+      "pcmpeqb     %%xmm6,%%xmm6                 \n"
+      "psrlw       $0x8,%%xmm6                   \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psllw       $0x8,%%xmm5                   \n"
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "pslld       $0x18,%%xmm4                  \n"
+      "sub         $0x4,%3                       \n"
+      "jl          49f                           \n"
 
       // 4 pixel loop.
       LABELALIGN
       "40:                                       \n"
-      "movdqu    (%0),%%xmm3                     \n"
-      "lea       0x10(%0),%0                     \n"
-      "movdqa    %%xmm3,%%xmm0                   \n"
-      "pxor      %%xmm4,%%xmm3                   \n"
-      "movdqu    (%1),%%xmm2                     \n"
-      "pshufb    %4,%%xmm3                       \n"
-      "pand      %%xmm6,%%xmm2                   \n"
-      "paddw     %%xmm7,%%xmm3                   \n"
-      "pmullw    %%xmm3,%%xmm2                   \n"
-      "movdqu    (%1),%%xmm1                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "por       %%xmm4,%%xmm0                   \n"
-      "pmullw    %%xmm3,%%xmm1                   \n"
-      "psrlw     $0x8,%%xmm2                     \n"
-      "paddusb   %%xmm2,%%xmm0                   \n"
-      "pand      %%xmm5,%%xmm1                   \n"
-      "paddusb   %%xmm1,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%2)                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "sub       $0x4,%3                         \n"
-      "jge       40b                             \n"
+      "movdqu      (%0),%%xmm3                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqa      %%xmm3,%%xmm0                 \n"
+      "pxor        %%xmm4,%%xmm3                 \n"
+      "movdqu      (%1),%%xmm2                   \n"
+      "pshufb      %4,%%xmm3                     \n"
+      "pand        %%xmm6,%%xmm2                 \n"
+      "paddw       %%xmm7,%%xmm3                 \n"
+      "pmullw      %%xmm3,%%xmm2                 \n"
+      "movdqu      (%1),%%xmm1                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "por         %%xmm4,%%xmm0                 \n"
+      "pmullw      %%xmm3,%%xmm1                 \n"
+      "psrlw       $0x8,%%xmm2                   \n"
+      "paddusb     %%xmm2,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm1                 \n"
+      "paddusb     %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jge         40b                           \n"
 
       "49:                                       \n"
-      "add       $0x3,%3                         \n"
-      "jl        99f                             \n"
+      "add         $0x3,%3                       \n"
+      "jl          99f                           \n"
 
       // 1 pixel loop.
       "91:                                       \n"
-      "movd      (%0),%%xmm3                     \n"
-      "lea       0x4(%0),%0                      \n"
-      "movdqa    %%xmm3,%%xmm0                   \n"
-      "pxor      %%xmm4,%%xmm3                   \n"
-      "movd      (%1),%%xmm2                     \n"
-      "pshufb    %4,%%xmm3                       \n"
-      "pand      %%xmm6,%%xmm2                   \n"
-      "paddw     %%xmm7,%%xmm3                   \n"
-      "pmullw    %%xmm3,%%xmm2                   \n"
-      "movd      (%1),%%xmm1                     \n"
-      "lea       0x4(%1),%1                      \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "por       %%xmm4,%%xmm0                   \n"
-      "pmullw    %%xmm3,%%xmm1                   \n"
-      "psrlw     $0x8,%%xmm2                     \n"
-      "paddusb   %%xmm2,%%xmm0                   \n"
-      "pand      %%xmm5,%%xmm1                   \n"
-      "paddusb   %%xmm1,%%xmm0                   \n"
-      "movd      %%xmm0,(%2)                     \n"
-      "lea       0x4(%2),%2                      \n"
-      "sub       $0x1,%3                         \n"
-      "jge       91b                             \n"
+      "movd        (%0),%%xmm3                   \n"
+      "lea         0x4(%0),%0                    \n"
+      "movdqa      %%xmm3,%%xmm0                 \n"
+      "pxor        %%xmm4,%%xmm3                 \n"
+      "movd        (%1),%%xmm2                   \n"
+      "pshufb      %4,%%xmm3                     \n"
+      "pand        %%xmm6,%%xmm2                 \n"
+      "paddw       %%xmm7,%%xmm3                 \n"
+      "pmullw      %%xmm3,%%xmm2                 \n"
+      "movd        (%1),%%xmm1                   \n"
+      "lea         0x4(%1),%1                    \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "por         %%xmm4,%%xmm0                 \n"
+      "pmullw      %%xmm3,%%xmm1                 \n"
+      "psrlw       $0x8,%%xmm2                   \n"
+      "paddusb     %%xmm2,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm1                 \n"
+      "paddusb     %%xmm1,%%xmm0                 \n"
+      "movd        %%xmm0,(%2)                   \n"
+      "lea         0x4(%2),%2                    \n"
+      "sub         $0x1,%3                       \n"
+      "jge         91b                           \n"
       "99:                                       \n"
       : "+r"(src_argb0),    // %0
         "+r"(src_argb1),    // %1
@@ -4586,36 +4786,36 @@ void BlendPlaneRow_SSSE3(const uint8_t* src0,
                          uint8_t* dst,
                          int width) {
   asm volatile(
-      "pcmpeqb    %%xmm5,%%xmm5                  \n"
-      "psllw      $0x8,%%xmm5                    \n"
-      "mov        $0x80808080,%%eax              \n"
-      "movd       %%eax,%%xmm6                   \n"
-      "pshufd     $0x0,%%xmm6,%%xmm6             \n"
-      "mov        $0x807f807f,%%eax              \n"
-      "movd       %%eax,%%xmm7                   \n"
-      "pshufd     $0x0,%%xmm7,%%xmm7             \n"
-      "sub        %2,%0                          \n"
-      "sub        %2,%1                          \n"
-      "sub        %2,%3                          \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psllw       $0x8,%%xmm5                   \n"
+      "mov         $0x80808080,%%eax             \n"
+      "movd        %%eax,%%xmm6                  \n"
+      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
+      "mov         $0x807f807f,%%eax             \n"
+      "movd        %%eax,%%xmm7                  \n"
+      "pshufd      $0x0,%%xmm7,%%xmm7            \n"
+      "sub         %2,%0                         \n"
+      "sub         %2,%1                         \n"
+      "sub         %2,%3                         \n"
 
       // 8 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movq       (%2),%%xmm0                    \n"
-      "punpcklbw  %%xmm0,%%xmm0                  \n"
-      "pxor       %%xmm5,%%xmm0                  \n"
-      "movq       (%0,%2,1),%%xmm1               \n"
-      "movq       (%1,%2,1),%%xmm2               \n"
-      "punpcklbw  %%xmm2,%%xmm1                  \n"
-      "psubb      %%xmm6,%%xmm1                  \n"
-      "pmaddubsw  %%xmm1,%%xmm0                  \n"
-      "paddw      %%xmm7,%%xmm0                  \n"
-      "psrlw      $0x8,%%xmm0                    \n"
-      "packuswb   %%xmm0,%%xmm0                  \n"
-      "movq       %%xmm0,(%3,%2,1)               \n"
-      "lea        0x8(%2),%2                     \n"
-      "sub        $0x8,%4                        \n"
-      "jg        1b                              \n"
+      "movq        (%2),%%xmm0                   \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "pxor        %%xmm5,%%xmm0                 \n"
+      "movq        (%0,%2,1),%%xmm1              \n"
+      "movq        (%1,%2,1),%%xmm2              \n"
+      "punpcklbw   %%xmm2,%%xmm1                 \n"
+      "psubb       %%xmm6,%%xmm1                 \n"
+      "pmaddubsw   %%xmm1,%%xmm0                 \n"
+      "paddw       %%xmm7,%%xmm0                 \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movq        %%xmm0,(%3,%2,1)              \n"
+      "lea         0x8(%2),%2                    \n"
+      "sub         $0x8,%4                       \n"
+      "jg          1b                            \n"
       : "+r"(src0),   // %0
         "+r"(src1),   // %1
         "+r"(alpha),  // %2
@@ -4638,43 +4838,43 @@ void BlendPlaneRow_AVX2(const uint8_t* src0,
                         uint8_t* dst,
                         int width) {
   asm volatile(
-      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-      "vpsllw     $0x8,%%ymm5,%%ymm5             \n"
-      "mov        $0x80808080,%%eax              \n"
-      "vmovd      %%eax,%%xmm6                   \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsllw      $0x8,%%ymm5,%%ymm5            \n"
+      "mov         $0x80808080,%%eax             \n"
+      "vmovd       %%eax,%%xmm6                  \n"
       "vbroadcastss %%xmm6,%%ymm6                \n"
-      "mov        $0x807f807f,%%eax              \n"
-      "vmovd      %%eax,%%xmm7                   \n"
+      "mov         $0x807f807f,%%eax             \n"
+      "vmovd       %%eax,%%xmm7                  \n"
       "vbroadcastss %%xmm7,%%ymm7                \n"
-      "sub        %2,%0                          \n"
-      "sub        %2,%1                          \n"
-      "sub        %2,%3                          \n"
+      "sub         %2,%0                         \n"
+      "sub         %2,%1                         \n"
+      "sub         %2,%3                         \n"
 
       // 32 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu    (%2),%%ymm0                    \n"
-      "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n"
-      "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
-      "vpxor      %%ymm5,%%ymm3,%%ymm3           \n"
-      "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"
-      "vmovdqu    (%0,%2,1),%%ymm1               \n"
-      "vmovdqu    (%1,%2,1),%%ymm2               \n"
-      "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n"
-      "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
-      "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n"
-      "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"
-      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-      "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"
-      "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n"
-      "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"
-      "vpsrlw     $0x8,%%ymm3,%%ymm3             \n"
-      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-      "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n"
-      "vmovdqu    %%ymm0,(%3,%2,1)               \n"
-      "lea        0x20(%2),%2                    \n"
-      "sub        $0x20,%4                       \n"
-      "jg        1b                              \n"
+      "vmovdqu     (%2),%%ymm0                   \n"
+      "vpunpckhbw  %%ymm0,%%ymm0,%%ymm3          \n"
+      "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpxor       %%ymm5,%%ymm3,%%ymm3          \n"
+      "vpxor       %%ymm5,%%ymm0,%%ymm0          \n"
+      "vmovdqu     (%0,%2,1),%%ymm1              \n"
+      "vmovdqu     (%1,%2,1),%%ymm2              \n"
+      "vpunpckhbw  %%ymm2,%%ymm1,%%ymm4          \n"
+      "vpunpcklbw  %%ymm2,%%ymm1,%%ymm1          \n"
+      "vpsubb      %%ymm6,%%ymm4,%%ymm4          \n"
+      "vpsubb      %%ymm6,%%ymm1,%%ymm1          \n"
+      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
+      "vpmaddubsw  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm7,%%ymm3,%%ymm3          \n"
+      "vpaddw      %%ymm7,%%ymm0,%%ymm0          \n"
+      "vpsrlw      $0x8,%%ymm3,%%ymm3            \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm3,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%3,%2,1)              \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x20,%4                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src0),   // %0
         "+r"(src1),   // %1
@@ -4688,7 +4888,7 @@ void BlendPlaneRow_AVX2(const uint8_t* src0,
 #endif  // HAS_BLENDPLANEROW_AVX2
 
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
-// Shuffle table duplicating alpha
+// Shuffle table duplicating alpha.
 static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
                                      7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
 static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
@@ -4698,35 +4898,35 @@ void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             int width) {
   asm volatile(
-      "pcmpeqb   %%xmm3,%%xmm3                   \n"
-      "pslld     $0x18,%%xmm3                    \n"
-      "movdqa    %3,%%xmm4                       \n"
-      "movdqa    %4,%%xmm5                       \n"
+      "pcmpeqb     %%xmm3,%%xmm3                 \n"
+      "pslld       $0x18,%%xmm3                  \n"
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
 
       // 4 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "pshufb    %%xmm4,%%xmm0                   \n"
-      "movdqu    (%0),%%xmm1                     \n"
-      "punpcklbw %%xmm1,%%xmm1                   \n"
-      "pmulhuw   %%xmm1,%%xmm0                   \n"
-      "movdqu    (%0),%%xmm1                     \n"
-      "pshufb    %%xmm5,%%xmm1                   \n"
-      "movdqu    (%0),%%xmm2                     \n"
-      "punpckhbw %%xmm2,%%xmm2                   \n"
-      "pmulhuw   %%xmm2,%%xmm1                   \n"
-      "movdqu    (%0),%%xmm2                     \n"
-      "lea       0x10(%0),%0                     \n"
-      "pand      %%xmm3,%%xmm2                   \n"
-      "psrlw     $0x8,%%xmm0                     \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "por       %%xmm2,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x4,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "movdqu      (%0),%%xmm1                   \n"
+      "punpcklbw   %%xmm1,%%xmm1                 \n"
+      "pmulhuw     %%xmm1,%%xmm0                 \n"
+      "movdqu      (%0),%%xmm1                   \n"
+      "pshufb      %%xmm5,%%xmm1                 \n"
+      "movdqu      (%0),%%xmm2                   \n"
+      "punpckhbw   %%xmm2,%%xmm2                 \n"
+      "pmulhuw     %%xmm2,%%xmm1                 \n"
+      "movdqu      (%0),%%xmm2                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "pand        %%xmm3,%%xmm2                 \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src_argb),       // %0
         "+r"(dst_argb),       // %1
         "+r"(width)           // %2
@@ -4747,29 +4947,29 @@ void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
                            int width) {
   asm volatile(
       "vbroadcastf128 %3,%%ymm4                  \n"
-      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-      "vpslld     $0x18,%%ymm5,%%ymm5            \n"
-      "sub        %0,%1                          \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpslld      $0x18,%%ymm5,%%ymm5           \n"
+      "sub         %0,%1                         \n"
 
       // 8 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu    (%0),%%ymm6                    \n"
-      "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
-      "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
-      "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
-      "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
-      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
-      "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
-      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-      "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
-      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-      "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
-      "vmovdqu    %%ymm0,0x00(%0,%1,1)           \n"
-      "lea       0x20(%0),%0                     \n"
-      "sub        $0x8,%2                        \n"
-      "jg        1b                              \n"
+      "vmovdqu     (%0),%%ymm6                   \n"
+      "vpunpcklbw  %%ymm6,%%ymm6,%%ymm0          \n"
+      "vpunpckhbw  %%ymm6,%%ymm6,%%ymm1          \n"
+      "vpshufb     %%ymm4,%%ymm0,%%ymm2          \n"
+      "vpshufb     %%ymm4,%%ymm1,%%ymm3          \n"
+      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpand       %%ymm5,%%ymm6,%%ymm6          \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpor        %%ymm6,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,0x00(%0,%1,1)          \n"
+      "lea         0x20(%0),%0                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src_argb),          // %0
         "+r"(dst_argb),          // %1
@@ -4789,32 +4989,32 @@ void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
       // 4 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movzb     0x03(%0),%3                     \n"
-      "punpcklbw %%xmm0,%%xmm0                   \n"
-      "movd      0x00(%4,%3,4),%%xmm2            \n"
-      "movzb     0x07(%0),%3                     \n"
-      "movd      0x00(%4,%3,4),%%xmm3            \n"
-      "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
-      "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
-      "movlhps   %%xmm3,%%xmm2                   \n"
-      "pmulhuw   %%xmm2,%%xmm0                   \n"
-      "movdqu    (%0),%%xmm1                     \n"
-      "movzb     0x0b(%0),%3                     \n"
-      "punpckhbw %%xmm1,%%xmm1                   \n"
-      "movd      0x00(%4,%3,4),%%xmm2            \n"
-      "movzb     0x0f(%0),%3                     \n"
-      "movd      0x00(%4,%3,4),%%xmm3            \n"
-      "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
-      "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
-      "movlhps   %%xmm3,%%xmm2                   \n"
-      "pmulhuw   %%xmm2,%%xmm1                   \n"
-      "lea       0x10(%0),%0                     \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x4,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movzb       0x03(%0),%3                   \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "movd        0x00(%4,%3,4),%%xmm2          \n"
+      "movzb       0x07(%0),%3                   \n"
+      "movd        0x00(%4,%3,4),%%xmm3          \n"
+      "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
+      "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
+      "movlhps     %%xmm3,%%xmm2                 \n"
+      "pmulhuw     %%xmm2,%%xmm0                 \n"
+      "movdqu      (%0),%%xmm1                   \n"
+      "movzb       0x0b(%0),%3                   \n"
+      "punpckhbw   %%xmm1,%%xmm1                 \n"
+      "movd        0x00(%4,%3,4),%%xmm2          \n"
+      "movzb       0x0f(%0),%3                   \n"
+      "movd        0x00(%4,%3,4),%%xmm3          \n"
+      "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
+      "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
+      "movlhps     %%xmm3,%%xmm2                 \n"
+      "pmulhuw     %%xmm2,%%xmm1                 \n"
+      "lea         0x10(%0),%0                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src_argb),     // %0
         "+r"(dst_argb),     // %1
         "+r"(width),        // %2
@@ -4834,52 +5034,52 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
                              int width) {
   uintptr_t alpha;
   asm volatile(
-      "sub        %0,%1                          \n"
+      "sub         %0,%1                         \n"
       "vbroadcastf128 %5,%%ymm5                  \n"
 
       // 8 pixel loop.
       LABELALIGN
       "1:                                        \n"
       // replace VPGATHER
-      "movzb     0x03(%0),%3                     \n"
-      "vmovd     0x00(%4,%3,4),%%xmm0            \n"
-      "movzb     0x07(%0),%3                     \n"
-      "vmovd     0x00(%4,%3,4),%%xmm1            \n"
-      "movzb     0x0b(%0),%3                     \n"
-      "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
-      "vmovd     0x00(%4,%3,4),%%xmm2            \n"
-      "movzb     0x0f(%0),%3                     \n"
-      "vmovd     0x00(%4,%3,4),%%xmm3            \n"
-      "movzb     0x13(%0),%3                     \n"
-      "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
-      "vmovd     0x00(%4,%3,4),%%xmm0            \n"
-      "movzb     0x17(%0),%3                     \n"
-      "vmovd     0x00(%4,%3,4),%%xmm1            \n"
-      "movzb     0x1b(%0),%3                     \n"
-      "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
-      "vmovd     0x00(%4,%3,4),%%xmm2            \n"
-      "movzb     0x1f(%0),%3                     \n"
-      "vmovd     0x00(%4,%3,4),%%xmm3            \n"
-      "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
+      "movzb       0x03(%0),%3                   \n"
+      "vmovd       0x00(%4,%3,4),%%xmm0          \n"
+      "movzb       0x07(%0),%3                   \n"
+      "vmovd       0x00(%4,%3,4),%%xmm1          \n"
+      "movzb       0x0b(%0),%3                   \n"
+      "vpunpckldq  %%xmm1,%%xmm0,%%xmm6          \n"
+      "vmovd       0x00(%4,%3,4),%%xmm2          \n"
+      "movzb       0x0f(%0),%3                   \n"
+      "vmovd       0x00(%4,%3,4),%%xmm3          \n"
+      "movzb       0x13(%0),%3                   \n"
+      "vpunpckldq  %%xmm3,%%xmm2,%%xmm7          \n"
+      "vmovd       0x00(%4,%3,4),%%xmm0          \n"
+      "movzb       0x17(%0),%3                   \n"
+      "vmovd       0x00(%4,%3,4),%%xmm1          \n"
+      "movzb       0x1b(%0),%3                   \n"
+      "vpunpckldq  %%xmm1,%%xmm0,%%xmm0          \n"
+      "vmovd       0x00(%4,%3,4),%%xmm2          \n"
+      "movzb       0x1f(%0),%3                   \n"
+      "vmovd       0x00(%4,%3,4),%%xmm3          \n"
+      "vpunpckldq  %%xmm3,%%xmm2,%%xmm2          \n"
       "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
       "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
       "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
       // end of VPGATHER
 
-      "vmovdqu    (%0),%%ymm6                    \n"
-      "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
-      "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
-      "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
-      "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
-      "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
-      "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
-      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
-      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-      "vmovdqu    %%ymm0,0x00(%0,%1,1)           \n"
-      "lea       0x20(%0),%0                     \n"
-      "sub        $0x8,%2                        \n"
-      "jg        1b                              \n"
+      "vmovdqu     (%0),%%ymm6                   \n"
+      "vpunpcklbw  %%ymm6,%%ymm6,%%ymm0          \n"
+      "vpunpckhbw  %%ymm6,%%ymm6,%%ymm1          \n"
+      "vpunpcklwd  %%ymm3,%%ymm3,%%ymm2          \n"
+      "vpunpckhwd  %%ymm3,%%ymm3,%%ymm3          \n"
+      "vpshufb     %%ymm5,%%ymm2,%%ymm2          \n"
+      "vpshufb     %%ymm5,%%ymm3,%%ymm3          \n"
+      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,0x00(%0,%1,1)          \n"
+      "lea         0x20(%0),%0                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src_argb),                 // %0
         "+r"(dst_argb),                 // %1
@@ -4896,44 +5096,48 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
 void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
   asm volatile(
-      "movdqa    %3,%%xmm4                       \n"
-      "movdqa    %4,%%xmm5                       \n"
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
 
       // 8 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm1                   \n"
-      "phaddw    %%xmm1,%%xmm0                   \n"
-      "paddw     %%xmm5,%%xmm0                   \n"
-      "psrlw     $0x7,%%xmm0                     \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "movdqu    (%0),%%xmm2                     \n"
-      "movdqu    0x10(%0),%%xmm3                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "psrld     $0x18,%%xmm2                    \n"
-      "psrld     $0x18,%%xmm3                    \n"
-      "packuswb  %%xmm3,%%xmm2                   \n"
-      "packuswb  %%xmm2,%%xmm2                   \n"
-      "movdqa    %%xmm0,%%xmm3                   \n"
-      "punpcklbw %%xmm0,%%xmm0                   \n"
-      "punpcklbw %%xmm2,%%xmm3                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "punpcklwd %%xmm3,%%xmm0                   \n"
-      "punpckhwd %%xmm3,%%xmm1                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "movdqu    %%xmm1,0x10(%1)                 \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "psubb       %%xmm5,%%xmm0                 \n"
+      "psubb       %%xmm5,%%xmm1                 \n"
+      "movdqu      %%xmm4,%%xmm6                 \n"
+      "pmaddubsw   %%xmm0,%%xmm6                 \n"
+      "movdqu      %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm1,%%xmm0                 \n"
+      "phaddw      %%xmm0,%%xmm6                 \n"
+      "paddw       %%xmm5,%%xmm6                 \n"
+      "psrlw       $0x8,%%xmm6                   \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movdqu      (%0),%%xmm2                   \n"
+      "movdqu      0x10(%0),%%xmm3               \n"
+      "lea         0x20(%0),%0                   \n"
+      "psrld       $0x18,%%xmm2                  \n"
+      "psrld       $0x18,%%xmm3                  \n"
+      "packuswb    %%xmm3,%%xmm2                 \n"
+      "packuswb    %%xmm2,%%xmm2                 \n"
+      "movdqa      %%xmm6,%%xmm3                 \n"
+      "punpcklbw   %%xmm6,%%xmm6                 \n"
+      "punpcklbw   %%xmm2,%%xmm3                 \n"
+      "movdqa      %%xmm6,%%xmm1                 \n"
+      "punpcklwd   %%xmm3,%%xmm6                 \n"
+      "punpckhwd   %%xmm3,%%xmm1                 \n"
+      "movdqu      %%xmm6,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
       : "m"(kARGBToYJ),  // %3
-        "m"(kAddYJ64)    // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+        "m"(kSub128)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_ARGBGRAYROW_SSSE3
 
@@ -4954,50 +5158,50 @@ static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
 void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
   asm volatile(
-      "movdqa    %2,%%xmm2                       \n"
-      "movdqa    %3,%%xmm3                       \n"
-      "movdqa    %4,%%xmm4                       \n"
+      "movdqa      %2,%%xmm2                     \n"
+      "movdqa      %3,%%xmm3                     \n"
+      "movdqa      %4,%%xmm4                     \n"
 
       // 8 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm6                 \n"
-      "pmaddubsw %%xmm2,%%xmm0                   \n"
-      "pmaddubsw %%xmm2,%%xmm6                   \n"
-      "phaddw    %%xmm6,%%xmm0                   \n"
-      "psrlw     $0x7,%%xmm0                     \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "movdqu    (%0),%%xmm5                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "pmaddubsw %%xmm3,%%xmm5                   \n"
-      "pmaddubsw %%xmm3,%%xmm1                   \n"
-      "phaddw    %%xmm1,%%xmm5                   \n"
-      "psrlw     $0x7,%%xmm5                     \n"
-      "packuswb  %%xmm5,%%xmm5                   \n"
-      "punpcklbw %%xmm5,%%xmm0                   \n"
-      "movdqu    (%0),%%xmm5                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "pmaddubsw %%xmm4,%%xmm5                   \n"
-      "pmaddubsw %%xmm4,%%xmm1                   \n"
-      "phaddw    %%xmm1,%%xmm5                   \n"
-      "psrlw     $0x7,%%xmm5                     \n"
-      "packuswb  %%xmm5,%%xmm5                   \n"
-      "movdqu    (%0),%%xmm6                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "psrld     $0x18,%%xmm6                    \n"
-      "psrld     $0x18,%%xmm1                    \n"
-      "packuswb  %%xmm1,%%xmm6                   \n"
-      "packuswb  %%xmm6,%%xmm6                   \n"
-      "punpcklbw %%xmm6,%%xmm5                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "punpcklwd %%xmm5,%%xmm0                   \n"
-      "punpckhwd %%xmm5,%%xmm1                   \n"
-      "movdqu    %%xmm0,(%0)                     \n"
-      "movdqu    %%xmm1,0x10(%0)                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "sub       $0x8,%1                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm6               \n"
+      "pmaddubsw   %%xmm2,%%xmm0                 \n"
+      "pmaddubsw   %%xmm2,%%xmm6                 \n"
+      "phaddw      %%xmm6,%%xmm0                 \n"
+      "psrlw       $0x7,%%xmm0                   \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movdqu      (%0),%%xmm5                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "pmaddubsw   %%xmm3,%%xmm5                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "phaddw      %%xmm1,%%xmm5                 \n"
+      "psrlw       $0x7,%%xmm5                   \n"
+      "packuswb    %%xmm5,%%xmm5                 \n"
+      "punpcklbw   %%xmm5,%%xmm0                 \n"
+      "movdqu      (%0),%%xmm5                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "pmaddubsw   %%xmm4,%%xmm5                 \n"
+      "pmaddubsw   %%xmm4,%%xmm1                 \n"
+      "phaddw      %%xmm1,%%xmm5                 \n"
+      "psrlw       $0x7,%%xmm5                   \n"
+      "packuswb    %%xmm5,%%xmm5                 \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "psrld       $0x18,%%xmm6                  \n"
+      "psrld       $0x18,%%xmm1                  \n"
+      "packuswb    %%xmm1,%%xmm6                 \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "punpcklbw   %%xmm6,%%xmm5                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklwd   %%xmm5,%%xmm0                 \n"
+      "punpckhwd   %%xmm5,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%0)                   \n"
+      "movdqu      %%xmm1,0x10(%0)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "sub         $0x8,%1                       \n"
+      "jg          1b                            \n"
       : "+r"(dst_argb),      // %0
         "+r"(width)          // %1
       : "m"(kARGBToSepiaB),  // %2
@@ -5015,54 +5219,54 @@ void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
                               const int8_t* matrix_argb,
                               int width) {
   asm volatile(
-      "movdqu    (%3),%%xmm5                     \n"
-      "pshufd    $0x00,%%xmm5,%%xmm2             \n"
-      "pshufd    $0x55,%%xmm5,%%xmm3             \n"
-      "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
-      "pshufd    $0xff,%%xmm5,%%xmm5             \n"
+      "movdqu      (%3),%%xmm5                   \n"
+      "pshufd      $0x00,%%xmm5,%%xmm2           \n"
+      "pshufd      $0x55,%%xmm5,%%xmm3           \n"
+      "pshufd      $0xaa,%%xmm5,%%xmm4           \n"
+      "pshufd      $0xff,%%xmm5,%%xmm5           \n"
 
       // 8 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm7                 \n"
-      "pmaddubsw %%xmm2,%%xmm0                   \n"
-      "pmaddubsw %%xmm2,%%xmm7                   \n"
-      "movdqu    (%0),%%xmm6                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "pmaddubsw %%xmm3,%%xmm6                   \n"
-      "pmaddubsw %%xmm3,%%xmm1                   \n"
-      "phaddsw   %%xmm7,%%xmm0                   \n"
-      "phaddsw   %%xmm1,%%xmm6                   \n"
-      "psraw     $0x6,%%xmm0                     \n"
-      "psraw     $0x6,%%xmm6                     \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "packuswb  %%xmm6,%%xmm6                   \n"
-      "punpcklbw %%xmm6,%%xmm0                   \n"
-      "movdqu    (%0),%%xmm1                     \n"
-      "movdqu    0x10(%0),%%xmm7                 \n"
-      "pmaddubsw %%xmm4,%%xmm1                   \n"
-      "pmaddubsw %%xmm4,%%xmm7                   \n"
-      "phaddsw   %%xmm7,%%xmm1                   \n"
-      "movdqu    (%0),%%xmm6                     \n"
-      "movdqu    0x10(%0),%%xmm7                 \n"
-      "pmaddubsw %%xmm5,%%xmm6                   \n"
-      "pmaddubsw %%xmm5,%%xmm7                   \n"
-      "phaddsw   %%xmm7,%%xmm6                   \n"
-      "psraw     $0x6,%%xmm1                     \n"
-      "psraw     $0x6,%%xmm6                     \n"
-      "packuswb  %%xmm1,%%xmm1                   \n"
-      "packuswb  %%xmm6,%%xmm6                   \n"
-      "punpcklbw %%xmm6,%%xmm1                   \n"
-      "movdqa    %%xmm0,%%xmm6                   \n"
-      "punpcklwd %%xmm1,%%xmm0                   \n"
-      "punpckhwd %%xmm1,%%xmm6                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "movdqu    %%xmm6,0x10(%1)                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm7               \n"
+      "pmaddubsw   %%xmm2,%%xmm0                 \n"
+      "pmaddubsw   %%xmm2,%%xmm7                 \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "phaddsw     %%xmm7,%%xmm0                 \n"
+      "phaddsw     %%xmm1,%%xmm6                 \n"
+      "psraw       $0x6,%%xmm0                   \n"
+      "psraw       $0x6,%%xmm6                   \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "punpcklbw   %%xmm6,%%xmm0                 \n"
+      "movdqu      (%0),%%xmm1                   \n"
+      "movdqu      0x10(%0),%%xmm7               \n"
+      "pmaddubsw   %%xmm4,%%xmm1                 \n"
+      "pmaddubsw   %%xmm4,%%xmm7                 \n"
+      "phaddsw     %%xmm7,%%xmm1                 \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "movdqu      0x10(%0),%%xmm7               \n"
+      "pmaddubsw   %%xmm5,%%xmm6                 \n"
+      "pmaddubsw   %%xmm5,%%xmm7                 \n"
+      "phaddsw     %%xmm7,%%xmm6                 \n"
+      "psraw       $0x6,%%xmm1                   \n"
+      "psraw       $0x6,%%xmm6                   \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "punpcklbw   %%xmm6,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm6                 \n"
+      "punpcklwd   %%xmm1,%%xmm0                 \n"
+      "punpckhwd   %%xmm1,%%xmm6                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm6,0x10(%1)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src_argb),   // %0
         "+r"(dst_argb),   // %1
         "+r"(width)       // %2
@@ -5080,40 +5284,40 @@ void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
                           int interval_offset,
                           int width) {
   asm volatile(
-      "movd      %2,%%xmm2                       \n"
-      "movd      %3,%%xmm3                       \n"
-      "movd      %4,%%xmm4                       \n"
-      "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
-      "pshufd    $0x44,%%xmm2,%%xmm2             \n"
-      "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
-      "pshufd    $0x44,%%xmm3,%%xmm3             \n"
-      "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
-      "pshufd    $0x44,%%xmm4,%%xmm4             \n"
-      "pxor      %%xmm5,%%xmm5                   \n"
-      "pcmpeqb   %%xmm6,%%xmm6                   \n"
-      "pslld     $0x18,%%xmm6                    \n"
+      "movd        %2,%%xmm2                     \n"
+      "movd        %3,%%xmm3                     \n"
+      "movd        %4,%%xmm4                     \n"
+      "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
+      "pshufd      $0x44,%%xmm2,%%xmm2           \n"
+      "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
+      "pshufd      $0x44,%%xmm3,%%xmm3           \n"
+      "pshuflw     $0x40,%%xmm4,%%xmm4           \n"
+      "pshufd      $0x44,%%xmm4,%%xmm4           \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+      "pcmpeqb     %%xmm6,%%xmm6                 \n"
+      "pslld       $0x18,%%xmm6                  \n"
 
       // 4 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "punpcklbw %%xmm5,%%xmm0                   \n"
-      "pmulhuw   %%xmm2,%%xmm0                   \n"
-      "movdqu    (%0),%%xmm1                     \n"
-      "punpckhbw %%xmm5,%%xmm1                   \n"
-      "pmulhuw   %%xmm2,%%xmm1                   \n"
-      "pmullw    %%xmm3,%%xmm0                   \n"
-      "movdqu    (%0),%%xmm7                     \n"
-      "pmullw    %%xmm3,%%xmm1                   \n"
-      "pand      %%xmm6,%%xmm7                   \n"
-      "paddw     %%xmm4,%%xmm0                   \n"
-      "paddw     %%xmm4,%%xmm1                   \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "por       %%xmm7,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%0)                     \n"
-      "lea       0x10(%0),%0                     \n"
-      "sub       $0x4,%1                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "punpcklbw   %%xmm5,%%xmm0                 \n"
+      "pmulhuw     %%xmm2,%%xmm0                 \n"
+      "movdqu      (%0),%%xmm1                   \n"
+      "punpckhbw   %%xmm5,%%xmm1                 \n"
+      "pmulhuw     %%xmm2,%%xmm1                 \n"
+      "pmullw      %%xmm3,%%xmm0                 \n"
+      "movdqu      (%0),%%xmm7                   \n"
+      "pmullw      %%xmm3,%%xmm1                 \n"
+      "pand        %%xmm6,%%xmm7                 \n"
+      "paddw       %%xmm4,%%xmm0                 \n"
+      "paddw       %%xmm4,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "por         %%xmm7,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%0)                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "sub         $0x4,%1                       \n"
+      "jg          1b                            \n"
       : "+r"(dst_argb),       // %0
         "+r"(width)           // %1
       : "r"(scale),           // %2
@@ -5131,27 +5335,27 @@ void ARGBShadeRow_SSE2(const uint8_t* src_argb,
                        int width,
                        uint32_t value) {
   asm volatile(
-      "movd      %3,%%xmm2                       \n"
-      "punpcklbw %%xmm2,%%xmm2                   \n"
-      "punpcklqdq %%xmm2,%%xmm2                  \n"
+      "movd        %3,%%xmm2                     \n"
+      "punpcklbw   %%xmm2,%%xmm2                 \n"
+      "punpcklqdq  %%xmm2,%%xmm2                 \n"
 
       // 4 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "lea       0x10(%0),%0                     \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "punpcklbw %%xmm0,%%xmm0                   \n"
-      "punpckhbw %%xmm1,%%xmm1                   \n"
-      "pmulhuw   %%xmm2,%%xmm0                   \n"
-      "pmulhuw   %%xmm2,%%xmm1                   \n"
-      "psrlw     $0x8,%%xmm0                     \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x4,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm1                 \n"
+      "pmulhuw     %%xmm2,%%xmm0                 \n"
+      "pmulhuw     %%xmm2,%%xmm1                 \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
@@ -5168,28 +5372,28 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
                           int width) {
   asm volatile(
 
-      "pxor      %%xmm5,%%xmm5                   \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
 
       // 4 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "lea       0x10(%0),%0                     \n"
-      "movdqu    (%1),%%xmm2                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "movdqu    %%xmm0,%%xmm1                   \n"
-      "movdqu    %%xmm2,%%xmm3                   \n"
-      "punpcklbw %%xmm0,%%xmm0                   \n"
-      "punpckhbw %%xmm1,%%xmm1                   \n"
-      "punpcklbw %%xmm5,%%xmm2                   \n"
-      "punpckhbw %%xmm5,%%xmm3                   \n"
-      "pmulhuw   %%xmm2,%%xmm0                   \n"
-      "pmulhuw   %%xmm3,%%xmm1                   \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%2)                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "sub       $0x4,%3                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqu      (%1),%%xmm2                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "movdqu      %%xmm0,%%xmm1                 \n"
+      "movdqu      %%xmm2,%%xmm3                 \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm1                 \n"
+      "punpcklbw   %%xmm5,%%xmm2                 \n"
+      "punpckhbw   %%xmm5,%%xmm3                 \n"
+      "pmulhuw     %%xmm2,%%xmm0                 \n"
+      "pmulhuw     %%xmm3,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jg          1b                            \n"
       : "+r"(src_argb0),  // %0
         "+r"(src_argb1),  // %1
         "+r"(dst_argb),   // %2
@@ -5207,26 +5411,26 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
                           int width) {
   asm volatile(
 
-      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
 
       // 4 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu    (%0),%%ymm1                    \n"
-      "lea        0x20(%0),%0                    \n"
-      "vmovdqu    (%1),%%ymm3                    \n"
-      "lea        0x20(%1),%1                    \n"
-      "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
-      "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
-      "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
-      "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
-      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
-      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-      "vmovdqu    %%ymm0,(%2)                    \n"
-      "lea       0x20(%2),%2                     \n"
-      "sub        $0x8,%3                        \n"
-      "jg        1b                              \n"
+      "vmovdqu     (%0),%%ymm1                   \n"
+      "lea         0x20(%0),%0                   \n"
+      "vmovdqu     (%1),%%ymm3                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "vpunpcklbw  %%ymm1,%%ymm1,%%ymm0          \n"
+      "vpunpckhbw  %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
+      "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
+      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%2)                   \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x8,%3                       \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src_argb0),  // %0
         "+r"(src_argb1),  // %1
@@ -5238,7 +5442,7 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
         ,
         "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
 #endif
-      );
+  );
 }
 #endif  // HAS_ARGBMULTIPLYROW_AVX2
 
@@ -5252,15 +5456,15 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb0,
       // 4 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "lea       0x10(%0),%0                     \n"
-      "movdqu    (%1),%%xmm1                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "paddusb   %%xmm1,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%2)                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "sub       $0x4,%3                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqu      (%1),%%xmm1                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "paddusb     %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jg          1b                            \n"
       : "+r"(src_argb0),  // %0
         "+r"(src_argb1),  // %1
         "+r"(dst_argb),   // %2
@@ -5280,14 +5484,14 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb0,
       // 4 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "lea        0x20(%0),%0                    \n"
-      "vpaddusb   (%1),%%ymm0,%%ymm0             \n"
-      "lea        0x20(%1),%1                    \n"
-      "vmovdqu    %%ymm0,(%2)                    \n"
-      "lea        0x20(%2),%2                    \n"
-      "sub        $0x8,%3                        \n"
-      "jg        1b                              \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "lea         0x20(%0),%0                   \n"
+      "vpaddusb    (%1),%%ymm0,%%ymm0            \n"
+      "lea         0x20(%1),%1                   \n"
+      "vmovdqu     %%ymm0,(%2)                   \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x8,%3                       \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src_argb0),  // %0
         "+r"(src_argb1),  // %1
@@ -5308,15 +5512,15 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
       // 4 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "lea       0x10(%0),%0                     \n"
-      "movdqu    (%1),%%xmm1                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "psubusb   %%xmm1,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%2)                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "sub       $0x4,%3                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqu      (%1),%%xmm1                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "psubusb     %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jg          1b                            \n"
       : "+r"(src_argb0),  // %0
         "+r"(src_argb1),  // %1
         "+r"(dst_argb),   // %2
@@ -5336,14 +5540,14 @@ void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
       // 4 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "lea        0x20(%0),%0                    \n"
-      "vpsubusb   (%1),%%ymm0,%%ymm0             \n"
-      "lea        0x20(%1),%1                    \n"
-      "vmovdqu    %%ymm0,(%2)                    \n"
-      "lea        0x20(%2),%2                    \n"
-      "sub        $0x8,%3                        \n"
-      "jg         1b                             \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "lea         0x20(%0),%0                   \n"
+      "vpsubusb    (%1),%%ymm0,%%ymm0            \n"
+      "lea         0x20(%1),%1                   \n"
+      "vmovdqu     %%ymm0,(%2)                   \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x8,%3                       \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src_argb0),  // %0
         "+r"(src_argb1),  // %1
@@ -5365,40 +5569,40 @@ void SobelXRow_SSE2(const uint8_t* src_y0,
                     uint8_t* dst_sobelx,
                     int width) {
   asm volatile(
-      "sub       %0,%1                           \n"
-      "sub       %0,%2                           \n"
-      "sub       %0,%3                           \n"
-      "pxor      %%xmm5,%%xmm5                   \n"
+      "sub         %0,%1                         \n"
+      "sub         %0,%2                         \n"
+      "sub         %0,%3                         \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
 
       // 8 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movq      (%0),%%xmm0                     \n"
-      "movq      0x2(%0),%%xmm1                  \n"
-      "punpcklbw %%xmm5,%%xmm0                   \n"
-      "punpcklbw %%xmm5,%%xmm1                   \n"
-      "psubw     %%xmm1,%%xmm0                   \n"
-      "movq      0x00(%0,%1,1),%%xmm1            \n"
-      "movq      0x02(%0,%1,1),%%xmm2            \n"
-      "punpcklbw %%xmm5,%%xmm1                   \n"
-      "punpcklbw %%xmm5,%%xmm2                   \n"
-      "psubw     %%xmm2,%%xmm1                   \n"
-      "movq      0x00(%0,%2,1),%%xmm2            \n"
-      "movq      0x02(%0,%2,1),%%xmm3            \n"
-      "punpcklbw %%xmm5,%%xmm2                   \n"
-      "punpcklbw %%xmm5,%%xmm3                   \n"
-      "psubw     %%xmm3,%%xmm2                   \n"
-      "paddw     %%xmm2,%%xmm0                   \n"
-      "paddw     %%xmm1,%%xmm0                   \n"
-      "paddw     %%xmm1,%%xmm0                   \n"
-      "pxor      %%xmm1,%%xmm1                   \n"
-      "psubw     %%xmm0,%%xmm1                   \n"
-      "pmaxsw    %%xmm1,%%xmm0                   \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "movq      %%xmm0,0x00(%0,%3,1)            \n"
-      "lea       0x8(%0),%0                      \n"
-      "sub       $0x8,%4                         \n"
-      "jg        1b                              \n"
+      "movq        (%0),%%xmm0                   \n"
+      "movq        0x2(%0),%%xmm1                \n"
+      "punpcklbw   %%xmm5,%%xmm0                 \n"
+      "punpcklbw   %%xmm5,%%xmm1                 \n"
+      "psubw       %%xmm1,%%xmm0                 \n"
+      "movq        0x00(%0,%1,1),%%xmm1          \n"
+      "movq        0x02(%0,%1,1),%%xmm2          \n"
+      "punpcklbw   %%xmm5,%%xmm1                 \n"
+      "punpcklbw   %%xmm5,%%xmm2                 \n"
+      "psubw       %%xmm2,%%xmm1                 \n"
+      "movq        0x00(%0,%2,1),%%xmm2          \n"
+      "movq        0x02(%0,%2,1),%%xmm3          \n"
+      "punpcklbw   %%xmm5,%%xmm2                 \n"
+      "punpcklbw   %%xmm5,%%xmm3                 \n"
+      "psubw       %%xmm3,%%xmm2                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"
+      "paddw       %%xmm1,%%xmm0                 \n"
+      "paddw       %%xmm1,%%xmm0                 \n"
+      "pxor        %%xmm1,%%xmm1                 \n"
+      "psubw       %%xmm0,%%xmm1                 \n"
+      "pmaxsw      %%xmm1,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movq        %%xmm0,0x00(%0,%3,1)          \n"
+      "lea         0x8(%0),%0                    \n"
+      "sub         $0x8,%4                       \n"
+      "jg          1b                            \n"
       : "+r"(src_y0),      // %0
         "+r"(src_y1),      // %1
         "+r"(src_y2),      // %2
@@ -5419,39 +5623,39 @@ void SobelYRow_SSE2(const uint8_t* src_y0,
                     uint8_t* dst_sobely,
                     int width) {
   asm volatile(
-      "sub       %0,%1                           \n"
-      "sub       %0,%2                           \n"
-      "pxor      %%xmm5,%%xmm5                   \n"
+      "sub         %0,%1                         \n"
+      "sub         %0,%2                         \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
 
       // 8 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movq      (%0),%%xmm0                     \n"
-      "movq      0x00(%0,%1,1),%%xmm1            \n"
-      "punpcklbw %%xmm5,%%xmm0                   \n"
-      "punpcklbw %%xmm5,%%xmm1                   \n"
-      "psubw     %%xmm1,%%xmm0                   \n"
-      "movq      0x1(%0),%%xmm1                  \n"
-      "movq      0x01(%0,%1,1),%%xmm2            \n"
-      "punpcklbw %%xmm5,%%xmm1                   \n"
-      "punpcklbw %%xmm5,%%xmm2                   \n"
-      "psubw     %%xmm2,%%xmm1                   \n"
-      "movq      0x2(%0),%%xmm2                  \n"
-      "movq      0x02(%0,%1,1),%%xmm3            \n"
-      "punpcklbw %%xmm5,%%xmm2                   \n"
-      "punpcklbw %%xmm5,%%xmm3                   \n"
-      "psubw     %%xmm3,%%xmm2                   \n"
-      "paddw     %%xmm2,%%xmm0                   \n"
-      "paddw     %%xmm1,%%xmm0                   \n"
-      "paddw     %%xmm1,%%xmm0                   \n"
-      "pxor      %%xmm1,%%xmm1                   \n"
-      "psubw     %%xmm0,%%xmm1                   \n"
-      "pmaxsw    %%xmm1,%%xmm0                   \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "movq      %%xmm0,0x00(%0,%2,1)            \n"
-      "lea       0x8(%0),%0                      \n"
-      "sub       $0x8,%3                         \n"
-      "jg        1b                              \n"
+      "movq        (%0),%%xmm0                   \n"
+      "movq        0x00(%0,%1,1),%%xmm1          \n"
+      "punpcklbw   %%xmm5,%%xmm0                 \n"
+      "punpcklbw   %%xmm5,%%xmm1                 \n"
+      "psubw       %%xmm1,%%xmm0                 \n"
+      "movq        0x1(%0),%%xmm1                \n"
+      "movq        0x01(%0,%1,1),%%xmm2          \n"
+      "punpcklbw   %%xmm5,%%xmm1                 \n"
+      "punpcklbw   %%xmm5,%%xmm2                 \n"
+      "psubw       %%xmm2,%%xmm1                 \n"
+      "movq        0x2(%0),%%xmm2                \n"
+      "movq        0x02(%0,%1,1),%%xmm3          \n"
+      "punpcklbw   %%xmm5,%%xmm2                 \n"
+      "punpcklbw   %%xmm5,%%xmm3                 \n"
+      "psubw       %%xmm3,%%xmm2                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"
+      "paddw       %%xmm1,%%xmm0                 \n"
+      "paddw       %%xmm1,%%xmm0                 \n"
+      "pxor        %%xmm1,%%xmm1                 \n"
+      "psubw       %%xmm0,%%xmm1                 \n"
+      "pmaxsw      %%xmm1,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movq        %%xmm0,0x00(%0,%2,1)          \n"
+      "lea         0x8(%0),%0                    \n"
+      "sub         $0x8,%3                       \n"
+      "jg          1b                            \n"
       : "+r"(src_y0),      // %0
         "+r"(src_y1),      // %1
         "+r"(dst_sobely),  // %2
@@ -5472,37 +5676,37 @@ void SobelRow_SSE2(const uint8_t* src_sobelx,
                    uint8_t* dst_argb,
                    int width) {
   asm volatile(
-      "sub       %0,%1                           \n"
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
-      "pslld     $0x18,%%xmm5                    \n"
+      "sub         %0,%1                         \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pslld       $0x18,%%xmm5                  \n"
 
       // 8 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
-      "lea       0x10(%0),%0                     \n"
-      "paddusb   %%xmm1,%%xmm0                   \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "punpcklbw %%xmm0,%%xmm2                   \n"
-      "punpckhbw %%xmm0,%%xmm0                   \n"
-      "movdqa    %%xmm2,%%xmm1                   \n"
-      "punpcklwd %%xmm2,%%xmm1                   \n"
-      "punpckhwd %%xmm2,%%xmm2                   \n"
-      "por       %%xmm5,%%xmm1                   \n"
-      "por       %%xmm5,%%xmm2                   \n"
-      "movdqa    %%xmm0,%%xmm3                   \n"
-      "punpcklwd %%xmm0,%%xmm3                   \n"
-      "punpckhwd %%xmm0,%%xmm0                   \n"
-      "por       %%xmm5,%%xmm3                   \n"
-      "por       %%xmm5,%%xmm0                   \n"
-      "movdqu    %%xmm1,(%2)                     \n"
-      "movdqu    %%xmm2,0x10(%2)                 \n"
-      "movdqu    %%xmm3,0x20(%2)                 \n"
-      "movdqu    %%xmm0,0x30(%2)                 \n"
-      "lea       0x40(%2),%2                     \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%1,1),%%xmm1          \n"
+      "lea         0x10(%0),%0                   \n"
+      "paddusb     %%xmm1,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpcklbw   %%xmm0,%%xmm2                 \n"
+      "punpckhbw   %%xmm0,%%xmm0                 \n"
+      "movdqa      %%xmm2,%%xmm1                 \n"
+      "punpcklwd   %%xmm2,%%xmm1                 \n"
+      "punpckhwd   %%xmm2,%%xmm2                 \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "por         %%xmm5,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm3                 \n"
+      "punpcklwd   %%xmm0,%%xmm3                 \n"
+      "punpckhwd   %%xmm0,%%xmm0                 \n"
+      "por         %%xmm5,%%xmm3                 \n"
+      "por         %%xmm5,%%xmm0                 \n"
+      "movdqu      %%xmm1,(%2)                   \n"
+      "movdqu      %%xmm2,0x10(%2)               \n"
+      "movdqu      %%xmm3,0x20(%2)               \n"
+      "movdqu      %%xmm0,0x30(%2)               \n"
+      "lea         0x40(%2),%2                   \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
       : "+r"(src_sobelx),  // %0
         "+r"(src_sobely),  // %1
         "+r"(dst_argb),    // %2
@@ -5519,21 +5723,21 @@ void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
                           uint8_t* dst_y,
                           int width) {
   asm volatile(
-      "sub       %0,%1                           \n"
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
-      "pslld     $0x18,%%xmm5                    \n"
+      "sub         %0,%1                         \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pslld       $0x18,%%xmm5                  \n"
 
       // 8 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
-      "lea       0x10(%0),%0                     \n"
-      "paddusb   %%xmm1,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%2)                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%1,1),%%xmm1          \n"
+      "lea         0x10(%0),%0                   \n"
+      "paddusb     %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
       : "+r"(src_sobelx),  // %0
         "+r"(src_sobely),  // %1
         "+r"(dst_y),       // %2
@@ -5554,36 +5758,36 @@ void SobelXYRow_SSE2(const uint8_t* src_sobelx,
                      uint8_t* dst_argb,
                      int width) {
   asm volatile(
-      "sub       %0,%1                           \n"
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "sub         %0,%1                         \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
       // 8 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
-      "lea       0x10(%0),%0                     \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "paddusb   %%xmm1,%%xmm2                   \n"
-      "movdqa    %%xmm0,%%xmm3                   \n"
-      "punpcklbw %%xmm5,%%xmm3                   \n"
-      "punpckhbw %%xmm5,%%xmm0                   \n"
-      "movdqa    %%xmm1,%%xmm4                   \n"
-      "punpcklbw %%xmm2,%%xmm4                   \n"
-      "punpckhbw %%xmm2,%%xmm1                   \n"
-      "movdqa    %%xmm4,%%xmm6                   \n"
-      "punpcklwd %%xmm3,%%xmm6                   \n"
-      "punpckhwd %%xmm3,%%xmm4                   \n"
-      "movdqa    %%xmm1,%%xmm7                   \n"
-      "punpcklwd %%xmm0,%%xmm7                   \n"
-      "punpckhwd %%xmm0,%%xmm1                   \n"
-      "movdqu    %%xmm6,(%2)                     \n"
-      "movdqu    %%xmm4,0x10(%2)                 \n"
-      "movdqu    %%xmm7,0x20(%2)                 \n"
-      "movdqu    %%xmm1,0x30(%2)                 \n"
-      "lea       0x40(%2),%2                     \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%1,1),%%xmm1          \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "paddusb     %%xmm1,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm3                 \n"
+      "punpcklbw   %%xmm5,%%xmm3                 \n"
+      "punpckhbw   %%xmm5,%%xmm0                 \n"
+      "movdqa      %%xmm1,%%xmm4                 \n"
+      "punpcklbw   %%xmm2,%%xmm4                 \n"
+      "punpckhbw   %%xmm2,%%xmm1                 \n"
+      "movdqa      %%xmm4,%%xmm6                 \n"
+      "punpcklwd   %%xmm3,%%xmm6                 \n"
+      "punpckhwd   %%xmm3,%%xmm4                 \n"
+      "movdqa      %%xmm1,%%xmm7                 \n"
+      "punpcklwd   %%xmm0,%%xmm7                 \n"
+      "punpckhwd   %%xmm0,%%xmm1                 \n"
+      "movdqu      %%xmm6,(%2)                   \n"
+      "movdqu      %%xmm4,0x10(%2)               \n"
+      "movdqu      %%xmm7,0x20(%2)               \n"
+      "movdqu      %%xmm1,0x30(%2)               \n"
+      "lea         0x40(%2),%2                   \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
       : "+r"(src_sobelx),  // %0
         "+r"(src_sobely),  // %1
         "+r"(dst_argb),    // %2
@@ -5602,67 +5806,67 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
                                   const int32_t* previous_cumsum,
                                   int width) {
   asm volatile(
-      "pxor      %%xmm0,%%xmm0                   \n"
-      "pxor      %%xmm1,%%xmm1                   \n"
-      "sub       $0x4,%3                         \n"
-      "jl        49f                             \n"
-      "test      $0xf,%1                         \n"
-      "jne       49f                             \n"
+      "pxor        %%xmm0,%%xmm0                 \n"
+      "pxor        %%xmm1,%%xmm1                 \n"
+      "sub         $0x4,%3                       \n"
+      "jl          49f                           \n"
+      "test        $0xf,%1                       \n"
+      "jne         49f                           \n"
 
       // 4 pixel loop.
       LABELALIGN
       "40:                                       \n"
-      "movdqu    (%0),%%xmm2                     \n"
-      "lea       0x10(%0),%0                     \n"
-      "movdqa    %%xmm2,%%xmm4                   \n"
-      "punpcklbw %%xmm1,%%xmm2                   \n"
-      "movdqa    %%xmm2,%%xmm3                   \n"
-      "punpcklwd %%xmm1,%%xmm2                   \n"
-      "punpckhwd %%xmm1,%%xmm3                   \n"
-      "punpckhbw %%xmm1,%%xmm4                   \n"
-      "movdqa    %%xmm4,%%xmm5                   \n"
-      "punpcklwd %%xmm1,%%xmm4                   \n"
-      "punpckhwd %%xmm1,%%xmm5                   \n"
-      "paddd     %%xmm2,%%xmm0                   \n"
-      "movdqu    (%2),%%xmm2                     \n"
-      "paddd     %%xmm0,%%xmm2                   \n"
-      "paddd     %%xmm3,%%xmm0                   \n"
-      "movdqu    0x10(%2),%%xmm3                 \n"
-      "paddd     %%xmm0,%%xmm3                   \n"
-      "paddd     %%xmm4,%%xmm0                   \n"
-      "movdqu    0x20(%2),%%xmm4                 \n"
-      "paddd     %%xmm0,%%xmm4                   \n"
-      "paddd     %%xmm5,%%xmm0                   \n"
-      "movdqu    0x30(%2),%%xmm5                 \n"
-      "lea       0x40(%2),%2                     \n"
-      "paddd     %%xmm0,%%xmm5                   \n"
-      "movdqu    %%xmm2,(%1)                     \n"
-      "movdqu    %%xmm3,0x10(%1)                 \n"
-      "movdqu    %%xmm4,0x20(%1)                 \n"
-      "movdqu    %%xmm5,0x30(%1)                 \n"
-      "lea       0x40(%1),%1                     \n"
-      "sub       $0x4,%3                         \n"
-      "jge       40b                             \n"
+      "movdqu      (%0),%%xmm2                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqa      %%xmm2,%%xmm4                 \n"
+      "punpcklbw   %%xmm1,%%xmm2                 \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "punpcklwd   %%xmm1,%%xmm2                 \n"
+      "punpckhwd   %%xmm1,%%xmm3                 \n"
+      "punpckhbw   %%xmm1,%%xmm4                 \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "punpcklwd   %%xmm1,%%xmm4                 \n"
+      "punpckhwd   %%xmm1,%%xmm5                 \n"
+      "paddd       %%xmm2,%%xmm0                 \n"
+      "movdqu      (%2),%%xmm2                   \n"
+      "paddd       %%xmm0,%%xmm2                 \n"
+      "paddd       %%xmm3,%%xmm0                 \n"
+      "movdqu      0x10(%2),%%xmm3               \n"
+      "paddd       %%xmm0,%%xmm3                 \n"
+      "paddd       %%xmm4,%%xmm0                 \n"
+      "movdqu      0x20(%2),%%xmm4               \n"
+      "paddd       %%xmm0,%%xmm4                 \n"
+      "paddd       %%xmm5,%%xmm0                 \n"
+      "movdqu      0x30(%2),%%xmm5               \n"
+      "lea         0x40(%2),%2                   \n"
+      "paddd       %%xmm0,%%xmm5                 \n"
+      "movdqu      %%xmm2,(%1)                   \n"
+      "movdqu      %%xmm3,0x10(%1)               \n"
+      "movdqu      %%xmm4,0x20(%1)               \n"
+      "movdqu      %%xmm5,0x30(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x4,%3                       \n"
+      "jge         40b                           \n"
 
       "49:                                       \n"
-      "add       $0x3,%3                         \n"
-      "jl        19f                             \n"
+      "add         $0x3,%3                       \n"
+      "jl          19f                           \n"
 
       // 1 pixel loop.
       LABELALIGN
       "10:                                       \n"
-      "movd      (%0),%%xmm2                     \n"
-      "lea       0x4(%0),%0                      \n"
-      "punpcklbw %%xmm1,%%xmm2                   \n"
-      "punpcklwd %%xmm1,%%xmm2                   \n"
-      "paddd     %%xmm2,%%xmm0                   \n"
-      "movdqu    (%2),%%xmm2                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "paddd     %%xmm0,%%xmm2                   \n"
-      "movdqu    %%xmm2,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x1,%3                         \n"
-      "jge       10b                             \n"
+      "movd        (%0),%%xmm2                   \n"
+      "lea         0x4(%0),%0                    \n"
+      "punpcklbw   %%xmm1,%%xmm2                 \n"
+      "punpcklwd   %%xmm1,%%xmm2                 \n"
+      "paddd       %%xmm2,%%xmm0                 \n"
+      "movdqu      (%2),%%xmm2                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "paddd       %%xmm0,%%xmm2                 \n"
+      "movdqu      %%xmm2,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x1,%3                       \n"
+      "jge         10b                           \n"
 
       "19:                                       \n"
       : "+r"(row),              // %0
@@ -5682,119 +5886,119 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
                                     uint8_t* dst,
                                     int count) {
   asm volatile(
-      "movd      %5,%%xmm5                       \n"
-      "cvtdq2ps  %%xmm5,%%xmm5                   \n"
-      "rcpss     %%xmm5,%%xmm4                   \n"
-      "pshufd    $0x0,%%xmm4,%%xmm4              \n"
-      "sub       $0x4,%3                         \n"
-      "jl        49f                             \n"
-      "cmpl      $0x80,%5                        \n"
-      "ja        40f                             \n"
-
-      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-      "pcmpeqb   %%xmm6,%%xmm6                   \n"
-      "psrld     $0x10,%%xmm6                    \n"
-      "cvtdq2ps  %%xmm6,%%xmm6                   \n"
-      "addps     %%xmm6,%%xmm5                   \n"
-      "mulps     %%xmm4,%%xmm5                   \n"
-      "cvtps2dq  %%xmm5,%%xmm5                   \n"
-      "packssdw  %%xmm5,%%xmm5                   \n"
+      "movd        %5,%%xmm5                     \n"
+      "cvtdq2ps    %%xmm5,%%xmm5                 \n"
+      "rcpss       %%xmm5,%%xmm4                 \n"
+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
+      "sub         $0x4,%3                       \n"
+      "jl          49f                           \n"
+      "cmpl        $0x80,%5                      \n"
+      "ja          40f                           \n"
+
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
+      "pcmpeqb     %%xmm6,%%xmm6                 \n"
+      "psrld       $0x10,%%xmm6                  \n"
+      "cvtdq2ps    %%xmm6,%%xmm6                 \n"
+      "addps       %%xmm6,%%xmm5                 \n"
+      "mulps       %%xmm4,%%xmm5                 \n"
+      "cvtps2dq    %%xmm5,%%xmm5                 \n"
+      "packssdw    %%xmm5,%%xmm5                 \n"
 
       // 4 pixel small loop.
       LABELALIGN
       "4:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x30(%0),%%xmm3                 \n"
-      "psubd     0x00(%0,%4,4),%%xmm0            \n"
-      "psubd     0x10(%0,%4,4),%%xmm1            \n"
-      "psubd     0x20(%0,%4,4),%%xmm2            \n"
-      "psubd     0x30(%0,%4,4),%%xmm3            \n"
-      "lea       0x40(%0),%0                     \n"
-      "psubd     (%1),%%xmm0                     \n"
-      "psubd     0x10(%1),%%xmm1                 \n"
-      "psubd     0x20(%1),%%xmm2                 \n"
-      "psubd     0x30(%1),%%xmm3                 \n"
-      "paddd     0x00(%1,%4,4),%%xmm0            \n"
-      "paddd     0x10(%1,%4,4),%%xmm1            \n"
-      "paddd     0x20(%1,%4,4),%%xmm2            \n"
-      "paddd     0x30(%1,%4,4),%%xmm3            \n"
-      "lea       0x40(%1),%1                     \n"
-      "packssdw  %%xmm1,%%xmm0                   \n"
-      "packssdw  %%xmm3,%%xmm2                   \n"
-      "pmulhuw   %%xmm5,%%xmm0                   \n"
-      "pmulhuw   %%xmm5,%%xmm2                   \n"
-      "packuswb  %%xmm2,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%2)                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "sub       $0x4,%3                         \n"
-      "jge       4b                              \n"
-      "jmp       49f                             \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x30(%0),%%xmm3               \n"
+      "psubd       0x00(%0,%4,4),%%xmm0          \n"
+      "psubd       0x10(%0,%4,4),%%xmm1          \n"
+      "psubd       0x20(%0,%4,4),%%xmm2          \n"
+      "psubd       0x30(%0,%4,4),%%xmm3          \n"
+      "lea         0x40(%0),%0                   \n"
+      "psubd       (%1),%%xmm0                   \n"
+      "psubd       0x10(%1),%%xmm1               \n"
+      "psubd       0x20(%1),%%xmm2               \n"
+      "psubd       0x30(%1),%%xmm3               \n"
+      "paddd       0x00(%1,%4,4),%%xmm0          \n"
+      "paddd       0x10(%1,%4,4),%%xmm1          \n"
+      "paddd       0x20(%1,%4,4),%%xmm2          \n"
+      "paddd       0x30(%1,%4,4),%%xmm3          \n"
+      "lea         0x40(%1),%1                   \n"
+      "packssdw    %%xmm1,%%xmm0                 \n"
+      "packssdw    %%xmm3,%%xmm2                 \n"
+      "pmulhuw     %%xmm5,%%xmm0                 \n"
+      "pmulhuw     %%xmm5,%%xmm2                 \n"
+      "packuswb    %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jge         4b                            \n"
+      "jmp         49f                           \n"
 
       // 4 pixel loop
       LABELALIGN
       "40:                                       \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x30(%0),%%xmm3                 \n"
-      "psubd     0x00(%0,%4,4),%%xmm0            \n"
-      "psubd     0x10(%0,%4,4),%%xmm1            \n"
-      "psubd     0x20(%0,%4,4),%%xmm2            \n"
-      "psubd     0x30(%0,%4,4),%%xmm3            \n"
-      "lea       0x40(%0),%0                     \n"
-      "psubd     (%1),%%xmm0                     \n"
-      "psubd     0x10(%1),%%xmm1                 \n"
-      "psubd     0x20(%1),%%xmm2                 \n"
-      "psubd     0x30(%1),%%xmm3                 \n"
-      "paddd     0x00(%1,%4,4),%%xmm0            \n"
-      "paddd     0x10(%1,%4,4),%%xmm1            \n"
-      "paddd     0x20(%1,%4,4),%%xmm2            \n"
-      "paddd     0x30(%1,%4,4),%%xmm3            \n"
-      "lea       0x40(%1),%1                     \n"
-      "cvtdq2ps  %%xmm0,%%xmm0                   \n"
-      "cvtdq2ps  %%xmm1,%%xmm1                   \n"
-      "mulps     %%xmm4,%%xmm0                   \n"
-      "mulps     %%xmm4,%%xmm1                   \n"
-      "cvtdq2ps  %%xmm2,%%xmm2                   \n"
-      "cvtdq2ps  %%xmm3,%%xmm3                   \n"
-      "mulps     %%xmm4,%%xmm2                   \n"
-      "mulps     %%xmm4,%%xmm3                   \n"
-      "cvtps2dq  %%xmm0,%%xmm0                   \n"
-      "cvtps2dq  %%xmm1,%%xmm1                   \n"
-      "cvtps2dq  %%xmm2,%%xmm2                   \n"
-      "cvtps2dq  %%xmm3,%%xmm3                   \n"
-      "packssdw  %%xmm1,%%xmm0                   \n"
-      "packssdw  %%xmm3,%%xmm2                   \n"
-      "packuswb  %%xmm2,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%2)                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "sub       $0x4,%3                         \n"
-      "jge       40b                             \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x30(%0),%%xmm3               \n"
+      "psubd       0x00(%0,%4,4),%%xmm0          \n"
+      "psubd       0x10(%0,%4,4),%%xmm1          \n"
+      "psubd       0x20(%0,%4,4),%%xmm2          \n"
+      "psubd       0x30(%0,%4,4),%%xmm3          \n"
+      "lea         0x40(%0),%0                   \n"
+      "psubd       (%1),%%xmm0                   \n"
+      "psubd       0x10(%1),%%xmm1               \n"
+      "psubd       0x20(%1),%%xmm2               \n"
+      "psubd       0x30(%1),%%xmm3               \n"
+      "paddd       0x00(%1,%4,4),%%xmm0          \n"
+      "paddd       0x10(%1,%4,4),%%xmm1          \n"
+      "paddd       0x20(%1,%4,4),%%xmm2          \n"
+      "paddd       0x30(%1,%4,4),%%xmm3          \n"
+      "lea         0x40(%1),%1                   \n"
+      "cvtdq2ps    %%xmm0,%%xmm0                 \n"
+      "cvtdq2ps    %%xmm1,%%xmm1                 \n"
+      "mulps       %%xmm4,%%xmm0                 \n"
+      "mulps       %%xmm4,%%xmm1                 \n"
+      "cvtdq2ps    %%xmm2,%%xmm2                 \n"
+      "cvtdq2ps    %%xmm3,%%xmm3                 \n"
+      "mulps       %%xmm4,%%xmm2                 \n"
+      "mulps       %%xmm4,%%xmm3                 \n"
+      "cvtps2dq    %%xmm0,%%xmm0                 \n"
+      "cvtps2dq    %%xmm1,%%xmm1                 \n"
+      "cvtps2dq    %%xmm2,%%xmm2                 \n"
+      "cvtps2dq    %%xmm3,%%xmm3                 \n"
+      "packssdw    %%xmm1,%%xmm0                 \n"
+      "packssdw    %%xmm3,%%xmm2                 \n"
+      "packuswb    %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jge         40b                           \n"
 
       "49:                                       \n"
-      "add       $0x3,%3                         \n"
-      "jl        19f                             \n"
+      "add         $0x3,%3                       \n"
+      "jl          19f                           \n"
 
       // 1 pixel loop
       LABELALIGN
       "10:                                       \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "psubd     0x00(%0,%4,4),%%xmm0            \n"
-      "lea       0x10(%0),%0                     \n"
-      "psubd     (%1),%%xmm0                     \n"
-      "paddd     0x00(%1,%4,4),%%xmm0            \n"
-      "lea       0x10(%1),%1                     \n"
-      "cvtdq2ps  %%xmm0,%%xmm0                   \n"
-      "mulps     %%xmm4,%%xmm0                   \n"
-      "cvtps2dq  %%xmm0,%%xmm0                   \n"
-      "packssdw  %%xmm0,%%xmm0                   \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "movd      %%xmm0,(%2)                     \n"
-      "lea       0x4(%2),%2                      \n"
-      "sub       $0x1,%3                         \n"
-      "jge       10b                             \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "psubd       0x00(%0,%4,4),%%xmm0          \n"
+      "lea         0x10(%0),%0                   \n"
+      "psubd       (%1),%%xmm0                   \n"
+      "paddd       0x00(%1,%4,4),%%xmm0          \n"
+      "lea         0x10(%1),%1                   \n"
+      "cvtdq2ps    %%xmm0,%%xmm0                 \n"
+      "mulps       %%xmm4,%%xmm0                 \n"
+      "cvtps2dq    %%xmm0,%%xmm0                 \n"
+      "packssdw    %%xmm0,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movd        %%xmm0,(%2)                   \n"
+      "lea         0x4(%2),%2                    \n"
+      "sub         $0x1,%3                       \n"
+      "jge         10b                           \n"
       "19:                                       \n"
       : "+r"(topleft),           // %0
         "+r"(botleft),           // %1
@@ -5817,70 +6021,70 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb,
   intptr_t src_argb_stride_temp = src_argb_stride;
   intptr_t temp;
   asm volatile(
-      "movq      (%3),%%xmm2                     \n"
-      "movq      0x08(%3),%%xmm7                 \n"
-      "shl       $0x10,%1                        \n"
-      "add       $0x4,%1                         \n"
-      "movd      %1,%%xmm5                       \n"
-      "sub       $0x4,%4                         \n"
-      "jl        49f                             \n"
-
-      "pshufd    $0x44,%%xmm7,%%xmm7             \n"
-      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-      "movdqa    %%xmm2,%%xmm0                   \n"
-      "addps     %%xmm7,%%xmm0                   \n"
-      "movlhps   %%xmm0,%%xmm2                   \n"
-      "movdqa    %%xmm7,%%xmm4                   \n"
-      "addps     %%xmm4,%%xmm4                   \n"
-      "movdqa    %%xmm2,%%xmm3                   \n"
-      "addps     %%xmm4,%%xmm3                   \n"
-      "addps     %%xmm4,%%xmm4                   \n"
+      "movq        (%3),%%xmm2                   \n"
+      "movq        0x08(%3),%%xmm7               \n"
+      "shl         $0x10,%1                      \n"
+      "add         $0x4,%1                       \n"
+      "movd        %1,%%xmm5                     \n"
+      "sub         $0x4,%4                       \n"
+      "jl          49f                           \n"
+
+      "pshufd      $0x44,%%xmm7,%%xmm7           \n"
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
+      "movdqa      %%xmm2,%%xmm0                 \n"
+      "addps       %%xmm7,%%xmm0                 \n"
+      "movlhps     %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm7,%%xmm4                 \n"
+      "addps       %%xmm4,%%xmm4                 \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "addps       %%xmm4,%%xmm3                 \n"
+      "addps       %%xmm4,%%xmm4                 \n"
 
       // 4 pixel loop
       LABELALIGN
       "40:                                       \n"
-      "cvttps2dq %%xmm2,%%xmm0                   \n"  // x,y float->int first 2
-      "cvttps2dq %%xmm3,%%xmm1                   \n"  // x,y float->int next 2
-      "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
-      "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x*4 + y*stride
-      "movd      %%xmm0,%k1                      \n"
-      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-      "movd      %%xmm0,%k5                      \n"
-      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-      "movd      0x00(%0,%1,1),%%xmm1            \n"
-      "movd      0x00(%0,%5,1),%%xmm6            \n"
-      "punpckldq %%xmm6,%%xmm1                   \n"
-      "addps     %%xmm4,%%xmm2                   \n"
-      "movq      %%xmm1,(%2)                     \n"
-      "movd      %%xmm0,%k1                      \n"
-      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-      "movd      %%xmm0,%k5                      \n"
-      "movd      0x00(%0,%1,1),%%xmm0            \n"
-      "movd      0x00(%0,%5,1),%%xmm6            \n"
-      "punpckldq %%xmm6,%%xmm0                   \n"
-      "addps     %%xmm4,%%xmm3                   \n"
-      "movq      %%xmm0,0x08(%2)                 \n"
-      "lea       0x10(%2),%2                     \n"
-      "sub       $0x4,%4                         \n"
-      "jge       40b                             \n"
+      "cvttps2dq   %%xmm2,%%xmm0                 \n"  // x,y float->int first 2
+      "cvttps2dq   %%xmm3,%%xmm1                 \n"  // x,y float->int next 2
+      "packssdw    %%xmm1,%%xmm0                 \n"  // x, y as 8 shorts
+      "pmaddwd     %%xmm5,%%xmm0                 \n"  // off = x*4 + y*stride
+      "movd        %%xmm0,%k1                    \n"
+      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
+      "movd        %%xmm0,%k5                    \n"
+      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
+      "movd        0x00(%0,%1,1),%%xmm1          \n"
+      "movd        0x00(%0,%5,1),%%xmm6          \n"
+      "punpckldq   %%xmm6,%%xmm1                 \n"
+      "addps       %%xmm4,%%xmm2                 \n"
+      "movq        %%xmm1,(%2)                   \n"
+      "movd        %%xmm0,%k1                    \n"
+      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
+      "movd        %%xmm0,%k5                    \n"
+      "movd        0x00(%0,%1,1),%%xmm0          \n"
+      "movd        0x00(%0,%5,1),%%xmm6          \n"
+      "punpckldq   %%xmm6,%%xmm0                 \n"
+      "addps       %%xmm4,%%xmm3                 \n"
+      "movq        %%xmm0,0x08(%2)               \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%4                       \n"
+      "jge         40b                           \n"
 
       "49:                                       \n"
-      "add       $0x3,%4                         \n"
-      "jl        19f                             \n"
+      "add         $0x3,%4                       \n"
+      "jl          19f                           \n"
 
       // 1 pixel loop
       LABELALIGN
       "10:                                       \n"
-      "cvttps2dq %%xmm2,%%xmm0                   \n"
-      "packssdw  %%xmm0,%%xmm0                   \n"
-      "pmaddwd   %%xmm5,%%xmm0                   \n"
-      "addps     %%xmm7,%%xmm2                   \n"
-      "movd      %%xmm0,%k1                      \n"
-      "movd      0x00(%0,%1,1),%%xmm0            \n"
-      "movd      %%xmm0,(%2)                     \n"
-      "lea       0x04(%2),%2                     \n"
-      "sub       $0x1,%4                         \n"
-      "jge       10b                             \n"
+      "cvttps2dq   %%xmm2,%%xmm0                 \n"
+      "packssdw    %%xmm0,%%xmm0                 \n"
+      "pmaddwd     %%xmm5,%%xmm0                 \n"
+      "addps       %%xmm7,%%xmm2                 \n"
+      "movd        %%xmm0,%k1                    \n"
+      "movd        0x00(%0,%1,1),%%xmm0          \n"
+      "movd        %%xmm0,(%2)                   \n"
+      "lea         0x04(%2),%2                   \n"
+      "sub         $0x1,%4                       \n"
+      "jge         10b                           \n"
       "19:                                       \n"
       : "+r"(src_argb),              // %0
         "+r"(src_argb_stride_temp),  // %1
@@ -5902,68 +6106,68 @@ void InterpolateRow_SSSE3(uint8_t* dst_ptr,
                           int dst_width,
                           int source_y_fraction) {
   asm volatile(
-      "sub       %1,%0                           \n"
-      "cmp       $0x0,%3                         \n"
-      "je        100f                            \n"
-      "cmp       $0x80,%3                        \n"
-      "je        50f                             \n"
-
-      "movd      %3,%%xmm0                       \n"
-      "neg       %3                              \n"
-      "add       $0x100,%3                       \n"
-      "movd      %3,%%xmm5                       \n"
-      "punpcklbw %%xmm0,%%xmm5                   \n"
-      "punpcklwd %%xmm5,%%xmm5                   \n"
-      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-      "mov       $0x80808080,%%eax               \n"
-      "movd      %%eax,%%xmm4                    \n"
-      "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+      "sub         %1,%0                         \n"
+      "cmp         $0x0,%3                       \n"
+      "je          100f                          \n"
+      "cmp         $0x80,%3                      \n"
+      "je          50f                           \n"
+
+      "movd        %3,%%xmm0                     \n"
+      "neg         %3                            \n"
+      "add         $0x100,%3                     \n"
+      "movd        %3,%%xmm5                     \n"
+      "punpcklbw   %%xmm0,%%xmm5                 \n"
+      "punpcklwd   %%xmm5,%%xmm5                 \n"
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
+      "mov         $0x80808080,%%eax             \n"
+      "movd        %%eax,%%xmm4                  \n"
+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
 
       // General purpose row blend.
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%1),%%xmm0                     \n"
-      "movdqu    0x00(%1,%4,1),%%xmm2            \n"
-      "movdqa     %%xmm0,%%xmm1                  \n"
-      "punpcklbw  %%xmm2,%%xmm0                  \n"
-      "punpckhbw  %%xmm2,%%xmm1                  \n"
-      "psubb      %%xmm4,%%xmm0                  \n"
-      "psubb      %%xmm4,%%xmm1                  \n"
-      "movdqa     %%xmm5,%%xmm2                  \n"
-      "movdqa     %%xmm5,%%xmm3                  \n"
-      "pmaddubsw  %%xmm0,%%xmm2                  \n"
-      "pmaddubsw  %%xmm1,%%xmm3                  \n"
-      "paddw      %%xmm4,%%xmm2                  \n"
-      "paddw      %%xmm4,%%xmm3                  \n"
-      "psrlw      $0x8,%%xmm2                    \n"
-      "psrlw      $0x8,%%xmm3                    \n"
-      "packuswb   %%xmm3,%%xmm2                  \n"
-      "movdqu    %%xmm2,0x00(%1,%0,1)            \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
-      "jmp       99f                             \n"
+      "movdqu      (%1),%%xmm0                   \n"
+      "movdqu      0x00(%1,%4,1),%%xmm2          \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm2,%%xmm0                 \n"
+      "punpckhbw   %%xmm2,%%xmm1                 \n"
+      "psubb       %%xmm4,%%xmm0                 \n"
+      "psubb       %%xmm4,%%xmm1                 \n"
+      "movdqa      %%xmm5,%%xmm2                 \n"
+      "movdqa      %%xmm5,%%xmm3                 \n"
+      "pmaddubsw   %%xmm0,%%xmm2                 \n"
+      "pmaddubsw   %%xmm1,%%xmm3                 \n"
+      "paddw       %%xmm4,%%xmm2                 \n"
+      "paddw       %%xmm4,%%xmm3                 \n"
+      "psrlw       $0x8,%%xmm2                   \n"
+      "psrlw       $0x8,%%xmm3                   \n"
+      "packuswb    %%xmm3,%%xmm2                 \n"
+      "movdqu      %%xmm2,0x00(%1,%0,1)          \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "jmp         99f                           \n"
 
       // Blend 50 / 50.
       LABELALIGN
       "50:                                       \n"
-      "movdqu    (%1),%%xmm0                     \n"
-      "movdqu    0x00(%1,%4,1),%%xmm1            \n"
-      "pavgb     %%xmm1,%%xmm0                   \n"
-      "movdqu    %%xmm0,0x00(%1,%0,1)            \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        50b                             \n"
-      "jmp       99f                             \n"
+      "movdqu      (%1),%%xmm0                   \n"
+      "movdqu      0x00(%1,%4,1),%%xmm1          \n"
+      "pavgb       %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          50b                           \n"
+      "jmp         99f                           \n"
 
       // Blend 100 / 0 - Copy row unchanged.
       LABELALIGN
       "100:                                      \n"
-      "movdqu    (%1),%%xmm0                     \n"
-      "movdqu    %%xmm0,0x00(%1,%0,1)            \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        100b                            \n"
+      "movdqu      (%1),%%xmm0                   \n"
+      "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          100b                          \n"
 
       "99:                                       \n"
       : "+r"(dst_ptr),               // %0
@@ -5983,61 +6187,61 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr,
                          int dst_width,
                          int source_y_fraction) {
   asm volatile(
-      "cmp       $0x0,%3                         \n"
-      "je        100f                            \n"
-      "sub       %1,%0                           \n"
-      "cmp       $0x80,%3                        \n"
-      "je        50f                             \n"
-
-      "vmovd      %3,%%xmm0                      \n"
-      "neg        %3                             \n"
-      "add        $0x100,%3                      \n"
-      "vmovd      %3,%%xmm5                      \n"
-      "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
-      "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
+      "cmp         $0x0,%3                       \n"
+      "je          100f                          \n"
+      "sub         %1,%0                         \n"
+      "cmp         $0x80,%3                      \n"
+      "je          50f                           \n"
+
+      "vmovd       %3,%%xmm0                     \n"
+      "neg         %3                            \n"
+      "add         $0x100,%3                     \n"
+      "vmovd       %3,%%xmm5                     \n"
+      "vpunpcklbw  %%xmm0,%%xmm5,%%xmm5          \n"
+      "vpunpcklwd  %%xmm5,%%xmm5,%%xmm5          \n"
       "vbroadcastss %%xmm5,%%ymm5                \n"
-      "mov        $0x80808080,%%eax              \n"
-      "vmovd      %%eax,%%xmm4                   \n"
+      "mov         $0x80808080,%%eax             \n"
+      "vmovd       %%eax,%%xmm4                  \n"
       "vbroadcastss %%xmm4,%%ymm4                \n"
 
       // General purpose row blend.
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu    (%1),%%ymm0                    \n"
-      "vmovdqu    0x00(%1,%4,1),%%ymm2           \n"
-      "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
-      "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
-      "vpsubb     %%ymm4,%%ymm1,%%ymm1           \n"
-      "vpsubb     %%ymm4,%%ymm0,%%ymm0           \n"
-      "vpmaddubsw %%ymm1,%%ymm5,%%ymm1           \n"
-      "vpmaddubsw %%ymm0,%%ymm5,%%ymm0           \n"
-      "vpaddw     %%ymm4,%%ymm1,%%ymm1           \n"
-      "vpaddw     %%ymm4,%%ymm0,%%ymm0           \n"
-      "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
-      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-      "vmovdqu    %%ymm0,0x00(%1,%0,1)           \n"
-      "lea        0x20(%1),%1                    \n"
-      "sub        $0x20,%2                       \n"
-      "jg         1b                             \n"
-      "jmp        99f                            \n"
+      "vmovdqu     (%1),%%ymm0                   \n"
+      "vmovdqu     0x00(%1,%4,1),%%ymm2          \n"
+      "vpunpckhbw  %%ymm2,%%ymm0,%%ymm1          \n"
+      "vpunpcklbw  %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpsubb      %%ymm4,%%ymm1,%%ymm1          \n"
+      "vpsubb      %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm1,%%ymm5,%%ymm1          \n"
+      "vpmaddubsw  %%ymm0,%%ymm5,%%ymm0          \n"
+      "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"
+      "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "jmp         99f                           \n"
 
       // Blend 50 / 50.
       LABELALIGN
       "50:                                       \n"
-      "vmovdqu   (%1),%%ymm0                     \n"
-      "vpavgb    0x00(%1,%4,1),%%ymm0,%%ymm0     \n"
-      "vmovdqu   %%ymm0,0x00(%1,%0,1)            \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x20,%2                        \n"
-      "jg        50b                             \n"
-      "jmp       99f                             \n"
+      "vmovdqu     (%1),%%ymm0                   \n"
+      "vpavgb      0x00(%1,%4,1),%%ymm0,%%ymm0   \n"
+      "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          50b                           \n"
+      "jmp         99f                           \n"
 
       // Blend 100 / 0 - Copy row unchanged.
       LABELALIGN
       "100:                                      \n"
-      "rep movsb                                 \n"
-      "jmp       999f                            \n"
+      "rep         movsb                         \n"
+      "jmp         999f                          \n"
 
       "99:                                       \n"
       "vzeroupper                                \n"
@@ -6059,20 +6263,20 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
                           int width) {
   asm volatile(
 
-      "movdqu    (%3),%%xmm5                     \n"
+      "movdqu      (%3),%%xmm5                   \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "pshufb    %%xmm5,%%xmm0                   \n"
-      "pshufb    %%xmm5,%%xmm1                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "movdqu    %%xmm1,0x10(%1)                 \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "pshufb      %%xmm5,%%xmm0                 \n"
+      "pshufb      %%xmm5,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
@@ -6093,16 +6297,16 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu   (%0),%%ymm0                     \n"
-      "vmovdqu   0x20(%0),%%ymm1                 \n"
-      "lea       0x40(%0),%0                     \n"
-      "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
-      "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
-      "vmovdqu   %%ymm0,(%1)                     \n"
-      "vmovdqu   %%ymm1,0x20(%1)                 \n"
-      "lea       0x40(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
@@ -6120,24 +6324,24 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y,
                         int width) {
   asm volatile(
 
-      "sub       %1,%2                             \n"
+      "sub         %1,%2                         \n"
 
       LABELALIGN
-      "1:                                          \n"
-      "movq      (%1),%%xmm2                       \n"
-      "movq      0x00(%1,%2,1),%%xmm1              \n"
-      "add       $0x8,%1                           \n"
-      "punpcklbw %%xmm1,%%xmm2                     \n"
-      "movdqu    (%0),%%xmm0                       \n"
-      "add       $0x10,%0                          \n"
-      "movdqa    %%xmm0,%%xmm1                     \n"
-      "punpcklbw %%xmm2,%%xmm0                     \n"
-      "punpckhbw %%xmm2,%%xmm1                     \n"
-      "movdqu    %%xmm0,(%3)                       \n"
-      "movdqu    %%xmm1,0x10(%3)                   \n"
-      "lea       0x20(%3),%3                       \n"
-      "sub       $0x10,%4                          \n"
-      "jg         1b                               \n"
+      "1:                                        \n"
+      "movq        (%1),%%xmm2                   \n"
+      "movq        0x00(%1,%2,1),%%xmm1          \n"
+      "add         $0x8,%1                       \n"
+      "punpcklbw   %%xmm1,%%xmm2                 \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "add         $0x10,%0                      \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm2,%%xmm0                 \n"
+      "punpckhbw   %%xmm2,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%3)                   \n"
+      "movdqu      %%xmm1,0x10(%3)               \n"
+      "lea         0x20(%3),%3                   \n"
+      "sub         $0x10,%4                      \n"
+      "jg          1b                            \n"
       : "+r"(src_y),     // %0
         "+r"(src_u),     // %1
         "+r"(src_v),     // %2
@@ -6156,24 +6360,24 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y,
                         int width) {
   asm volatile(
 
-      "sub        %1,%2                            \n"
+      "sub         %1,%2                         \n"
 
       LABELALIGN
-      "1:                                          \n"
-      "movq      (%1),%%xmm2                       \n"
-      "movq      0x00(%1,%2,1),%%xmm1              \n"
-      "add       $0x8,%1                           \n"
-      "punpcklbw %%xmm1,%%xmm2                     \n"
-      "movdqu    (%0),%%xmm0                       \n"
-      "movdqa    %%xmm2,%%xmm1                     \n"
-      "add       $0x10,%0                          \n"
-      "punpcklbw %%xmm0,%%xmm1                     \n"
-      "punpckhbw %%xmm0,%%xmm2                     \n"
-      "movdqu    %%xmm1,(%3)                       \n"
-      "movdqu    %%xmm2,0x10(%3)                   \n"
-      "lea       0x20(%3),%3                       \n"
-      "sub       $0x10,%4                          \n"
-      "jg         1b                               \n"
+      "1:                                        \n"
+      "movq        (%1),%%xmm2                   \n"
+      "movq        0x00(%1,%2,1),%%xmm1          \n"
+      "add         $0x8,%1                       \n"
+      "punpcklbw   %%xmm1,%%xmm2                 \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm2,%%xmm1                 \n"
+      "add         $0x10,%0                      \n"
+      "punpcklbw   %%xmm0,%%xmm1                 \n"
+      "punpckhbw   %%xmm0,%%xmm2                 \n"
+      "movdqu      %%xmm1,(%3)                   \n"
+      "movdqu      %%xmm2,0x10(%3)               \n"
+      "lea         0x20(%3),%3                   \n"
+      "sub         $0x10,%4                      \n"
+      "jg          1b                            \n"
       : "+r"(src_y),     // %0
         "+r"(src_u),     // %1
         "+r"(src_v),     // %2
@@ -6192,27 +6396,27 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,
                         int width) {
   asm volatile(
 
-      "sub       %1,%2                             \n"
+      "sub         %1,%2                         \n"
 
       LABELALIGN
-      "1:                                          \n"
-      "vpmovzxbw  (%1),%%ymm1                      \n"
-      "vpmovzxbw  0x00(%1,%2,1),%%ymm2             \n"
-      "add        $0x10,%1                         \n"
-      "vpsllw     $0x8,%%ymm2,%%ymm2               \n"
-      "vpor       %%ymm1,%%ymm2,%%ymm2             \n"
-      "vmovdqu    (%0),%%ymm0                      \n"
-      "add        $0x20,%0                         \n"
-      "vpunpcklbw %%ymm2,%%ymm0,%%ymm1             \n"
-      "vpunpckhbw %%ymm2,%%ymm0,%%ymm2             \n"
-      "vextractf128 $0x0,%%ymm1,(%3)               \n"
-      "vextractf128 $0x0,%%ymm2,0x10(%3)           \n"
-      "vextractf128 $0x1,%%ymm1,0x20(%3)           \n"
-      "vextractf128 $0x1,%%ymm2,0x30(%3)           \n"
-      "lea        0x40(%3),%3                      \n"
-      "sub        $0x20,%4                         \n"
-      "jg         1b                               \n"
-      "vzeroupper                                  \n"
+      "1:                                        \n"
+      "vpmovzxbw   (%1),%%ymm1                   \n"
+      "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
+      "add         $0x10,%1                      \n"
+      "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
+      "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "add         $0x20,%0                      \n"
+      "vpunpcklbw  %%ymm2,%%ymm0,%%ymm1          \n"
+      "vpunpckhbw  %%ymm2,%%ymm0,%%ymm2          \n"
+      "vextractf128 $0x0,%%ymm1,(%3)             \n"
+      "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
+      "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
+      "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
+      "lea         0x40(%3),%3                   \n"
+      "sub         $0x20,%4                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
       : "+r"(src_y),     // %0
         "+r"(src_u),     // %1
         "+r"(src_v),     // %2
@@ -6231,27 +6435,27 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y,
                         int width) {
   asm volatile(
 
-      "sub        %1,%2                            \n"
+      "sub         %1,%2                         \n"
 
       LABELALIGN
-      "1:                                          \n"
-      "vpmovzxbw  (%1),%%ymm1                      \n"
-      "vpmovzxbw  0x00(%1,%2,1),%%ymm2             \n"
-      "add        $0x10,%1                         \n"
-      "vpsllw     $0x8,%%ymm2,%%ymm2               \n"
-      "vpor       %%ymm1,%%ymm2,%%ymm2             \n"
-      "vmovdqu    (%0),%%ymm0                      \n"
-      "add        $0x20,%0                         \n"
-      "vpunpcklbw %%ymm0,%%ymm2,%%ymm1             \n"
-      "vpunpckhbw %%ymm0,%%ymm2,%%ymm2             \n"
-      "vextractf128 $0x0,%%ymm1,(%3)               \n"
-      "vextractf128 $0x0,%%ymm2,0x10(%3)           \n"
-      "vextractf128 $0x1,%%ymm1,0x20(%3)           \n"
-      "vextractf128 $0x1,%%ymm2,0x30(%3)           \n"
-      "lea        0x40(%3),%3                      \n"
-      "sub        $0x20,%4                         \n"
-      "jg         1b                               \n"
-      "vzeroupper                                  \n"
+      "1:                                        \n"
+      "vpmovzxbw   (%1),%%ymm1                   \n"
+      "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
+      "add         $0x10,%1                      \n"
+      "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
+      "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "add         $0x20,%0                      \n"
+      "vpunpcklbw  %%ymm0,%%ymm2,%%ymm1          \n"
+      "vpunpckhbw  %%ymm0,%%ymm2,%%ymm2          \n"
+      "vextractf128 $0x0,%%ymm1,(%3)             \n"
+      "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
+      "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
+      "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
+      "lea         0x40(%3),%3                   \n"
+      "sub         $0x20,%4                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
       : "+r"(src_y),     // %0
         "+r"(src_u),     // %1
         "+r"(src_v),     // %2
@@ -6269,47 +6473,47 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
                             int width) {
   asm volatile(
 
-      "pxor      %%xmm3,%%xmm3                   \n"
+      "pxor        %%xmm3,%%xmm3                 \n"
 
       // 2 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movq      (%0),%%xmm0                     \n"
-      "lea       0x8(%0),%0                      \n"
-      "punpcklbw %%xmm3,%%xmm0                   \n"
-      "movdqa    %%xmm0,%%xmm4                   \n"
-      "punpcklwd %%xmm3,%%xmm0                   \n"
-      "punpckhwd %%xmm3,%%xmm4                   \n"
-      "cvtdq2ps  %%xmm0,%%xmm0                   \n"
-      "cvtdq2ps  %%xmm4,%%xmm4                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "movdqa    %%xmm4,%%xmm5                   \n"
-      "mulps     0x10(%3),%%xmm0                 \n"
-      "mulps     0x10(%3),%%xmm4                 \n"
-      "addps     (%3),%%xmm0                     \n"
-      "addps     (%3),%%xmm4                     \n"
-      "movdqa    %%xmm1,%%xmm2                   \n"
-      "movdqa    %%xmm5,%%xmm6                   \n"
-      "mulps     %%xmm1,%%xmm2                   \n"
-      "mulps     %%xmm5,%%xmm6                   \n"
-      "mulps     %%xmm2,%%xmm1                   \n"
-      "mulps     %%xmm6,%%xmm5                   \n"
-      "mulps     0x20(%3),%%xmm2                 \n"
-      "mulps     0x20(%3),%%xmm6                 \n"
-      "mulps     0x30(%3),%%xmm1                 \n"
-      "mulps     0x30(%3),%%xmm5                 \n"
-      "addps     %%xmm2,%%xmm0                   \n"
-      "addps     %%xmm6,%%xmm4                   \n"
-      "addps     %%xmm1,%%xmm0                   \n"
-      "addps     %%xmm5,%%xmm4                   \n"
-      "cvttps2dq %%xmm0,%%xmm0                   \n"
-      "cvttps2dq %%xmm4,%%xmm4                   \n"
-      "packuswb  %%xmm4,%%xmm0                   \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x2,%2                         \n"
-      "jg        1b                              \n"
+      "movq        (%0),%%xmm0                   \n"
+      "lea         0x8(%0),%0                    \n"
+      "punpcklbw   %%xmm3,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "punpcklwd   %%xmm3,%%xmm0                 \n"
+      "punpckhwd   %%xmm3,%%xmm4                 \n"
+      "cvtdq2ps    %%xmm0,%%xmm0                 \n"
+      "cvtdq2ps    %%xmm4,%%xmm4                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "mulps       0x10(%3),%%xmm0               \n"
+      "mulps       0x10(%3),%%xmm4               \n"
+      "addps       (%3),%%xmm0                   \n"
+      "addps       (%3),%%xmm4                   \n"
+      "movdqa      %%xmm1,%%xmm2                 \n"
+      "movdqa      %%xmm5,%%xmm6                 \n"
+      "mulps       %%xmm1,%%xmm2                 \n"
+      "mulps       %%xmm5,%%xmm6                 \n"
+      "mulps       %%xmm2,%%xmm1                 \n"
+      "mulps       %%xmm6,%%xmm5                 \n"
+      "mulps       0x20(%3),%%xmm2               \n"
+      "mulps       0x20(%3),%%xmm6               \n"
+      "mulps       0x30(%3),%%xmm1               \n"
+      "mulps       0x30(%3),%%xmm5               \n"
+      "addps       %%xmm2,%%xmm0                 \n"
+      "addps       %%xmm6,%%xmm4                 \n"
+      "addps       %%xmm1,%%xmm0                 \n"
+      "addps       %%xmm5,%%xmm4                 \n"
+      "cvttps2dq   %%xmm0,%%xmm0                 \n"
+      "cvttps2dq   %%xmm4,%%xmm4                 \n"
+      "packuswb    %%xmm4,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x2,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
@@ -6405,27 +6609,27 @@ void HalfFloatRow_AVX2(const uint16_t* src,
                        int width) {
   scale *= kScaleBias;
   asm volatile(
-      "vbroadcastss  %3, %%ymm4                  \n"
-      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
-      "sub        %0,%1                          \n"
+      "vbroadcastss %3, %%ymm4                   \n"
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+      "sub         %0,%1                         \n"
 
       // 16 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu    (%0),%%ymm2                    \n"  // 16 shorts
-      "add        $0x20,%0                       \n"
-      "vpunpckhwd %%ymm5,%%ymm2,%%ymm3           \n"  // mutates
-      "vpunpcklwd %%ymm5,%%ymm2,%%ymm2           \n"
-      "vcvtdq2ps  %%ymm3,%%ymm3                  \n"
-      "vcvtdq2ps  %%ymm2,%%ymm2                  \n"
-      "vmulps     %%ymm3,%%ymm4,%%ymm3           \n"
-      "vmulps     %%ymm2,%%ymm4,%%ymm2           \n"
-      "vpsrld     $0xd,%%ymm3,%%ymm3             \n"
-      "vpsrld     $0xd,%%ymm2,%%ymm2             \n"
-      "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // unmutates
-      "vmovdqu    %%ymm2,-0x20(%0,%1,1)          \n"
-      "sub        $0x10,%2                       \n"
-      "jg         1b                             \n"
+      "vmovdqu     (%0),%%ymm2                   \n"  // 16 shorts
+      "add         $0x20,%0                      \n"
+      "vpunpckhwd  %%ymm5,%%ymm2,%%ymm3          \n"  // mutates
+      "vpunpcklwd  %%ymm5,%%ymm2,%%ymm2          \n"
+      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
+      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
+      "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
+      "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
+      "vpsrld      $0xd,%%ymm3,%%ymm3            \n"
+      "vpsrld      $0xd,%%ymm2,%%ymm2            \n"
+      "vpackssdw   %%ymm3, %%ymm2, %%ymm2        \n"  // unmutates
+      "vmovdqu     %%ymm2,-0x20(%0,%1,1)         \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
 
       "vzeroupper                                \n"
       : "+r"(src),   // %0
@@ -6446,8 +6650,8 @@ void HalfFloatRow_F16C(const uint16_t* src,
                        float scale,
                        int width) {
   asm volatile(
-      "vbroadcastss  %3, %%ymm4                  \n"
-      "sub        %0,%1                          \n"
+      "vbroadcastss %3, %%ymm4                   \n"
+      "sub         %0,%1                         \n"
 
       // 16 pixel loop.
       LABELALIGN
@@ -6481,7 +6685,7 @@ void HalfFloatRow_F16C(const uint16_t* src,
 #ifdef HAS_HALFFLOATROW_F16C
 void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
   asm volatile(
-      "sub        %0,%1                          \n"
+      "sub         %0,%1                         \n"
       // 16 pixel loop.
       LABELALIGN
       "1:                                        \n"
@@ -6515,21 +6719,21 @@ void ARGBColorTableRow_X86(uint8_t* dst_argb,
       // 1 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movzb     (%0),%1                         \n"
-      "lea       0x4(%0),%0                      \n"
-      "movzb     0x00(%3,%1,4),%1                \n"
-      "mov       %b1,-0x4(%0)                    \n"
-      "movzb     -0x3(%0),%1                     \n"
-      "movzb     0x01(%3,%1,4),%1                \n"
-      "mov       %b1,-0x3(%0)                    \n"
-      "movzb     -0x2(%0),%1                     \n"
-      "movzb     0x02(%3,%1,4),%1                \n"
-      "mov       %b1,-0x2(%0)                    \n"
-      "movzb     -0x1(%0),%1                     \n"
-      "movzb     0x03(%3,%1,4),%1                \n"
-      "mov       %b1,-0x1(%0)                    \n"
-      "dec       %2                              \n"
-      "jg        1b                              \n"
+      "movzb       (%0),%1                       \n"
+      "lea         0x4(%0),%0                    \n"
+      "movzb       0x00(%3,%1,4),%1              \n"
+      "mov         %b1,-0x4(%0)                  \n"
+      "movzb       -0x3(%0),%1                   \n"
+      "movzb       0x01(%3,%1,4),%1              \n"
+      "mov         %b1,-0x3(%0)                  \n"
+      "movzb       -0x2(%0),%1                   \n"
+      "movzb       0x02(%3,%1,4),%1              \n"
+      "mov         %b1,-0x2(%0)                  \n"
+      "movzb       -0x1(%0),%1                   \n"
+      "movzb       0x03(%3,%1,4),%1              \n"
+      "mov         %b1,-0x1(%0)                  \n"
+      "dec         %2                            \n"
+      "jg          1b                            \n"
       : "+r"(dst_argb),     // %0
         "=&d"(pixel_temp),  // %1
         "+r"(width)         // %2
@@ -6548,18 +6752,18 @@ void RGBColorTableRow_X86(uint8_t* dst_argb,
       // 1 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movzb     (%0),%1                         \n"
-      "lea       0x4(%0),%0                      \n"
-      "movzb     0x00(%3,%1,4),%1                \n"
-      "mov       %b1,-0x4(%0)                    \n"
-      "movzb     -0x3(%0),%1                     \n"
-      "movzb     0x01(%3,%1,4),%1                \n"
-      "mov       %b1,-0x3(%0)                    \n"
-      "movzb     -0x2(%0),%1                     \n"
-      "movzb     0x02(%3,%1,4),%1                \n"
-      "mov       %b1,-0x2(%0)                    \n"
-      "dec       %2                              \n"
-      "jg        1b                              \n"
+      "movzb       (%0),%1                       \n"
+      "lea         0x4(%0),%0                    \n"
+      "movzb       0x00(%3,%1,4),%1              \n"
+      "mov         %b1,-0x4(%0)                  \n"
+      "movzb       -0x3(%0),%1                   \n"
+      "movzb       0x01(%3,%1,4),%1              \n"
+      "mov         %b1,-0x3(%0)                  \n"
+      "movzb       -0x2(%0),%1                   \n"
+      "movzb       0x02(%3,%1,4),%1              \n"
+      "mov         %b1,-0x2(%0)                  \n"
+      "dec         %2                            \n"
+      "jg          1b                            \n"
       : "+r"(dst_argb),     // %0
         "=&d"(pixel_temp),  // %1
         "+r"(width)         // %2
@@ -6578,86 +6782,86 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
   uintptr_t pixel_temp;
   uintptr_t table_temp;
   asm volatile(
-      "movd      %6,%%xmm3                       \n"
-      "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-      "pcmpeqb   %%xmm4,%%xmm4                   \n"
-      "psllw     $0x8,%%xmm4                     \n"
-      "pxor      %%xmm5,%%xmm5                   \n"
+      "movd        %6,%%xmm3                     \n"
+      "pshufd      $0x0,%%xmm3,%%xmm3            \n"
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psllw       $0x8,%%xmm4                   \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
 
       // 4 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%2),%%xmm0                     \n"
-      "pmaddubsw %%xmm3,%%xmm0                   \n"
-      "phaddw    %%xmm0,%%xmm0                   \n"
-      "pand      %%xmm4,%%xmm0                   \n"
-      "punpcklwd %%xmm5,%%xmm0                   \n"
-      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-      "add       %5,%1                           \n"
-      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-
-      "movzb     (%2),%0                         \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,(%3)                        \n"
-      "movzb     0x1(%2),%0                      \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,0x1(%3)                     \n"
-      "movzb     0x2(%2),%0                      \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,0x2(%3)                     \n"
-      "movzb     0x3(%2),%0                      \n"
-      "mov       %b0,0x3(%3)                     \n"
-
-      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-      "add       %5,%1                           \n"
-      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-
-      "movzb     0x4(%2),%0                      \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,0x4(%3)                     \n"
-      "movzb     0x5(%2),%0                      \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,0x5(%3)                     \n"
-      "movzb     0x6(%2),%0                      \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,0x6(%3)                     \n"
-      "movzb     0x7(%2),%0                      \n"
-      "mov       %b0,0x7(%3)                     \n"
-
-      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-      "add       %5,%1                           \n"
-      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-
-      "movzb     0x8(%2),%0                      \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,0x8(%3)                     \n"
-      "movzb     0x9(%2),%0                      \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,0x9(%3)                     \n"
-      "movzb     0xa(%2),%0                      \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,0xa(%3)                     \n"
-      "movzb     0xb(%2),%0                      \n"
-      "mov       %b0,0xb(%3)                     \n"
-
-      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-      "add       %5,%1                           \n"
-
-      "movzb     0xc(%2),%0                      \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,0xc(%3)                     \n"
-      "movzb     0xd(%2),%0                      \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,0xd(%3)                     \n"
-      "movzb     0xe(%2),%0                      \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,0xe(%3)                     \n"
-      "movzb     0xf(%2),%0                      \n"
-      "mov       %b0,0xf(%3)                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "lea       0x10(%3),%3                     \n"
-      "sub       $0x4,%4                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%2),%%xmm0                   \n"
+      "pmaddubsw   %%xmm3,%%xmm0                 \n"
+      "phaddw      %%xmm0,%%xmm0                 \n"
+      "pand        %%xmm4,%%xmm0                 \n"
+      "punpcklwd   %%xmm5,%%xmm0                 \n"
+      "movd        %%xmm0,%k1                    \n"  // 32 bit offset
+      "add         %5,%1                         \n"
+      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
+
+      "movzb       (%2),%0                       \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,(%3)                      \n"
+      "movzb       0x1(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0x1(%3)                   \n"
+      "movzb       0x2(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0x2(%3)                   \n"
+      "movzb       0x3(%2),%0                    \n"
+      "mov         %b0,0x3(%3)                   \n"
+
+      "movd        %%xmm0,%k1                    \n"  // 32 bit offset
+      "add         %5,%1                         \n"
+      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
+
+      "movzb       0x4(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0x4(%3)                   \n"
+      "movzb       0x5(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0x5(%3)                   \n"
+      "movzb       0x6(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0x6(%3)                   \n"
+      "movzb       0x7(%2),%0                    \n"
+      "mov         %b0,0x7(%3)                   \n"
+
+      "movd        %%xmm0,%k1                    \n"  // 32 bit offset
+      "add         %5,%1                         \n"
+      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
+
+      "movzb       0x8(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0x8(%3)                   \n"
+      "movzb       0x9(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0x9(%3)                   \n"
+      "movzb       0xa(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0xa(%3)                   \n"
+      "movzb       0xb(%2),%0                    \n"
+      "mov         %b0,0xb(%3)                   \n"
+
+      "movd        %%xmm0,%k1                    \n"  // 32 bit offset
+      "add         %5,%1                         \n"
+
+      "movzb       0xc(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0xc(%3)                   \n"
+      "movzb       0xd(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0xd(%3)                   \n"
+      "movzb       0xe(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0xe(%3)                   \n"
+      "movzb       0xf(%2),%0                    \n"
+      "mov         %b0,0xf(%3)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "lea         0x10(%3),%3                   \n"
+      "sub         $0x4,%4                       \n"
+      "jg          1b                            \n"
       : "=&d"(pixel_temp),  // %0
         "=&a"(table_temp),  // %1
         "+r"(src_argb),     // %2
@@ -6669,6 +6873,300 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
 }
 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
 
+#ifdef HAS_NV21TOYUV24ROW_AVX2
+
+// begin NV21ToYUV24Row_C avx2 constants
+static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00,
+                               0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80,
+                               0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
+                               0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00};
+
+static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
+                               0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
+                               0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
+                               0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80};
+
+static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
+                               0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
+                               0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
+                               0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00};
+
+static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
+                              0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05,
+                              0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
+                              0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05};
+
+static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
+                              0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80,
+                              0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
+                              0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80};
+
+static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
+                              0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f,
+                              0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
+                              0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f};
+
+static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
+                              0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80,
+                              0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
+                              0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80};
+
+static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
+                              0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a,
+                              0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
+                              0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a};
+
+static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
+                              0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80,
+                              0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
+                              0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80};
+
+// NV21ToYUV24Row_AVX2
+void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_yuv24,
+                         int width) {
+  uint8_t* src_y_ptr;
+  uint64_t src_offset = 0;
+  uint64_t width64;
+
+  width64 = width;
+  src_y_ptr = (uint8_t*)src_y;
+
+  asm volatile(
+      "vmovdqu     %5, %%ymm0                    \n"  // init blend value
+      "vmovdqu     %6, %%ymm1                    \n"  // init blend value
+      "vmovdqu     %7, %%ymm2                    \n"  // init blend value
+      //      "sub         $0x20, %3                     \n"  //sub 32 from
+      //      width for final loop
+
+      LABELALIGN
+      "1:                                        \n"      // label 1
+      "vmovdqu     (%0,%4), %%ymm3               \n"      // src_y
+      "vmovdqu     1(%1,%4), %%ymm4              \n"      // src_uv+1
+      "vmovdqu     (%1), %%ymm5                  \n"      // src_uv
+      "vpshufb     %8, %%ymm3, %%ymm13           \n"      // y, kSHUF0 for shuf
+      "vpshufb     %9, %%ymm4, %%ymm14           \n"      // uv+1, kSHUF1 for
+                                                          // shuf
+      "vpshufb     %10, %%ymm5, %%ymm15          \n"      // uv, kSHUF2 for
+                                                          // shuf
+      "vpshufb     %11, %%ymm3, %%ymm3           \n"      // y kSHUF3 for shuf
+      "vpshufb     %12, %%ymm4, %%ymm4           \n"      // uv+1 kSHUF4 for
+                                                          // shuf
+      "vpblendvb   %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n"  // blend 0
+      "vpblendvb   %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n"  // blend 0
+      "vpblendvb   %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n"  // blend 2
+      "vpblendvb   %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n"  // blend 1
+      "vpshufb     %13, %%ymm5, %%ymm15          \n"      // shuffle const
+      "vpor        %%ymm4, %%ymm3, %%ymm5        \n"      // get results
+      "vmovdqu     %%ymm12, 0x20(%2)             \n"      // store dst_yuv+20h
+      "vpor        %%ymm15, %%ymm5, %%ymm3       \n"      // get results
+      "add         $0x20, %4                     \n"      // add to src buffer
+                                                          // ptr
+      "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n"      // insert
+      "vperm2i128  $0x31, %%ymm13, %%ymm3, %%ymm5 \n"     // insert
+      "vmovdqu     %%ymm4, (%2)                  \n"      // store dst_yuv
+      "vmovdqu     %%ymm5, 0x40(%2)              \n"      // store dst_yuv+40h
+      "add         $0x60,%2                      \n"      // add to dst buffer
+                                                          // ptr
+      //      "cmp         %3, %4                        \n" //(width64 -
+      //      32 bytes) and src_offset
+      "sub         $0x20,%3                      \n"  // 32 pixels per loop
+      "jg          1b                            \n"
+      "vzeroupper                                \n"  // sse-avx2
+                                                      // transistions
+
+      : "+r"(src_y),      //%0
+        "+r"(src_vu),     //%1
+        "+r"(dst_yuv24),  //%2
+        "+r"(width64),    //%3
+        "+r"(src_offset)  //%4
+      : "m"(kBLEND0),     //%5
+        "m"(kBLEND1),     //%6
+        "m"(kBLEND2),     //%7
+        "m"(kSHUF0),      //%8
+        "m"(kSHUF1),      //%9
+        "m"(kSHUF2),      //%10
+        "m"(kSHUF3),      //%11
+        "m"(kSHUF4),      //%12
+        "m"(kSHUF5)       //%13
+      : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12",
+        "xmm13", "xmm14", "xmm15");
+}
+#endif  // HAS_NV21TOYUV24ROW_AVX2
+
+#ifdef HAS_SWAPUVROW_SSSE3
+
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleUVToVU = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
+                                     9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+  asm volatile(
+
+      "movdqu      %3,%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "pshufb      %%xmm5,%%xmm0                 \n"
+      "pshufb      %%xmm5,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_uv),        // %0
+        "+r"(dst_vu),        // %1
+        "+r"(width)          // %2
+      : "m"(kShuffleUVToVU)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif  // HAS_SWAPUVROW_SSSE3
+
+#ifdef HAS_SWAPUVROW_AVX2
+void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+  asm volatile(
+
+      "vbroadcastf128 %3,%%ymm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uv),        // %0
+        "+r"(dst_vu),        // %1
+        "+r"(width)          // %2
+      : "m"(kShuffleUVToVU)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif  // HAS_SWAPUVROW_AVX2
+
+void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
+                          int src_stride_u,
+                          const uint8_t* src_v,
+                          int src_stride_v,
+                          uint8_t* dst_uv,
+                          int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrlw       $0xf,%%xmm4                   \n"
+      "packuswb    %%xmm4,%%xmm4                 \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"  // load 16 U values
+      "movdqu      (%1),%%xmm1                   \n"  // load 16 V values
+      "movdqu      0(%0,%4,1),%%xmm2             \n"  // 16 from next row
+      "movdqu      0(%1,%5,1),%%xmm3             \n"
+      "lea         0x10(%0),%0                   \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"  // half size
+      "pmaddubsw   %%xmm4,%%xmm1                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm4,%%xmm3                 \n"
+      "lea         0x10(%1),%1                   \n"
+      "paddw       %%xmm2,%%xmm0                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"
+      "psrlw       $0x1,%%xmm0                   \n"
+      "psrlw       $0x1,%%xmm1                   \n"
+      "pavgw       %%xmm5,%%xmm0                 \n"
+      "pavgw       %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "punpcklbw   %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"  // store 8 UV pixels
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x10,%3                      \n"  // 16 src pixels per loop
+      "jg          1b                            \n"
+      : "+r"(src_u),                    // %0
+        "+r"(src_v),                    // %1
+        "+r"(dst_uv),                   // %2
+        "+r"(width)                     // %3
+      : "r"((intptr_t)(src_stride_u)),  // %4
+        "r"((intptr_t)(src_stride_v))   // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void HalfMergeUVRow_AVX2(const uint8_t* src_u,
+                         int src_stride_u,
+                         const uint8_t* src_v,
+                         int src_stride_v,
+                         uint8_t* dst_uv,
+                         int width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
+      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"  // load 32 U values
+      "vmovdqu     (%1),%%ymm1                   \n"  // load 32 V values
+      "vmovdqu     0(%0,%4,1),%%ymm2             \n"  // 32 from next row
+      "vmovdqu     0(%1,%5,1),%%ymm3             \n"
+      "lea         0x20(%0),%0                   \n"
+      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"  // half size
+      "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
+      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
+      "lea         0x20(%1),%1                   \n"
+      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x1,%%ymm1,%%ymm1            \n"
+      "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%2)                   \n"  // store 16 UV pixels
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x20,%3                      \n"  // 32 src pixels per loop
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_u),                    // %0
+        "+r"(src_v),                    // %1
+        "+r"(dst_uv),                   // %2
+        "+r"(width)                     // %3
+      : "r"((intptr_t)(src_stride_u)),  // %4
+        "r"((intptr_t)(src_stride_v))   // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
+  asm volatile(
+      "pxor        %%xmm1,%%xmm1                 \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movd        (%0),%%xmm0                   \n"  // load float
+      "maxss       %%xmm1, %%xmm0                \n"  // clamp to zero
+      "add         4, %0                         \n"
+      "movd        %%xmm0, (%1)                  \n"  // store float
+      "add         4, %1                         \n"
+      "sub         $0x4,%2                       \n"  // 1 float per loop
+      "jg          1b                            \n"
+      : "+r"(src_x),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(width)   // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #ifdef __cplusplus
diff --git a/chromium/third_party/libyuv/source/row_mmi.cc b/chromium/third_party/libyuv/source/row_mmi.cc
new file mode 100644
index 00000000000..9a8e2cb2d16
--- /dev/null
+++ b/chromium/third_party/libyuv/source/row_mmi.cc
@@ -0,0 +1,7842 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "libyuv/row.h"
+
+#include <string.h>  // For memcpy and memset.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Mips MMI.
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+// clang-format off
+
+void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24,
+                        uint8_t* dst_argb,
+                        int width) {
+  uint64_t src0, src1, dest;
+  const uint64_t mask = 0xff000000ULL;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gslwlc1    %[src0],         0x03(%[src_ptr])                 \n\t"
+      "gslwrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
+      "gslwlc1    %[src1],         0x06(%[src_ptr])                 \n\t"
+      "gslwrc1    %[src1],         0x03(%[src_ptr])                 \n\t"
+
+      "or         %[src0],         %[src0],           %[mask]       \n\t"
+      "or         %[src1],         %[src1],           %[mask]       \n\t"
+      "punpcklwd  %[dest],         %[src0],           %[src1]       \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "gslwlc1    %[src0],         0x09(%[src_ptr])                 \n\t"
+      "gslwrc1    %[src0],         0x06(%[src_ptr])                 \n\t"
+      "gslwlc1    %[src1],         0x0c(%[src_ptr])                 \n\t"
+      "gslwrc1    %[src1],         0x09(%[src_ptr])                 \n\t"
+
+      "or         %[src0],         %[src0],           %[mask]       \n\t"
+      "or         %[src1],         %[src1],           %[mask]       \n\t"
+      "punpcklwd  %[dest],         %[src0],           %[src1]       \n\t"
+      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x0c          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
+      "daddi      %[width],        %[width],         -0x04          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+      : [src_ptr] "r"(src_rgb24), [dst_ptr] "r"(dst_argb), [width] "r"(width),
+        [mask] "f"(mask)
+      : "memory");
+}
+
+void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  uint64_t src0, src1, dest;
+  const uint64_t mask0 = 0x0;
+  const uint64_t mask1 = 0xff000000ULL;
+  const uint64_t mask2 = 0xc6;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gslwlc1    %[src0],         0x03(%[src_ptr])                 \n\t"
+      "gslwrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
+      "gslwlc1    %[src1],         0x06(%[src_ptr])                 \n\t"
+      "gslwrc1    %[src1],         0x03(%[src_ptr])                 \n\t"
+
+      "or         %[src0],         %[src0],           %[mask1]      \n\t"
+      "punpcklbh  %[src0],         %[src0],           %[mask0]      \n\t"
+      "pshufh     %[src0],         %[src0],           %[mask2]      \n\t"
+      "or         %[src1],         %[src1],           %[mask1]      \n\t"
+      "punpcklbh  %[src1],         %[src1],           %[mask0]      \n\t"
+      "pshufh     %[src1],         %[src1],           %[mask2]      \n\t"
+      "packushb   %[dest],         %[src0],           %[src1]       \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "gslwlc1    %[src0],         0x09(%[src_ptr])                 \n\t"
+      "gslwrc1    %[src0],         0x06(%[src_ptr])                 \n\t"
+      "gslwlc1    %[src1],         0x0c(%[src_ptr])                 \n\t"
+      "gslwrc1    %[src1],         0x09(%[src_ptr])                 \n\t"
+
+      "or         %[src0],         %[src0],           %[mask1]      \n\t"
+      "punpcklbh  %[src0],         %[src0],           %[mask0]      \n\t"
+      "pshufh     %[src0],         %[src0],           %[mask2]      \n\t"
+      "or         %[src1],         %[src1],           %[mask1]      \n\t"
+      "punpcklbh  %[src1],         %[src1],           %[mask0]      \n\t"
+      "pshufh     %[src1],         %[src1],           %[mask2]      \n\t"
+      "packushb   %[dest],         %[src0],           %[src1]       \n\t"
+      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x0c          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
+      "daddi      %[width],        %[width],         -0x04          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+      : [src_ptr] "r"(src_raw), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
+        [mask1] "f"(mask1), [mask2] "f"(mask2), [width] "r"(width)
+      : "memory");
+}
+
+void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+  uint64_t src0, src1;
+  uint64_t ftmp[4];
+  uint64_t mask0 = 0xc6;
+  uint64_t mask1 = 0x6c;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_raw])                 \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_raw])                 \n\t"
+      "gslwrc1    %[src1],         0x08(%[src_raw])                 \n\t"
+      "gslwlc1    %[src1],         0x0b(%[src_raw])                 \n\t"
+
+      "punpcklbh  %[ftmp0],        %[src0],           %[zero]       \n\t"
+      "pshufh     %[ftmp0],        %[ftmp0],          %[mask0]      \n\t"
+      "punpckhbh  %[ftmp1],        %[src0],           %[zero]       \n\t"
+      "punpcklbh  %[src1],         %[src1],           %[zero]       \n\t"
+      "pextrh     %[ftmp2],        %[ftmp0],          %[three]      \n\t"
+      "pextrh     %[ftmp3],        %[ftmp1],          %[one]        \n\t"
+      "pinsrh_3   %[ftmp0],        %[ftmp0],          %[ftmp3]      \n\t"
+      "pextrh     %[ftmp3],        %[ftmp1],          %[two]        \n\t"
+      "pinsrh_1   %[ftmp1],        %[ftmp1],          %[ftmp2]      \n\t"
+      "pshufh     %[src1],         %[src1],           %[mask1]      \n\t"
+      "pextrh     %[ftmp2],        %[src1],           %[zero]       \n\t"
+      "pinsrh_2   %[ftmp1],        %[ftmp1],          %[ftmp2]      \n\t"
+      "pinsrh_0   %[src1],         %[src1],           %[ftmp3]      \n\t"
+      "packushb   %[ftmp0],        %[ftmp0],          %[ftmp1]      \n\t"
+      "packushb   %[src1],         %[src1],           %[zero]       \n\t"
+
+      "gssdrc1    %[ftmp0],        0x00(%[dst_rgb24])               \n\t"
+      "gssdlc1    %[ftmp0],        0x07(%[dst_rgb24])               \n\t"
+      "gsswrc1    %[src1],         0x08(%[dst_rgb24])               \n\t"
+      "gsswlc1    %[src1],         0x0b(%[dst_rgb24])               \n\t"
+
+      "daddiu     %[src_raw],      %[src_raw],        0x0c          \n\t"
+      "daddiu     %[dst_rgb24],    %[dst_rgb24],      0x0c          \n\t"
+      "daddiu     %[width],        %[width],         -0x04          \n\t"
+      "bgtz       %[width],        1b                               \n\t"
+      : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]),
+        [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3])
+      : [src_raw] "r"(src_raw), [dst_rgb24] "r"(dst_rgb24), [width] "r"(width),
+        [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00),
+        [one] "f"(0x01), [two] "f"(0x02), [three] "f"(0x03)
+      : "memory");
+}
+
+void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565,
+                         uint8_t* dst_argb,
+                         int width) {
+  uint64_t ftmp[5];
+  uint64_t c0 = 0x001f001f001f001f;
+  uint64_t c1 = 0x00ff00ff00ff00ff;
+  uint64_t c2 = 0x0007000700070007;
+  __asm__ volatile(
+      "1:                                                      \n\t"
+      "gsldrc1   %[src0],       0x00(%[src_rgb565])            \n\t"
+      "gsldlc1   %[src0],       0x07(%[src_rgb565])            \n\t"
+      "psrlh     %[src1],       %[src0],             %[eight]  \n\t"
+      "and       %[b],          %[src0],             %[c0]     \n\t"
+      "and       %[src0],       %[src0],             %[c1]     \n\t"
+      "psrlh     %[src0],       %[src0],             %[five]   \n\t"
+      "and       %[g],          %[src1],             %[c2]     \n\t"
+      "psllh     %[g],          %[g],                %[three]  \n\t"
+      "or        %[g],          %[src0],             %[g]      \n\t"
+      "psrlh     %[r],          %[src1],             %[three]  \n\t"
+      "psllh     %[src0],       %[b],                %[three]  \n\t"
+      "psrlh     %[src1],       %[b],                %[two]    \n\t"
+      "or        %[b],          %[src0],             %[src1]   \n\t"
+      "psllh     %[src0],       %[g],                %[two]    \n\t"
+      "psrlh     %[src1],       %[g],                %[four]   \n\t"
+      "or        %[g],          %[src0],             %[src1]   \n\t"
+      "psllh     %[src0],       %[r],                %[three]  \n\t"
+      "psrlh     %[src1],       %[r],                %[two]    \n\t"
+      "or        %[r],          %[src0],             %[src1]   \n\t"
+      "packushb  %[b],          %[b],                %[r]      \n\t"
+      "packushb  %[g],          %[g],                %[c1]     \n\t"
+      "punpcklbh %[src0],       %[b],                %[g]      \n\t"
+      "punpckhbh %[src1],       %[b],                %[g]      \n\t"
+      "punpcklhw %[r],          %[src0],             %[src1]   \n\t"
+      "gssdrc1   %[r],          0x00(%[dst_argb])              \n\t"
+      "gssdlc1   %[r],          0x07(%[dst_argb])              \n\t"
+      "punpckhhw %[r],          %[src0],             %[src1]   \n\t"
+      "gssdrc1   %[r],          0x08(%[dst_argb])              \n\t"
+      "gssdlc1   %[r],          0x0f(%[dst_argb])              \n\t"
+      "daddiu    %[src_rgb565], %[src_rgb565],       0x08      \n\t"
+      "daddiu    %[dst_argb],   %[dst_argb],         0x10      \n\t"
+      "daddiu    %[width],      %[width],           -0x04      \n\t"
+      "bgtz      %[width],     1b                              \n\t"
+      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
+        [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4])
+      : [src_rgb565] "r"(src_rgb565), [dst_argb] "r"(dst_argb),
+        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
+        [eight] "f"(0x08), [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02),
+        [four] "f"(0x04)
+      : "memory");
+}
+
+void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555,
+                           uint8_t* dst_argb,
+                           int width) {
+  uint64_t ftmp[6];
+  uint64_t c0 = 0x001f001f001f001f;
+  uint64_t c1 = 0x00ff00ff00ff00ff;
+  uint64_t c2 = 0x0003000300030003;
+  uint64_t c3 = 0x007c007c007c007c;
+  uint64_t c4 = 0x0001000100010001;
+  __asm__ volatile(
+      "1:                                                         \n\t"
+      "gsldrc1   %[src0],         0x00(%[src_argb1555])           \n\t"
+      "gsldlc1   %[src0],         0x07(%[src_argb1555])           \n\t"
+      "psrlh     %[src1],         %[src0],              %[eight]  \n\t"
+      "and       %[b],            %[src0],              %[c0]     \n\t"
+      "and       %[src0],         %[src0],              %[c1]     \n\t"
+      "psrlh     %[src0],         %[src0],              %[five]   \n\t"
+      "and       %[g],            %[src1],              %[c2]     \n\t"
+      "psllh     %[g],            %[g],                 %[three]  \n\t"
+      "or        %[g],            %[src0],              %[g]      \n\t"
+      "and       %[r],            %[src1],              %[c3]     \n\t"
+      "psrlh     %[r],            %[r],                 %[two]    \n\t"
+      "psrlh     %[a],            %[src1],              %[seven]  \n\t"
+      "psllh     %[src0],         %[b],                 %[three]  \n\t"
+      "psrlh     %[src1],         %[b],                 %[two]    \n\t"
+      "or        %[b],            %[src0],              %[src1]   \n\t"
+      "psllh     %[src0],         %[g],                 %[three]  \n\t"
+      "psrlh     %[src1],         %[g],                 %[two]    \n\t"
+      "or        %[g],            %[src0],              %[src1]   \n\t"
+      "psllh     %[src0],         %[r],                 %[three]  \n\t"
+      "psrlh     %[src1],         %[r],                 %[two]    \n\t"
+      "or        %[r],            %[src0],              %[src1]   \n\t"
+      "xor       %[a],            %[a],                 %[c1]     \n\t"
+      "paddb     %[a],            %[a],                 %[c4]     \n\t"
+      "packushb  %[b],            %[b],                 %[r]      \n\t"
+      "packushb  %[g],            %[g],                 %[a]      \n\t"
+      "punpcklbh %[src0],         %[b],                 %[g]      \n\t"
+      "punpckhbh %[src1],         %[b],                 %[g]      \n\t"
+      "punpcklhw %[r],            %[src0],              %[src1]   \n\t"
+      "gssdrc1   %[r],            0x00(%[dst_argb])               \n\t"
+      "gssdlc1   %[r],            0x07(%[dst_argb])               \n\t"
+      "punpckhhw %[r],            %[src0],              %[src1]   \n\t"
+      "gssdrc1   %[r],            0x08(%[dst_argb])               \n\t"
+      "gssdlc1   %[r],            0x0f(%[dst_argb])               \n\t"
+      "daddiu    %[src_argb1555], %[src_argb1555],      0x08      \n\t"
+      "daddiu    %[dst_argb],     %[dst_argb],          0x10      \n\t"
+      "daddiu    %[width],        %[width],            -0x04      \n\t"
+      "bgtz      %[width],        1b                              \n\t"
+      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
+        [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5])
+      : [src_argb1555] "r"(src_argb1555), [dst_argb] "r"(dst_argb),
+        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
+        [c3] "f"(c3), [c4] "f"(c4), [eight] "f"(0x08), [five] "f"(0x05),
+        [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07)
+      : "memory");
+}
+
+void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444,
+                           uint8_t* dst_argb,
+                           int width) {
+  uint64_t ftmp[6];
+  uint64_t c0 = 0x000f000f000f000f;
+  uint64_t c1 = 0x00ff00ff00ff00ff;
+  __asm__ volatile(
+      "1:                                                          \n\t"
+      "gsldrc1   %[src0],         0x00(%[src_argb4444])            \n\t"
+      "gsldlc1   %[src0],         0x07(%[src_argb4444])            \n\t"
+      "psrlh     %[src1],         %[src0],              %[eight]   \n\t"
+      "and       %[b],            %[src0],              %[c0]      \n\t"
+      "and       %[src0],         %[src0],              %[c1]      \n\t"
+      "psrlh     %[g],            %[src0],              %[four]    \n\t"
+      "and       %[r],            %[src1],              %[c0]      \n\t"
+      "psrlh     %[a],            %[src1],              %[four]    \n\t"
+      "psllh     %[src0],         %[b],                 %[four]    \n\t"
+      "or        %[b],            %[src0],              %[b]       \n\t"
+      "psllh     %[src0],         %[g],                 %[four]    \n\t"
+      "or        %[g],            %[src0],              %[g]       \n\t"
+      "psllh     %[src0],         %[r],                 %[four]    \n\t"
+      "or        %[r],            %[src0],              %[r]       \n\t"
+      "psllh     %[src0],         %[a],                 %[four]    \n\t"
+      "or        %[a],            %[src0],              %[a]       \n\t"
+      "packushb  %[b],            %[b],                 %[r]       \n\t"
+      "packushb  %[g],            %[g],                 %[a]       \n\t"
+      "punpcklbh %[src0],         %[b],                 %[g]       \n\t"
+      "punpckhbh %[src1],         %[b],                 %[g]       \n\t"
+      "punpcklhw %[r],            %[src0],              %[src1]    \n\t"
+      "gssdrc1   %[r],            0x00(%[dst_argb])                \n\t"
+      "gssdlc1   %[r],            0x07(%[dst_argb])                \n\t"
+      "punpckhhw %[r],            %[src0],              %[src1]    \n\t"
+      "gssdrc1   %[r],            0x08(%[dst_argb])                \n\t"
+      "gssdlc1   %[r],            0x0f(%[dst_argb])                \n\t"
+      "daddiu    %[src_argb4444], %[src_argb4444],      0x08       \n\t"
+      "daddiu    %[dst_argb],     %[dst_argb],          0x10       \n\t"
+      "daddiu    %[width],        %[width],            -0x04       \n\t"
+      "bgtz      %[width],        1b                               \n\t"
+      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
+        [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5])
+      : [src_argb4444] "r"(src_argb4444), [dst_argb] "r"(dst_argb),
+        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [eight] "f"(0x08),
+        [four] "f"(0x04)
+      : "memory");
+}
+
+void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  uint64_t src;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gslwlc1    %[src],          0x03(%[src_ptr])                 \n\t"
+      "gslwrc1    %[src],          0x00(%[src_ptr])                 \n\t"
+      "gsswlc1    %[src],          0x03(%[dst_ptr])                 \n\t"
+      "gsswrc1    %[src],          0x00(%[dst_ptr])                 \n\t"
+
+      "gslwlc1    %[src],          0x07(%[src_ptr])                 \n\t"
+      "gslwrc1    %[src],          0x04(%[src_ptr])                 \n\t"
+      "gsswlc1    %[src],          0x06(%[dst_ptr])                 \n\t"
+      "gsswrc1    %[src],          0x03(%[dst_ptr])                 \n\t"
+
+      "gslwlc1    %[src],          0x0b(%[src_ptr])                 \n\t"
+      "gslwrc1    %[src],          0x08(%[src_ptr])                 \n\t"
+      "gsswlc1    %[src],          0x09(%[dst_ptr])                 \n\t"
+      "gsswrc1    %[src],          0x06(%[dst_ptr])                 \n\t"
+
+      "gslwlc1    %[src],          0x0f(%[src_ptr])                 \n\t"
+      "gslwrc1    %[src],          0x0c(%[src_ptr])                 \n\t"
+      "gsswlc1    %[src],          0x0c(%[dst_ptr])                 \n\t"
+      "gsswrc1    %[src],          0x09(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x0c          \n\t"
+      "daddi      %[width],        %[width],         -0x04          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src] "=&f"(src)
+      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_rgb), [width] "r"(width)
+      : "memory");
+}
+
+void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  uint64_t src0, src1;
+  uint64_t ftmp[3];
+  uint64_t mask0 = 0xc6;
+  uint64_t mask1 = 0x18;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
+      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
+      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
+
+      "punpcklbh  %[ftmp0],        %[src0],           %[zero]       \n\t"
+      "pshufh     %[ftmp0],        %[ftmp0],          %[mask0]      \n\t"
+      "punpckhbh  %[ftmp1],        %[src0],           %[zero]       \n\t"
+      "punpcklbh  %[ftmp2],        %[src1],           %[zero]       \n\t"
+      "punpckhbh  %[src1],         %[src1],           %[zero]       \n\t"
+
+      "pextrh     %[src0],         %[ftmp1],          %[two]        \n\t"
+      "pinsrh_3   %[ftmp0],        %[ftmp0],          %[src0]       \n\t"
+      "pshufh     %[ftmp1],        %[ftmp1],          %[one]        \n\t"
+
+      "pextrh     %[src0],         %[ftmp2],          %[two]        \n\t"
+      "pinsrh_2   %[ftmp1],        %[ftmp1],          %[src0]       \n\t"
+      "pextrh     %[src0],         %[ftmp2],          %[one]        \n\t"
+      "pinsrh_3   %[ftmp1],        %[ftmp1],          %[src0]       \n\t"
+      "pextrh     %[src0],         %[ftmp2],          %[zero]       \n\t"
+      "pshufh     %[src1],         %[src1],           %[mask1]      \n\t"
+      "pinsrh_0   %[src1],         %[src1],           %[src0]       \n\t"
+      "packushb   %[ftmp0],        %[ftmp0],          %[ftmp1]      \n\t"
+      "packushb   %[src1],         %[src1],           %[zero]       \n\t"
+
+      "gssdrc1    %[ftmp0],        0x00(%[dst_rgb])                 \n\t"
+      "gssdlc1    %[ftmp0],        0x07(%[dst_rgb])                 \n\t"
+      "gsswrc1    %[src1],         0x08(%[dst_rgb])                 \n\t"
+      "gsswlc1    %[src1],         0x0b(%[dst_rgb])                 \n\t"
+
+      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
+      "daddiu     %[dst_rgb],      %[dst_rgb],        0x0c          \n\t"
+      "daddiu     %[width],        %[width],         -0x04          \n\t"
+      "bgtz       %[width],        1b                               \n\t"
+      : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]),
+        [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2])
+      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
+        [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00),
+        [one] "f"(0x01), [two] "f"(0x02)
+      : "memory");
+}
+
+void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  uint64_t src0, src1;
+  uint64_t ftmp[3];
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
+      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
+      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
+
+      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
+      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
+      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
+      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
+      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
+      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
+      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
+
+      "psrlh      %[b],            %[b],              %[three]      \n\t"
+      "psrlh      %[g],            %[g],              %[two]        \n\t"
+      "psrlh      %[r],            %[r],              %[three]      \n\t"
+
+      "psllh      %[g],            %[g],              %[five]       \n\t"
+      "psllh      %[r],            %[r],              %[eleven]     \n\t"
+      "or         %[b],            %[b],              %[g]          \n\t"
+      "or         %[b],            %[b],              %[r]          \n\t"
+
+      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
+      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
+
+      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
+      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
+      "daddiu     %[width],        %[width],         -0x04          \n\t"
+      "bgtz       %[width],        1b                               \n\t"
+      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
+        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2])
+      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
+        [zero] "f"(0x00), [two] "f"(0x02), [three] "f"(0x03), [five] "f"(0x05),
+        [eleven] "f"(0x0b)
+      : "memory");
+}
+
+// dither4 is a row of 4 values from 4x4 dither matrix.
+// The 4x4 matrix contains values to increase RGB.  When converting to
+// fewer bits (565) this provides an ordered dither.
+// The order in the 4x4 matrix in first byte is upper left.
+// The 4 values are passed as an int, then referenced as an array, so
+// endian will not affect order of the original matrix.  But the dither4
+// will containing the first pixel in the lower byte for little endian
+// or the upper byte for big endian.
+void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb,
+                               uint8_t* dst_rgb,
+                               const uint32_t dither4,
+                               int width) {
+  uint64_t src0, src1;
+  uint64_t ftmp[3];
+  uint64_t c0 = 0x00ff00ff00ff00ff;
+
+  __asm__ volatile(
+      "punpcklbh  %[dither],       %[dither],         %[zero]       \n\t"
+      "1:                                                           \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
+      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
+      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
+
+      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
+      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
+      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
+      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
+      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
+      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
+      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
+
+      "paddh      %[b],            %[b],              %[dither]     \n\t"
+      "paddh      %[g],            %[g],              %[dither]     \n\t"
+      "paddh      %[r],            %[r],              %[dither]     \n\t"
+      "pcmpgth    %[src0],         %[b],              %[c0]         \n\t"
+      "or         %[src0],         %[src0],           %[b]          \n\t"
+      "and        %[b],            %[src0],           %[c0]         \n\t"
+      "pcmpgth    %[src0],         %[g],              %[c0]         \n\t"
+      "or         %[src0],         %[src0],           %[g]          \n\t"
+      "and        %[g],            %[src0],           %[c0]         \n\t"
+      "pcmpgth    %[src0],         %[r],              %[c0]         \n\t"
+      "or         %[src0],         %[src0],           %[r]          \n\t"
+      "and        %[r],            %[src0],           %[c0]         \n\t"
+
+      "psrlh      %[b],            %[b],              %[three]      \n\t"
+      "psrlh      %[g],            %[g],              %[two]        \n\t"
+      "psrlh      %[r],            %[r],              %[three]      \n\t"
+
+      "psllh      %[g],            %[g],              %[five]       \n\t"
+      "psllh      %[r],            %[r],              %[eleven]     \n\t"
+      "or         %[b],            %[b],              %[g]          \n\t"
+      "or         %[b],            %[b],              %[r]          \n\t"
+
+      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
+      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
+
+      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
+      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
+      "daddiu     %[width],        %[width],         -0x04          \n\t"
+      "bgtz       %[width],        1b                               \n\t"
+      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
+        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2])
+      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
+        [dither] "f"(dither4), [c0] "f"(c0), [zero] "f"(0x00), [two] "f"(0x02),
+        [three] "f"(0x03), [five] "f"(0x05), [eleven] "f"(0x0b)
+      : "memory");
+}
+
+void ARGBToARGB1555Row_MMI(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width) {
+  uint64_t src0, src1;
+  uint64_t ftmp[4];
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
+      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
+      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
+
+      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
+      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
+      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
+      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
+      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
+      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
+      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
+      "punpckhbh  %[a],            %[src1],           %[zero]       \n\t"
+
+      "psrlh      %[b],            %[b],              %[three]      \n\t"
+      "psrlh      %[g],            %[g],              %[three]      \n\t"
+      "psrlh      %[r],            %[r],              %[three]      \n\t"
+      "psrlh      %[a],            %[a],              %[seven]      \n\t"
+
+      "psllh      %[g],            %[g],              %[five]       \n\t"
+      "psllh      %[r],            %[r],              %[ten]        \n\t"
+      "psllh      %[a],            %[a],              %[fifteen]    \n\t"
+      "or         %[b],            %[b],              %[g]          \n\t"
+      "or         %[b],            %[b],              %[r]          \n\t"
+      "or         %[b],            %[b],              %[a]          \n\t"
+
+      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
+      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
+
+      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
+      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
+      "daddiu     %[width],        %[width],         -0x04          \n\t"
+      "bgtz       %[width],        1b                               \n\t"
+      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
+        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3])
+      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
+        [zero] "f"(0x00), [three] "f"(0x03), [five] "f"(0x05),
+        [seven] "f"(0x07), [ten] "f"(0x0a), [fifteen] "f"(0x0f)
+      : "memory");
+}
+
+void ARGBToARGB4444Row_MMI(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width) {
+  uint64_t src0, src1;
+  uint64_t ftmp[4];
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
+      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
+      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
+
+      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
+      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
+      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
+      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
+      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
+      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
+      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
+      "punpckhbh  %[a],            %[src1],           %[zero]       \n\t"
+
+      "psrlh      %[b],            %[b],              %[four]       \n\t"
+      "psrlh      %[g],            %[g],              %[four]       \n\t"
+      "psrlh      %[r],            %[r],              %[four]       \n\t"
+      "psrlh      %[a],            %[a],              %[four]       \n\t"
+
+      "psllh      %[g],            %[g],              %[four]       \n\t"
+      "psllh      %[r],            %[r],              %[eight]      \n\t"
+      "psllh      %[a],            %[a],              %[twelve]     \n\t"
+      "or         %[b],            %[b],              %[g]          \n\t"
+      "or         %[b],            %[b],              %[r]          \n\t"
+      "or         %[b],            %[b],              %[a]          \n\t"
+
+      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
+      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
+
+      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
+      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
+      "daddiu     %[width],        %[width],         -0x04          \n\t"
+      "bgtz       %[width],        1b                               \n\t"
+      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
+        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3])
+      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
+        [zero] "f"(0x00), [four] "f"(0x04), [eight] "f"(0x08),
+        [twelve] "f"(0x0c)
+      : "memory");
+}
+
+void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  uint64_t src, src_hi, src_lo;
+  uint64_t dest0, dest1, dest2, dest3;
+  const uint64_t value = 0x1080;
+  const uint64_t mask = 0x0001004200810019;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
+      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
+
+      "gsldlc1    %[src],          0x0f(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x08(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
+      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
+
+      "gsldlc1    %[src],          0x17(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x10(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
+      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
+
+      "gsldlc1    %[src],          0x1f(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x18(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
+      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
+
+      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
+      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
+      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
+      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
+      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
+
+      "daddiu     %[src_argb0],    %[src_argb0],      0x20          \n\t"
+      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x08          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+        [dest3] "=&f"(dest3)
+      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
+        [zero] "f"(0x00)
+      : "memory");
+}
+
+void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  uint64_t src_rgb1;
+  uint64_t ftmp[13];
+  uint64_t tmp[1];
+  const uint64_t value = 0x4040;
+  const uint64_t mask_u = 0x0013002500380002;
+  const uint64_t mask_v = 0x00020038002f0009;
+
+  __asm__ volatile(
+      "dli        %[tmp0],         0x0001000100010001                   \n\t"
+      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
+      "1:                                                               \n\t"
+      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsll       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
+      "pinsrh_3   %[dest0_v],      %[src0],           %[value]          \n\t"
+      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x08(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x0f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
+      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
+      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
+      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x10(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x17(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsll       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
+      "pinsrh_3   %[dest1_v],      %[src0],           %[value]          \n\t"
+      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
+      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
+      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
+      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x20(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x27(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsll       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
+      "pinsrh_3   %[dest2_v],      %[src0],           %[value]          \n\t"
+      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x28(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x2f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
+      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
+      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
+      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x30(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x37(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsll       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
+      "pinsrh_3   %[dest3_v],      %[src0],           %[value]          \n\t"
+      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x38(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x3f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
+      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
+      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
+      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
+
+      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
+      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
+      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
+      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
+      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
+
+      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
+      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
+      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
+      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
+      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
+
+      "daddiu     %[src_rgb0],     %[src_rgb0],       0x40              \n\t"
+      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
+      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
+      "daddi      %[width],        %[width],         -0x10              \n\t"
+      "bgtz       %[width],        1b                                   \n\t"
+      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
+        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
+        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
+        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
+        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
+        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
+        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
+        [sixteen] "f"(0x10)
+      : "memory");
+}
+
+void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  uint64_t src, src_hi, src_lo;
+  uint64_t dest0, dest1, dest2, dest3;
+  const uint64_t value = 0x1080;
+  const uint64_t mask = 0x0019008100420001;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
+      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
+
+      "gsldlc1    %[src],          0x0f(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x08(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
+      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
+
+      "gsldlc1    %[src],          0x17(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x10(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
+      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
+
+      "gsldlc1    %[src],          0x1f(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x18(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
+      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
+
+      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
+      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
+      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
+      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
+      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
+
+      "daddiu     %[src_argb0],    %[src_argb0],      0x20          \n\t"
+      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x08          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+        [dest3] "=&f"(dest3)
+      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
+        [zero] "f"(0x00)
+      : "memory");
+}
+
+void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  uint64_t src_rgb1;
+  uint64_t ftmp[13];
+  uint64_t tmp[1];
+  const uint64_t value = 0x4040;
+  const uint64_t mask_u = 0x0002003800250013;
+  const uint64_t mask_v = 0x0009002f00380002;
+
+  __asm__ volatile(
+      "dli        %[tmp0],         0x0001000100010001                   \n\t"
+      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
+      "1:                                                               \n\t"
+      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsrl       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_3   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
+      "pinsrh_0   %[dest0_v],      %[src0],           %[value]          \n\t"
+      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x08(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x0f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
+      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
+      "psubw      %[dest0_u],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
+      "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x10(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x17(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsrl       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_3   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
+      "pinsrh_0   %[dest1_v],      %[src0],           %[value]          \n\t"
+      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
+      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
+      "psubw      %[dest1_u],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
+      "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x20(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x27(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsrl       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_3   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
+      "pinsrh_0   %[dest2_v],      %[src0],           %[value]          \n\t"
+      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x28(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x2f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]        \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
+      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
+      "psubw      %[dest2_u],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
+      "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x30(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x37(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsrl       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_3   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
+      "pinsrh_0   %[dest3_v],      %[src0],           %[value]          \n\t"
+      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x38(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x3f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
+      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
+      "psubw      %[dest3_u],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
+      "psubw      %[dest3_v],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
+
+      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
+      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
+      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
+      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
+      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
+
+      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
+      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
+      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
+      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
+      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
+
+      "daddiu     %[src_rgb0],     %[src_rgb0],       0x40              \n\t"
+      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
+      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
+      "daddi      %[width],        %[width],         -0x10              \n\t"
+      "bgtz       %[width],        1b                                   \n\t"
+      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
+        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
+        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
+        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
+        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
+        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
+        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
+        [sixteen] "f"(0x10)
+      : "memory");
+}
+
+void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  uint64_t src, src_hi, src_lo;
+  uint64_t dest0, dest1, dest2, dest3;
+  const uint64_t value = 0x1080;
+  const uint64_t mask = 0x0001001900810042;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
+      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
+
+      "gsldlc1    %[src],          0x0f(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x08(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
+      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
+
+      "gsldlc1    %[src],          0x17(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x10(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
+      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
+
+      "gsldlc1    %[src],          0x1f(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x18(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
+      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
+
+      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
+      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
+      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
+      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
+      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
+
+      "daddiu     %[src_argb0],    %[src_argb0],      0x20          \n\t"
+      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x08          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+        [dest3] "=&f"(dest3)
+      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
+        [zero] "f"(0x00)
+      : "memory");
+}
+
+void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  uint64_t src_rgb1;
+  uint64_t ftmp[13];
+  uint64_t tmp[1];
+  const uint64_t value = 0x4040;
+  const uint64_t mask_u = 0x0002003800250013;
+  const uint64_t mask_v = 0x0009002F00380002;
+
+  __asm__ volatile(
+      "dli        %[tmp0],         0x0001000100010001                   \n\t"
+      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
+      "1:                                                               \n\t"
+      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_3   %[dest0_u],      %[src0],           %[value]          \n\t"
+      "dsll       %[dest0_v],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[dest0_v],      %[dest0_v],        %[value]          \n\t"
+      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x08(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x0f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
+      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
+      "psubw      %[dest0_u],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
+      "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x10(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x17(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_3   %[dest1_u],      %[src0],           %[value]          \n\t"
+      "dsll       %[dest1_v],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[dest1_v],      %[dest1_v],        %[value]          \n\t"
+      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
+      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
+      "psubw      %[dest1_u],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
+      "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x20(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x27(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_3   %[dest2_u],      %[src0],           %[value]          \n\t"
+      "dsll       %[dest2_v],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[dest2_v],      %[dest2_v],        %[value]          \n\t"
+      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x28(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x2f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
+      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
+      "psubw      %[dest2_u],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
+      "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x30(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x37(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_3   %[dest3_u],      %[src0],           %[value]          \n\t"
+      "dsll       %[dest3_v],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[dest3_v],      %[dest3_v],        %[value]          \n\t"
+      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x38(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x3f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
+      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
+      "psubw      %[dest3_u],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
+      "psubw      %[dest3_v],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
+
+      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
+      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
+      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
+      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
+      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
+
+      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
+      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
+      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
+      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
+      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
+
+      "daddiu     %[src_rgb0],     %[src_rgb0],       0x40              \n\t"
+      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
+      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
+      "daddi      %[width],        %[width],         -0x10              \n\t"
+      "bgtz       %[width],        1b                                   \n\t"
+      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
+        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
+        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
+        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
+        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
+        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
+        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
+        [sixteen] "f"(0x10)
+      : "memory");
+}
+
+void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  uint64_t src, src_hi, src_lo;
+  uint64_t dest0, dest1, dest2, dest3;
+  const uint64_t value = 0x1080;
+  const uint64_t mask = 0x0042008100190001;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
+      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
+
+      "gsldlc1    %[src],          0x0f(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x08(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
+      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
+
+      "gsldlc1    %[src],          0x17(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x10(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
+      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
+
+      "gsldlc1    %[src],          0x1f(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x18(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
+      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
+
+      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
+      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
+      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
+      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
+      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
+
+      "daddiu     %[src_argb0],    %[src_argb0],      0x20          \n\t"
+      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x08          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+        [dest3] "=&f"(dest3)
+      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
+        [zero] "f"(0x00)
+      : "memory");
+}
+
+void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  uint64_t src_rgb1;
+  uint64_t ftmp[13];
+  uint64_t tmp[1];
+  const uint64_t value = 0x4040;
+  const uint64_t mask_u = 0x0013002500380002;
+  const uint64_t mask_v = 0x00020038002f0009;
+
+  __asm__ volatile(
+      "dli        %[tmp0],         0x0001000100010001                   \n\t"
+      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
+      "1:                                                               \n\t"
+      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_0   %[dest0_u],      %[src0],           %[value]          \n\t"
+      "dsrl       %[dest0_v],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_3   %[dest0_v],      %[dest0_v],        %[value]          \n\t"
+      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x08(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x0f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
+      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
+      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
+      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x10(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x17(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_0   %[dest1_u],      %[src0],           %[value]          \n\t"
+      "dsrl       %[dest1_v],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_3   %[dest1_v],      %[dest1_v],        %[value]          \n\t"
+      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
+      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
+      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
+      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x20(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x27(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_0   %[dest2_u],      %[src0],           %[value]          \n\t"
+      "dsrl       %[dest2_v],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_3   %[dest2_v],      %[dest2_v],        %[value]          \n\t"
+      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x28(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x2f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
+      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
+      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
+      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x30(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x37(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_0   %[dest3_u],      %[src0],           %[value]          \n\t"
+      "dsrl       %[dest3_v],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_3   %[dest3_v],      %[dest3_v],        %[value]          \n\t"
+      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x38(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x3f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
+      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
+      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
+      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
+
+      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
+      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
+      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
+      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
+      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
+
+      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
+      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
+      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
+      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
+      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
+
+      "daddiu     %[src_rgb0],     %[src_rgb0],       0x40              \n\t"
+      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
+      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
+      "daddi      %[width],        %[width],         -0x10              \n\t"
+      "bgtz       %[width],        1b                                   \n\t"
+      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
+        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
+        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
+        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
+        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
+        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
+        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
+        [sixteen] "f"(0x10)
+      : "memory");
+}
+
+void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  uint64_t src, src_hi, src_lo;
+  uint64_t dest0, dest1, dest2, dest3;
+  const uint64_t value = 0x1080;
+  const uint64_t mask = 0x0001004200810019;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "dsll       %[src],          %[src],            %[eight]      \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
+      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
+
+      "gsldlc1    %[src],          0x0d(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x06(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "dsll       %[src],          %[src],            %[eight]      \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
+      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
+
+      "gsldlc1    %[src],          0x13(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x0c(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "dsll       %[src],          %[src],            %[eight]      \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
+      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
+
+      "gsldlc1    %[src],          0x19(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x12(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "dsll       %[src],          %[src],            %[eight]      \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
+      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
+
+      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
+      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
+      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
+      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
+      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
+
+      "daddiu     %[src_argb0],    %[src_argb0],      0x18          \n\t"
+      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x08          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+        [dest3] "=&f"(dest3)
+      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
+        [zero] "f"(0x00)
+      : "memory");
+}
+
+void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
+                      int src_stride_rgb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  uint64_t src_rgb1;
+  uint64_t ftmp[13];
+  uint64_t tmp[1];
+  const uint64_t value = 0x4040;
+  const uint64_t mask_u = 0x0013002500380002;
+  const uint64_t mask_v = 0x00020038002f0009;
+
+  __asm__ volatile(
+      "dli        %[tmp0],         0x0001000100010001                   \n\t"
+      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
+      "1:                                                               \n\t"
+      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsll       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
+      "pinsrh_3   %[dest0_v],      %[src0],           %[value]          \n\t"
+      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x06(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x0d(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x06(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x0d(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
+      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
+      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
+      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x0c(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x13(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x0c(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x13(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsll       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
+      "pinsrh_3   %[dest1_v],      %[src0],           %[value]          \n\t"
+      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x12(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x19(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x12(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x19(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
+      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
+      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
+      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsll       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
+      "pinsrh_3   %[dest2_v],      %[src0],           %[value]          \n\t"
+      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x1e(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x25(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x1e(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x25(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
+      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
+      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
+      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x24(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x2b(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x24(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x2b(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsll       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
+      "pinsrh_3   %[dest3_v],      %[src0],           %[value]          \n\t"
+      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x2a(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x31(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x2a(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x31(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
+      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
+      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
+      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
+
+      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
+      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
+      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
+      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
+      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
+
+      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
+      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
+      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
+      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
+      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
+
+      "daddiu     %[src_rgb0],     %[src_rgb0],       0x30              \n\t"
+      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
+      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
+      "daddi      %[width],        %[width],         -0x10              \n\t"
+      "bgtz       %[width],        1b                                   \n\t"
+      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
+        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
+        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
+        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
+        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
+        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
+        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
+        [sixteen] "f"(0x10)
+      : "memory");
+}
+
+void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  uint64_t src, src_hi, src_lo;
+  uint64_t dest0, dest1, dest2, dest3;
+  const uint64_t value = 0x1080;
+  const uint64_t mask = 0x0001001900810042;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "dsll       %[src],          %[src],            %[eight]      \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
+      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
+
+      "gsldlc1    %[src],          0x0d(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x06(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "dsll       %[src],          %[src],            %[eight]      \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
+      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
+
+      "gsldlc1    %[src],          0x13(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x0c(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "dsll       %[src],          %[src],            %[eight]      \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
+      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
+
+      "gsldlc1    %[src],          0x19(%[src_argb0])               \n\t"
+      "gsldrc1    %[src],          0x12(%[src_argb0])               \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
+      "dsll       %[src],          %[src],            %[eight]      \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
+      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
+      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
+
+      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
+      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
+      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
+      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
+      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
+
+      "daddiu     %[src_argb0],    %[src_argb0],      0x18          \n\t"
+      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x08          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+        [dest3] "=&f"(dest3)
+      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
+        [zero] "f"(0x00)
+      : "memory");
+}
+
+void RAWToUVRow_MMI(const uint8_t* src_rgb0,
+                    int src_stride_rgb,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width) {
+  uint64_t src_rgb1;
+  uint64_t ftmp[13];
+  uint64_t tmp[1];
+  const uint64_t value = 0x4040;
+  const uint64_t mask_u = 0x0002003800250013;
+  const uint64_t mask_v = 0x0009002f00380002;
+
+  __asm__ volatile(
+      "dli        %[tmp0],         0x0001000100010001                   \n\t"
+      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
+      "1:                                                               \n\t"
+      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_3   %[dest0_u],      %[src0],           %[value]          \n\t"
+      "dsll       %[dest0_v],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[dest0_v],      %[dest0_v],        %[value]          \n\t"
+      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x06(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x0d(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x06(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x0d(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
+      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
+      "psubw      %[dest0_u],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
+      "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x0c(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x13(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x0c(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x13(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_3   %[dest1_u],      %[src0],           %[value]          \n\t"
+      "dsll       %[dest1_v],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[dest1_v],      %[dest1_v],        %[value]          \n\t"
+      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x12(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x19(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x12(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x19(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
+      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
+      "psubw      %[dest1_u],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
+      "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_3   %[dest2_u],      %[src0],           %[value]          \n\t"
+      "dsll       %[dest2_v],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[dest2_v],      %[dest2_v],        %[value]          \n\t"
+      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x1e(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x25(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x1e(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x25(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
+      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
+      "psubw      %[dest2_u],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
+      "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x24(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x2b(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x24(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x2b(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_3   %[dest3_u],      %[src0],           %[value]          \n\t"
+      "dsll       %[dest3_v],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[dest3_v],      %[dest3_v],        %[value]          \n\t"
+      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x2a(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x31(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x2a(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x31(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
+      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
+      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
+      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
+      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
+      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
+      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
+      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
+      "psubw      %[dest3_u],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
+      "psubw      %[dest3_v],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
+
+      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
+      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
+      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
+      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
+      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
+
+      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
+      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
+      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
+      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
+      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
+
+      "daddiu     %[src_rgb0],     %[src_rgb0],       0x30              \n\t"
+      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
+      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
+      "daddi      %[width],        %[width],         -0x10              \n\t"
+      "bgtz       %[width],        1b                                   \n\t"
+      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
+        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
+        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
+        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
+        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
+        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
+        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
+        [sixteen] "f"(0x10)
+      : "memory");
+}
+
+void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  uint64_t src, src_hi, src_lo;
+  uint64_t dest, dest0, dest1, dest2, dest3;
+  uint64_t tmp0, tmp1;
+  const uint64_t shift = 0x08;
+  const uint64_t value = 0x80;
+  const uint64_t mask0 = 0x0;
+  const uint64_t mask1 = 0x0001004D0096001DULL;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
+      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
+      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
+
+      "gsldlc1    %[src],          0x0f(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src],          0x08(%[src_ptr])                 \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
+      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
+      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
+
+      "gsldlc1    %[src],          0x17(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src],          0x10(%[src_ptr])                 \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
+      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest2],        %[tmp0],           %[tmp1]       \n\t"
+      "psrlw      %[dest2],        %[dest2],          %[shift]      \n\t"
+
+      "gsldlc1    %[src],          0x1f(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src],          0x18(%[src_ptr])                 \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
+      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
+      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
+      "paddw      %[dest3],        %[tmp0],           %[tmp1]       \n\t"
+      "psrlw      %[dest3],        %[dest3],          %[shift]      \n\t"
+
+      "packsswh   %[tmp0],         %[dest0],          %[dest1]      \n\t"
+      "packsswh   %[tmp1],         %[dest2],          %[dest3]      \n\t"
+      "packushb   %[dest],         %[tmp0],           %[tmp1]       \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x08          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
+        [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
+        [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [tmp0] "=&f"(tmp0),
+        [tmp1] "=&f"(tmp1)
+      : [src_ptr] "r"(src_argb0), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0),
+        [mask1] "f"(mask1), [shift] "f"(shift), [value] "f"(value),
+        [width] "r"(width)
+      : "memory");
+}
+
+void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
+                      int src_stride_rgb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  uint64_t src_rgb1;
+  uint64_t ftmp[12];
+  const uint64_t value = 0x4040;
+  const uint64_t mask_u = 0x0015002a003f0002;
+  const uint64_t mask_v = 0x0002003f0035000a;
+
+  __asm__ volatile(
+      "1:                                                               \n\t"
+      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
+      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
+      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
+      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
+      "dsll       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
+      "pinsrh_3   %[dest0_v],      %[src0],           %[value]          \n\t"
+      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x08(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x0f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
+      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
+      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
+      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
+      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
+      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
+      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
+      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x10(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x17(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
+      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
+      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
+      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
+      "dsll       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
+      "pinsrh_3   %[dest1_v],      %[src0],           %[value]          \n\t"
+      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
+      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
+      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
+      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
+      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
+      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
+      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
+      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x20(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x27(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
+      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
+      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
+      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
+      "dsll       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
+      "pinsrh_3   %[dest2_v],      %[src0],           %[value]          \n\t"
+      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x28(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x2f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
+      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
+      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
+      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
+      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
+      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
+      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
+      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x30(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x37(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
+      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
+      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
+      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
+      "dsll       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
+      "pinsrh_3   %[dest3_v],      %[src0],           %[value]          \n\t"
+      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
+
+      "gsldrc1    %[src0],         0x38(%[src_rgb0])                    \n\t"
+      "gsldlc1    %[src0],         0x3f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
+      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
+      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
+      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
+      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
+      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
+      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
+      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
+      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
+      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
+
+      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
+      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
+      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
+      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
+      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
+
+      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
+      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
+      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
+      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
+      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
+
+      "daddiu     %[src_rgb0],     %[src_rgb0],       0x40              \n\t"
+      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
+      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
+      "daddi      %[width],        %[width],         -0x10              \n\t"
+      "bgtz       %[width],        1b                                   \n\t"
+      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
+        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
+        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
+        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
+        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
+        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
+      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
+        [zero] "f"(0x00), [eight] "f"(0x08),
+        [sixteen] "f"(0x10)
+      : "memory");
+}
+
+void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+  uint64_t ftmp[11];
+  const uint64_t value = 0x1080108010801080;
+  const uint64_t mask = 0x0001004200810019;
+  uint64_t c0 = 0x001f001f001f001f;
+  uint64_t c1 = 0x00ff00ff00ff00ff;
+  uint64_t c2 = 0x0007000700070007;
+  __asm__ volatile(
+      "1:                                                            \n\t"
+      "gsldrc1    %[src0],        0x00(%[src_rgb565])                \n\t"
+      "gsldlc1    %[src0],        0x07(%[src_rgb565])                \n\t"
+      "psrlh      %[src1],        %[src0],             %[eight]      \n\t"
+      "and        %[b],           %[src0],             %[c0]         \n\t"
+      "and        %[src0],        %[src0],             %[c1]         \n\t"
+      "psrlh      %[src0],        %[src0],             %[five]       \n\t"
+      "and        %[g],           %[src1],             %[c2]         \n\t"
+      "psllh      %[g],           %[g],                %[three]      \n\t"
+      "or         %[g],           %[src0],             %[g]          \n\t"
+      "psrlh      %[r],           %[src1],             %[three]      \n\t"
+      "psllh      %[src0],        %[b],                %[three]      \n\t"
+      "psrlh      %[src1],        %[b],                %[two]        \n\t"
+      "or         %[b],           %[src0],             %[src1]       \n\t"
+      "psllh      %[src0],        %[g],                %[two]        \n\t"
+      "psrlh      %[src1],        %[g],                %[four]       \n\t"
+      "or         %[g],           %[src0],             %[src1]       \n\t"
+      "psllh      %[src0],        %[r],                %[three]      \n\t"
+      "psrlh      %[src1],        %[r],                %[two]        \n\t"
+      "or         %[r],           %[src0],             %[src1]       \n\t"
+      "punpcklhw  %[src0],        %[b],                %[r]          \n\t"
+      "punpcklhw  %[src1],        %[g],                %[value]      \n\t"
+      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
+      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
+      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
+      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
+      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
+      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
+      "paddw      %[dest0],       %[src0],             %[src1]       \n\t"
+      "psrlw      %[dest0],       %[dest0],            %[eight]      \n\t"
+
+      "punpckhhw  %[src0],        %[b],                %[r]          \n\t"
+      "punpckhhw  %[src1],        %[g],                %[value]      \n\t"
+      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
+      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
+      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
+      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
+      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
+      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
+      "paddw      %[dest1],       %[src0],             %[src1]       \n\t"
+      "psrlw      %[dest1],       %[dest1],            %[eight]      \n\t"
+
+      "gsldrc1    %[src0],        0x08(%[src_rgb565])                \n\t"
+      "gsldlc1    %[src0],        0x0f(%[src_rgb565])                \n\t"
+      "psrlh      %[src1],        %[src0],             %[eight]      \n\t"
+      "and        %[b],           %[src0],             %[c0]         \n\t"
+      "and        %[src0],        %[src0],             %[c1]         \n\t"
+      "psrlh      %[src0],        %[src0],             %[five]       \n\t"
+      "and        %[g],           %[src1],             %[c2]         \n\t"
+      "psllh      %[g],           %[g],                %[three]      \n\t"
+      "or         %[g],           %[src0],             %[g]          \n\t"
+      "psrlh      %[r],           %[src1],             %[three]      \n\t"
+      "psllh      %[src0],        %[b],                %[three]      \n\t"
+      "psrlh      %[src1],        %[b],                %[two]        \n\t"
+      "or         %[b],           %[src0],             %[src1]       \n\t"
+      "psllh      %[src0],        %[g],                %[two]        \n\t"
+      "psrlh      %[src1],        %[g],                %[four]       \n\t"
+      "or         %[g],           %[src0],             %[src1]       \n\t"
+      "psllh      %[src0],        %[r],                %[three]      \n\t"
+      "psrlh      %[src1],        %[r],                %[two]        \n\t"
+      "or         %[r],           %[src0],             %[src1]       \n\t"
+      "punpcklhw  %[src0],        %[b],                %[r]          \n\t"
+      "punpcklhw  %[src1],        %[g],                %[value]      \n\t"
+      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
+      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
+      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
+      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
+      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
+      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
+      "paddw      %[dest2],       %[src0],             %[src1]       \n\t"
+      "psrlw      %[dest2],       %[dest2],            %[eight]      \n\t"
+
+      "punpckhhw  %[src0],        %[b],                %[r]          \n\t"
+      "punpckhhw  %[src1],        %[g],                %[value]      \n\t"
+      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
+      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
+      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
+      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
+      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
+      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
+      "paddw      %[dest3],       %[src0],             %[src1]       \n\t"
+      "psrlw      %[dest3],       %[dest3],            %[eight]      \n\t"
+
+      "packsswh   %[src_lo],      %[dest0],            %[dest1]      \n\t"
+      "packsswh   %[src_hi],      %[dest2],            %[dest3]      \n\t"
+      "packushb   %[dest0],       %[src_lo],           %[src_hi]     \n\t"
+      "gssdlc1    %[dest0],       0x07(%[dst_y])                     \n\t"
+      "gssdrc1    %[dest0],       0x00(%[dst_y])                     \n\t"
+
+      "daddiu    %[src_rgb565],   %[src_rgb565],       0x10          \n\t"
+      "daddiu    %[dst_y],        %[dst_y],            0x08          \n\t"
+      "daddiu    %[width],        %[width],           -0x08          \n\t"
+      "bgtz      %[width],        1b                                 \n\t"
+      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
+        [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
+        [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
+        [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
+      : [src_rgb565] "r"(src_rgb565), [dst_y] "r"(dst_y), [value] "f"(value),
+        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
+        [mask] "f"(mask), [eight] "f"(0x08), [five] "f"(0x05),
+        [three] "f"(0x03), [two] "f"(0x02), [four] "f"(0x04)
+      : "memory");
+}
+
+void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555,
+                        uint8_t* dst_y,
+                        int width) {
+  uint64_t ftmp[11];
+  const uint64_t value = 0x1080108010801080;
+  const uint64_t mask = 0x0001004200810019;
+  uint64_t c0 = 0x001f001f001f001f;
+  uint64_t c1 = 0x00ff00ff00ff00ff;
+  uint64_t c2 = 0x0003000300030003;
+  uint64_t c3 = 0x007c007c007c007c;
+  __asm__ volatile(
+      "1:                                                            \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_argb1555])             \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_argb1555])             \n\t"
+      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
+      "and        %[b],            %[src0],              %[c0]       \n\t"
+      "and        %[src0],         %[src0],              %[c1]       \n\t"
+      "psrlh      %[src0],         %[src0],              %[five]     \n\t"
+      "and        %[g],            %[src1],              %[c2]       \n\t"
+      "psllh      %[g],            %[g],                 %[three]    \n\t"
+      "or         %[g],            %[src0],              %[g]        \n\t"
+      "and        %[r],            %[src1],              %[c3]       \n\t"
+      "psrlh      %[r],            %[r],                 %[two]      \n\t"
+      "psllh      %[src0],         %[b],                 %[three]    \n\t"
+      "psrlh      %[src1],         %[b],                 %[two]      \n\t"
+      "or         %[b],            %[src0],              %[src1]     \n\t"
+      "psllh      %[src0],         %[g],                 %[three]    \n\t"
+      "psrlh      %[src1],         %[g],                 %[two]      \n\t"
+      "or         %[g],            %[src0],              %[src1]     \n\t"
+      "psllh      %[src0],         %[r],                 %[three]    \n\t"
+      "psrlh      %[src1],         %[r],                 %[two]      \n\t"
+      "or         %[r],            %[src0],              %[src1]     \n\t"
+      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
+      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
+      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
+      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
+      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
+      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
+      "paddw      %[dest0],        %[src0],              %[src1]     \n\t"
+      "psrlw      %[dest0],        %[dest0],             %[eight]    \n\t"
+
+      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
+      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
+      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
+      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
+      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
+      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
+      "paddw      %[dest1],        %[src0],              %[src1]     \n\t"
+      "psrlw      %[dest1],        %[dest1],             %[eight]    \n\t"
+
+      "gsldrc1    %[src0],         0x08(%[src_argb1555])             \n\t"
+      "gsldlc1    %[src0],         0x0f(%[src_argb1555])             \n\t"
+      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
+      "and        %[b],            %[src0],              %[c0]       \n\t"
+      "and        %[src0],         %[src0],              %[c1]       \n\t"
+      "psrlh      %[src0],         %[src0],              %[five]     \n\t"
+      "and        %[g],            %[src1],              %[c2]       \n\t"
+      "psllh      %[g],            %[g],                 %[three]    \n\t"
+      "or         %[g],            %[src0],              %[g]        \n\t"
+      "and        %[r],            %[src1],              %[c3]       \n\t"
+      "psrlh      %[r],            %[r],                 %[two]      \n\t"
+      "psllh      %[src0],         %[b],                 %[three]    \n\t"
+      "psrlh      %[src1],         %[b],                 %[two]      \n\t"
+      "or         %[b],            %[src0],              %[src1]     \n\t"
+      "psllh      %[src0],         %[g],                 %[three]    \n\t"
+      "psrlh      %[src1],         %[g],                 %[two]      \n\t"
+      "or         %[g],            %[src0],              %[src1]     \n\t"
+      "psllh      %[src0],         %[r],                 %[three]    \n\t"
+      "psrlh      %[src1],         %[r],                 %[two]      \n\t"
+      "or         %[r],            %[src0],              %[src1]     \n\t"
+      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
+      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
+      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
+      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
+      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
+      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
+      "paddw      %[dest2],        %[src0],              %[src1]     \n\t"
+      "psrlw      %[dest2],        %[dest2],             %[eight]    \n\t"
+
+      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
+      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
+      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
+      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
+      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
+      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
+      "paddw      %[dest3],        %[src0],              %[src1]     \n\t"
+      "psrlw      %[dest3],        %[dest3],             %[eight]    \n\t"
+
+      "packsswh   %[src_lo],       %[dest0],             %[dest1]    \n\t"
+      "packsswh   %[src_hi],       %[dest2],             %[dest3]    \n\t"
+      "packushb   %[dest0],        %[src_lo],            %[src_hi]   \n\t"
+      "gssdlc1    %[dest0],        0x07(%[dst_y])                    \n\t"
+      "gssdrc1    %[dest0],        0x00(%[dst_y])                    \n\t"
+
+      "daddiu     %[src_argb1555], %[src_argb1555],      0x10        \n\t"
+      "daddiu     %[dst_y],        %[dst_y],             0x08        \n\t"
+      "daddiu     %[width],        %[width],            -0x08        \n\t"
+      "bgtz       %[width],        1b                                \n\t"
+      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
+        [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
+        [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
+        [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
+      : [src_argb1555] "r"(src_argb1555), [dst_y] "r"(dst_y),
+        [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0),
+        [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [eight] "f"(0x08),
+        [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07)
+      : "memory");
+}
+
+void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444,
+                        uint8_t* dst_y,
+                        int width) {
+  uint64_t ftmp[11];
+  uint64_t value = 0x1080108010801080;
+  uint64_t mask = 0x0001004200810019;
+  uint64_t c0 = 0x000f000f000f000f;
+  uint64_t c1 = 0x00ff00ff00ff00ff;
+  __asm__ volatile(
+      "1:                                                            \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_argb4444])             \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_argb4444])             \n\t"
+      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
+      "and        %[b],            %[src0],              %[c0]       \n\t"
+      "and        %[src0],         %[src0],              %[c1]       \n\t"
+      "psrlh      %[g],            %[src0],              %[four]     \n\t"
+      "and        %[r],            %[src1],              %[c0]       \n\t"
+      "psllh      %[src0],         %[b],                 %[four]     \n\t"
+      "or         %[b],            %[src0],              %[b]        \n\t"
+      "psllh      %[src0],         %[g],                 %[four]     \n\t"
+      "or         %[g],            %[src0],              %[g]        \n\t"
+      "psllh      %[src0],         %[r],                 %[four]     \n\t"
+      "or         %[r],            %[src0],              %[r]        \n\t"
+      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
+      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
+      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
+      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
+      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
+      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
+      "paddw      %[dest0],        %[src0],              %[src1]     \n\t"
+      "psrlw      %[dest0],        %[dest0],             %[eight]    \n\t"
+
+      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
+      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
+      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
+      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
+      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
+      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
+      "paddw      %[dest1],        %[src0],              %[src1]     \n\t"
+      "psrlw      %[dest1],        %[dest1],             %[eight]    \n\t"
+
+      "gsldrc1    %[src0],         0x08(%[src_argb4444])             \n\t"
+      "gsldlc1    %[src0],         0x0f(%[src_argb4444])             \n\t"
+      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
+      "and        %[b],            %[src0],              %[c0]       \n\t"
+      "and        %[src0],         %[src0],              %[c1]       \n\t"
+      "psrlh      %[g],            %[src0],              %[four]     \n\t"
+      "and        %[r],            %[src1],              %[c0]       \n\t"
+      "psllh      %[src0],         %[b],                 %[four]     \n\t"
+      "or         %[b],            %[src0],              %[b]        \n\t"
+      "psllh      %[src0],         %[g],                 %[four]     \n\t"
+      "or         %[g],            %[src0],              %[g]        \n\t"
+      "psllh      %[src0],         %[r],                 %[four]     \n\t"
+      "or         %[r],            %[src0],              %[r]        \n\t"
+      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
+      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
+      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
+      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
+      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
+      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
+      "paddw      %[dest2],        %[src0],              %[src1]     \n\t"
+      "psrlw      %[dest2],        %[dest2],             %[eight]    \n\t"
+
+      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
+      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
+      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
+      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
+      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
+      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
+      "paddw      %[dest3],        %[src0],              %[src1]     \n\t"
+      "psrlw      %[dest3],        %[dest3],             %[eight]    \n\t"
+
+      "packsswh   %[src_lo],       %[dest0],             %[dest1]    \n\t"
+      "packsswh   %[src_hi],       %[dest2],             %[dest3]    \n\t"
+      "packushb   %[dest0],        %[src_lo],            %[src_hi]   \n\t"
+      "gssdlc1    %[dest0],        0x07(%[dst_y])                    \n\t"
+      "gssdrc1    %[dest0],        0x00(%[dst_y])                    \n\t"
+
+      "daddiu     %[src_argb4444], %[src_argb4444],      0x10        \n\t"
+      "daddiu     %[dst_y],        %[dst_y],             0x08        \n\t"
+      "daddiu     %[width],        %[width],            -0x08        \n\t"
+      "bgtz       %[width],        1b                                \n\t"
+      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
+        [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
+        [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
+        [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
+      : [src_argb4444] "r"(src_argb4444), [dst_y] "r"(dst_y),
+        [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0),
+        [c1] "f"(c1), [eight] "f"(0x08), [four] "f"(0x04)
+      : "memory");
+}
+
+void RGB565ToUVRow_MMI(const uint8_t* src_rgb565,
+                       int src_stride_rgb565,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  uint64_t ftmp[13];
+  uint64_t value = 0x2020202020202020;
+  uint64_t mask_u = 0x0026004a00700002;
+  uint64_t mask_v = 0x00020070005e0012;
+  uint64_t mask = 0x93;
+  uint64_t c0 = 0x001f001f001f001f;
+  uint64_t c1 = 0x00ff00ff00ff00ff;
+  uint64_t c2 = 0x0007000700070007;
+  __asm__ volatile(
+      "daddu      %[next_rgb565], %[src_rgb565],       %[next_rgb565]   \n\t"
+      "1:                                                               \n\t"
+      "gsldrc1    %[src0],        0x00(%[src_rgb565])                   \n\t"
+      "gsldlc1    %[src0],        0x07(%[src_rgb565])                   \n\t"
+      "gsldrc1    %[src1],        0x00(%[next_rgb565])                  \n\t"
+      "gsldlc1    %[src1],        0x07(%[next_rgb565])                  \n\t"
+      "psrlh      %[dest0_u],     %[src0],             %[eight]         \n\t"
+      "and        %[b0],          %[src0],             %[c0]            \n\t"
+      "and        %[src0],        %[src0],             %[c1]            \n\t"
+      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
+      "and        %[g0],          %[dest0_u],          %[c2]            \n\t"
+      "psllh      %[g0],          %[g0],               %[three]         \n\t"
+      "or         %[g0],          %[src0],             %[g0]            \n\t"
+      "psrlh      %[r0],          %[dest0_u],          %[three]         \n\t"
+      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
+      "and        %[dest0_u],     %[src1],             %[c0]            \n\t"
+      "and        %[src1],        %[src1],             %[c1]            \n\t"
+      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
+      "and        %[dest0_v],     %[src0],             %[c2]            \n\t"
+      "psllh      %[dest0_v],     %[dest0_v],          %[three]         \n\t"
+      "or         %[dest0_v],     %[src1],             %[dest0_v]       \n\t"
+      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
+      "paddh      %[b0],          %[b0],               %[dest0_u]       \n\t"
+      "paddh      %[g0],          %[g0],               %[dest0_v]       \n\t"
+      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
+      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
+      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
+      "punpcklwd  %[dest0_u],     %[src0],             %[src1]          \n\t"
+      "punpckhwd  %[dest0_v],     %[src0],             %[src1]          \n\t"
+      "paddh      %[src0],        %[dest0_u],          %[dest0_v]       \n\t"
+      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
+      "psllh      %[r0],          %[src0],             %[one]           \n\t"
+      "or         %[b0],          %[b0],               %[r0]            \n\t"
+      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
+      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
+      "punpcklwd  %[dest0_u],     %[src0],             %[src1]          \n\t"
+      "punpckhwd  %[dest0_v],     %[src0],             %[src1]          \n\t"
+      "paddh      %[g0],          %[dest0_u],          %[dest0_v]       \n\t"
+      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
+      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
+
+      "pmaddhw    %[dest0_v],     %[src0],             %[mask_v]        \n\t"
+      "pshufh     %[dest0_u],     %[src0],             %[mask]          \n\t"
+      "pmaddhw    %[dest0_u],     %[dest0_u],          %[mask_u]        \n\t"
+      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
+      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
+      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
+
+      "punpcklwd  %[src0],        %[dest0_u],          %[b0]            \n\t"
+      "punpckhwd  %[src1],        %[dest0_u],          %[b0]            \n\t"
+      "psubw      %[dest0_u],     %[src0],             %[src1]          \n\t"
+      "psraw      %[dest0_u],     %[dest0_u],          %[eight]         \n\t"
+      "punpcklwd  %[src0],        %[dest0_v],          %[g0]            \n\t"
+      "punpckhwd  %[src1],        %[dest0_v],          %[g0]            \n\t"
+      "psubw      %[dest0_v],     %[src1],             %[src0]          \n\t"
+      "psraw      %[dest0_v],     %[dest0_v],          %[eight]         \n\t"
+
+      "gsldrc1    %[src0],        0x08(%[src_rgb565])                   \n\t"
+      "gsldlc1    %[src0],        0x0f(%[src_rgb565])                   \n\t"
+      "gsldrc1    %[src1],        0x08(%[next_rgb565])                  \n\t"
+      "gsldlc1    %[src1],        0x0f(%[next_rgb565])                  \n\t"
+      "psrlh      %[dest1_u],     %[src0],             %[eight]         \n\t"
+      "and        %[b0],          %[src0],             %[c0]            \n\t"
+      "and        %[src0],        %[src0],             %[c1]            \n\t"
+      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
+      "and        %[g0],          %[dest1_u],          %[c2]            \n\t"
+      "psllh      %[g0],          %[g0],               %[three]         \n\t"
+      "or         %[g0],          %[src0],             %[g0]            \n\t"
+      "psrlh      %[r0],          %[dest1_u],          %[three]         \n\t"
+      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
+      "and        %[dest1_u],     %[src1],             %[c0]            \n\t"
+      "and        %[src1],        %[src1],             %[c1]            \n\t"
+      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
+      "and        %[dest1_v],     %[src0],             %[c2]            \n\t"
+      "psllh      %[dest1_v],     %[dest1_v],          %[three]         \n\t"
+      "or         %[dest1_v],     %[src1],             %[dest1_v]       \n\t"
+      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
+      "paddh      %[b0],          %[b0],               %[dest1_u]       \n\t"
+      "paddh      %[g0],          %[g0],               %[dest1_v]       \n\t"
+      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
+      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
+      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
+      "punpcklwd  %[dest1_u],     %[src0],             %[src1]          \n\t"
+      "punpckhwd  %[dest1_v],     %[src0],             %[src1]          \n\t"
+      "paddh      %[src0],        %[dest1_u],          %[dest1_v]       \n\t"
+      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
+      "psllh      %[r0],          %[src0],             %[one]           \n\t"
+      "or         %[b0],          %[b0],               %[r0]            \n\t"
+      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
+      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
+      "punpcklwd  %[dest1_u],     %[src0],             %[src1]          \n\t"
+      "punpckhwd  %[dest1_v],     %[src0],             %[src1]          \n\t"
+      "paddh      %[g0],          %[dest1_u],          %[dest1_v]       \n\t"
+      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
+      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
+
+      "pmaddhw    %[dest1_v],     %[src0],             %[mask_v]        \n\t"
+      "pshufh     %[dest1_u],     %[src0],             %[mask]          \n\t"
+      "pmaddhw    %[dest1_u],     %[dest1_u],          %[mask_u]        \n\t"
+      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
+      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
+      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
+
+      "punpcklwd  %[src0],        %[dest1_u],          %[b0]            \n\t"
+      "punpckhwd  %[src1],        %[dest1_u],          %[b0]            \n\t"
+      "psubw      %[dest1_u],     %[src0],             %[src1]          \n\t"
+      "psraw      %[dest1_u],     %[dest1_u],          %[eight]         \n\t"
+      "punpcklwd  %[src0],        %[dest1_v],          %[g0]            \n\t"
+      "punpckhwd  %[src1],        %[dest1_v],          %[g0]            \n\t"
+      "psubw      %[dest1_v],     %[src1],             %[src0]          \n\t"
+      "psraw      %[dest1_v],     %[dest1_v],          %[eight]         \n\t"
+
+      "gsldrc1    %[src0],        0x10(%[src_rgb565])                   \n\t"
+      "gsldlc1    %[src0],        0x17(%[src_rgb565])                   \n\t"
+      "gsldrc1    %[src1],        0x10(%[next_rgb565])                  \n\t"
+      "gsldlc1    %[src1],        0x17(%[next_rgb565])                  \n\t"
+      "psrlh      %[dest2_u],     %[src0],             %[eight]         \n\t"
+      "and        %[b0],          %[src0],             %[c0]            \n\t"
+      "and        %[src0],        %[src0],             %[c1]            \n\t"
+      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
+      "and        %[g0],          %[dest2_u],          %[c2]            \n\t"
+      "psllh      %[g0],          %[g0],               %[three]         \n\t"
+      "or         %[g0],          %[src0],             %[g0]            \n\t"
+      "psrlh      %[r0],          %[dest2_u],          %[three]         \n\t"
+      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
+      "and        %[dest2_u],     %[src1],             %[c0]            \n\t"
+      "and        %[src1],        %[src1],             %[c1]            \n\t"
+      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
+      "and        %[dest2_v],     %[src0],             %[c2]            \n\t"
+      "psllh      %[dest2_v],     %[dest2_v],          %[three]         \n\t"
+      "or         %[dest2_v],     %[src1],             %[dest2_v]       \n\t"
+      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
+      "paddh      %[b0],          %[b0],               %[dest2_u]       \n\t"
+      "paddh      %[g0],          %[g0],               %[dest2_v]       \n\t"
+      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
+      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
+      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
+      "punpcklwd  %[dest2_u],     %[src0],             %[src1]          \n\t"
+      "punpckhwd  %[dest2_v],     %[src0],             %[src1]          \n\t"
+      "paddh      %[src0],        %[dest2_u],          %[dest2_v]       \n\t"
+      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
+      "psllh      %[r0],          %[src0],             %[one]           \n\t"
+      "or         %[b0],          %[b0],               %[r0]            \n\t"
+      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
+      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
+      "punpcklwd  %[dest2_u],     %[src0],             %[src1]          \n\t"
+      "punpckhwd  %[dest2_v],     %[src0],             %[src1]          \n\t"
+      "paddh      %[g0],          %[dest2_u],          %[dest2_v]       \n\t"
+      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
+      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
+
+      "pmaddhw    %[dest2_v],     %[src0],             %[mask_v]        \n\t"
+      "pshufh     %[dest2_u],     %[src0],             %[mask]          \n\t"
+      "pmaddhw    %[dest2_u],     %[dest2_u],          %[mask_u]        \n\t"
+      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
+      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
+      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
+
+      "punpcklwd  %[src0],        %[dest2_u],          %[b0]            \n\t"
+      "punpckhwd  %[src1],        %[dest2_u],          %[b0]            \n\t"
+      "psubw      %[dest2_u],     %[src0],             %[src1]          \n\t"
+      "psraw      %[dest2_u],     %[dest2_u],          %[eight]         \n\t"
+      "punpcklwd  %[src0],        %[dest2_v],          %[g0]            \n\t"
+      "punpckhwd  %[src1],        %[dest2_v],          %[g0]            \n\t"
+      "psubw      %[dest2_v],     %[src1],             %[src0]          \n\t"
+      "psraw      %[dest2_v],     %[dest2_v],          %[eight]         \n\t"
+
+      "gsldrc1    %[src0],        0x18(%[src_rgb565])                   \n\t"
+      "gsldlc1    %[src0],        0x1f(%[src_rgb565])                   \n\t"
+      "gsldrc1    %[src1],        0x18(%[next_rgb565])                  \n\t"
+      "gsldlc1    %[src1],        0x1f(%[next_rgb565])                  \n\t"
+      "psrlh      %[dest3_u],     %[src0],             %[eight]         \n\t"
+      "and        %[b0],          %[src0],             %[c0]            \n\t"
+      "and        %[src0],        %[src0],             %[c1]            \n\t"
+      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
+      "and        %[g0],          %[dest3_u],          %[c2]            \n\t"
+      "psllh      %[g0],          %[g0],               %[three]         \n\t"
+      "or         %[g0],          %[src0],             %[g0]            \n\t"
+      "psrlh      %[r0],          %[dest3_u],          %[three]         \n\t"
+      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
+      "and        %[dest3_u],     %[src1],             %[c0]            \n\t"
+      "and        %[src1],        %[src1],             %[c1]            \n\t"
+      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
+      "and        %[dest3_v],     %[src0],             %[c2]            \n\t"
+      "psllh      %[dest3_v],     %[dest3_v],          %[three]         \n\t"
+      "or         %[dest3_v],     %[src1],             %[dest3_v]       \n\t"
+      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
+      "paddh      %[b0],          %[b0],               %[dest3_u]       \n\t"
+      "paddh      %[g0],          %[g0],               %[dest3_v]       \n\t"
+      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
+      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
+      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
+      "punpcklwd  %[dest3_u],     %[src0],             %[src1]          \n\t"
+      "punpckhwd  %[dest3_v],     %[src0],             %[src1]          \n\t"
+      "paddh      %[src0],        %[dest3_u],          %[dest3_v]       \n\t"
+      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
+      "psllh      %[r0],          %[src0],             %[one]           \n\t"
+      "or         %[b0],          %[b0],               %[r0]            \n\t"
+      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
+      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
+      "punpcklwd  %[dest3_u],     %[src0],             %[src1]          \n\t"
+      "punpckhwd  %[dest3_v],     %[src0],             %[src1]          \n\t"
+      "paddh      %[g0],          %[dest3_u],          %[dest3_v]       \n\t"
+      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
+      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
+
+      "pmaddhw    %[dest3_v],     %[src0],             %[mask_v]        \n\t"
+      "pshufh     %[dest3_u],     %[src0],             %[mask]          \n\t"
+      "pmaddhw    %[dest3_u],     %[dest3_u],          %[mask_u]        \n\t"
+      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
+      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
+      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
+
+      "punpcklwd  %[src0],        %[dest3_u],          %[b0]            \n\t"
+      "punpckhwd  %[src1],        %[dest3_u],          %[b0]            \n\t"
+      "psubw      %[dest3_u],     %[src0],             %[src1]          \n\t"
+      "psraw      %[dest3_u],     %[dest3_u],          %[eight]         \n\t"
+      "punpcklwd  %[src0],        %[dest3_v],          %[g0]            \n\t"
+      "punpckhwd  %[src1],        %[dest3_v],          %[g0]            \n\t"
+      "psubw      %[dest3_v],     %[src1],             %[src0]          \n\t"
+      "psraw      %[dest3_v],     %[dest3_v],          %[eight]         \n\t"
+
+      "packsswh   %[src0],        %[dest0_u],          %[dest1_u]       \n\t"
+      "packsswh   %[src1],        %[dest2_u],          %[dest3_u]       \n\t"
+      "packushb   %[dest0_u],     %[src0],             %[src1]          \n\t"
+      "gssdlc1    %[dest0_u],     0x07(%[dst_u])                        \n\t"
+      "gssdrc1    %[dest0_u],     0x00(%[dst_u])                        \n\t"
+      "packsswh   %[src0],        %[dest0_v],          %[dest1_v]       \n\t"
+      "packsswh   %[src1],        %[dest2_v],          %[dest3_v]       \n\t"
+      "packushb   %[dest0_v],     %[src0],             %[src1]          \n\t"
+      "gssdlc1    %[dest0_v],     0x07(%[dst_v])                        \n\t"
+      "gssdrc1    %[dest0_v],     0x00(%[dst_v])                        \n\t"
+
+      "daddiu    %[src_rgb565],   %[src_rgb565],       0x20             \n\t"
+      "daddiu    %[next_rgb565],  %[next_rgb565],      0x20             \n\t"
+      "daddiu    %[dst_u],        %[dst_u],            0x08             \n\t"
+      "daddiu    %[dst_v],        %[dst_v],            0x08             \n\t"
+      "daddiu    %[width],        %[width],           -0x10             \n\t"
+      "bgtz      %[width],        1b                                    \n\t"
+      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
+        [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
+        [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
+        [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
+        [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]),
+        [dest3_v] "=&f"(ftmp[12])
+      : [src_rgb565] "r"(src_rgb565), [next_rgb565] "r"(src_stride_rgb565),
+        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+        [value] "f"(value), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
+        [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
+        [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03),
+        [one] "f"(0x01)
+      : "memory");
+}
+
+void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555,
+                         int src_stride_argb1555,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  uint64_t ftmp[11];
+  uint64_t value = 0x2020202020202020;
+  uint64_t mask_u = 0x0026004a00700002;
+  uint64_t mask_v = 0x00020070005e0012;
+  uint64_t mask = 0x93;
+  uint64_t c0 = 0x001f001f001f001f;
+  uint64_t c1 = 0x00ff00ff00ff00ff;
+  uint64_t c2 = 0x0003000300030003;
+  uint64_t c3 = 0x007c007c007c007c;
+  __asm__ volatile(
+      "daddu      %[next_argb1555], %[src_argb1555],      %[next_argb1555] \n\t"
+      "1:                                                                  \n\t"
+      "gsldrc1    %[src0],          0x00(%[src_argb1555])                  \n\t"
+      "gsldlc1    %[src0],          0x07(%[src_argb1555])                  \n\t"
+      "gsldrc1    %[src1],          0x00(%[next_argb1555])                 \n\t"
+      "gsldlc1    %[src1],          0x07(%[next_argb1555])                 \n\t"
+      "psrlh      %[dest0_u],       %[src0],               %[eight]        \n\t"
+      "and        %[b0],            %[src0],               %[c0]           \n\t"
+      "and        %[src0],          %[src0],               %[c1]           \n\t"
+      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
+      "and        %[g0],            %[dest0_u],            %[c2]           \n\t"
+      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
+      "or         %[g0],            %[src0],               %[g0]           \n\t"
+      "and        %[r0],            %[dest0_u],            %[c3]           \n\t"
+      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
+      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
+      "and        %[dest0_u],       %[src1],               %[c0]           \n\t"
+      "and        %[src1],          %[src1],               %[c1]           \n\t"
+      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
+      "and        %[dest0_v],       %[src0],               %[c2]           \n\t"
+      "psllh      %[dest0_v],       %[dest0_v],            %[three]        \n\t"
+      "or         %[dest0_v],       %[src1],               %[dest0_v]      \n\t"
+      "and        %[src0],          %[src0],               %[c3]           \n\t"
+      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
+      "paddh      %[b0],            %[b0],                 %[dest0_u]      \n\t"
+      "paddh      %[g0],            %[g0],                 %[dest0_v]      \n\t"
+      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
+      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
+      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
+      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
+      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
+      "paddh      %[src0],          %[dest0_u],            %[dest0_v]      \n\t"
+      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
+      "psllh      %[r0],            %[src0],               %[one]          \n\t"
+      "or         %[b0],            %[b0],                 %[r0]           \n\t"
+      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
+      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
+      "or         %[g0],            %[g0],                 %[r0]           \n\t"
+      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
+      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
+      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
+      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
+      "paddh      %[g0],            %[dest0_u],            %[dest0_v]      \n\t"
+      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
+      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
+
+      "pmaddhw    %[dest0_v],       %[src0],               %[mask_v]       \n\t"
+      "pshufh     %[dest0_u],       %[src0],               %[mask]         \n\t"
+      "pmaddhw    %[dest0_u],       %[dest0_u],            %[mask_u]       \n\t"
+      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
+      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
+      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
+
+      "punpcklwd  %[src0],          %[dest0_u],            %[b0]           \n\t"
+      "punpckhwd  %[src1],          %[dest0_u],            %[b0]           \n\t"
+      "psubw      %[dest0_u],       %[src0],               %[src1]         \n\t"
+      "psraw      %[dest0_u],       %[dest0_u],            %[eight]        \n\t"
+      "punpcklwd  %[src0],          %[dest0_v],            %[g0]           \n\t"
+      "punpckhwd  %[src1],          %[dest0_v],            %[g0]           \n\t"
+      "psubw      %[dest0_v],       %[src1],               %[src0]         \n\t"
+      "psraw      %[dest0_v],       %[dest0_v],            %[eight]        \n\t"
+
+      "gsldrc1    %[src0],          0x08(%[src_argb1555])                  \n\t"
+      "gsldlc1    %[src0],          0x0f(%[src_argb1555])                  \n\t"
+      "gsldrc1    %[src1],          0x08(%[next_argb1555])                 \n\t"
+      "gsldlc1    %[src1],          0x0f(%[next_argb1555])                 \n\t"
+      "psrlh      %[dest1_u],       %[src0],               %[eight]        \n\t"
+      "and        %[b0],            %[src0],               %[c0]           \n\t"
+      "and        %[src0],          %[src0],               %[c1]           \n\t"
+      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
+      "and        %[g0],            %[dest1_u],            %[c2]           \n\t"
+      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
+      "or         %[g0],            %[src0],               %[g0]           \n\t"
+      "and        %[r0],            %[dest1_u],            %[c3]           \n\t"
+      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
+      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
+      "and        %[dest1_u],       %[src1],               %[c0]           \n\t"
+      "and        %[src1],          %[src1],               %[c1]           \n\t"
+      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
+      "and        %[dest1_v],       %[src0],               %[c2]           \n\t"
+      "psllh      %[dest1_v],       %[dest1_v],            %[three]        \n\t"
+      "or         %[dest1_v],       %[src1],               %[dest1_v]      \n\t"
+      "and        %[src0],          %[src0],               %[c3]           \n\t"
+      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
+      "paddh      %[b0],            %[b0],                 %[dest1_u]      \n\t"
+      "paddh      %[g0],            %[g0],                 %[dest1_v]      \n\t"
+      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
+      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
+      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
+      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
+      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
+      "paddh      %[src0],          %[dest1_u],            %[dest1_v]      \n\t"
+      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
+      "psllh      %[r0],            %[src0],               %[one]          \n\t"
+      "or         %[b0],            %[b0],                 %[r0]           \n\t"
+      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
+      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
+      "or         %[g0],            %[g0],                 %[r0]           \n\t"
+      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
+      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
+      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
+      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
+      "paddh      %[g0],            %[dest1_u],            %[dest1_v]      \n\t"
+      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
+      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
+
+      "pmaddhw    %[dest1_v],       %[src0],               %[mask_v]       \n\t"
+      "pshufh     %[dest1_u],       %[src0],               %[mask]         \n\t"
+      "pmaddhw    %[dest1_u],       %[dest1_u],            %[mask_u]       \n\t"
+      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
+      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
+      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
+
+      "punpcklwd  %[src0],          %[dest1_u],            %[b0]           \n\t"
+      "punpckhwd  %[src1],          %[dest1_u],            %[b0]           \n\t"
+      "psubw      %[dest1_u],       %[src0],               %[src1]         \n\t"
+      "psraw      %[dest1_u],       %[dest1_u],            %[eight]        \n\t"
+      "punpcklwd  %[src0],          %[dest1_v],            %[g0]           \n\t"
+      "punpckhwd  %[src1],          %[dest1_v],            %[g0]           \n\t"
+      "psubw      %[dest1_v],       %[src1],               %[src0]         \n\t"
+      "psraw      %[dest1_v],       %[dest1_v],            %[eight]        \n\t"
+
+      "packsswh   %[dest0_u],       %[dest0_u],            %[dest1_u]      \n\t"
+      "packsswh   %[dest1_u],       %[dest0_v],            %[dest1_v]      \n\t"
+
+      "gsldrc1    %[src0],          0x10(%[src_argb1555])                  \n\t"
+      "gsldlc1    %[src0],          0x17(%[src_argb1555])                  \n\t"
+      "gsldrc1    %[src1],          0x10(%[next_argb1555])                 \n\t"
+      "gsldlc1    %[src1],          0x17(%[next_argb1555])                 \n\t"
+      "psrlh      %[dest2_u],       %[src0],               %[eight]        \n\t"
+      "and        %[b0],            %[src0],               %[c0]           \n\t"
+      "and        %[src0],          %[src0],               %[c1]           \n\t"
+      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
+      "and        %[g0],            %[dest2_u],            %[c2]           \n\t"
+      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
+      "or         %[g0],            %[src0],               %[g0]           \n\t"
+      "and        %[r0],            %[dest2_u],            %[c3]           \n\t"
+      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
+      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
+      "and        %[dest2_u],       %[src1],               %[c0]           \n\t"
+      "and        %[src1],          %[src1],               %[c1]           \n\t"
+      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
+      "and        %[dest0_v],       %[src0],               %[c2]           \n\t"
+      "psllh      %[dest0_v],       %[dest0_v],            %[three]        \n\t"
+      "or         %[dest0_v],       %[src1],               %[dest0_v]      \n\t"
+      "and        %[src0],          %[src0],               %[c3]           \n\t"
+      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
+      "paddh      %[b0],            %[b0],                 %[dest2_u]      \n\t"
+      "paddh      %[g0],            %[g0],                 %[dest0_v]      \n\t"
+      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
+      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
+      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
+      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
+      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
+      "paddh      %[src0],          %[dest2_u],            %[dest0_v]      \n\t"
+      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
+      "psllh      %[r0],            %[src0],               %[one]          \n\t"
+      "or         %[b0],            %[b0],                 %[r0]           \n\t"
+      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
+      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
+      "or         %[g0],            %[g0],                 %[r0]           \n\t"
+      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
+      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
+      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
+      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
+      "paddh      %[g0],            %[dest2_u],            %[dest0_v]      \n\t"
+      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
+      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
+
+      "pmaddhw    %[dest0_v],       %[src0],               %[mask_v]       \n\t"
+      "pshufh     %[dest2_u],       %[src0],               %[mask]         \n\t"
+      "pmaddhw    %[dest2_u],       %[dest2_u],            %[mask_u]       \n\t"
+      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
+      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
+      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
+
+      "punpcklwd  %[src0],          %[dest2_u],            %[b0]           \n\t"
+      "punpckhwd  %[src1],          %[dest2_u],            %[b0]           \n\t"
+      "psubw      %[dest2_u],       %[src0],               %[src1]         \n\t"
+      "psraw      %[dest2_u],       %[dest2_u],            %[eight]        \n\t"
+      "punpcklwd  %[src0],          %[dest0_v],            %[g0]           \n\t"
+      "punpckhwd  %[src1],          %[dest0_v],            %[g0]           \n\t"
+      "psubw      %[dest0_v],       %[src1],               %[src0]         \n\t"
+      "psraw      %[dest0_v],       %[dest0_v],            %[eight]        \n\t"
+
+      "gsldrc1    %[src0],          0x18(%[src_argb1555])                  \n\t"
+      "gsldlc1    %[src0],          0x1f(%[src_argb1555])                  \n\t"
+      "gsldrc1    %[src1],          0x18(%[next_argb1555])                 \n\t"
+      "gsldlc1    %[src1],          0x1f(%[next_argb1555])                 \n\t"
+      "psrlh      %[dest3_u],       %[src0],               %[eight]        \n\t"
+      "and        %[b0],            %[src0],               %[c0]           \n\t"
+      "and        %[src0],          %[src0],               %[c1]           \n\t"
+      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
+      "and        %[g0],            %[dest3_u],            %[c2]           \n\t"
+      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
+      "or         %[g0],            %[src0],               %[g0]           \n\t"
+      "and        %[r0],            %[dest3_u],            %[c3]           \n\t"
+      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
+      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
+      "and        %[dest3_u],       %[src1],               %[c0]           \n\t"
+      "and        %[src1],          %[src1],               %[c1]           \n\t"
+      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
+      "and        %[dest1_v],       %[src0],               %[c2]           \n\t"
+      "psllh      %[dest1_v],       %[dest1_v],            %[three]        \n\t"
+      "or         %[dest1_v],       %[src1],               %[dest1_v]      \n\t"
+      "and        %[src0],          %[src0],               %[c3]           \n\t"
+      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
+      "paddh      %[b0],            %[b0],                 %[dest3_u]      \n\t"
+      "paddh      %[g0],            %[g0],                 %[dest1_v]      \n\t"
+      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
+      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
+      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
+      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
+      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
+      "paddh      %[src0],          %[dest3_u],            %[dest1_v]      \n\t"
+      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
+      "psllh      %[r0],            %[src0],               %[one]          \n\t"
+      "or         %[b0],            %[b0],                 %[r0]           \n\t"
+      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
+      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
+      "or         %[g0],            %[g0],                 %[r0]           \n\t"
+      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
+      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
+      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
+      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
+      "paddh      %[g0],            %[dest3_u],            %[dest1_v]      \n\t"
+      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
+      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
+
+      "pmaddhw    %[dest1_v],       %[src0],               %[mask_v]       \n\t"
+      "pshufh     %[dest3_u],       %[src0],               %[mask]         \n\t"
+      "pmaddhw    %[dest3_u],       %[dest3_u],            %[mask_u]       \n\t"
+      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
+      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
+      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
+
+      "punpcklwd  %[src0],          %[dest3_u],            %[b0]           \n\t"
+      "punpckhwd  %[src1],          %[dest3_u],            %[b0]           \n\t"
+      "psubw      %[dest3_u],       %[src0],               %[src1]         \n\t"
+      "psraw      %[dest3_u],       %[dest3_u],            %[eight]        \n\t"
+      "punpcklwd  %[src0],          %[dest1_v],            %[g0]           \n\t"
+      "punpckhwd  %[src1],          %[dest1_v],            %[g0]           \n\t"
+      "psubw      %[dest1_v],       %[src1],               %[src0]         \n\t"
+      "psraw      %[dest1_v],       %[dest1_v],            %[eight]        \n\t"
+
+      "packsswh   %[src1],          %[dest2_u],            %[dest3_u]      \n\t"
+      "packushb   %[dest0_u],       %[dest0_u],            %[src1]         \n\t"
+      "gssdlc1    %[dest0_u],       0x07(%[dst_u])                         \n\t"
+      "gssdrc1    %[dest0_u],       0x00(%[dst_u])                         \n\t"
+      "packsswh   %[src1],          %[dest0_v],            %[dest1_v]      \n\t"
+      "packushb   %[dest0_v],       %[dest1_u],            %[src1]         \n\t"
+      "gssdlc1    %[dest0_v],       0x07(%[dst_v])                         \n\t"
+      "gssdrc1    %[dest0_v],       0x00(%[dst_v])                         \n\t"
+
+      "daddiu    %[src_argb1555],   %[src_argb1555],       0x20            \n\t"
+      "daddiu    %[next_argb1555],  %[next_argb1555],      0x20            \n\t"
+      "daddiu    %[dst_u],          %[dst_u],              0x08            \n\t"
+      "daddiu    %[dst_v],          %[dst_v],              0x08            \n\t"
+      "daddiu    %[width],          %[width],             -0x10            \n\t"
+      "bgtz      %[width],          1b                                     \n\t"
+      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
+        [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
+        [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
+        [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
+        [dest1_v] "=&f"(ftmp[10])
+      : [src_argb1555] "r"(src_argb1555),
+        [next_argb1555] "r"(src_stride_argb1555), [dst_u] "r"(dst_u),
+        [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value),
+        [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3),
+        [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
+        [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03),
+        [two] "f"(0x02), [one] "f"(0x01)
+      : "memory");
+}
+
+void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444,
+                         int src_stride_argb4444,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  uint64_t ftmp[13];
+  uint64_t value = 0x2020202020202020;
+  uint64_t mask_u = 0x0026004a00700002;
+  uint64_t mask_v = 0x00020070005e0012;
+  uint64_t mask = 0x93;
+  uint64_t c0 = 0x000f000f000f000f;
+  uint64_t c1 = 0x00ff00ff00ff00ff;
+  __asm__ volatile(
+      "daddu      %[next_argb4444], %[src_argb4444],      %[next_argb4444] \n\t"
+      "1:                                                                  \n\t"
+      "gsldrc1    %[src0],          0x00(%[src_argb4444])                  \n\t"
+      "gsldlc1    %[src0],          0x07(%[src_argb4444])                  \n\t"
+      "gsldrc1    %[src1],          0x00(%[next_argb4444])                 \n\t"
+      "gsldlc1    %[src1],          0x07(%[next_argb4444])                 \n\t"
+      "psrlh      %[dest0_u],       %[src0],               %[eight]        \n\t"
+      "and        %[b0],            %[src0],               %[c0]           \n\t"
+      "and        %[src0],          %[src0],               %[c1]           \n\t"
+      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
+      "and        %[r0],            %[dest0_u],            %[c0]           \n\t"
+      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
+      "and        %[dest0_u],       %[src1],               %[c0]           \n\t"
+      "and        %[src1],          %[src1],               %[c1]           \n\t"
+      "psrlh      %[dest0_v],       %[src1],               %[four]         \n\t"
+      "and        %[src0],          %[src0],               %[c0]           \n\t"
+      "paddh      %[b0],            %[b0],                 %[dest0_u]      \n\t"
+      "paddh      %[g0],            %[g0],                 %[dest0_v]      \n\t"
+      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
+      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
+      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
+      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
+      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
+      "paddh      %[src0],          %[dest0_u],            %[dest0_v]      \n\t"
+      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
+      "psllh      %[r0],            %[src0],               %[two]          \n\t"
+      "or         %[b0],            %[b0],                 %[r0]           \n\t"
+      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
+      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
+      "or         %[g0],            %[g0],                 %[r0]           \n\t"
+      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
+      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
+      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
+      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
+      "paddh      %[g0],            %[dest0_u],            %[dest0_v]      \n\t"
+      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
+      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
+
+      "pmaddhw    %[dest0_v],       %[src0],               %[mask_v]       \n\t"
+      "pshufh     %[dest0_u],       %[src0],               %[mask]         \n\t"
+      "pmaddhw    %[dest0_u],       %[dest0_u],            %[mask_u]       \n\t"
+      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
+      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
+      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
+
+      "punpcklwd  %[src0],          %[dest0_u],            %[b0]           \n\t"
+      "punpckhwd  %[src1],          %[dest0_u],            %[b0]           \n\t"
+      "psubw      %[dest0_u],       %[src0],               %[src1]         \n\t"
+      "psraw      %[dest0_u],       %[dest0_u],            %[eight]        \n\t"
+      "punpcklwd  %[src0],          %[dest0_v],            %[g0]           \n\t"
+      "punpckhwd  %[src1],          %[dest0_v],            %[g0]           \n\t"
+      "psubw      %[dest0_v],       %[src1],               %[src0]         \n\t"
+      "psraw      %[dest0_v],       %[dest0_v],            %[eight]        \n\t"
+
+      "gsldrc1    %[src0],          0x08(%[src_argb4444])                  \n\t"
+      "gsldlc1    %[src0],          0x0f(%[src_argb4444])                  \n\t"
+      "gsldrc1    %[src1],          0x08(%[next_argb4444])                 \n\t"
+      "gsldlc1    %[src1],          0x0f(%[next_argb4444])                 \n\t"
+      "psrlh      %[dest1_u],       %[src0],               %[eight]        \n\t"
+      "and        %[b0],            %[src0],               %[c0]           \n\t"
+      "and        %[src0],          %[src0],               %[c1]           \n\t"
+      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
+      "and        %[r0],            %[dest1_u],            %[c0]           \n\t"
+      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
+      "and        %[dest1_u],       %[src1],               %[c0]           \n\t"
+      "and        %[src1],          %[src1],               %[c1]           \n\t"
+      "psrlh      %[dest1_v],       %[src1],               %[four]         \n\t"
+      "and        %[src0],          %[src0],               %[c0]           \n\t"
+      "paddh      %[b0],            %[b0],                 %[dest1_u]      \n\t"
+      "paddh      %[g0],            %[g0],                 %[dest1_v]      \n\t"
+      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
+      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
+      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
+      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
+      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
+      "paddh      %[src0],          %[dest1_u],            %[dest1_v]      \n\t"
+      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
+      "psllh      %[r0],            %[src0],               %[two]          \n\t"
+      "or         %[b0],            %[b0],                 %[r0]           \n\t"
+      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
+      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
+      "or         %[g0],            %[g0],                 %[r0]           \n\t"
+      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
+      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
+      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
+      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
+      "paddh      %[g0],            %[dest1_u],            %[dest1_v]      \n\t"
+      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
+      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
+
+      "pmaddhw    %[dest1_v],       %[src0],               %[mask_v]       \n\t"
+      "pshufh     %[dest1_u],       %[src0],               %[mask]         \n\t"
+      "pmaddhw    %[dest1_u],       %[dest1_u],            %[mask_u]       \n\t"
+      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
+      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
+      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
+
+      "punpcklwd  %[src0],          %[dest1_u],            %[b0]           \n\t"
+      "punpckhwd  %[src1],          %[dest1_u],            %[b0]           \n\t"
+      "psubw      %[dest1_u],       %[src0],               %[src1]         \n\t"
+      "psraw      %[dest1_u],       %[dest1_u],            %[eight]        \n\t"
+      "punpcklwd  %[src0],          %[dest1_v],            %[g0]           \n\t"
+      "punpckhwd  %[src1],          %[dest1_v],            %[g0]           \n\t"
+      "psubw      %[dest1_v],       %[src1],               %[src0]         \n\t"
+      "psraw      %[dest1_v],       %[dest1_v],            %[eight]        \n\t"
+
+      "gsldrc1    %[src0],          0x10(%[src_argb4444])                  \n\t"
+      "gsldlc1    %[src0],          0x17(%[src_argb4444])                  \n\t"
+      "gsldrc1    %[src1],          0x10(%[next_argb4444])                 \n\t"
+      "gsldlc1    %[src1],          0x17(%[next_argb4444])                 \n\t"
+      "psrlh      %[dest2_u],       %[src0],               %[eight]        \n\t"
+      "and        %[b0],            %[src0],               %[c0]           \n\t"
+      "and        %[src0],          %[src0],               %[c1]           \n\t"
+      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
+      "and        %[r0],            %[dest2_u],            %[c0]           \n\t"
+      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
+      "and        %[dest2_u],       %[src1],               %[c0]           \n\t"
+      "and        %[src1],          %[src1],               %[c1]           \n\t"
+      "psrlh      %[dest2_v],       %[src1],               %[four]         \n\t"
+      "and        %[src0],          %[src0],               %[c0]           \n\t"
+      "paddh      %[b0],            %[b0],                 %[dest2_u]      \n\t"
+      "paddh      %[g0],            %[g0],                 %[dest2_v]      \n\t"
+      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
+      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
+      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
+      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
+      "punpckhwd  %[dest2_v],       %[src0],               %[src1]         \n\t"
+      "paddh      %[src0],          %[dest2_u],            %[dest2_v]      \n\t"
+      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
+      "psllh      %[r0],            %[src0],               %[two]          \n\t"
+      "or         %[b0],            %[b0],                 %[r0]           \n\t"
+      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
+      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
+      "or         %[g0],            %[g0],                 %[r0]           \n\t"
+      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
+      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
+      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
+      "punpckhwd  %[dest2_v],       %[src0],               %[src1]         \n\t"
+      "paddh      %[g0],            %[dest2_u],            %[dest2_v]      \n\t"
+      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
+      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
+
+      "pmaddhw    %[dest2_v],       %[src0],               %[mask_v]       \n\t"
+      "pshufh     %[dest2_u],       %[src0],               %[mask]         \n\t"
+      "pmaddhw    %[dest2_u],       %[dest2_u],            %[mask_u]       \n\t"
+      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
+      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
+      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
+
+      "punpcklwd  %[src0],          %[dest2_u],            %[b0]           \n\t"
+      "punpckhwd  %[src1],          %[dest2_u],            %[b0]           \n\t"
+      "psubw      %[dest2_u],       %[src0],               %[src1]         \n\t"
+      "psraw      %[dest2_u],       %[dest2_u],            %[eight]        \n\t"
+      "punpcklwd  %[src0],          %[dest2_v],            %[g0]           \n\t"
+      "punpckhwd  %[src1],          %[dest2_v],            %[g0]           \n\t"
+      "psubw      %[dest2_v],       %[src1],               %[src0]         \n\t"
+      "psraw      %[dest2_v],       %[dest2_v],            %[eight]        \n\t"
+
+      "gsldrc1    %[src0],          0x18(%[src_argb4444])                  \n\t"
+      "gsldlc1    %[src0],          0x1f(%[src_argb4444])                  \n\t"
+      "gsldrc1    %[src1],          0x18(%[next_argb4444])                 \n\t"
+      "gsldlc1    %[src1],          0x1f(%[next_argb4444])                 \n\t"
+      "psrlh      %[dest3_u],       %[src0],               %[eight]        \n\t"
+      "and        %[b0],            %[src0],               %[c0]           \n\t"
+      "and        %[src0],          %[src0],               %[c1]           \n\t"
+      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
+      "and        %[r0],            %[dest3_u],            %[c0]           \n\t"
+      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
+      "and        %[dest3_u],       %[src1],               %[c0]           \n\t"
+      "and        %[src1],          %[src1],               %[c1]           \n\t"
+      "psrlh      %[dest3_v],       %[src1],               %[four]         \n\t"
+      "and        %[src0],          %[src0],               %[c0]           \n\t"
+      "paddh      %[b0],            %[b0],                 %[dest3_u]      \n\t"
+      "paddh      %[g0],            %[g0],                 %[dest3_v]      \n\t"
+      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
+      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
+      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
+      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
+      "punpckhwd  %[dest3_v],       %[src0],               %[src1]         \n\t"
+      "paddh      %[src0],          %[dest3_u],            %[dest3_v]      \n\t"
+      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
+      "psllh      %[r0],            %[src0],               %[two]          \n\t"
+      "or         %[b0],            %[b0],                 %[r0]           \n\t"
+      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
+      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
+      "or         %[g0],            %[g0],                 %[r0]           \n\t"
+      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
+      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
+      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
+      "punpckhwd  %[dest3_v],       %[src0],               %[src1]         \n\t"
+      "paddh      %[g0],            %[dest3_u],            %[dest3_v]      \n\t"
+      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
+      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
+
+      "pmaddhw    %[dest3_v],       %[src0],               %[mask_v]       \n\t"
+      "pshufh     %[dest3_u],       %[src0],               %[mask]         \n\t"
+      "pmaddhw    %[dest3_u],       %[dest3_u],            %[mask_u]       \n\t"
+      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
+      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
+      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
+
+      "punpcklwd  %[src0],          %[dest3_u],            %[b0]           \n\t"
+      "punpckhwd  %[src1],          %[dest3_u],            %[b0]           \n\t"
+      "psubw      %[dest3_u],       %[src0],               %[src1]         \n\t"
+      "psraw      %[dest3_u],       %[dest3_u],            %[eight]        \n\t"
+      "punpcklwd  %[src0],          %[dest3_v],            %[g0]           \n\t"
+      "punpckhwd  %[src1],          %[dest3_v],            %[g0]           \n\t"
+      "psubw      %[dest3_v],       %[src1],               %[src0]         \n\t"
+      "psraw      %[dest3_v],       %[dest3_v],            %[eight]        \n\t"
+
+      "packsswh   %[src0],          %[dest0_u],            %[dest1_u]      \n\t"
+      "packsswh   %[src1],          %[dest2_u],            %[dest3_u]      \n\t"
+      "packushb   %[dest0_u],       %[src0],               %[src1]         \n\t"
+      "gssdlc1    %[dest0_u],       0x07(%[dst_u])                         \n\t"
+      "gssdrc1    %[dest0_u],       0x00(%[dst_u])                         \n\t"
+      "packsswh   %[src0],          %[dest0_v],            %[dest1_v]      \n\t"
+      "packsswh   %[src1],          %[dest2_v],            %[dest3_v]      \n\t"
+      "packushb   %[dest0_v],       %[src0],               %[src1]         \n\t"
+      "gssdlc1    %[dest0_v],       0x07(%[dst_v])                         \n\t"
+      "gssdrc1    %[dest0_v],       0x00(%[dst_v])                         \n\t"
+
+      "daddiu    %[src_argb4444],   %[src_argb4444],       0x20            \n\t"
+      "daddiu    %[next_argb4444],  %[next_argb4444],      0x20            \n\t"
+      "daddiu    %[dst_u],          %[dst_u],              0x08            \n\t"
+      "daddiu    %[dst_v],          %[dst_v],              0x08            \n\t"
+      "daddiu    %[width],          %[width],             -0x10            \n\t"
+      "bgtz      %[width],          1b                                     \n\t"
+      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
+        [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
+        [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
+        [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
+        [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]),
+        [dest3_v] "=&f"(ftmp[12])
+      : [src_argb4444] "r"(src_argb4444),
+        [next_argb4444] "r"(src_stride_argb4444), [dst_u] "r"(dst_u),
+        [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value),
+        [c0] "f"(c0), [c1] "f"(c1), [mask] "f"(mask), [mask_u] "f"(mask_u),
+        [mask_v] "f"(mask_v), [eight] "f"(0x08), [four] "f"(0x04),
+        [two] "f"(0x02)
+      : "memory");
+}
+
+void ARGBToUV444Row_MMI(const uint8_t* src_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  uint64_t ftmp[12];
+  const uint64_t value = 0x4040;
+  const uint64_t mask_u = 0x0026004a00700002;
+  const uint64_t mask_v = 0x00020070005e0012;
+
+  __asm__ volatile(
+      "1:                                                               \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_argb])                    \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_argb])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "dsll       %[dest0_u],      %[src_lo],         %[sixteen]        \n\t"
+      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
+      "pinsrh_3   %[dest0_v],      %[src_lo],         %[value]          \n\t"
+      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
+
+      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
+      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
+      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x08(%[src_argb])                    \n\t"
+      "gsldlc1    %[src0],         0x0f(%[src_argb])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "dsll       %[dest1_u],      %[src_lo],         %[sixteen]        \n\t"
+      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
+      "pinsrh_3   %[dest1_v],      %[src_lo],         %[value]          \n\t"
+      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
+      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
+      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
+      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x10(%[src_argb])                    \n\t"
+      "gsldlc1    %[src0],         0x17(%[src_argb])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "dsll       %[dest2_u],      %[src_lo],         %[sixteen]        \n\t"
+      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
+      "pinsrh_3   %[dest2_v],      %[src_lo],         %[value]          \n\t"
+      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
+      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
+      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
+      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
+
+      "gsldrc1    %[src0],         0x18(%[src_argb])                    \n\t"
+      "gsldlc1    %[src0],         0x1f(%[src_argb])                    \n\t"
+      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
+      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
+      "dsll       %[dest3_u],      %[src_lo],         %[sixteen]        \n\t"
+      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
+      "pinsrh_3   %[dest3_v],      %[src_lo],         %[value]          \n\t"
+      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
+      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
+      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
+      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
+      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
+      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
+
+      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
+      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
+      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
+      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
+      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
+      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
+      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
+      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
+
+      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
+      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
+      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
+      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
+      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
+
+      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
+      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
+      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
+      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
+      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
+
+      "daddiu     %[src_argb],     %[src_argb],       0x20              \n\t"
+      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
+      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
+      "daddi      %[width],        %[width],         -0x08              \n\t"
+      "bgtz       %[width],        1b                                   \n\t"
+      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
+        [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]),
+        [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]),
+        [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]),
+        [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]),
+        [dest3_v] "=&f"(ftmp[11])
+      : [src_argb] "r"(src_argb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
+        [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
+        [value] "f"(value), [zero] "f"(0x00), [sixteen] "f"(0x10),
+        [eight] "f"(0x08)
+      : "memory");
+}
+
+void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  uint64_t src, src_lo, src_hi, src37, dest, dest_lo, dest_hi;
+  uint64_t tmp0, tmp1;
+  const uint64_t mask0 = 0x0;
+  const uint64_t mask1 = 0x01;
+  const uint64_t mask2 = 0x0080004D0096001DULL;
+  const uint64_t mask3 = 0xFF000000FF000000ULL;
+  const uint64_t mask4 = ~mask3;
+  const uint64_t shift = 0x08;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
+
+      "and        %[src37],        %[src],            %[mask3]      \n\t"
+
+      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
+      "pinsrh_3   %[src_lo],       %[src_lo],         %[mask1]      \n\t"
+      "pmaddhw    %[dest_lo],      %[src_lo],         %[mask2]      \n\t"
+      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_lo]    \n\t"
+      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_lo]    \n\t"
+      "paddw      %[dest_lo],      %[tmp0],           %[tmp1]       \n\t"
+      "psrlw      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
+      "packsswh   %[dest_lo],      %[dest_lo],        %[dest_lo]    \n\t"
+
+      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
+      "pinsrh_3   %[src_hi],       %[src_hi],         %[mask1]      \n\t"
+      "pmaddhw    %[dest_hi],      %[src_hi],         %[mask2]      \n\t"
+      "punpcklwd  %[tmp0],         %[dest_hi],        %[dest_hi]    \n\t"
+      "punpckhwd  %[tmp1],         %[dest_hi],        %[dest_hi]    \n\t"
+      "paddw      %[dest_hi],      %[tmp0],           %[tmp1]       \n\t"
+      "psrlw      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
+      "packsswh   %[dest_hi],      %[dest_hi],        %[dest_hi]    \n\t"
+
+      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
+      "and        %[dest],         %[dest],           %[mask4]      \n\t"
+      "or         %[dest],         %[dest],           %[src37]      \n\t"
+
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x02          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
+        [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [tmp0] "=&f"(tmp0),
+        [tmp1] "=&f"(tmp1), [src] "=&f"(src), [dest] "=&f"(dest),
+        [src37] "=&f"(src37)
+      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width),
+        [shift] "f"(shift), [mask0] "f"(mask0), [mask1] "f"(mask1),
+        [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4)
+      : "memory");
+}
+
+// Convert a row of image to Sepia tone.
+void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width) {
+  uint64_t dest, dest_lo, dest_hi, dest37, dest0, dest1, dest2;
+  uint64_t tmp0, tmp1;
+  const uint64_t mask0 = 0x0;
+  const uint64_t mask1 = 0x002300440011ULL;
+  const uint64_t mask2 = 0x002D00580016ULL;
+  const uint64_t mask3 = 0x003200620018ULL;
+  const uint64_t mask4 = 0xFF000000FF000000ULL;
+  const uint64_t shift = 0x07;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gsldrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "and        %[dest37],       %[dest],           %[mask4]      \n\t"
+
+      "punpcklbh  %[dest_lo],      %[dest],           %[mask0]      \n\t"
+      "pmaddhw    %[dest0],        %[dest_lo],        %[mask1]      \n\t"
+      "pmaddhw    %[dest1],        %[dest_lo],        %[mask2]      \n\t"
+      "pmaddhw    %[dest2],        %[dest_lo],        %[mask3]      \n\t"
+      "punpcklwd  %[tmp0],         %[dest0],          %[dest1]      \n\t"
+      "punpckhwd  %[tmp1],         %[dest0],          %[dest1]      \n\t"
+      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
+      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
+      "punpcklwd  %[tmp0],         %[dest2],          %[mask0]      \n\t"
+      "punpckhwd  %[tmp1],         %[dest2],          %[mask0]      \n\t"
+      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
+      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
+      "packsswh   %[dest_lo],      %[dest0],          %[dest1]      \n\t"
+
+      "punpckhbh  %[dest_hi],      %[dest],           %[mask0]      \n\t"
+      "pmaddhw    %[dest0],        %[dest_hi],        %[mask1]      \n\t"
+      "pmaddhw    %[dest1],        %[dest_hi],        %[mask2]      \n\t"
+      "pmaddhw    %[dest2],        %[dest_hi],        %[mask3]      \n\t"
+      "punpcklwd  %[tmp0],         %[dest0],          %[dest1]      \n\t"
+      "punpckhwd  %[tmp1],         %[dest0],          %[dest1]      \n\t"
+      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
+      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
+      "punpcklwd  %[tmp0],         %[dest2],          %[mask0]      \n\t"
+      "punpckhwd  %[tmp1],         %[dest2],          %[mask0]      \n\t"
+      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
+      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
+      "packsswh   %[dest_hi],      %[dest0],          %[dest1]      \n\t"
+
+      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
+      "or         %[dest],         %[dest],           %[dest37]     \n\t"
+
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x02          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
+        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+        [dest37] "=&f"(dest37), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1),
+        [dest] "=&f"(dest)
+      : [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask0] "f"(mask0),
+        [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3),
+        [mask4] "f"(mask4), [shift] "f"(shift)
+      : "memory");
+}
+
+// Apply color matrix to a row of image. Matrix is signed.
+// TODO(fbarchard): Consider adding rounding (+32).
+void ARGBColorMatrixRow_MMI(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const int8_t* matrix_argb,
+                            int width) {
+  uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi, dest0, dest1, dest2,
+      dest3;
+  uint64_t matrix, matrix_hi, matrix_lo;
+  uint64_t tmp0, tmp1;
+  const uint64_t shift0 = 0x06;
+  const uint64_t shift1 = 0x08;
+  const uint64_t mask0 = 0x0;
+  const uint64_t mask1 = 0x08;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
+
+      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
+
+      "gsldlc1    %[matrix],       0x07(%[matrix_ptr])              \n\t"
+      "gsldrc1    %[matrix],       0x00(%[matrix_ptr])              \n\t"
+      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
+      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
+      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
+      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
+      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
+      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
+      "pmaddhw    %[dest_lo],      %[src_lo],         %[matrix_lo]  \n\t"
+      "pmaddhw    %[dest_hi],      %[src_lo],         %[matrix_hi]  \n\t"
+      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
+      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
+      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
+      "psraw      %[dest0],        %[dest0],          %[shift0]     \n\t"
+
+      "gsldlc1    %[matrix],       0x0f(%[matrix_ptr])              \n\t"
+      "gsldrc1    %[matrix],       0x08(%[matrix_ptr])              \n\t"
+      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
+      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
+      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
+      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
+      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
+      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
+      "pmaddhw    %[dest_lo],      %[src_lo],         %[matrix_lo]  \n\t"
+      "pmaddhw    %[dest_hi],      %[src_lo],         %[matrix_hi]  \n\t"
+      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
+      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
+      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
+      "psraw      %[dest1],        %[dest1],          %[shift0]     \n\t"
+
+      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
+
+      "gsldlc1    %[matrix],       0x07(%[matrix_ptr])              \n\t"
+      "gsldrc1    %[matrix],       0x00(%[matrix_ptr])              \n\t"
+      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
+      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
+      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
+      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
+      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
+      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
+      "pmaddhw    %[dest_lo],      %[src_hi],         %[matrix_lo]  \n\t"
+      "pmaddhw    %[dest_hi],      %[src_hi],         %[matrix_hi]  \n\t"
+      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
+      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
+      "paddw      %[dest2],        %[tmp0],           %[tmp1]       \n\t"
+      "psraw      %[dest2],        %[dest2],          %[shift0]     \n\t"
+
+      "gsldlc1    %[matrix],       0x0f(%[matrix_ptr])              \n\t"
+      "gsldrc1    %[matrix],       0x08(%[matrix_ptr])              \n\t"
+      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
+      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
+      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
+      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
+      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
+      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
+      "pmaddhw    %[dest_lo],      %[src_hi],         %[matrix_lo]  \n\t"
+      "pmaddhw    %[dest_hi],      %[src_hi],         %[matrix_hi]  \n\t"
+      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
+      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
+      "paddw      %[dest3],        %[tmp0],           %[tmp1]       \n\t"
+      "psraw      %[dest3],        %[dest3],          %[shift0]     \n\t"
+
+      "packsswh   %[tmp0],         %[dest0],          %[dest1]      \n\t"
+      "packsswh   %[tmp1],         %[dest2],          %[dest3]      \n\t"
+      "packushb   %[dest],         %[tmp0],           %[tmp1]       \n\t"
+
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x02          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
+        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+        [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest),
+        [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [matrix_hi] "=&f"(matrix_hi),
+        [matrix_lo] "=&f"(matrix_lo), [matrix] "=&f"(matrix)
+      : [src_ptr] "r"(src_argb), [matrix_ptr] "r"(matrix_argb),
+        [dst_ptr] "r"(dst_argb), [width] "r"(width), [shift0] "f"(shift0),
+        [shift1] "f"(shift1), [mask0] "f"(mask0), [mask1] "f"(mask1)
+      : "memory");
+}
+
+void ARGBShadeRow_MMI(const uint8_t* src_argb,
+                      uint8_t* dst_argb,
+                      int width,
+                      uint32_t value) {
+  uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi;
+  const uint64_t shift = 0x08;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[src]        \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[src]        \n\t"
+
+      "punpcklbh  %[value],        %[value],          %[value]      \n\t"
+
+      "pmulhuh    %[dest_lo],      %[src_lo],         %[value]      \n\t"
+      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
+      "pmulhuh    %[dest_hi],      %[src_hi],         %[value]      \n\t"
+      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
+      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
+
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x02          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src] "=&f"(src),
+        [dest] "=&f"(dest)
+      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width),
+        [value] "f"(value), [shift] "f"(shift)
+      : "memory");
+}
+
+void ARGBMultiplyRow_MMI(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width) {
+  uint64_t src0, src0_hi, src0_lo, src1, src1_hi, src1_lo;
+  uint64_t dest, dest_lo, dest_hi;
+  const uint64_t mask = 0x0;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
+      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
+      "punpcklbh  %[src0_lo],      %[src0],           %[src0]       \n\t"
+      "punpckhbh  %[src0_hi],      %[src0],           %[src0]       \n\t"
+
+      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
+      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
+      "punpcklbh  %[src1_lo],      %[src1],           %[mask]       \n\t"
+      "punpckhbh  %[src1_hi],      %[src1],           %[mask]       \n\t"
+
+      "pmulhuh    %[dest_lo],      %[src0_lo],        %[src1_lo]    \n\t"
+      "pmulhuh    %[dest_hi],      %[src0_hi],        %[src1_hi]    \n\t"
+      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
+
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
+      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x02          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
+        [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
+        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src0] "=&f"(src0),
+        [src1] "=&f"(src1), [dest] "=&f"(dest)
+      : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
+        [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask)
+      : "memory");
+}
+
+void ARGBAddRow_MMI(const uint8_t* src_argb0,
+                    const uint8_t* src_argb1,
+                    uint8_t* dst_argb,
+                    int width) {
+  uint64_t src0, src1, dest;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
+      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
+      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
+      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
+      "paddusb    %[dest],         %[src0],           %[src1]       \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
+      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x02          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+      : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
+        [dst_ptr] "r"(dst_argb), [width] "r"(width)
+      : "memory");
+}
+
+void ARGBSubtractRow_MMI(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width) {
+  uint64_t src0, src1, dest;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
+      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
+      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
+      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
+      "psubusb    %[dest],         %[src0],           %[src1]       \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
+      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x02          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+      : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
+        [dst_ptr] "r"(dst_argb), [width] "r"(width)
+      : "memory");
+}
+
+// Sobel functions which mimics SSSE3.
+void SobelXRow_MMI(const uint8_t* src_y0,
+                   const uint8_t* src_y1,
+                   const uint8_t* src_y2,
+                   uint8_t* dst_sobelx,
+                   int width) {
+  uint64_t y00 = 0, y10 = 0, y20 = 0;
+  uint64_t y02 = 0, y12 = 0, y22 = 0;
+  uint64_t zero = 0x0;
+  uint64_t sobel = 0x0;
+  __asm__ volatile(
+      "1:	                                         \n\t"
+      "gsldlc1   %[y00],        0x07(%[src_y0])          \n\t"  // a=src_y0[i]
+      "gsldrc1   %[y00],        0x00(%[src_y0])          \n\t"
+      "gsldlc1   %[y02],        0x09(%[src_y0])          \n\t"  // a_sub=src_y0[i+2]
+      "gsldrc1   %[y02],        0x02(%[src_y0])          \n\t"
+
+      "gsldlc1   %[y10],        0x07(%[src_y1])          \n\t"  // b=src_y1[i]
+      "gsldrc1   %[y10],        0x00(%[src_y1])          \n\t"
+      "gsldlc1   %[y12],        0x09(%[src_y1])          \n\t"  // b_sub=src_y1[i+2]
+      "gsldrc1   %[y12],        0x02(%[src_y1])          \n\t"
+
+      "gsldlc1   %[y20],        0x07(%[src_y2])          \n\t"  // c=src_y2[i]
+      "gsldrc1   %[y20],        0x00(%[src_y2])          \n\t"
+      "gsldlc1   %[y22],        0x09(%[src_y2])          \n\t"  // c_sub=src_y2[i+2]
+      "gsldrc1   %[y22],        0x02(%[src_y2])          \n\t"
+
+      "punpcklbh %[y00],        %[y00],          %[zero] \n\t"
+      "punpcklbh %[y10],        %[y10],          %[zero] \n\t"
+      "punpcklbh %[y20],        %[y20],          %[zero] \n\t"
+
+      "punpcklbh %[y02],        %[y02],          %[zero] \n\t"
+      "punpcklbh %[y12],        %[y12],          %[zero] \n\t"
+      "punpcklbh %[y22],        %[y22],          %[zero] \n\t"
+
+      "paddh     %[y00],        %[y00],          %[y10]  \n\t"  // a+b
+      "paddh     %[y20],        %[y20],          %[y10]  \n\t"  // c+b
+      "paddh     %[y00],        %[y00],          %[y20]  \n\t"  // a+2b+c
+
+      "paddh     %[y02],        %[y02],          %[y12]  \n\t"  // a_sub+b_sub
+      "paddh     %[y22],        %[y22],          %[y12]  \n\t"  // c_sub+b_sub
+      "paddh     %[y02],        %[y02],          %[y22]  \n\t"  // a_sub+2b_sub+c_sub
+
+      "pmaxsh    %[y10],        %[y00],          %[y02]  \n\t"
+      "pminsh    %[y20],        %[y00],          %[y02]  \n\t"
+      "psubh     %[sobel],      %[y10],          %[y20]  \n\t"  // Abs
+
+      "gsldlc1   %[y00],        0x0B(%[src_y0])          \n\t"
+      "gsldrc1   %[y00],        0x04(%[src_y0])          \n\t"
+      "gsldlc1   %[y02],        0x0D(%[src_y0])          \n\t"
+      "gsldrc1   %[y02],        0x06(%[src_y0])          \n\t"
+
+      "gsldlc1   %[y10],        0x0B(%[src_y1])          \n\t"
+      "gsldrc1   %[y10],        0x04(%[src_y1])          \n\t"
+      "gsldlc1   %[y12],        0x0D(%[src_y1])          \n\t"
+      "gsldrc1   %[y12],        0x06(%[src_y1])          \n\t"
+
+      "gsldlc1   %[y20],        0x0B(%[src_y2])          \n\t"
+      "gsldrc1   %[y20],        0x04(%[src_y2])          \n\t"
+      "gsldlc1   %[y22],        0x0D(%[src_y2])          \n\t"
+      "gsldrc1   %[y22],        0x06(%[src_y2])          \n\t"
+
+      "punpcklbh %[y00],        %[y00],          %[zero] \n\t"
+      "punpcklbh %[y10],        %[y10],          %[zero] \n\t"
+      "punpcklbh %[y20],        %[y20],          %[zero] \n\t"
+
+      "punpcklbh %[y02],        %[y02],          %[zero] \n\t"
+      "punpcklbh %[y12],        %[y12],          %[zero] \n\t"
+      "punpcklbh %[y22],        %[y22],          %[zero] \n\t"
+
+      "paddh     %[y00],        %[y00],          %[y10]  \n\t"
+      "paddh     %[y20],        %[y20],          %[y10]  \n\t"
+      "paddh     %[y00],        %[y00],          %[y20]  \n\t"
+
+      "paddh     %[y02],        %[y02],          %[y12]  \n\t"
+      "paddh     %[y22],        %[y22],          %[y12]  \n\t"
+      "paddh     %[y02],        %[y02],          %[y22]  \n\t"
+
+      "pmaxsh    %[y10],        %[y00],          %[y02]  \n\t"
+      "pminsh    %[y20],        %[y00],          %[y02]  \n\t"
+      "psubh     %[y00],        %[y10],          %[y20]  \n\t"
+
+      "packushb  %[sobel],      %[sobel],        %[y00]  \n\t"  // clamp255
+      "gssdrc1   %[sobel],      0(%[dst_sobelx])         \n\t"
+      "gssdlc1   %[sobel],      7(%[dst_sobelx])         \n\t"
+
+      "daddiu    %[src_y0],     %[src_y0],      8        \n\t"
+      "daddiu    %[src_y1],     %[src_y1],      8        \n\t"
+      "daddiu    %[src_y2],     %[src_y2],      8        \n\t"
+      "daddiu    %[dst_sobelx], %[dst_sobelx],  8        \n\t"
+      "daddiu    %[width],      %[width],      -8        \n\t"
+      "bgtz      %[width],      1b                       \n\t"
+      "nop                                               \n\t"
+      : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y10] "=&f"(y10),
+        [y20] "=&f"(y20), [y02] "=&f"(y02), [y12] "=&f"(y12), [y22] "=&f"(y22)
+      : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1), [src_y2] "r"(src_y2),
+        [dst_sobelx] "r"(dst_sobelx), [width] "r"(width), [zero] "f"(zero)
+      : "memory");
+}
+
+void SobelYRow_MMI(const uint8_t* src_y0,
+                   const uint8_t* src_y1,
+                   uint8_t* dst_sobely,
+                   int width) {
+  uint64_t y00 = 0, y01 = 0, y02 = 0;
+  uint64_t y10 = 0, y11 = 0, y12 = 0;
+  uint64_t zero = 0x0;
+  uint64_t sobel = 0x0;
+  __asm__ volatile(
+      "1:	                                        \n\t"
+      "gsldlc1   %[y00],        0x07(%[src_y0])         \n\t"  // a=src_y0[i]
+      "gsldrc1   %[y00],        0x00(%[src_y0])         \n\t"
+      "gsldlc1   %[y01],        0x08(%[src_y0])         \n\t"  // b=src_y0[i+1]
+      "gsldrc1   %[y01],        0x01(%[src_y0])         \n\t"
+      "gsldlc1   %[y02],        0x09(%[src_y0])         \n\t"  // c=src_y0[i+2]
+      "gsldrc1   %[y02],        0x02(%[src_y0])         \n\t"
+
+      "gsldlc1   %[y10],        0x07(%[src_y1])         \n\t"  // a_sub=src_y1[i]
+      "gsldrc1   %[y10],        0x00(%[src_y1])         \n\t"
+      "gsldlc1   %[y11],        0x08(%[src_y1])         \n\t"  // b_sub=src_y1[i+1]
+      "gsldrc1   %[y11],        0x01(%[src_y1])         \n\t"
+      "gsldlc1   %[y12],        0x09(%[src_y1])         \n\t"  // c_sub=src_y1[i+2]
+      "gsldrc1   %[y12],        0x02(%[src_y1])         \n\t"
+
+      "punpcklbh %[y00],        %[y00],         %[zero] \n\t"
+      "punpcklbh %[y01],        %[y01],         %[zero] \n\t"
+      "punpcklbh %[y02],        %[y02],         %[zero] \n\t"
+
+      "punpcklbh %[y10],        %[y10],         %[zero] \n\t"
+      "punpcklbh %[y11],        %[y11],         %[zero] \n\t"
+      "punpcklbh %[y12],        %[y12],         %[zero] \n\t"
+
+      "paddh     %[y00],        %[y00],         %[y01]  \n\t"  // a+b
+      "paddh     %[y02],        %[y02],         %[y01]  \n\t"  // c+b
+      "paddh     %[y00],        %[y00],         %[y02]  \n\t"  // a+2b+c
+
+      "paddh     %[y10],        %[y10],         %[y11]  \n\t"  // a_sub+b_sub
+      "paddh     %[y12],        %[y12],         %[y11]  \n\t"  // c_sub+b_sub
+      "paddh     %[y10],        %[y10],         %[y12]  \n\t"  // a_sub+2b_sub+c_sub
+
+      "pmaxsh    %[y02],        %[y00],         %[y10]  \n\t"
+      "pminsh    %[y12],        %[y00],         %[y10]  \n\t"
+      "psubh     %[sobel],      %[y02],         %[y12]  \n\t"  // Abs
+
+      "gsldlc1   %[y00],        0x0B(%[src_y0])         \n\t"
+      "gsldrc1   %[y00],        0x04(%[src_y0])         \n\t"
+      "gsldlc1   %[y01],        0x0C(%[src_y0])         \n\t"
+      "gsldrc1   %[y01],        0x05(%[src_y0])         \n\t"
+      "gsldlc1   %[y02],        0x0D(%[src_y0])         \n\t"
+      "gsldrc1   %[y02],        0x06(%[src_y0])         \n\t"
+
+      "gsldlc1   %[y10],        0x0B(%[src_y1])         \n\t"
+      "gsldrc1   %[y10],        0x04(%[src_y1])         \n\t"
+      "gsldlc1   %[y11],        0x0C(%[src_y1])         \n\t"
+      "gsldrc1   %[y11],        0x05(%[src_y1])         \n\t"
+      "gsldlc1   %[y12],        0x0D(%[src_y1])         \n\t"
+      "gsldrc1   %[y12],        0x06(%[src_y1])         \n\t"
+
+      "punpcklbh %[y00],        %[y00],         %[zero] \n\t"
+      "punpcklbh %[y01],        %[y01],         %[zero] \n\t"
+      "punpcklbh %[y02],        %[y02],         %[zero] \n\t"
+
+      "punpcklbh %[y10],        %[y10],         %[zero] \n\t"
+      "punpcklbh %[y11],        %[y11],         %[zero] \n\t"
+      "punpcklbh %[y12],        %[y12],         %[zero] \n\t"
+
+      "paddh     %[y00],        %[y00],         %[y01]  \n\t"
+      "paddh     %[y02],        %[y02],         %[y01]  \n\t"
+      "paddh     %[y00],        %[y00],         %[y02]  \n\t"
+
+      "paddh     %[y10],        %[y10],         %[y11]  \n\t"
+      "paddh     %[y12],        %[y12],         %[y11]  \n\t"
+      "paddh     %[y10],        %[y10],         %[y12]  \n\t"
+
+      "pmaxsh    %[y02],        %[y00],         %[y10]  \n\t"
+      "pminsh    %[y12],        %[y00],         %[y10]  \n\t"
+      "psubh     %[y00],        %[y02],         %[y12]  \n\t"
+
+      "packushb  %[sobel],      %[sobel],       %[y00]  \n\t"  // clamp255
+      "gssdrc1   %[sobel],      0(%[dst_sobely])        \n\t"
+      "gssdlc1   %[sobel],      7(%[dst_sobely])        \n\t"
+
+      "daddiu    %[src_y0],     %[src_y0],      8       \n\t"
+      "daddiu    %[src_y1],     %[src_y1],      8       \n\t"
+      "daddiu    %[dst_sobely], %[dst_sobely],  8       \n\t"
+      "daddiu    %[width],      %[width],      -8       \n\t"
+      "bgtz      %[width],      1b                      \n\t"
+      "nop                                              \n\t"
+      : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y01] "=&f"(y01),
+        [y02] "=&f"(y02), [y10] "=&f"(y10), [y11] "=&f"(y11), [y12] "=&f"(y12)
+      : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1),
+        [dst_sobely] "r"(dst_sobely), [width] "r"(width), [zero] "f"(zero)
+      : "memory");
+}
+
+void SobelRow_MMI(const uint8_t* src_sobelx,
+                  const uint8_t* src_sobely,
+                  uint8_t* dst_argb,
+                  int width) {
+  double temp[3];
+  uint64_t c1 = 0xff000000ff000000;
+  __asm__ volatile(
+      "1:	                                          \n\t"
+      "gsldlc1   %[t0],         0x07(%[src_sobelx])       \n\t"  // a=src_sobelx[i]
+      "gsldrc1   %[t0],         0x00(%[src_sobelx])       \n\t"
+      "gsldlc1   %[t1],         0x07(%[src_sobely])       \n\t"  // b=src_sobely[i]
+      "gsldrc1   %[t1],         0x00(%[src_sobely])       \n\t"
+      // s7 s6 s5 s4 s3 s2 s1 s0 = a+b
+      "paddusb   %[t2] ,        %[t0],              %[t1] \n\t"
+
+      // s3 s2 s1 s0->s3 s3 s2 s2 s1 s1 s0 s0
+      "punpcklbh %[t0],         %[t2],              %[t2] \n\t"
+
+      // s1 s1 s0 s0->s1 s2 s1 s1 s0 s0 s0 s0
+      "punpcklbh %[t1],         %[t0],              %[t0] \n\t"
+      "or        %[t1],         %[t1],              %[c1] \n\t"
+      // 255 s1 s1 s1 s55 s0 s0 s0
+      "gssdrc1   %[t1],         0x00(%[dst_argb])	  \n\t"
+      "gssdlc1   %[t1],         0x07(%[dst_argb])         \n\t"
+
+      // s3 s3 s2 s2->s3 s3 s3 s3 s2 s2 s2 s2
+      "punpckhbh %[t1],         %[t0],              %[t0] \n\t"
+      "or        %[t1],         %[t1],              %[c1] \n\t"
+      // 255 s3 s3 s3 255 s2 s2 s2
+      "gssdrc1   %[t1],         0x08(%[dst_argb])	  \n\t"
+      "gssdlc1   %[t1],         0x0f(%[dst_argb])         \n\t"
+
+      // s7 s6 s5 s4->s7 s7 s6 s6 s5 s5 s4 s4
+      "punpckhbh %[t0],         %[t2],              %[t2] \n\t"
+
+      // s5 s5 s4 s4->s5 s5 s5 s5 s4 s4 s4 s4
+      "punpcklbh %[t1],         %[t0],              %[t0] \n\t"
+      "or        %[t1],         %[t1],              %[c1] \n\t"
+      "gssdrc1   %[t1],         0x10(%[dst_argb])	  \n\t"
+      "gssdlc1   %[t1],         0x17(%[dst_argb])         \n\t"
+
+      // s7 s7 s6 s6->s7 s7 s7 s7 s6 s6 s6 s6
+      "punpckhbh %[t1],         %[t0],              %[t0] \n\t"
+      "or        %[t1],         %[t1],              %[c1] \n\t"
+      "gssdrc1   %[t1],         0x18(%[dst_argb])	  \n\t"
+      "gssdlc1   %[t1],         0x1f(%[dst_argb])         \n\t"
+
+      "daddiu    %[dst_argb],   %[dst_argb],        32    \n\t"
+      "daddiu    %[src_sobelx], %[src_sobelx],      8     \n\t"
+      "daddiu    %[src_sobely], %[src_sobely],      8     \n\t"
+      "daddiu    %[width],      %[width],          -8     \n\t"
+      "bgtz      %[width],      1b                        \n\t"
+      "nop                                                \n\t"
+      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2])
+      : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
+        [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1)
+      : "memory");
+}
+
+void SobelToPlaneRow_MMI(const uint8_t* src_sobelx,
+                         const uint8_t* src_sobely,
+                         uint8_t* dst_y,
+                         int width) {
+  uint64_t tr = 0;
+  uint64_t tb = 0;
+  __asm__ volatile(
+      "1:	                                       \n\t"
+      "gsldrc1 %[tr],         0x0(%[src_sobelx])       \n\t"
+      "gsldlc1 %[tr],         0x7(%[src_sobelx])       \n\t"  // r=src_sobelx[i]
+      "gsldrc1 %[tb],         0x0(%[src_sobely])       \n\t"
+      "gsldlc1 %[tb],         0x7(%[src_sobely])       \n\t"  // b=src_sobely[i]
+      "paddusb %[tr],         %[tr],             %[tb] \n\t"  // g
+      "gssdrc1 %[tr],         0x0(%[dst_y])	       \n\t"
+      "gssdlc1 %[tr],         0x7(%[dst_y])            \n\t"
+
+      "daddiu  %[dst_y],      %[dst_y],          8     \n\t"
+      "daddiu  %[src_sobelx], %[src_sobelx],     8     \n\t"
+      "daddiu  %[src_sobely], %[src_sobely],     8     \n\t"
+      "daddiu  %[width],      %[width],         -8     \n\t"
+      "bgtz    %[width],      1b                       \n\t"
+      "nop                                             \n\t"
+      : [tr] "=&f"(tr), [tb] "=&f"(tb)
+      : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
+        [dst_y] "r"(dst_y), [width] "r"(width)
+      : "memory");
+}
+
+void SobelXYRow_MMI(const uint8_t* src_sobelx,
+                    const uint8_t* src_sobely,
+                    uint8_t* dst_argb,
+                    int width) {
+  uint64_t temp[3];
+  uint64_t result = 0;
+  uint64_t gb = 0;
+  uint64_t cr = 0;
+  uint64_t c1 = 0xffffffffffffffff;
+  __asm__ volatile(
+      "1:	                                          \n\t"
+      "gsldlc1   %[tr],         0x07(%[src_sobelx])       \n\t"  // r=src_sobelx[i]
+      "gsldrc1   %[tr],         0x00(%[src_sobelx])       \n\t"
+      "gsldlc1   %[tb],         0x07(%[src_sobely])       \n\t"  // b=src_sobely[i]
+      "gsldrc1   %[tb],         0x00(%[src_sobely])       \n\t"
+      "paddusb   %[tg] ,        %[tr],              %[tb] \n\t"  // g
+
+      // g3 b3 g2 b2 g1 b1 g0 b0
+      "punpcklbh %[gb],         %[tb],              %[tg] \n\t"
+      // c3 r3 r2 r2 c1 r1 c0 r0
+      "punpcklbh %[cr],         %[tr],              %[c1] \n\t"
+      // c1 r1 g1 b1 c0 r0 g0 b0
+      "punpcklhw %[result],     %[gb],              %[cr] \n\t"
+      "gssdrc1   %[result],     0x00(%[dst_argb])	  \n\t"
+      "gssdlc1   %[result],     0x07(%[dst_argb])         \n\t"
+      // c3 r3 g3 b3 c2 r2 g2 b2
+      "punpckhhw %[result],     %[gb],              %[cr] \n\t"
+      "gssdrc1   %[result],     0x08(%[dst_argb])	  \n\t"
+      "gssdlc1   %[result],     0x0f(%[dst_argb])         \n\t"
+
+      // g7 b7 g6 b6 g5 b5 g4 b4
+      "punpckhbh %[gb],         %[tb],              %[tg] \n\t"
+      // c7 r7 c6 r6 c5 r5 c4 r4
+      "punpckhbh %[cr],         %[tr],              %[c1] \n\t"
+      // c5 r5 g5 b5 c4 r4 g4 b4
+      "punpcklhw %[result],     %[gb],              %[cr] \n\t"
+      "gssdrc1   %[result],     0x10(%[dst_argb])	  \n\t"
+      "gssdlc1   %[result],     0x17(%[dst_argb])         \n\t"
+      // c7 r7 g7 b7 c6 r6 g6 b6
+      "punpckhhw %[result],     %[gb],              %[cr] \n\t"
+      "gssdrc1   %[result],     0x18(%[dst_argb])	  \n\t"
+      "gssdlc1   %[result],     0x1f(%[dst_argb])         \n\t"
+
+      "daddiu    %[dst_argb],   %[dst_argb],        32    \n\t"
+      "daddiu    %[src_sobelx], %[src_sobelx],      8     \n\t"
+      "daddiu    %[src_sobely], %[src_sobely],      8     \n\t"
+      "daddiu    %[width],      %[width],          -8     \n\t"
+      "bgtz      %[width],      1b                        \n\t"
+      "nop                                                \n\t"
+      : [tr] "=&f"(temp[0]), [tb] "=&f"(temp[1]), [tg] "=&f"(temp[2]),
+        [gb] "=&f"(gb), [cr] "=&f"(cr), [result] "=&f"(result)
+      : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
+        [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1)
+      : "memory");
+}
+
+void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  // Copy a Y to RGB.
+  uint64_t src, dest;
+  const uint64_t mask0 = 0x00ffffff00ffffffULL;
+  const uint64_t mask1 = ~mask0;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gslwlc1    %[src],          0x03(%[src_ptr])                 \n\t"
+      "gslwrc1    %[src],          0x00(%[src_ptr])                 \n\t"
+      "punpcklbh  %[src],          %[src],            %[src]        \n\t"
+      "punpcklhw  %[dest],         %[src],            %[src]        \n\t"
+      "and        %[dest],         %[dest],           %[mask0]      \n\t"
+      "or         %[dest],         %[dest],           %[mask1]      \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+
+      "punpckhhw  %[dest],         %[src],            %[src]        \n\t"
+      "and        %[dest],         %[dest],           %[mask0]      \n\t"
+      "or         %[dest],         %[dest],           %[mask1]      \n\t"
+      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
+      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x04          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
+      "daddi      %[width],        %[width],         -0x04          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src] "=&f"(src), [dest] "=&f"(dest)
+      : [src_ptr] "r"(src_y), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
+        [mask1] "f"(mask1), [width] "r"(width)
+      : "memory");
+}
+
+// TODO - respect YuvConstants
+void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf,
+                       const struct YuvConstants*, int width) {
+  uint64_t src, src_lo, src_hi, dest, dest_lo, dest_hi;
+  const uint64_t mask0 = 0x0;
+  const uint64_t mask1 = 0x55;
+  const uint64_t mask2 = 0xAA;
+  const uint64_t mask3 = 0xFF;
+  const uint64_t mask4 = 0x4A354A354A354A35ULL;
+  const uint64_t mask5 = 0x0488048804880488ULL;
+  const uint64_t shift0 = 0x08;
+  const uint64_t shift1 = 0x06;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
+
+      "pshufh     %[src],          %[src_lo],         %[mask0]      \n\t"
+      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
+      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
+      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
+      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
+      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
+      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
+      "pshufh     %[src],          %[src_lo],         %[mask1]      \n\t"
+      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
+      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
+      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
+      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
+      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
+      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
+      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "pshufh     %[src],          %[src_lo],         %[mask2]      \n\t"
+      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
+      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
+      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
+      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
+      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
+      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
+      "pshufh     %[src],          %[src_lo],         %[mask3]      \n\t"
+      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
+      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
+      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
+      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
+      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
+      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
+      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
+      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
+
+      "pshufh     %[src],          %[src_hi],         %[mask0]      \n\t"
+      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
+      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
+      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
+      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
+      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
+      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
+      "pshufh     %[src],          %[src_hi],         %[mask1]      \n\t"
+      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
+      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
+      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
+      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
+      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
+      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
+      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
+      "gssdlc1    %[dest],         0x17(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x10(%[dst_ptr])                 \n\t"
+
+      "pshufh     %[src],          %[src_hi],         %[mask2]      \n\t"
+      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
+      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
+      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
+      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
+      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
+      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
+      "pshufh     %[src],          %[src_hi],         %[mask3]      \n\t"
+      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
+      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
+      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
+      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
+      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
+      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
+      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
+      "gssdlc1    %[dest],         0x1f(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
+
+      "daddi      %[src_ptr],      %[src_ptr],        0x08          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x20          \n\t"
+      "daddi      %[width],        %[width],         -0x08          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
+        [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi),
+        [dest_lo] "=&f"(dest_lo)
+      : [src_ptr] "r"(src_y), [dst_ptr] "r"(rgb_buf), [mask0] "f"(mask0),
+        [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3),
+        [mask4] "f"(mask4), [mask5] "f"(mask5), [shift0] "f"(shift0),
+        [shift1] "f"(shift1), [width] "r"(width)
+      : "memory");
+}
+
+void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
+  uint64_t source, src0, src1, dest;
+  const uint64_t mask0 = 0x0;
+  const uint64_t mask1 = 0x1b;
+
+  src += width - 1;
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[source],       0(%[src_ptr])                    \n\t"
+      "gsldrc1    %[source],       -7(%[src_ptr])                   \n\t"
+      "punpcklbh  %[src0],         %[source],         %[mask0]      \n\t"
+      "pshufh     %[src0],         %[src0],           %[mask1]      \n\t"
+      "punpckhbh  %[src1],         %[source],         %[mask0]      \n\t"
+      "pshufh     %[src1],         %[src1],           %[mask1]      \n\t"
+      "packushb   %[dest],         %[src1],           %[src0]       \n\t"
+
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddi      %[src_ptr],      %[src_ptr],       -0x08          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x08          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [source] "=&f"(source), [dest] "=&f"(dest), [src0] "=&f"(src0),
+        [src1] "=&f"(src1)
+      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
+        [mask1] "f"(mask1), [width] "r"(width)
+      : "memory");
+}
+
+void MirrorSplitUVRow_MMI(const uint8_t* src_uv,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  uint64_t src0, src1, dest0, dest1;
+  const uint64_t mask0 = 0x00ff00ff00ff00ffULL;
+  const uint64_t mask1 = 0x1b;
+  const uint64_t shift = 0x08;
+
+  src_uv += (width - 1) << 1;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[src0],         1(%[src_ptr])                    \n\t"
+      "gsldrc1    %[src0],         -6(%[src_ptr])                   \n\t"
+      "gsldlc1    %[src1],         -7(%[src_ptr])                   \n\t"
+      "gsldrc1    %[src1],         -14(%[src_ptr])                  \n\t"
+
+      "and        %[dest0],        %[src0],           %[mask0]      \n\t"
+      "pshufh     %[dest0],        %[dest0],          %[mask1]      \n\t"
+      "and        %[dest1],        %[src1],           %[mask0]      \n\t"
+      "pshufh     %[dest1],        %[dest1],          %[mask1]      \n\t"
+      "packushb   %[dest0],        %[dest0],          %[dest1]      \n\t"
+      "gssdlc1    %[dest0],        0x07(%[dstu_ptr])                \n\t"
+      "gssdrc1    %[dest0],        0x00(%[dstu_ptr])                \n\t"
+
+      "psrlh      %[dest0],        %[src0],           %[shift]      \n\t"
+      "pshufh     %[dest0],        %[dest0],          %[mask1]      \n\t"
+      "psrlh      %[dest1],        %[src1],           %[shift]      \n\t"
+      "pshufh     %[dest1],        %[dest1],          %[mask1]      \n\t"
+      "packushb   %[dest0],        %[dest0],          %[dest1]      \n\t"
+      "gssdlc1    %[dest0],        0x07(%[dstv_ptr])                \n\t"
+      "gssdrc1    %[dest0],        0x00(%[dstv_ptr])                \n\t"
+
+      "daddi      %[src_ptr],      %[src_ptr],       -0x10          \n\t"
+      "daddiu     %[dstu_ptr],     %[dstu_ptr],       0x08          \n\t"
+      "daddiu     %[dstv_ptr],     %[dstv_ptr],       0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x08          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0),
+        [src1] "=&f"(src1)
+      : [src_ptr] "r"(src_uv), [dstu_ptr] "r"(dst_u), [dstv_ptr] "r"(dst_v),
+        [width] "r"(width), [mask0] "f"(mask0), [mask1] "f"(mask1),
+        [shift] "f"(shift)
+      : "memory");
+}
+
+void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
+  src += (width - 1) * 4;
+  uint64_t temp = 0x0;
+  uint64_t shuff = 0x4e;  // 01 00 11 10
+  __asm__ volatile(
+      "1:                                      \n\t"
+      "gsldlc1 %[temp],  3(%[src])     	       \n\t"
+      "gsldrc1 %[temp], -4(%[src])     	       \n\t"
+      "pshufh  %[temp],  %[temp],    %[shuff]  \n\t"
+      "gssdrc1 %[temp],  0x0(%[dst])           \n\t"
+      "gssdlc1 %[temp],  0x7(%[dst])           \n\t"
+
+      "daddiu  %[src],   %[src],    -0x08      \n\t"
+      "daddiu  %[dst],   %[dst],     0x08      \n\t"
+      "daddiu  %[width], %[width],  -0x02      \n\t"
+      "bnez    %[width], 1b                    \n\t"
+      : [temp] "=&f"(temp)
+      : [src] "r"(src), [dst] "r"(dst), [width] "r"(width), [shuff] "f"(shuff)
+      : "memory");
+}
+
+void SplitUVRow_MMI(const uint8_t* src_uv,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width) {
+  uint64_t c0 = 0x00ff00ff00ff00ff;
+  uint64_t temp[4];
+  uint64_t shift = 0x08;
+  __asm__ volatile(
+      "1:	                                    \n\t"
+      "gsldrc1  %[t0],     0x00(%[src_uv])          \n\t"
+      "gsldlc1  %[t0],     0x07(%[src_uv])          \n\t"
+      "gsldrc1  %[t1],     0x08(%[src_uv])          \n\t"
+      "gsldlc1  %[t1],     0x0f(%[src_uv])          \n\t"
+
+      "and      %[t2],     %[t0],          %[c0]    \n\t"
+      "and      %[t3],     %[t1],          %[c0]    \n\t"
+      "packushb %[t2],     %[t2],          %[t3]    \n\t"
+      "gssdrc1  %[t2],     0x0(%[dst_u])	    \n\t"
+      "gssdlc1  %[t2],     0x7(%[dst_u])            \n\t"
+
+      "psrlh    %[t2],     %[t0],          %[shift] \n\t"
+      "psrlh    %[t3],     %[t1],          %[shift] \n\t"
+      "packushb %[t2],     %[t2],          %[t3]    \n\t"
+      "gssdrc1  %[t2],     0x0(%[dst_v])            \n\t"
+      "gssdlc1  %[t2],     0x7(%[dst_v])            \n\t"
+
+      "daddiu   %[src_uv], %[src_uv],      16       \n\t"
+      "daddiu   %[dst_u],  %[dst_u],       8        \n\t"
+      "daddiu   %[dst_v],  %[dst_v],       8        \n\t"
+      "daddiu   %[width],  %[width],      -8        \n\t"
+      "bgtz     %[width],  1b                       \n\t"
+      "nop                                          \n\t"
+      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
+        [t3] "=&f"(temp[3])
+      : [src_uv] "r"(src_uv), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
+        [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift)
+      : "memory");
+}
+
+void MergeUVRow_MMI(const uint8_t* src_u,
+                    const uint8_t* src_v,
+                    uint8_t* dst_uv,
+                    int width) {
+  uint64_t temp[3];
+  __asm__ volatile(
+      "1:	                                 \n\t"
+      "gsldrc1   %[t0],     0x0(%[src_u])        \n\t"
+      "gsldlc1   %[t0],     0x7(%[src_u])        \n\t"
+      "gsldrc1   %[t1],     0x0(%[src_v])        \n\t"
+      "gsldlc1   %[t1],     0x7(%[src_v])        \n\t"
+      "punpcklbh %[t2],     %[t0],         %[t1] \n\t"
+      "gssdrc1   %[t2],     0x0(%[dst_uv])	 \n\t"
+      "gssdlc1   %[t2],     0x7(%[dst_uv])       \n\t"
+      "punpckhbh %[t2],     %[t0],         %[t1] \n\t"
+      "gssdrc1   %[t2],     0x8(%[dst_uv])	 \n\t"
+      "gssdlc1   %[t2],     0xf(%[dst_uv])       \n\t"
+
+      "daddiu    %[src_u],  %[src_u],      8     \n\t"
+      "daddiu    %[src_v],  %[src_v],      8     \n\t"
+      "daddiu    %[dst_uv], %[dst_uv],     16    \n\t"
+      "daddiu    %[width],  %[width],     -8     \n\t"
+      "bgtz      %[width],  1b                   \n\t"
+      "nop                                       \n\t"
+      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2])
+      : [dst_uv] "r"(dst_uv), [src_u] "r"(src_u), [src_v] "r"(src_v),
+        [width] "r"(width)
+      : "memory");
+}
+
+void SplitRGBRow_MMI(const uint8_t* src_rgb,
+                     uint8_t* dst_r,
+                     uint8_t* dst_g,
+                     uint8_t* dst_b,
+                     int width) {
+  uint64_t src[4];
+  uint64_t dest_hi, dest_lo, dest;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gslwlc1    %[src0],         0x03(%[src_ptr])                 \n\t"
+      "gslwrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
+      "gslwlc1    %[src1],         0x06(%[src_ptr])                 \n\t"
+      "gslwrc1    %[src1],         0x03(%[src_ptr])                 \n\t"
+      "punpcklbh  %[dest_lo],      %[src0],           %[src1]       \n\t"
+      "gslwlc1    %[src2],         0x09(%[src_ptr])                 \n\t"
+      "gslwrc1    %[src2],         0x06(%[src_ptr])                 \n\t"
+      "gslwlc1    %[src3],         0x0c(%[src_ptr])                 \n\t"
+      "gslwrc1    %[src3],         0x09(%[src_ptr])                 \n\t"
+      "punpcklbh  %[dest_hi],      %[src2],           %[src3]       \n\t"
+
+      "punpcklhw  %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
+      "gsswlc1    %[dest],         0x03(%[dstr_ptr])                \n\t"
+      "gsswrc1    %[dest],         0x00(%[dstr_ptr])                \n\t"
+      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
+      "gsswlc1    %[dest],         0x03(%[dstg_ptr])                \n\t"
+      "gsswrc1    %[dest],         0x00(%[dstg_ptr])                \n\t"
+      "punpckhhw  %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
+      "gsswlc1    %[dest],         0x03(%[dstb_ptr])                \n\t"
+      "gsswrc1    %[dest],         0x00(%[dstb_ptr])                \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x0c          \n\t"
+      "daddiu     %[dstr_ptr],     %[dstr_ptr],       0x04          \n\t"
+      "daddiu     %[dstg_ptr],     %[dstg_ptr],       0x04          \n\t"
+      "daddiu     %[dstb_ptr],     %[dstb_ptr],       0x04          \n\t"
+      "daddi      %[width],        %[width],         -0x04          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src0] "=&f"(src[0]), [src1] "=&f"(src[1]), [src2] "=&f"(src[2]),
+        [src3] "=&f"(src[3]), [dest_hi] "=&f"(dest_hi),
+        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
+      : [src_ptr] "r"(src_rgb), [dstr_ptr] "r"(dst_r), [dstg_ptr] "r"(dst_g),
+        [dstb_ptr] "r"(dst_b), [width] "r"(width)
+      : "memory");
+}
+
+void MergeRGBRow_MMI(const uint8_t* src_r,
+                     const uint8_t* src_g,
+                     const uint8_t* src_b,
+                     uint8_t* dst_rgb,
+                     int width) {
+  uint64_t srcr, srcg, srcb, dest;
+  uint64_t srcrg_hi, srcrg_lo, srcbz_hi, srcbz_lo;
+  const uint64_t temp = 0x0;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[srcr],         0x07(%[srcr_ptr])                \n\t"
+      "gsldrc1    %[srcr],         0x00(%[srcr_ptr])                \n\t"
+      "gsldlc1    %[srcg],         0x07(%[srcg_ptr])                \n\t"
+      "gsldrc1    %[srcg],         0x00(%[srcg_ptr])                \n\t"
+      "punpcklbh  %[srcrg_lo],     %[srcr],           %[srcg]       \n\t"
+      "punpckhbh  %[srcrg_hi],     %[srcr],           %[srcg]       \n\t"
+
+      "gsldlc1    %[srcb],         0x07(%[srcb_ptr])                \n\t"
+      "gsldrc1    %[srcb],         0x00(%[srcb_ptr])                \n\t"
+      "punpcklbh  %[srcbz_lo],     %[srcb],           %[temp]       \n\t"
+      "punpckhbh  %[srcbz_hi],     %[srcb],           %[temp]       \n\t"
+
+      "punpcklhw  %[dest],         %[srcrg_lo],       %[srcbz_lo]   \n\t"
+      "gsswlc1    %[dest],         0x03(%[dst_ptr])                 \n\t"
+      "gsswrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
+      "gsswlc1    %[dest],         0x06(%[dst_ptr])                 \n\t"
+      "gsswrc1    %[dest],         0x03(%[dst_ptr])                 \n\t"
+      "punpckhhw  %[dest],         %[srcrg_lo],       %[srcbz_lo]   \n\t"
+      "gsswlc1    %[dest],         0x09(%[dst_ptr])                 \n\t"
+      "gsswrc1    %[dest],         0x06(%[dst_ptr])                 \n\t"
+      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
+      "gsswlc1    %[dest],         0x0c(%[dst_ptr])                 \n\t"
+      "gsswrc1    %[dest],         0x09(%[dst_ptr])                 \n\t"
+      "punpcklhw  %[dest],         %[srcrg_hi],       %[srcbz_hi]   \n\t"
+      "gsswlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
+      "gsswrc1    %[dest],         0x0c(%[dst_ptr])                 \n\t"
+      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
+      "gsswlc1    %[dest],         0x12(%[dst_ptr])                 \n\t"
+      "gsswrc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
+      "punpckhhw  %[dest],         %[srcrg_hi],       %[srcbz_hi]   \n\t"
+      "gsswlc1    %[dest],         0x15(%[dst_ptr])                 \n\t"
+      "gsswrc1    %[dest],         0x12(%[dst_ptr])                 \n\t"
+      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
+      "gsswlc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
+      "gsswrc1    %[dest],         0x15(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[srcr_ptr],     %[srcr_ptr],       0x08          \n\t"
+      "daddiu     %[srcg_ptr],     %[srcg_ptr],       0x08          \n\t"
+      "daddiu     %[srcb_ptr],     %[srcb_ptr],       0x08          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x18          \n\t"
+      "daddi      %[width],        %[width],         -0x08          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [srcr] "=&f"(srcr), [srcg] "=&f"(srcg), [srcb] "=&f"(srcb),
+        [dest] "=&f"(dest), [srcrg_hi] "=&f"(srcrg_hi),
+        [srcrg_lo] "=&f"(srcrg_lo), [srcbz_hi] "=&f"(srcbz_hi),
+        [srcbz_lo] "=&f"(srcbz_lo)
+      : [srcr_ptr] "r"(src_r), [srcg_ptr] "r"(src_g), [srcb_ptr] "r"(src_b),
+        [dst_ptr] "r"(dst_rgb), [width] "r"(width), [temp] "f"(temp)
+      : "memory");
+}
+
+// Filter 2 rows of YUY2 UV's (422) into U and V (420).
+void YUY2ToUVRow_MMI(const uint8_t* src_yuy2,
+                     int src_stride_yuy2,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  uint64_t c0 = 0xff00ff00ff00ff00;
+  uint64_t c1 = 0x00ff00ff00ff00ff;
+  uint64_t temp[3];
+  uint64_t data[4];
+  uint64_t shift = 0x08;
+  uint64_t src_stride = 0x0;
+  __asm__ volatile(
+      "1:	                                                     \n\t"
+      "gsldrc1  %[t0],         0x00(%[src_yuy2])                     \n\t"
+      "gsldlc1  %[t0],         0x07(%[src_yuy2])                     \n\t"
+      "daddu    %[src_stride], %[src_yuy2],       %[src_stride_yuy2] \n\t"
+      "gsldrc1  %[t1],         0x00(%[src_stride])                   \n\t"
+      "gsldlc1  %[t1],         0x07(%[src_stride])                   \n\t"
+      "pavgb    %[t0],         %[t0],             %[t1]              \n\t"
+
+      "gsldrc1  %[t2],         0x08(%[src_yuy2])                     \n\t"
+      "gsldlc1  %[t2],         0x0f(%[src_yuy2])                     \n\t"
+      "gsldrc1  %[t1],         0x08(%[src_stride])                   \n\t"
+      "gsldlc1  %[t1],         0x0f(%[src_stride])                   \n\t"
+      "pavgb    %[t1],         %[t2],             %[t1]              \n\t"
+
+      "and      %[t0],         %[t0],             %[c0]              \n\t"
+      "and      %[t1],         %[t1],             %[c0]              \n\t"
+      "psrlh    %[t0],         %[t0],             %[shift]           \n\t"
+      "psrlh    %[t1],         %[t1],             %[shift]           \n\t"
+      "packushb %[t0],         %[t0],             %[t1]              \n\t"
+      "mov.s    %[t1],         %[t0]                                 \n\t"
+      "and      %[d0],         %[t0],             %[c1]              \n\t"
+      "psrlh    %[d1],         %[t1],             %[shift]           \n\t"
+
+      "gsldrc1  %[t0],         0x10(%[src_yuy2])                     \n\t"
+      "gsldlc1  %[t0],         0x17(%[src_yuy2])                     \n\t"
+      "gsldrc1  %[t1],         0x10(%[src_stride])                   \n\t"
+      "gsldlc1  %[t1],         0x17(%[src_stride])                   \n\t"
+      "pavgb    %[t0],         %[t0],              %[t1]             \n\t"
+
+      "gsldrc1  %[t2],         0x18(%[src_yuy2])                     \n\t"
+      "gsldlc1  %[t2],         0x1f(%[src_yuy2])                     \n\t"
+      "gsldrc1  %[t1],         0x18(%[src_stride])                   \n\t"
+      "gsldlc1  %[t1],         0x1f(%[src_stride])                   \n\t"
+      "pavgb    %[t1],         %[t2],              %[t1]             \n\t"
+
+      "and      %[t0],         %[t0],              %[c0]             \n\t"
+      "and      %[t1],         %[t1],              %[c0]             \n\t"
+      "psrlh    %[t0],         %[t0],              %[shift]          \n\t"
+      "psrlh    %[t1],         %[t1],              %[shift]          \n\t"
+      "packushb %[t0],         %[t0],              %[t1]             \n\t"
+      "mov.s    %[t1],         %[t0]                                 \n\t"
+      "and      %[d2],         %[t0],              %[c1]             \n\t"
+      "psrlh    %[d3],         %[t1],              %[shift]          \n\t"
+
+      "packushb %[d0],         %[d0],              %[d2]             \n\t"
+      "packushb %[d1],         %[d1],              %[d3]             \n\t"
+      "gssdrc1  %[d0],         0x0(%[dst_u])	                     \n\t"
+      "gssdlc1  %[d0],         0x7(%[dst_u])                         \n\t"
+      "gssdrc1  %[d1],         0x0(%[dst_v])	                     \n\t"
+      "gssdlc1  %[d1],         0x7(%[dst_v])                         \n\t"
+      "daddiu   %[src_yuy2],   %[src_yuy2],        32                \n\t"
+      "daddiu   %[dst_u],      %[dst_u],           8                 \n\t"
+      "daddiu   %[dst_v],      %[dst_v],           8                 \n\t"
+      "daddiu   %[width],      %[width],          -16                \n\t"
+      "bgtz     %[width],      1b                                    \n\t"
+      "nop                                                           \n\t"
+      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
+        [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]),
+        [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride)
+      : [src_yuy2] "r"(src_yuy2), [src_stride_yuy2] "r"(src_stride_yuy2),
+        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+        [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift)
+      : "memory");
+}
+
+// Copy row of YUY2 UV's (422) into U and V (422).
+void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  uint64_t c0 = 0xff00ff00ff00ff00;
+  uint64_t c1 = 0x00ff00ff00ff00ff;
+  uint64_t temp[2];
+  uint64_t data[4];
+  uint64_t shift = 0x08;
+  __asm__ volatile(
+      "1:	                                        \n\t"
+      "gsldrc1  %[t0],       0x00(%[src_yuy2])          \n\t"
+      "gsldlc1  %[t0],       0x07(%[src_yuy2])          \n\t"
+      "gsldrc1  %[t1],       0x08(%[src_yuy2])          \n\t"
+      "gsldlc1  %[t1],       0x0f(%[src_yuy2])          \n\t"
+      "and      %[t0],       %[t0],            %[c0]    \n\t"
+      "and      %[t1],       %[t1],            %[c0]    \n\t"
+      "psrlh    %[t0],       %[t0],            %[shift] \n\t"
+      "psrlh    %[t1],       %[t1],            %[shift] \n\t"
+      "packushb %[t0],       %[t0],            %[t1]    \n\t"
+      "mov.s    %[t1],       %[t0]                      \n\t"
+      "and      %[d0],       %[t0],            %[c1]    \n\t"
+      "psrlh    %[d1],       %[t1],            %[shift] \n\t"
+
+      "gsldrc1  %[t0],       0x10(%[src_yuy2])          \n\t"
+      "gsldlc1  %[t0],       0x17(%[src_yuy2])          \n\t"
+      "gsldrc1  %[t1],       0x18(%[src_yuy2])          \n\t"
+      "gsldlc1  %[t1],       0x1f(%[src_yuy2])          \n\t"
+      "and      %[t0],       %[t0],            %[c0]    \n\t"
+      "and      %[t1],       %[t1],            %[c0]    \n\t"
+      "psrlh    %[t0],       %[t0],            %[shift] \n\t"
+      "psrlh    %[t1],       %[t1],            %[shift] \n\t"
+      "packushb %[t0],       %[t0],            %[t1]    \n\t"
+      "mov.s    %[t1],       %[t0]                      \n\t"
+      "and      %[d2],       %[t0],            %[c1]    \n\t"
+      "psrlh    %[d3],       %[t1],            %[shift] \n\t"
+
+      "packushb %[d0],       %[d0],            %[d2]    \n\t"
+      "packushb %[d1],       %[d1],            %[d3]    \n\t"
+      "gssdrc1  %[d0],       0x0(%[dst_u])	        \n\t"
+      "gssdlc1  %[d0],       0x7(%[dst_u])              \n\t"
+      "gssdrc1  %[d1],       0x0(%[dst_v])	        \n\t"
+      "gssdlc1  %[d1],       0x7(%[dst_v])              \n\t"
+      "daddiu   %[src_yuy2], %[src_yuy2],      32       \n\t"
+      "daddiu   %[dst_u],    %[dst_u],         8        \n\t"
+      "daddiu   %[dst_v],    %[dst_v],         8        \n\t"
+      "daddiu   %[width],    %[width],        -16       \n\t"
+      "bgtz     %[width],    1b                         \n\t"
+      "nop                                              \n\t"
+      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]),
+        [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
+      : [src_yuy2] "r"(src_yuy2), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
+        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift)
+      : "memory");
+}
+
+// Copy row of YUY2 Y's (422) into Y (420/422).
+void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  uint64_t c0 = 0x00ff00ff00ff00ff;
+  uint64_t temp[2];
+  __asm__ volatile(
+      "1:	                                     \n\t"
+      "gsldrc1  %[t0],       0x00(%[src_yuy2])       \n\t"
+      "gsldlc1  %[t0],       0x07(%[src_yuy2])       \n\t"
+      "gsldrc1  %[t1],       0x08(%[src_yuy2])       \n\t"
+      "gsldlc1  %[t1],       0x0f(%[src_yuy2])       \n\t"
+      "and      %[t0],       %[t0],            %[c0] \n\t"
+      "and      %[t1],       %[t1],            %[c0] \n\t"
+      "packushb %[t0],       %[t0],            %[t1] \n\t"
+      "gssdrc1  %[t0],       0x0(%[dst_y])	     \n\t"
+      "gssdlc1  %[t0],       0x7(%[dst_y])           \n\t"
+      "daddiu   %[src_yuy2], %[src_yuy2],      16    \n\t"
+      "daddiu   %[dst_y],    %[dst_y],         8     \n\t"
+      "daddiu   %[width],    %[width],        -8     \n\t"
+      "bgtz     %[width],    1b                      \n\t"
+      "nop                                           \n\t"
+      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1])
+      : [src_yuy2] "r"(src_yuy2), [dst_y] "r"(dst_y), [width] "r"(width),
+        [c0] "f"(c0)
+      : "memory");
+}
+
+// Filter 2 rows of UYVY UV's (422) into U and V (420).
+void UYVYToUVRow_MMI(const uint8_t* src_uyvy,
+                     int src_stride_uyvy,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  // Output a row of UV values.
+  uint64_t c0 = 0x00ff00ff00ff00ff;
+  uint64_t temp[3];
+  uint64_t data[4];
+  uint64_t shift = 0x08;
+  uint64_t src_stride = 0x0;
+  __asm__ volatile(
+      "1:	                                                      \n\t"
+      "gsldrc1  %[t0],         0x00(%[src_uyvy])                      \n\t"
+      "gsldlc1  %[t0],         0x07(%[src_uyvy])                      \n\t"
+      "daddu    %[src_stride], %[src_uyvy],        %[src_stride_uyvy] \n\t"
+      "gsldrc1  %[t1],         0x00(%[src_stride])                    \n\t"
+      "gsldlc1  %[t1],         0x07(%[src_stride])                    \n\t"
+      "pavgb    %[t0],         %[t0],              %[t1]              \n\t"
+
+      "gsldrc1  %[t2],         0x08(%[src_uyvy])                      \n\t"
+      "gsldlc1  %[t2],         0x0f(%[src_uyvy])                      \n\t"
+      "gsldrc1  %[t1],         0x08(%[src_stride])                    \n\t"
+      "gsldlc1  %[t1],         0x0f(%[src_stride])                    \n\t"
+      "pavgb    %[t1],         %[t2],              %[t1]              \n\t"
+
+      "and      %[t0],         %[t0],              %[c0]              \n\t"
+      "and      %[t1],         %[t1],              %[c0]              \n\t"
+      "packushb %[t0],         %[t0],              %[t1]              \n\t"
+      "mov.s    %[t1],         %[t0]                                  \n\t"
+      "and      %[d0],         %[t0],              %[c0]              \n\t"
+      "psrlh    %[d1],         %[t1],              %[shift]           \n\t"
+
+      "gsldrc1  %[t0],         0x10(%[src_uyvy])                      \n\t"
+      "gsldlc1  %[t0],         0x17(%[src_uyvy])                      \n\t"
+      "gsldrc1  %[t1],         0x10(%[src_stride])                    \n\t"
+      "gsldlc1  %[t1],         0x17(%[src_stride])                    \n\t"
+      "pavgb    %[t0],         %[t0],              %[t1]              \n\t"
+
+      "gsldrc1  %[t2],         0x18(%[src_uyvy])                      \n\t"
+      "gsldlc1  %[t2],         0x1f(%[src_uyvy])                      \n\t"
+      "gsldrc1  %[t1],         0x18(%[src_stride])                    \n\t"
+      "gsldlc1  %[t1],         0x1f(%[src_stride])                    \n\t"
+      "pavgb    %[t1],         %[t2],              %[t1]              \n\t"
+
+      "and      %[t0],         %[t0],              %[c0]              \n\t"
+      "and      %[t1],         %[t1],              %[c0]              \n\t"
+      "packushb %[t0],         %[t0],              %[t1]              \n\t"
+      "mov.s    %[t1],         %[t0]                                  \n\t"
+      "and      %[d2],         %[t0],              %[c0]              \n\t"
+      "psrlh    %[d3],         %[t1],              %[shift]           \n\t"
+
+      "packushb %[d0],         %[d0],              %[d2]              \n\t"
+      "packushb %[d1],         %[d1],              %[d3]              \n\t"
+      "gssdrc1  %[d0],         0x0(%[dst_u])	                      \n\t"
+      "gssdlc1  %[d0],         0x7(%[dst_u])                          \n\t"
+      "gssdrc1  %[d1],         0x0(%[dst_v])	                      \n\t"
+      "gssdlc1  %[d1],         0x7(%[dst_v])                          \n\t"
+      "daddiu   %[src_uyvy],   %[src_uyvy],        32                 \n\t"
+      "daddiu   %[dst_u],      %[dst_u],           8                  \n\t"
+      "daddiu   %[dst_v],      %[dst_v],           8                  \n\t"
+      "daddiu   %[width],      %[width],          -16                 \n\t"
+      "bgtz     %[width],      1b                                     \n\t"
+      "nop                                                            \n\t"
+      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
+        [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]),
+        [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride)
+      : [src_uyvy] "r"(src_uyvy), [src_stride_uyvy] "r"(src_stride_uyvy),
+        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+        [c0] "f"(c0), [shift] "f"(shift)
+      : "memory");
+}
+
+// Copy row of UYVY UV's (422) into U and V (422).
+void UYVYToUV422Row_MMI(const uint8_t* src_uyvy,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  // Output a row of UV values.
+  uint64_t c0 = 0x00ff00ff00ff00ff;
+  uint64_t temp[2];
+  uint64_t data[4];
+  uint64_t shift = 0x08;
+  __asm__ volatile(
+      "1:	                                        \n\t"
+      "gsldrc1  %[t0],       0x00(%[src_uyvy])          \n\t"
+      "gsldlc1  %[t0],       0x07(%[src_uyvy])          \n\t"
+      "gsldrc1  %[t1],       0x08(%[src_uyvy])          \n\t"
+      "gsldlc1  %[t1],       0x0f(%[src_uyvy])          \n\t"
+      "and      %[t0],       %[t0],            %[c0]    \n\t"
+      "and      %[t1],       %[t1],            %[c0]    \n\t"
+      "packushb %[t0],       %[t0],            %[t1]    \n\t"
+      "mov.s    %[t1],       %[t0]                      \n\t"
+      "and      %[d0],       %[t0],            %[c0]    \n\t"
+      "psrlh    %[d1],       %[t1],            %[shift] \n\t"
+
+      "gsldrc1  %[t0],       0x10(%[src_uyvy])          \n\t"
+      "gsldlc1  %[t0],       0x17(%[src_uyvy])          \n\t"
+      "gsldrc1  %[t1],       0x18(%[src_uyvy])          \n\t"
+      "gsldlc1  %[t1],       0x1f(%[src_uyvy])          \n\t"
+      "and      %[t0],       %[t0],            %[c0]    \n\t"
+      "and      %[t1],       %[t1],            %[c0]    \n\t"
+      "packushb %[t0],       %[t0],            %[t1]    \n\t"
+      "mov.s    %[t1],       %[t0]                      \n\t"
+      "and      %[d2],       %[t0],            %[c0]    \n\t"
+      "psrlh    %[d3],       %[t1],            %[shift] \n\t"
+
+      "packushb %[d0],       %[d0],            %[d2]    \n\t"
+      "packushb %[d1],       %[d1],            %[d3]    \n\t"
+      "gssdrc1  %[d0],       0x0(%[dst_u])	        \n\t"
+      "gssdlc1  %[d0],       0x7(%[dst_u])              \n\t"
+      "gssdrc1  %[d1],       0x0(%[dst_v])	        \n\t"
+      "gssdlc1  %[d1],       0x7(%[dst_v])              \n\t"
+      "daddiu   %[src_uyvy], %[src_uyvy],      32       \n\t"
+      "daddiu   %[dst_u],    %[dst_u],         8        \n\t"
+      "daddiu   %[dst_v],    %[dst_v],         8        \n\t"
+      "daddiu   %[width],    %[width],        -16       \n\t"
+      "bgtz     %[width],    1b                         \n\t"
+      "nop                                              \n\t"
+      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]),
+        [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
+      : [src_uyvy] "r"(src_uyvy), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
+        [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift)
+      : "memory");
+}
+
+// Copy row of UYVY Y's (422) into Y (420/422).
+void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  // Output a row of Y values.
+  uint64_t c0 = 0x00ff00ff00ff00ff;
+  uint64_t shift = 0x08;
+  uint64_t temp[2];
+  __asm__ volatile(
+      "1:	                                        \n\t"
+      "gsldrc1  %[t0],       0x00(%[src_uyvy])          \n\t"
+      "gsldlc1  %[t0],       0x07(%[src_uyvy])          \n\t"
+      "gsldrc1  %[t1],       0x08(%[src_uyvy])          \n\t"
+      "gsldlc1  %[t1],       0x0f(%[src_uyvy])          \n\t"
+      "dsrl     %[t0],       %[t0],            %[shift] \n\t"
+      "dsrl     %[t1],       %[t1],            %[shift] \n\t"
+      "and      %[t0],       %[t0],            %[c0]    \n\t"
+      "and      %[t1],       %[t1],            %[c0]    \n\t"
+      "and      %[t1],       %[t1],            %[c0]    \n\t"
+      "packushb %[t0],       %[t0],            %[t1]    \n\t"
+      "gssdrc1  %[t0],       0x0(%[dst_y])	        \n\t"
+      "gssdlc1  %[t0],       0x7(%[dst_y])              \n\t"
+      "daddiu   %[src_uyvy], %[src_uyvy],      16       \n\t"
+      "daddiu   %[dst_y],    %[dst_y],         8        \n\t"
+      "daddiu   %[width],    %[width],        -8        \n\t"
+      "bgtz     %[width],    1b                         \n\t"
+      "nop                                              \n\t"
+      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1])
+      : [src_uyvy] "r"(src_uyvy), [dst_y] "r"(dst_y), [width] "r"(width),
+        [c0] "f"(c0), [shift] "f"(shift)
+      : "memory");
+}
+
+// Blend src_argb0 over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb0 or src_argb1.
+// This code mimics the SSSE3 version for better testability.
+void ARGBBlendRow_MMI(const uint8_t* src_argb0,
+                      const uint8_t* src_argb1,
+                      uint8_t* dst_argb,
+                      int width) {
+  uint64_t src0, src1, dest, alpha, src0_hi, src0_lo, src1_hi, src1_lo, dest_hi,
+      dest_lo;
+  const uint64_t mask0 = 0x0;
+  const uint64_t mask1 = 0x00FFFFFF00FFFFFFULL;
+  const uint64_t mask2 = 0x00FF00FF00FF00FFULL;
+  const uint64_t mask3 = 0xFF;
+  const uint64_t mask4 = ~mask1;
+  const uint64_t shift = 0x08;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
+      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
+      "punpcklbh  %[src0_lo],      %[src0],           %[mask0]      \n\t"
+
+      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
+      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
+      "punpcklbh  %[src1_lo],      %[src1],           %[mask0]      \n\t"
+
+      "psubush    %[alpha],        %[mask2],          %[src0_lo]    \n\t"
+      "pshufh     %[alpha],        %[alpha],          %[mask3]      \n\t"
+      "pmullh     %[dest_lo],      %[src1_lo],        %[alpha]      \n\t"
+      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
+      "paddush    %[dest_lo],      %[dest_lo],        %[src0_lo]    \n\t"
+
+      "punpckhbh  %[src0_hi],      %[src0],           %[mask0]      \n\t"
+      "punpckhbh  %[src1_hi],      %[src1],           %[mask0]      \n\t"
+
+      "psubush    %[alpha],        %[mask2],          %[src0_hi]    \n\t"
+      "pshufh     %[alpha],        %[alpha],          %[mask3]      \n\t"
+      "pmullh     %[dest_hi],      %[src1_hi],        %[alpha]      \n\t"
+      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
+      "paddush    %[dest_hi],      %[dest_hi],        %[src0_hi]    \n\t"
+
+      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
+      "and        %[dest],         %[dest],           %[mask1]      \n\t"
+      "or         %[dest],         %[dest],           %[mask4]      \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
+      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x02          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src0] "=&f"(src0), [src1] "=&f"(src1), [alpha] "=&f"(alpha),
+        [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
+        [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
+        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo)
+      : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
+        [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1),
+        [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4),
+        [shift] "f"(shift), [width] "r"(width)
+      : "memory");
+}
+
+void BlendPlaneRow_MMI(const uint8_t* src0,
+                       const uint8_t* src1,
+                       const uint8_t* alpha,
+                       uint8_t* dst,
+                       int width) {
+  uint64_t source0, source1, dest, alph;
+  uint64_t src0_hi, src0_lo, src1_hi, src1_lo, alpha_hi, alpha_lo, dest_hi,
+      dest_lo;
+  uint64_t alpha_rev, alpha_rev_lo, alpha_rev_hi;
+  const uint64_t mask0 = 0x0;
+  const uint64_t mask1 = 0xFFFFFFFFFFFFFFFFULL;
+  const uint64_t mask2 = 0x00FF00FF00FF00FFULL;
+  const uint64_t shift = 0x08;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
+      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
+      "punpcklbh  %[src0_lo],      %[src0],           %[mask0]      \n\t"
+      "punpckhbh  %[src0_hi],      %[src0],           %[mask0]      \n\t"
+
+      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
+      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
+      "punpcklbh  %[src1_lo],      %[src1],           %[mask0]      \n\t"
+      "punpckhbh  %[src1_hi],      %[src1],           %[mask0]      \n\t"
+
+      "gsldlc1    %[alpha],        0x07(%[alpha_ptr])               \n\t"
+      "gsldrc1    %[alpha],        0x00(%[alpha_ptr])               \n\t"
+      "psubusb    %[alpha_r],      %[mask1],          %[alpha]      \n\t"
+      "punpcklbh  %[alpha_lo],     %[alpha],          %[mask0]      \n\t"
+      "punpckhbh  %[alpha_hi],     %[alpha],          %[mask0]      \n\t"
+      "punpcklbh  %[alpha_rlo],    %[alpha_r],        %[mask0]      \n\t"
+      "punpckhbh  %[alpha_rhi],    %[alpha_r],        %[mask0]      \n\t"
+
+      "pmullh     %[dest_lo],      %[src0_lo],        %[alpha_lo]   \n\t"
+      "pmullh     %[dest],         %[src1_lo],        %[alpha_rlo]  \n\t"
+      "paddush    %[dest_lo],      %[dest_lo],        %[dest]       \n\t"
+      "paddush    %[dest_lo],      %[dest_lo],        %[mask2]      \n\t"
+      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
+
+      "pmullh     %[dest_hi],      %[src0_hi],        %[alpha_hi]   \n\t"
+      "pmullh     %[dest],         %[src1_hi],        %[alpha_rhi]  \n\t"
+      "paddush    %[dest_hi],      %[dest_hi],        %[dest]       \n\t"
+      "paddush    %[dest_hi],      %[dest_hi],        %[mask2]      \n\t"
+      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
+
+      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
+      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
+      "daddiu     %[alpha_ptr],    %[alpha_ptr],      0x08          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x08          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src0] "=&f"(source0), [src1] "=&f"(source1), [alpha] "=&f"(alph),
+        [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
+        [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
+        [alpha_hi] "=&f"(alpha_hi), [alpha_lo] "=&f"(alpha_lo),
+        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
+        [alpha_rlo] "=&f"(alpha_rev_lo), [alpha_rhi] "=&f"(alpha_rev_hi),
+        [alpha_r] "=&f"(alpha_rev)
+      : [src0_ptr] "r"(src0), [src1_ptr] "r"(src1), [alpha_ptr] "r"(alpha),
+        [dst_ptr] "r"(dst), [mask0] "f"(mask0), [mask1] "f"(mask1),
+        [mask2] "f"(mask2), [shift] "f"(shift), [width] "r"(width)
+      : "memory");
+}
+
+// Multiply source RGB by alpha and store to destination.
+// This code mimics the SSSE3 version for better testability.
+void ARGBAttenuateRow_MMI(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width) {
+  uint64_t src, src_hi, src_lo, dest, dest_hi, dest_lo, alpha;
+  const uint64_t mask0 = 0xFF;
+  const uint64_t mask1 = 0xFF000000FF000000ULL;
+  const uint64_t mask2 = ~mask1;
+  const uint64_t shift = 0x08;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[src]        \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[src]        \n\t"
+
+      "pshufh     %[alpha],        %[src_lo],         %[mask0]      \n\t"
+      "pmulhuh    %[dest_lo],      %[alpha],          %[src_lo]     \n\t"
+      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
+      "pshufh     %[alpha],        %[src_hi],         %[mask0]      \n\t"
+      "pmulhuh    %[dest_hi],      %[alpha],          %[src_hi]     \n\t"
+      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
+
+      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
+      "and        %[dest],         %[dest],           %[mask2]      \n\t"
+      "and        %[src],          %[src],            %[mask1]      \n\t"
+      "or         %[dest],         %[dest],           %[src]        \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x02          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
+        [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi),
+        [dest_lo] "=&f"(dest_lo), [alpha] "=&f"(alpha)
+      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
+        [mask1] "f"(mask1), [mask2] "f"(mask2), [shift] "f"(shift),
+        [width] "r"(width)
+      : "memory");
+}
+
+void ComputeCumulativeSumRow_MMI(const uint8_t* row,
+                                 int32_t* cumsum,
+                                 const int32_t* previous_cumsum,
+                                 int width) {
+  int64_t row_sum[2] = {0, 0};
+  uint64_t src, dest0, dest1, presrc0, presrc1, dest;
+  const uint64_t mask = 0x0;
+
+  __asm__ volatile(
+      "xor        %[row_sum0],     %[row_sum0],       %[row_sum0]   \n\t"
+      "xor        %[row_sum1],     %[row_sum1],       %[row_sum1]   \n\t"
+
+      "1:                                                           \n\t"
+      "gslwlc1    %[src],          0x03(%[row_ptr])                 \n\t"
+      "gslwrc1    %[src],          0x00(%[row_ptr])                 \n\t"
+
+      "punpcklbh  %[src],          %[src],            %[mask]       \n\t"
+      "punpcklhw  %[dest0],        %[src],            %[mask]       \n\t"
+      "punpckhhw  %[dest1],        %[src],            %[mask]       \n\t"
+
+      "paddw      %[row_sum0],     %[row_sum0],       %[dest0]      \n\t"
+      "paddw      %[row_sum1],     %[row_sum1],       %[dest1]      \n\t"
+
+      "gsldlc1    %[presrc0],      0x07(%[pre_ptr])                 \n\t"
+      "gsldrc1    %[presrc0],      0x00(%[pre_ptr])                 \n\t"
+      "gsldlc1    %[presrc1],      0x0f(%[pre_ptr])                 \n\t"
+      "gsldrc1    %[presrc1],      0x08(%[pre_ptr])                 \n\t"
+
+      "paddw      %[dest0],        %[row_sum0],       %[presrc0]    \n\t"
+      "paddw      %[dest1],        %[row_sum1],       %[presrc1]    \n\t"
+
+      "gssdlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
+      "gssdlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[row_ptr],      %[row_ptr],        0x04          \n\t"
+      "daddiu     %[pre_ptr],      %[pre_ptr],        0x10          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
+      "daddi      %[width],        %[width],         -0x01          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
+        [dest1] "=&f"(dest1), [row_sum0] "+&f"(row_sum[0]),
+        [row_sum1] "+&f"(row_sum[1]), [presrc0] "=&f"(presrc0),
+        [presrc1] "=&f"(presrc1)
+      : [row_ptr] "r"(row), [pre_ptr] "r"(previous_cumsum),
+        [dst_ptr] "r"(cumsum), [width] "r"(width), [mask] "f"(mask)
+      : "memory");
+}
+
+// C version 2x2 -> 2x1.
+void InterpolateRow_MMI(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int width,
+                        int source_y_fraction) {
+  if (source_y_fraction == 0) {
+    __asm__ volatile(
+        "1:	                              \n\t"
+        "ld     $t0,        0x0(%[src_ptr])   \n\t"
+        "sd     $t0,        0x0(%[dst_ptr])   \n\t"
+        "daddiu %[src_ptr], %[src_ptr],     8 \n\t"
+        "daddiu %[dst_ptr], %[dst_ptr],     8 \n\t"
+        "daddiu %[width],   %[width],      -8 \n\t"
+        "bgtz   %[width],   1b                \n\t"
+        "nop                                  \n\t"
+        :
+        : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), [width] "r"(width)
+        : "memory");
+    return;
+  }
+  if (source_y_fraction == 128) {
+    uint64_t uv = 0x0;
+    uint64_t uv_stride = 0x0;
+    __asm__ volatile(
+        "1:	                                            \n\t"
+        "gsldrc1 %[uv],        0x0(%[src_ptr])              \n\t"
+        "gsldlc1 %[uv],        0x7(%[src_ptr])              \n\t"
+        "daddu   $t0,          %[src_ptr],     %[stride]    \n\t"
+        "gsldrc1 %[uv_stride], 0x0($t0)                     \n\t"
+        "gsldlc1 %[uv_stride], 0x7($t0)                     \n\t"
+
+        "pavgb   %[uv],        %[uv],          %[uv_stride] \n\t"
+        "gssdrc1 %[uv],        0x0(%[dst_ptr])              \n\t"
+        "gssdlc1 %[uv],        0x7(%[dst_ptr])              \n\t"
+
+        "daddiu  %[src_ptr],   %[src_ptr],     8            \n\t"
+        "daddiu  %[dst_ptr],   %[dst_ptr],     8            \n\t"
+        "daddiu  %[width],     %[width],      -8            \n\t"
+        "bgtz    %[width],     1b                           \n\t"
+        "nop                                                \n\t"
+        : [uv] "=&f"(uv), [uv_stride] "=&f"(uv_stride)
+        : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(width),
+          [stride] "r"((int64_t)src_stride)
+        : "memory");
+    return;
+  }
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  uint64_t temp;
+  uint64_t data[4];
+  uint64_t zero = 0x0;
+  uint64_t c0 = 0x0080008000800080;
+  uint64_t fy0 = 0x0100010001000100;
+  uint64_t shift = 0x8;
+  __asm__ volatile(
+      "pshufh    %[fy1],      %[fy1],          %[zero]  \n\t"
+      "psubh     %[fy0],      %[fy0],          %[fy1]   \n\t"
+      "1:	                                        \n\t"
+      "gsldrc1   %[t0],       0x0(%[src_ptr])           \n\t"
+      "gsldlc1   %[t0],       0x7(%[src_ptr])           \n\t"
+      "punpcklbh %[d0],       %[t0],           %[zero]  \n\t"
+      "punpckhbh %[d1],       %[t0],           %[zero]  \n\t"
+      "gsldrc1   %[t0],       0x0(%[src_ptr1])          \n\t"
+      "gsldlc1   %[t0],       0x7(%[src_ptr1])          \n\t"
+      "punpcklbh %[d2],       %[t0],           %[zero]  \n\t"
+      "punpckhbh %[d3],       %[t0],           %[zero]  \n\t"
+
+      "pmullh    %[d0],       %[d0],           %[fy0]   \n\t"
+      "pmullh    %[d2],       %[d2],           %[fy1]   \n\t"
+      "paddh     %[d0],       %[d0],           %[d2]    \n\t"
+      "paddh     %[d0],       %[d0],           %[c0]    \n\t"
+      "psrlh     %[d0],       %[d0],           %[shift] \n\t"
+
+      "pmullh    %[d1],       %[d1],           %[fy0]   \n\t"
+      "pmullh    %[d3],       %[d3],           %[fy1]   \n\t"
+      "paddh     %[d1],       %[d1],           %[d3]    \n\t"
+      "paddh     %[d1],       %[d1],           %[c0]    \n\t"
+      "psrlh     %[d1],       %[d1],           %[shift] \n\t"
+
+      "packushb  %[d0],       %[d0],           %[d1]    \n\t"
+      "gssdrc1   %[d0],       0x0(%[dst_ptr])           \n\t"
+      "gssdlc1   %[d0],       0x7(%[dst_ptr])           \n\t"
+      "daddiu    %[src_ptr],  %[src_ptr],      8        \n\t"
+      "daddiu    %[src_ptr1], %[src_ptr1],     8        \n\t"
+      "daddiu    %[dst_ptr],  %[dst_ptr],      8        \n\t"
+      "daddiu    %[width],    %[width],       -8        \n\t"
+      "bgtz      %[width],    1b                        \n\t"
+      "nop                                              \n\t"
+      : [t0] "=&f"(temp), [d0] "=&f"(data[0]), [d1] "=&f"(data[1]),
+        [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
+      : [src_ptr] "r"(src_ptr), [src_ptr1] "r"(src_ptr1),
+        [dst_ptr] "r"(dst_ptr), [width] "r"(width),
+        [fy1] "f"(source_y_fraction), [fy0] "f"(fy0), [c0] "f"(c0),
+        [shift] "f"(shift), [zero] "f"(zero)
+      : "memory");
+}
+
+// Use first 4 shuffler values to reorder ARGB channels.
+void ARGBShuffleRow_MMI(const uint8_t* src_argb,
+                        uint8_t* dst_argb,
+                        const uint8_t* shuffler,
+                        int width) {
+  uint64_t source, dest0, dest1, dest;
+  const uint64_t mask0 = 0x0;
+  const uint64_t mask1 = (shuffler[0] & 0x03) | ((shuffler[1] & 0x03) << 2) |
+                         ((shuffler[2] & 0x03) << 4) |
+                         ((shuffler[3] & 0x03) << 6);
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
+
+      "punpcklbh  %[dest0],        %[src],            %[mask0]      \n\t"
+      "pshufh     %[dest0],        %[dest0],          %[mask1]      \n\t"
+      "punpckhbh  %[dest1],        %[src],            %[mask0]      \n\t"
+      "pshufh     %[dest1],        %[dest1],          %[mask1]      \n\t"
+      "packushb   %[dest],         %[dest0],          %[dest1]      \n\t"
+
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x02          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
+        [dest1] "=&f"(dest1)
+      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
+        [mask1] "f"(mask1), [width] "r"(width)
+      : "memory");
+}
+
+void I422ToYUY2Row_MMI(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_frame,
+                       int width) {
+  uint64_t temp[3];
+  uint64_t vu = 0x0;
+  __asm__ volatile(
+      "1:	                                        \n\t"
+      "gsldlc1   %[ty],        0x7(%[src_y])            \n\t"  // r=src_sobelx[i]
+      "gsldrc1   %[ty],        0x0(%[src_y])            \n\t"  // r=src_sobelx[i]
+      "gslwlc1   %[tu],        0x3(%[src_u])            \n\t"  // b=src_sobely[i]
+      "gslwrc1   %[tu],        0x0(%[src_u])            \n\t"  // b=src_sobely[i]
+      "gslwlc1   %[tv],        0x3(%[src_v])            \n\t"  // b=src_sobely[i]
+      "gslwrc1   %[tv],        0x0(%[src_v])            \n\t"  // b=src_sobely[i]
+      "punpcklbh %[vu],        %[tu],             %[tv]	\n\t"  // g
+      "punpcklbh %[tu],        %[ty],             %[vu]	\n\t"  // g
+      "gssdlc1   %[tu],        0x7(%[dst_frame])        \n\t"
+      "gssdrc1   %[tu],        0x0(%[dst_frame])        \n\t"
+      "punpckhbh %[tu],        %[ty],             %[vu]	\n\t"  // g
+      "gssdlc1   %[tu],        0x0F(%[dst_frame])       \n\t"
+      "gssdrc1   %[tu],        0x08(%[dst_frame])       \n\t"
+      "daddiu    %[src_y],     %[src_y],          8     \n\t"
+      "daddiu    %[src_u],     %[src_u],          4     \n\t"
+      "daddiu    %[src_v],     %[src_v],          4     \n\t"
+      "daddiu    %[dst_frame], %[dst_frame],      16    \n\t"
+      "daddiu    %[width],     %[width],         -8     \n\t"
+      "bgtz      %[width],     1b                       \n\t"
+      "nop                                              \n\t"
+      : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]),
+        [vu] "=&f"(vu)
+      : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
+        [dst_frame] "r"(dst_frame), [width] "r"(width)
+      : "memory");
+}
+
+void I422ToUYVYRow_MMI(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_frame,
+                       int width) {
+  uint64_t temp[3];
+  uint64_t vu = 0x0;
+  __asm__ volatile(
+      "1:	                                        \n\t"
+      "gsldlc1   %[ty],        0x7(%[src_y])            \n\t"  // r=src_sobelx[i]
+      "gsldrc1   %[ty],        0x0(%[src_y])            \n\t"  // r=src_sobelx[i]
+      "gslwlc1   %[tu],        0x3(%[src_u])            \n\t"  // b=src_sobely[i]
+      "gslwrc1   %[tu],        0x0(%[src_u])            \n\t"  // b=src_sobely[i]
+      "gslwlc1   %[tv],        0x3(%[src_v])            \n\t"  // b=src_sobely[i]
+      "gslwrc1   %[tv],        0x0(%[src_v])            \n\t"  // b=src_sobely[i]
+      "punpcklbh %[vu],        %[tu],             %[tv]	\n\t"  // g
+      "punpcklbh %[tu],        %[vu],             %[ty]	\n\t"  // g
+      "gssdlc1   %[tu],        0x7(%[dst_frame])        \n\t"
+      "gssdrc1   %[tu],        0x0(%[dst_frame])        \n\t"
+      "punpckhbh %[tu],        %[vu],             %[ty]	\n\t"  // g
+      "gssdlc1   %[tu],        0x0F(%[dst_frame])       \n\t"
+      "gssdrc1   %[tu],        0x08(%[dst_frame])       \n\t"
+      "daddiu    %[src_y],     %[src_y],          8     \n\t"
+      "daddiu    %[src_u],     %[src_u],          4     \n\t"
+      "daddiu    %[src_v],     %[src_v],          4     \n\t"
+      "daddiu    %[dst_frame], %[dst_frame],      16    \n\t"
+      "daddiu    %[width],     %[width],         -8     \n\t"
+      "bgtz      %[width],     1b                       \n\t"
+      "nop                                              \n\t"
+      : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]),
+        [vu] "=&f"(vu)
+      : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
+        [dst_frame] "r"(dst_frame), [width] "r"(width)
+      : "memory");
+}
+
+void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
+  uint64_t source, dest;
+  const uint64_t mask0 = 0xff000000ff000000ULL;
+  const uint64_t mask1 = ~mask0;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
+      "gsldlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gsldrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "and        %[src],          %[src],            %[mask0]      \n\t"
+      "and        %[dest],         %[dest],           %[mask1]      \n\t"
+      "or         %[dest],         %[src],            %[dest]       \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x02          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src] "=&f"(source), [dest] "=&f"(dest)
+      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
+        [mask1] "f"(mask1), [width] "r"(width)
+      : "memory");
+}
+
+void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb,
+                             uint8_t* dst_a,
+                             int width) {
+  uint64_t src, dest0, dest1, dest_lo, dest_hi, dest;
+  const uint64_t mask = 0xff000000ff000000ULL;
+  const uint64_t shift = 0x18;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
+      "and        %[dest0],        %[src],            %[mask]       \n\t"
+      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
+      "gsldlc1    %[src],          0x0f(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src],          0x08(%[src_ptr])                 \n\t"
+      "and        %[dest1],        %[src],            %[mask]       \n\t"
+      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
+      "packsswh   %[dest_lo],      %[dest0],          %[dest1]      \n\t"
+
+      "gsldlc1    %[src],          0x17(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src],          0x10(%[src_ptr])                 \n\t"
+      "and        %[dest0],        %[src],            %[mask]       \n\t"
+      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
+      "gsldlc1    %[src],          0x1f(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src],          0x18(%[src_ptr])                 \n\t"
+      "and        %[dest1],        %[src],            %[mask]       \n\t"
+      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
+      "packsswh   %[dest_hi],      %[dest0],          %[dest1]      \n\t"
+
+      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
+
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x08          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
+        [dest1] "=&f"(dest1), [dest_lo] "=&f"(dest_lo), [dest_hi] "=&f"(dest_hi)
+      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_a), [mask] "f"(mask),
+        [shift] "f"(shift), [width] "r"(width)
+      : "memory");
+}
+
+void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
+  uint64_t source, dest0, dest1, dest;
+  const uint64_t mask0 = 0x0;
+  const uint64_t mask1 = 0x00ffffff00ffffffULL;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
+
+      "punpcklbh  %[dest0],        %[mask0],          %[src]        \n\t"
+      "punpcklhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
+      "gsldlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gsldrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+      "and        %[dest],         %[dest],           %[mask1]      \n\t"
+      "or         %[dest],         %[dest],           %[dest1]      \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+      "punpckhhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
+      "gsldlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
+      "gsldrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
+      "and        %[dest],         %[dest],           %[mask1]      \n\t"
+      "or         %[dest],         %[dest],           %[dest1]      \n\t"
+      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
+
+      "punpckhbh  %[dest0],        %[mask0],          %[src]        \n\t"
+      "punpcklhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
+      "gsldlc1    %[dest],         0x17(%[dst_ptr])                 \n\t"
+      "gsldrc1    %[dest],         0x10(%[dst_ptr])                 \n\t"
+      "and        %[dest],         %[dest],           %[mask1]      \n\t"
+      "or         %[dest],         %[dest],           %[dest1]      \n\t"
+      "gssdlc1    %[dest],         0x17(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x10(%[dst_ptr])                 \n\t"
+      "punpckhhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
+      "gsldlc1    %[dest],         0x1f(%[dst_ptr])                 \n\t"
+      "gsldrc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
+      "and        %[dest],         %[dest],           %[mask1]      \n\t"
+      "or         %[dest],         %[dest],           %[dest1]      \n\t"
+      "gssdlc1    %[dest],         0x1f(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x20          \n\t"
+      "daddi      %[width],        %[width],         -0x08          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
+        [dest1] "=&f"(dest1)
+      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
+        [mask1] "f"(mask1), [width] "r"(width)
+      : "memory");
+}
+
+void I444ToARGBRow_MMI(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* rgb_buf,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  uint64_t y,u,v;
+  uint64_t b_vec[2],g_vec[2],r_vec[2];
+  uint64_t mask = 0xff00ff00ff00ff00ULL;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+  __asm__ volatile (
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"//yg
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"//bb
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"//ub
+    "or         %[ub],           %[ub],             %[mask]       \n\t"//must sign extension
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"//bg
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"//ug
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"//vg
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"//br
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"//vr
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask]       \n\t"//sign extension
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
+
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"//u
+    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
+    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
+    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
+
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"//v
+    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
+    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"//u*ug
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"//v*vg
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
+
+    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
+    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"//v*vr
+    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
+    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
+
+    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//rrrrbbbb
+    "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"//ffffgggg
+    "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
+    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//gbgbgbgb
+    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//frfrfrfr
+    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
+    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
+    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
+    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x04          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x04          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+    : [y]"=&f"(y),
+      [u]"=&f"(u),                         [v]"=&f"(v),
+      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
+      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
+      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [alpha]"f"(-1),
+      [six]"f"(0x6),                       [five]"f"(0x55),
+      [mask]"f"(mask)
+    : "memory"
+  );
+}
+
+// Also used for 420
+void I422ToARGBRow_MMI(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* rgb_buf,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  uint64_t y,u,v;
+  uint64_t b_vec[2],g_vec[2],r_vec[2];
+  uint64_t mask = 0xff00ff00ff00ff00ULL;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+  __asm__ volatile(
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"//yg
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"//bb
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"//ub
+    "or         %[ub],           %[ub],             %[mask]       \n\t"//must sign extension
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"//bg
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"//ug
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"//vg
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"//br
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"//vr
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask]       \n\t"//sign extension
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
+
+    //u3|u2|u1|u0 --> u1|u1|u0|u0
+    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
+    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
+    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
+
+    //v3|v2|v1|v0 --> v1|v1|v0|v0
+    "punpcklbh  %[v],            %[v],              %[v]          \n\t"//v
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
+    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
+    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"//u*ug
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"//v*vg
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
+
+    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
+    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"//v*vr
+    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
+    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
+
+    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//rrrrbbbb
+    "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"//ffffgggg
+    "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
+    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//gbgbgbgb
+    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//frfrfrfr
+    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
+    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
+    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
+    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),
+      [u]"=&f"(u),                         [v]"=&f"(v),
+      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
+      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
+      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [alpha]"f"(-1),
+      [six]"f"(0x6),                       [five]"f"(0x55),
+      [mask]"f"(mask)
+    : "memory"
+  );
+}
+
+// 10 bit YUV to ARGB
+void I210ToARGBRow_MMI(const uint16_t* src_y,
+                       const uint16_t* src_u,
+                       const uint16_t* src_v,
+                       uint8_t* rgb_buf,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  uint64_t y,u,v;
+  uint64_t b_vec[2],g_vec[2],r_vec[2];
+  uint64_t mask = 0xff00ff00ff00ff00ULL;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+  __asm__ volatile(
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask]       \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask]       \n\t"
+
+    "1:                                                           \n\t"
+    "gsldlc1    %[y],            0x07(%[y_ptr])                   \n\t"
+    "gsldrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+
+    "psllh      %[y],            %[y],              %[six]        \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+
+    "punpcklhw  %[u],            %[u],              %[u]          \n\t"
+    "psrah      %[u],            %[u],              %[two]        \n\t"
+    "punpcklhw  %[v],            %[v],              %[v]          \n\t"
+    "psrah      %[v],            %[v],              %[two]        \n\t"
+    "pminsh     %[u],            %[u],              %[mask1]      \n\t"
+    "pminsh     %[v],            %[v],              %[mask1]      \n\t"
+
+    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
+    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
+
+    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
+    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+
+    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
+    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
+
+    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
+    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
+    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
+
+    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
+    "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
+    "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
+    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
+    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
+    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
+    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
+    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
+    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x08          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x04          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x04          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),
+      [u]"=&f"(u),                         [v]"=&f"(v),
+      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
+      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
+      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [alpha]"f"(-1),
+      [six]"f"(0x6),                       [five]"f"(0x55),
+      [mask]"f"(mask),                     [two]"f"(0x02),
+      [mask1]"f"(0x00ff00ff00ff00ff)
+    : "memory"
+  );
+}
+
+void I422AlphaToARGBRow_MMI(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* rgb_buf,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  uint64_t y,u,v,a;
+  uint64_t b_vec[2],g_vec[2],r_vec[2];
+  uint64_t mask = 0xff00ff00ff00ff00ULL;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+  __asm__ volatile(
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask]       \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask]       \n\t"
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+    "gslwlc1    %[a],            0x03(%[a_ptr])                   \n\t"
+    "gslwrc1    %[a],            0x00(%[a_ptr])                   \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
+
+    //u3|u2|u1|u0 --> u1|u1|u0|u0
+    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
+    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
+    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
+
+    //v3|v2|v1|v0 --> v1|v1|v0|v0
+    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
+    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
+    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
+
+    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
+    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
+    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
+
+    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//rrrrbbbb
+    "packushb   %[g_vec0],       %[g_vec0],         %[a]          \n\t"
+    "punpcklwd  %[g_vec0],       %[g_vec0],         %[a]          \n\t"//aaaagggg
+    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
+    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
+    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
+    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
+    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
+    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[a_ptr],        %[a_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),                         [a]"=&f"(a),
+      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
+      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
+      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [a_ptr]"r"(src_a),                   [zero]"f"(0x00),
+      [six]"f"(0x6),                       [five]"f"(0x55),
+      [mask]"f"(mask)
+    : "memory"
+  );
+}
+
+void I422ToRGB24Row_MMI(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  uint64_t y,u,v;
+  uint64_t b_vec[2],g_vec[2],r_vec[2];
+  uint64_t mask = 0xff00ff00ff00ff00ULL;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+  __asm__ volatile(
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask]       \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask]       \n\t"
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
+
+    //u3|u2|u1|u0 --> u1|u1|u0|u0
+    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
+    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
+    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
+
+    //v3|v2|v1|v0 --> v1|v1|v0|v0
+    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
+    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
+    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
+
+    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
+    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
+    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
+
+    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
+    "packushb   %[g_vec0],       %[g_vec0],         %[zero]       \n\t"
+    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
+    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
+    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
+    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
+
+    "punpckhwd  %[r_vec0],       %[g_vec0],         %[g_vec0]     \n\t"
+    "psllw      %[r_vec1],       %[r_vec0],         %[lmove1]     \n\t"
+    "or         %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
+    "psrlw      %[r_vec1],       %[r_vec0],         %[rmove1]     \n\t"
+    "pextrh     %[r_vec1],       %[r_vec1],         %[zero]       \n\t"
+    "pinsrh_2   %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
+    "pextrh     %[r_vec1],       %[g_vec1],         %[zero]       \n\t"
+    "pinsrh_3   %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
+    "pextrh     %[r_vec1],       %[g_vec1],         %[one]        \n\t"
+    "punpckhwd  %[g_vec1],       %[g_vec1],         %[g_vec1]     \n\t"
+    "psllw      %[g_vec1],       %[g_vec1],         %[rmove1]     \n\t"
+    "or         %[g_vec1],       %[g_vec1],         %[r_vec1]     \n\t"
+    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
+    "gsswlc1    %[g_vec1],       0x0b(%[rgbbuf_ptr])              \n\t"
+    "gsswrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
+
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0c          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
+      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
+      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask]"f"(mask),
+      [lmove1]"f"(0x18),                   [rmove1]"f"(0x8),
+      [one]"f"(0x1)
+    : "memory"
+  );
+}
+
+void I422ToARGB4444Row_MMI(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb4444,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
+  uint64_t y, u, v;
+  uint64_t b_vec, g_vec, r_vec, temp;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+  __asm__ volatile(
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask]       \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask]       \n\t"
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
+
+    //u3|u2|u1|u0 --> u1|u1|u0|u0
+    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+
+    //v3|v2|v1|v0 --> v1|v1|v0|v0
+    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "and        %[g_vec],        %[g_vec],          %[mask1]      \n\t"
+    "psrlw      %[g_vec],        %[g_vec],          %[four]       \n\t"
+    "psrlw      %[r_vec],        %[g_vec],          %[four]       \n\t"
+    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "punpcklbh  %[r_vec],        %[alpha],          %[zero]       \n\t"
+    "and        %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+
+    "and        %[b_vec],        %[b_vec],          %[mask1]      \n\t"
+    "psrlw      %[b_vec],        %[b_vec],          %[four]       \n\t"
+    "psrlw      %[r_vec],        %[b_vec],          %[four]       \n\t"
+    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpcklbh  %[r_vec],        %[alpha],          %[zero]       \n\t"
+    "and        %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[b_vec]      \n\t"
+
+    "gssdlc1    %[g_vec],        0x07(%[dst_argb4444])            \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[dst_argb4444])            \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
+    "daddiu     %[dst_argb4444], %[dst_argb4444],   0x08          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [dst_argb4444]"r"(dst_argb4444),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask]"f"(0xff00ff00ff00ff00),
+      [four]"f"(0x4),                      [mask1]"f"(0xf0f0f0f0f0f0f0f0),
+      [alpha]"f"(-1)
+    : "memory"
+  );
+}
+
+void I422ToARGB1555Row_MMI(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb1555,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
+  uint64_t y, u, v;
+  uint64_t b_vec, g_vec, r_vec, temp;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+  __asm__ volatile(
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+
+    //u3|u2|u1|u0 --> u1|u1|u0|u0
+    "punpcklbh  %[u],            %[u],              %[u]          \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+
+    //v3|v2|v1|v0 --> v1|v1|v0|v0
+    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "psrlw      %[temp],         %[g_vec],          %[three]      \n\t"
+    "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "or         %[g_vec],        %[g_vec],          %[mask3]      \n\t"
+
+    "psrlw      %[temp],         %[b_vec],          %[three]      \n\t"
+    "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "or         %[b_vec],        %[b_vec],          %[mask3]      \n\t"
+
+    "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
+
+    "gssdlc1    %[g_vec],        0x07(%[dst_argb1555])            \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[dst_argb1555])            \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
+    "daddiu     %[dst_argb1555], %[dst_argb1555],   0x08          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [dst_argb1555]"r"(dst_argb1555),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [three]"f"(0x3),                     [mask2]"f"(0x1f0000001f),
+      [eight]"f"(0x8),                     [mask3]"f"(0x800000008000),
+      [lmove5]"f"(0x5)
+    : "memory"
+  );
+}
+
+void I422ToRGB565Row_MMI(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  uint64_t y, u, v;
+  uint64_t b_vec, g_vec, r_vec, temp;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+  __asm__ volatile(
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+
+    //u3|u2|u1|u0 --> u1|u1|u0|u0
+    "punpcklbh  %[u],            %[u],              %[u]          \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+
+    //v3|v2|v1|v0 --> v1|v1|v0|v0
+    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "psrlh      %[temp],         %[g_vec],          %[three]      \n\t"
+    "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
+    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
+    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "paddb      %[temp],         %[three],          %[eight]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+
+    "psrlh      %[temp],         %[b_vec],          %[three]      \n\t"
+    "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
+    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
+    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "paddb      %[temp],         %[three],          %[eight]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
+
+    "gssdlc1    %[g_vec],        0x07(%[dst_rgb565])             \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[dst_rgb565])             \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
+    "daddiu     %[dst_rgb565],   %[dst_rgb565],     0x08          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [dst_rgb565]"r"(dst_rgb565),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [three]"f"(0x3),                     [mask2]"f"(0x1f0000001f),
+      [eight]"f"(0x8),                     [seven]"f"(0x7),
+      [lmove5]"f"(0x5)
+    : "memory"
+  );
+}
+
+void NV12ToARGBRow_MMI(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* rgb_buf,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  uint64_t y, u, v;
+  uint64_t b_vec, g_vec, r_vec, temp;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+  __asm__ volatile(
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
+    "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "pshufh     %[v],            %[u],              %[vshu]       \n\t"
+    "pshufh     %[u],            %[u],              %[ushu]       \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
+    "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
+    "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
+    "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [uv_ptr]"r"(src_uv),
+      [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
+      [alpha]"f"(-1)
+    : "memory"
+  );
+}
+
+void NV21ToARGBRow_MMI(const uint8_t* src_y,
+                       const uint8_t* src_vu,
+                       uint8_t* rgb_buf,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  uint64_t y, u, v;
+  uint64_t b_vec, g_vec, r_vec, temp;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+  __asm__ volatile(
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[vu_ptr])                  \n\t"
+    "gslwrc1    %[u],            0x00(%[vu_ptr])                  \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "pshufh     %[v],            %[u],              %[ushu]       \n\t"
+    "pshufh     %[u],            %[u],              %[vshu]       \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
+    "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
+    "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
+    "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[vu_ptr],       %[vu_ptr],         0x04          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [vu_ptr]"r"(src_vu),
+      [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
+      [alpha]"f"(-1)
+    : "memory"
+  );
+}
+
+void NV12ToRGB24Row_MMI(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  uint64_t y, u, v;
+  uint64_t b_vec, g_vec, r_vec, temp;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+  __asm__ volatile(
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
+    "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "pshufh     %[v],            %[u],              %[vshu]       \n\t"
+    "pshufh     %[u],            %[u],              %[ushu]       \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "punpckhwd  %[r_vec],        %[g_vec],          %[g_vec]      \n\t"
+    "psllw      %[temp],         %[r_vec],          %[lmove1]     \n\t"
+    "or         %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrlw      %[temp],         %[r_vec],          %[rmove1]     \n\t"
+    "pextrh     %[temp],         %[temp],           %[zero]       \n\t"
+    "pinsrh_2   %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pextrh     %[temp],         %[b_vec],          %[zero]       \n\t"
+    "pinsrh_3   %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pextrh     %[temp],         %[b_vec],          %[one]        \n\t"
+    "punpckhwd  %[b_vec],        %[b_vec],          %[b_vec]      \n\t"
+    "psllw      %[b_vec],        %[b_vec],          %[rmove1]     \n\t"
+    "or         %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
+    "gsswlc1    %[b_vec],        0x0b(%[rgbbuf_ptr])              \n\t"
+    "gsswrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0C          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [uv_ptr]"r"(src_uv),
+      [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
+      [alpha]"f"(-1),                      [lmove1]"f"(0x18),
+      [one]"f"(0x1),                       [rmove1]"f"(0x8)
+    : "memory"
+  );
+}
+
+void NV21ToRGB24Row_MMI(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  uint64_t y, u, v;
+  uint64_t b_vec, g_vec, r_vec, temp;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+  __asm__ volatile(
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[vu_ptr])                  \n\t"
+    "gslwrc1    %[u],            0x00(%[vu_ptr])                  \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "pshufh     %[v],            %[u],              %[ushu]       \n\t"
+    "pshufh     %[u],            %[u],              %[vshu]       \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "punpckhwd  %[r_vec],        %[g_vec],          %[g_vec]      \n\t"
+    "psllw      %[temp],         %[r_vec],          %[lmove1]     \n\t"
+    "or         %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrlw      %[temp],         %[r_vec],          %[rmove1]     \n\t"
+    "pextrh     %[temp],         %[temp],           %[zero]       \n\t"
+    "pinsrh_2   %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pextrh     %[temp],         %[b_vec],          %[zero]       \n\t"
+    "pinsrh_3   %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pextrh     %[temp],         %[b_vec],          %[one]        \n\t"
+    "punpckhwd  %[b_vec],        %[b_vec],          %[b_vec]      \n\t"
+    "psllw      %[b_vec],        %[b_vec],          %[rmove1]     \n\t"
+    "or         %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
+    "gsswlc1    %[b_vec],        0x0b(%[rgbbuf_ptr])              \n\t"
+    "gsswrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[vu_ptr],       %[vu_ptr],         0x04          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0C          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [vu_ptr]"r"(src_vu),
+      [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
+      [lmove1]"f"(0x18),                   [rmove1]"f"(0x8),
+      [one]"f"(0x1)
+    : "memory"
+  );
+}
+
+void NV12ToRGB565Row_MMI(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  uint64_t y, u, v;
+  uint64_t b_vec, g_vec, r_vec, temp;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+  __asm__ volatile(
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
+    "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "pshufh     %[v],            %[u],              %[vshu]       \n\t"
+    "pshufh     %[u],            %[u],              %[ushu]       \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "psrlh      %[temp],         %[g_vec],          %[three]      \n\t"
+    "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
+    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
+    "psubb      %[y],            %[eight],          %[three]      \n\t"//5
+    "psllw      %[r_vec],        %[r_vec],          %[y]          \n\t"
+    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
+    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "paddb      %[temp],         %[three],          %[eight]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+
+    "psrlh      %[temp],         %[b_vec],          %[three]      \n\t"
+    "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
+    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
+    "psubb      %[y],            %[eight],          %[three]      \n\t"//5
+    "psllw      %[r_vec],        %[r_vec],          %[y]          \n\t"
+    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
+    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "paddb      %[temp],         %[three],          %[eight]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
+
+    "gssdlc1    %[g_vec],        0x07(%[dst_rgb565])             \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[dst_rgb565])             \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+	"daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
+    "daddiu     %[dst_rgb565],   %[dst_rgb565],     0x08          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [uv_ptr]"r"(src_uv),
+      [dst_rgb565]"r"(dst_rgb565),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
+      [three]"f"(0x3),                     [mask2]"f"(0x1f0000001f),
+      [eight]"f"(0x8),                     [seven]"f"(0x7)
+    : "memory"
+  );
+}
+
+void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2,
+                       uint8_t* rgb_buf,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  uint64_t y, u, v;
+  uint64_t b_vec, g_vec, r_vec, temp;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+  __asm__ volatile(
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
+
+    "1:                                                           \n\t"
+    "gsldlc1    %[y],            0x07(%[yuy2_ptr])                \n\t"
+    "gsldrc1    %[y],            0x00(%[yuy2_ptr])                \n\t"
+    "psrlh      %[temp],         %[y],              %[eight]      \n\t"
+    "pshufh     %[u],            %[temp],           %[ushu]       \n\t"
+    "pshufh     %[v],            %[temp],           %[vshu]       \n\t"
+
+    "psrlh      %[temp],         %[mask1],          %[eight]      \n\t"
+    "and        %[y],            %[y],              %[temp]       \n\t"
+    "psllh      %[temp],         %[y],              %[eight]      \n\t"
+    "or         %[y],            %[y],              %[temp]       \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
+    "gssdlc1    %[b_vec],        0x0f(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
+
+    "daddiu     %[yuy2_ptr],     %[yuy2_ptr],       0x08          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [yuy2_ptr]"r"(src_yuy2),             [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
+      [alpha]"f"(-1),                      [eight]"f"(0x8)
+    : "memory"
+  );
+}
+
+void UYVYToARGBRow_MMI(const uint8_t* src_uyvy,
+                       uint8_t* rgb_buf,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  uint64_t y, u, v;
+  uint64_t b_vec, g_vec, r_vec, temp;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+  __asm__ volatile(
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
+
+    "1:                                                           \n\t"
+    "gsldlc1    %[y],            0x07(%[uyvy_ptr])                \n\t"
+    "gsldrc1    %[y],            0x00(%[uyvy_ptr])                \n\t"
+    "psrlh      %[temp],         %[mask1],          %[eight]      \n\t"
+    "and        %[temp],         %[y],              %[temp]       \n\t"
+    "pshufh     %[u],            %[temp],           %[ushu]       \n\t"
+    "pshufh     %[v],            %[temp],           %[vshu]       \n\t"
+
+    "psrlh      %[y],            %[y],              %[eight]      \n\t"
+    "psllh      %[temp],         %[y],              %[eight]      \n\t"
+    "or         %[y],            %[y],              %[temp]       \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
+    "gssdlc1    %[b_vec],        0x0f(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
+
+    "daddiu     %[uyvy_ptr],     %[uyvy_ptr],       0x08          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [uyvy_ptr]"r"(src_uyvy),             [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
+      [alpha]"f"(-1),                      [eight]"f"(0x8)
+    : "memory"
+  );
+}
+
+void I422ToRGBARow_MMI(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* rgb_buf,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  uint64_t y, u, v;
+  uint64_t b_vec, g_vec, r_vec, temp;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+  __asm__ volatile(
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+
+    "punpcklbh  %[u],            %[u],              %[u]          \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+
+    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklwd  %[g_vec],        %[alpha],          %[g_vec]      \n\t"
+    "punpcklbh  %[b_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
+    "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
+    "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
+    "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [alpha]"f"(-1)
+    : "memory"
+  );
+}
+
+void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width) {
+  __asm__ volatile (
+    "punpcklwd  %[v32],          %[v32],            %[v32]        \n\t"
+    "1:                                                           \n\t"
+    "gssdlc1    %[v32],          0x07(%[dst_ptr])                 \n\t"
+    "gssdrc1    %[v32],          0x00(%[dst_ptr])                 \n\t"
+    "gssdlc1    %[v32],          0x0f(%[dst_ptr])                 \n\t"
+    "gssdrc1    %[v32],          0x08(%[dst_ptr])                 \n\t"
+
+    "daddi      %[width],        %[width],         -0x04          \n\t"
+    "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
+    "bnez       %[width],        1b                               \n\t"
+    : [v32]"+&f"(v32)
+    : [dst_ptr]"r"(dst_argb),           [width]"r"(width)
+    : "memory"
+  );
+}
+// clang-format on
+
+// 10 bit YUV to ARGB
+#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/row_msa.cc b/chromium/third_party/libyuv/source/row_msa.cc
index effa68c8b4a..fe6df93a601 100644
--- a/chromium/third_party/libyuv/source/row_msa.cc
+++ b/chromium/third_party/libyuv/source/row_msa.cc
@@ -155,22 +155,21 @@ extern "C" {
   }
 
 // Loads current and next row of ARGB input and averages it to calculate U and V
-#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3)               \
+#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3, const_0x0101) \
   {                                                                       \
     v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \
     v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
-    v16u8 vec8_m, vec9_m;                                                 \
     v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \
     v8u16 reg8_m, reg9_m;                                                 \
                                                                           \
-    src0_m = (v16u8)__msa_ld_b((void*)s, 0);                             \
-    src1_m = (v16u8)__msa_ld_b((void*)s, 16);                            \
-    src2_m = (v16u8)__msa_ld_b((void*)s, 32);                            \
-    src3_m = (v16u8)__msa_ld_b((void*)s, 48);                            \
-    src4_m = (v16u8)__msa_ld_b((void*)t, 0);                             \
-    src5_m = (v16u8)__msa_ld_b((void*)t, 16);                            \
-    src6_m = (v16u8)__msa_ld_b((void*)t, 32);                            \
-    src7_m = (v16u8)__msa_ld_b((void*)t, 48);                            \
+    src0_m = (v16u8)__msa_ld_b((void*)s, 0);                              \
+    src1_m = (v16u8)__msa_ld_b((void*)s, 16);                             \
+    src2_m = (v16u8)__msa_ld_b((void*)s, 32);                             \
+    src3_m = (v16u8)__msa_ld_b((void*)s, 48);                             \
+    src4_m = (v16u8)__msa_ld_b((void*)t, 0);                              \
+    src5_m = (v16u8)__msa_ld_b((void*)t, 16);                             \
+    src6_m = (v16u8)__msa_ld_b((void*)t, 32);                             \
+    src7_m = (v16u8)__msa_ld_b((void*)t, 48);                             \
     vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m);           \
     vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m);           \
     vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m);           \
@@ -195,81 +194,81 @@ extern "C" {
     reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m);          \
     reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m);         \
     reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m);         \
-    reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2);                       \
-    reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2);                       \
-    reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2);                       \
-    reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2);                       \
-    argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m);           \
-    argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);           \
-    src0_m = (v16u8)__msa_ld_b((void*)s, 64);                            \
-    src1_m = (v16u8)__msa_ld_b((void*)s, 80);                            \
-    src2_m = (v16u8)__msa_ld_b((void*)s, 96);                            \
-    src3_m = (v16u8)__msa_ld_b((void*)s, 112);                           \
-    src4_m = (v16u8)__msa_ld_b((void*)t, 64);                            \
-    src5_m = (v16u8)__msa_ld_b((void*)t, 80);                            \
-    src6_m = (v16u8)__msa_ld_b((void*)t, 96);                            \
-    src7_m = (v16u8)__msa_ld_b((void*)t, 112);                           \
-    vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m);           \
-    vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m);           \
-    vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m);           \
-    vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m);           \
-    vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m);           \
-    vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m);           \
-    vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m);           \
-    vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m);           \
-    reg0_m = __msa_hadd_u_h(vec2_m, vec2_m);                              \
-    reg1_m = __msa_hadd_u_h(vec3_m, vec3_m);                              \
-    reg2_m = __msa_hadd_u_h(vec4_m, vec4_m);                              \
-    reg3_m = __msa_hadd_u_h(vec5_m, vec5_m);                              \
-    reg4_m = __msa_hadd_u_h(vec6_m, vec6_m);                              \
-    reg5_m = __msa_hadd_u_h(vec7_m, vec7_m);                              \
-    reg6_m = __msa_hadd_u_h(vec8_m, vec8_m);                              \
-    reg7_m = __msa_hadd_u_h(vec9_m, vec9_m);                              \
-    reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m);          \
-    reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m);          \
-    reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m);         \
-    reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m);         \
-    reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m);          \
-    reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m);          \
-    reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m);         \
-    reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m);         \
-    reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2);                       \
-    reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2);                       \
-    reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2);                       \
-    reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2);                       \
-    argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m);           \
-    argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);           \
+    reg8_m += const_0x0101;                                               \
+    reg9_m += const_0x0101;                                               \
+    reg0_m += const_0x0101;                                               \
+    reg1_m += const_0x0101;                                               \
+    argb0 = (v8u16)__msa_srai_h((v8i16)reg8_m, 1);                        \
+    argb1 = (v8u16)__msa_srai_h((v8i16)reg9_m, 1);                        \
+    argb2 = (v8u16)__msa_srai_h((v8i16)reg0_m, 1);                        \
+    argb3 = (v8u16)__msa_srai_h((v8i16)reg1_m, 1);                        \
   }
 
-// Takes ARGB input and calculates U and V.
 #define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
-                 shf0, shf1, shf2, shf3, v_out, u_out)                       \
+                 shf0, shf1, shf2, shf3, shift, u_out, v_out)                \
   {                                                                          \
-    v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
-    v8u16 reg0_m, reg1_m, reg2_m, reg3_m;                                    \
+    v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
+    v4u32 reg0_m, reg1_m, reg2_m, reg3_m;                                    \
                                                                              \
-    vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0);          \
-    vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2);          \
-    vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0);          \
-    vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2);          \
-    vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0);          \
-    vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2);          \
-    vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0);          \
-    vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2);          \
-    reg0_m = __msa_dotp_u_h(vec0_m, const1);                                 \
-    reg1_m = __msa_dotp_u_h(vec1_m, const1);                                 \
-    reg2_m = __msa_dotp_u_h(vec4_m, const1);                                 \
-    reg3_m = __msa_dotp_u_h(vec5_m, const1);                                 \
-    reg0_m += const3;                                                        \
-    reg1_m += const3;                                                        \
-    reg2_m += const3;                                                        \
-    reg3_m += const3;                                                        \
-    reg0_m -= __msa_dotp_u_h(vec2_m, const0);                                \
-    reg1_m -= __msa_dotp_u_h(vec3_m, const0);                                \
-    reg2_m -= __msa_dotp_u_h(vec6_m, const2);                                \
-    reg3_m -= __msa_dotp_u_h(vec7_m, const2);                                \
-    v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m);              \
-    u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m);              \
+    vec0_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0);          \
+    vec1_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2);          \
+    vec2_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0);          \
+    vec3_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2);          \
+    vec4_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0);          \
+    vec5_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2);          \
+    vec6_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0);          \
+    vec7_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2);          \
+    reg0_m = __msa_dotp_u_w(vec0_m, const0);                                 \
+    reg1_m = __msa_dotp_u_w(vec1_m, const0);                                 \
+    reg2_m = __msa_dotp_u_w(vec4_m, const0);                                 \
+    reg3_m = __msa_dotp_u_w(vec5_m, const0);                                 \
+    reg0_m += const1;                                                        \
+    reg1_m += const1;                                                        \
+    reg2_m += const1;                                                        \
+    reg3_m += const1;                                                        \
+    reg0_m -= (v4u32)__msa_dotp_u_w(vec2_m, const2);                         \
+    reg1_m -= (v4u32)__msa_dotp_u_w(vec3_m, const2);                         \
+    reg2_m -= (v4u32)__msa_dotp_u_w(vec6_m, const3);                         \
+    reg3_m -= (v4u32)__msa_dotp_u_w(vec7_m, const3);                         \
+    reg0_m = __msa_srl_w(reg0_m, shift);                                     \
+    reg1_m = __msa_srl_w(reg1_m, shift);                                     \
+    reg2_m = __msa_srl_w(reg2_m, shift);                                     \
+    reg3_m = __msa_srl_w(reg3_m, shift);                                     \
+    u_out = (v8u16)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m);              \
+    v_out = (v8u16)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m);              \
+  }
+
+// Takes ARGB input and calculates U and V.
+#define ARGBTOUV_H(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
+                   shf0, shf1, shf2, shf3, v_out, u_out)                       \
+  {                                                                            \
+    v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;      \
+    v4u32 reg0_m, reg1_m, reg2_m, reg3_m;                                      \
+                                                                               \
+    vec0_m = __msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0);                   \
+    vec1_m = __msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2);                   \
+    vec2_m = __msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0);                   \
+    vec3_m = __msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2);                   \
+    vec4_m = __msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0);                   \
+    vec5_m = __msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2);                   \
+    vec6_m = __msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0);                   \
+    vec7_m = __msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2);                   \
+    reg0_m = __msa_dotp_u_w(vec0_m, const1);                                   \
+    reg1_m = __msa_dotp_u_w(vec1_m, const1);                                   \
+    reg2_m = __msa_dotp_u_w(vec4_m, const1);                                   \
+    reg3_m = __msa_dotp_u_w(vec5_m, const1);                                   \
+    reg0_m += (v4u32)const3;                                                   \
+    reg1_m += (v4u32)const3;                                                   \
+    reg2_m += (v4u32)const3;                                                   \
+    reg3_m += (v4u32)const3;                                                   \
+    reg0_m -= __msa_dotp_u_w(vec2_m, const0);                                  \
+    reg1_m -= __msa_dotp_u_w(vec3_m, const0);                                  \
+    reg2_m -= __msa_dotp_u_w(vec6_m, const2);                                  \
+    reg3_m -= __msa_dotp_u_w(vec7_m, const2);                                  \
+    u_out = (v16u8)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m);                \
+    v_out = (v16u8)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m);                \
+    u_out = (v16u8)__msa_pckod_b((v16i8)u_out, (v16i8)u_out);                  \
+    v_out = (v16u8)__msa_pckod_b((v16i8)v_out, (v16i8)v_out);                  \
   }
 
 // Load I444 pixel data
@@ -302,6 +301,20 @@ void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
   }
 }
 
+void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  int x;
+  v8u16 src, dst;
+  v8u16 shuffler = {7, 6, 5, 4, 3, 2, 1, 0};
+  src_uv += (width - 8) << 1;
+  for (x = 0; x < width; x += 8) {
+    src = LD_UH(src_uv);
+    dst = __msa_vshf_h(shuffler, src, src);
+    ST_UH(dst, dst_uv);
+    src_uv -= 16;
+    dst_uv += 16;
+  }
+}
+
 void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
   int x;
   v16u8 src0, src1, src2, src3;
@@ -825,12 +838,13 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0,
   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
   v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
   v16u8 dst0, dst1;
-  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
-  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
-  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
-  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
-  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x38);
+  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x25);
+  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x13);
+  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x2f);
+  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x09);
   v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
 
   for (x = 0; x < width; x += 32) {
     src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
@@ -889,12 +903,18 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0,
     reg3 += __msa_hadd_u_h(vec5, vec5);
     reg4 += __msa_hadd_u_h(vec0, vec0);
     reg5 += __msa_hadd_u_h(vec1, vec1);
-    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2);
-    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2);
-    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2);
-    reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2);
-    reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2);
-    reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2);
+    reg0 += const_0x0001;
+    reg1 += const_0x0001;
+    reg2 += const_0x0001;
+    reg3 += const_0x0001;
+    reg4 += const_0x0001;
+    reg5 += const_0x0001;
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 1);
+    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 1);
+    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 1);
+    reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 1);
+    reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 1);
+    reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 1);
     reg6 = reg0 * const_0x70;
     reg7 = reg1 * const_0x70;
     reg8 = reg2 * const_0x4A;
@@ -1412,17 +1432,17 @@ void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
   int x;
   v16u8 src0, src1, vec0, vec1, dst0, dst1;
   v8u16 reg0;
-  v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26);
-  v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
+  v16u8 const_0x4D = (v16u8)__msa_ldi_h(0x4D);
+  v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D);
 
   for (x = 0; x < width; x += 8) {
     src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
     src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
     vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
     vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
-    reg0 = __msa_dotp_u_h(vec0, const_0x4B0F);
-    reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26);
-    reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7);
+    reg0 = __msa_dotp_u_h(vec0, const_0x961D);
+    reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x4D);
+    reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 8);
     vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0);
     vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0);
     dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0);
@@ -2031,12 +2051,13 @@ void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
   v8i16 reg0, reg1, reg2, reg3;
   v16u8 dst0;
-  v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
-  v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
-  v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
-  v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
-  v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+  v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38);
+  v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25);
+  v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13);
+  v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f);
+  v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09);
   v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
   v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
   v16i8 zero = {0};
 
@@ -2085,10 +2106,14 @@ void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
     reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
     reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
     reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
-    reg0 = __msa_srai_h((v8i16)reg0, 2);
-    reg1 = __msa_srai_h((v8i16)reg1, 2);
-    reg2 = __msa_srai_h((v8i16)reg2, 2);
-    reg3 = __msa_srai_h((v8i16)reg3, 2);
+    reg0 += const_0x0001;
+    reg1 += const_0x0001;
+    reg2 += const_0x0001;
+    reg3 += const_0x0001;
+    reg0 = __msa_srai_h((v8i16)reg0, 1);
+    reg1 = __msa_srai_h((v8i16)reg1, 1);
+    reg2 = __msa_srai_h((v8i16)reg2, 1);
+    reg3 = __msa_srai_h((v8i16)reg3, 1);
     vec4 = (v8u16)__msa_pckev_h(reg1, reg0);
     vec5 = (v8u16)__msa_pckev_h(reg3, reg2);
     vec6 = (v8u16)__msa_pckod_h(reg1, reg0);
@@ -2136,12 +2161,13 @@ void RAWToUVRow_MSA(const uint8_t* src_rgb0,
   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
   v8i16 reg0, reg1, reg2, reg3;
   v16u8 dst0;
-  v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
-  v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
-  v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
-  v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
-  v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+  v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38);
+  v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25);
+  v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13);
+  v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f);
+  v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09);
   v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
   v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
   v16i8 zero = {0};
 
@@ -2190,10 +2216,14 @@ void RAWToUVRow_MSA(const uint8_t* src_rgb0,
     reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
     reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
     reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
-    reg0 = __msa_srai_h(reg0, 2);
-    reg1 = __msa_srai_h(reg1, 2);
-    reg2 = __msa_srai_h(reg2, 2);
-    reg3 = __msa_srai_h(reg3, 2);
+    reg0 += const_0x0001;
+    reg1 += const_0x0001;
+    reg2 += const_0x0001;
+    reg3 += const_0x0001;
+    reg0 = __msa_srai_h(reg0, 1);
+    reg1 = __msa_srai_h(reg1, 1);
+    reg2 = __msa_srai_h(reg2, 1);
+    reg3 = __msa_srai_h(reg3, 1);
     vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
     vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
     vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
@@ -2419,16 +2449,16 @@ void SobelXYRow_MSA(const uint8_t* src_sobelx,
 void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
   int x;
   v16u8 src0, src1, src2, src3, dst0;
-  v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
-  v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26);
-  v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40);
+  v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D);
+  v16u8 const_0x4D = (v16u8)__msa_fill_h(0x4D);
+  v8u16 const_0x80 = (v8u16)__msa_fill_h(0x80);
 
   for (x = 0; x < width; x += 16) {
     src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
     src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
     src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
     src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
-    ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7,
+    ARGBTOY(src0, src1, src2, src3, const_0x961D, const_0x4D, const_0x80, 8,
             dst0);
     ST_UB(dst0, dst_y);
     src_argb0 += 64;
@@ -2504,61 +2534,123 @@ void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
   int x;
   const uint8_t* s = src_rgb0;
   const uint8_t* t = src_rgb0 + src_stride_rgb;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16u8 vec0, vec1, vec2, vec3;
-  v16u8 dst0, dst1;
-  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
-  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
-                     18, 19, 22, 23, 26, 27, 30, 31};
-  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
-  v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
-  v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F);
-  v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14);
-  v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54);
-  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  v8u16 src0, src1, src2, src3, src4, src5, src6, src7;
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 dst0, dst1, dst2, dst3;
+  v16u8 zero = {0};
+  v8i16 shuffler0 = {0, 3, 4, 7, 8, 11, 12, 15};
+  v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14};
+  v8i16 shuffler2 = {2, 3, 6, 7, 10, 11, 14, 15};
+  v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13};
+  v8u16 const_0x0000003f = (v8u16)__msa_fill_w(0x0000003f);
+  v4u32 const_0x00008080 = (v8u16)__msa_fill_w(0x00008080);
+  v8u16 const_0x0015002a = (v8u16)__msa_fill_w(0x0015002a);
+  v8u16 const_0x0035000a = (v8u16)__msa_fill_w(0x0035000a);
+  v4i32 shift = __msa_fill_w(0x00000008);
 
   for (x = 0; x < width; x += 32) {
-    src0 = (v16u8)__msa_ld_b((void*)s, 0);
-    src1 = (v16u8)__msa_ld_b((void*)s, 16);
-    src2 = (v16u8)__msa_ld_b((void*)s, 32);
-    src3 = (v16u8)__msa_ld_b((void*)s, 48);
-    src4 = (v16u8)__msa_ld_b((void*)t, 0);
-    src5 = (v16u8)__msa_ld_b((void*)t, 16);
-    src6 = (v16u8)__msa_ld_b((void*)t, 32);
-    src7 = (v16u8)__msa_ld_b((void*)t, 48);
-    src0 = __msa_aver_u_b(src0, src4);
-    src1 = __msa_aver_u_b(src1, src5);
-    src2 = __msa_aver_u_b(src2, src6);
-    src3 = __msa_aver_u_b(src3, src7);
-    src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
-    src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
-    src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
-    src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
-    vec0 = __msa_aver_u_b(src4, src6);
-    vec1 = __msa_aver_u_b(src5, src7);
-    src0 = (v16u8)__msa_ld_b((void*)s, 64);
-    src1 = (v16u8)__msa_ld_b((void*)s, 80);
-    src2 = (v16u8)__msa_ld_b((void*)s, 96);
-    src3 = (v16u8)__msa_ld_b((void*)s, 112);
-    src4 = (v16u8)__msa_ld_b((void*)t, 64);
-    src5 = (v16u8)__msa_ld_b((void*)t, 80);
-    src6 = (v16u8)__msa_ld_b((void*)t, 96);
-    src7 = (v16u8)__msa_ld_b((void*)t, 112);
-    src0 = __msa_aver_u_b(src0, src4);
-    src1 = __msa_aver_u_b(src1, src5);
-    src2 = __msa_aver_u_b(src2, src6);
-    src3 = __msa_aver_u_b(src3, src7);
-    src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
-    src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
-    src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
-    src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
-    vec2 = __msa_aver_u_b(src4, src6);
-    vec3 = __msa_aver_u_b(src5, src7);
-    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54,
-             const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
-             dst1);
-    ST_UB(dst0, dst_v);
-    ST_UB(dst1, dst_u);
+    src1 = __msa_ld_b((void*)s, 0);
+    src3 = __msa_ld_b((void*)s, 16);
+    src5 = __msa_ld_b((void*)t, 0);
+    src7 = __msa_ld_b((void*)t, 16);
+    src0 = __msa_ilvr_b(zero, src1);
+    src1 = __msa_ilvl_b(zero, src1);
+    src2 = __msa_ilvr_b(zero, src3);
+    src3 = __msa_ilvl_b(zero, src3);
+    src4 = __msa_ilvr_b(zero, src5);
+    src5 = __msa_ilvl_b(zero, src5);
+    src6 = __msa_ilvr_b(zero, src7);
+    src7 = __msa_ilvl_b(zero, src7);
+    src0 += src4;
+    src1 += src5;
+    src2 += src6;
+    src3 += src7;
+    src4 = __msa_ilvev_d(src1, src0);
+    src5 = __msa_ilvod_d(src1, src0);
+    src6 = __msa_ilvev_d(src3, src2);
+    src7 = __msa_ilvod_d(src3, src2);
+    vec0 = __msa_aver_u_h(src4, src5);
+    vec1 = __msa_aver_u_h(src6, src7);
+
+    src1 = __msa_ld_b((void*)s, 32);
+    src3 = __msa_ld_b((void*)s, 48);
+    src5 = __msa_ld_b((void*)t, 32);
+    src7 = __msa_ld_b((void*)t, 48);
+    src0 = __msa_ilvr_b(zero, src1);
+    src1 = __msa_ilvl_b(zero, src1);
+    src2 = __msa_ilvr_b(zero, src3);
+    src3 = __msa_ilvl_b(zero, src3);
+    src4 = __msa_ilvr_b(zero, src5);
+    src5 = __msa_ilvl_b(zero, src5);
+    src6 = __msa_ilvr_b(zero, src7);
+    src7 = __msa_ilvl_b(zero, src7);
+    src0 += src4;
+    src1 += src5;
+    src2 += src6;
+    src3 += src7;
+    src4 = __msa_ilvev_d(src1, src0);
+    src5 = __msa_ilvod_d(src1, src0);
+    src6 = __msa_ilvev_d(src3, src2);
+    src7 = __msa_ilvod_d(src3, src2);
+    vec2 = __msa_aver_u_h(src4, src5);
+    vec3 = __msa_aver_u_h(src6, src7);
+    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080,
+             const_0x0015002a, const_0x0035000a, shuffler0, shuffler1,
+             shuffler2, shuffler3, shift, dst0, dst1);
+
+    src1 = __msa_ld_b((void*)s, 64);
+    src3 = __msa_ld_b((void*)s, 80);
+    src5 = __msa_ld_b((void*)t, 64);
+    src7 = __msa_ld_b((void*)t, 80);
+    src0 = __msa_ilvr_b(zero, src1);
+    src1 = __msa_ilvl_b(zero, src1);
+    src2 = __msa_ilvr_b(zero, src3);
+    src3 = __msa_ilvl_b(zero, src3);
+    src4 = __msa_ilvr_b(zero, src5);
+    src5 = __msa_ilvl_b(zero, src5);
+    src6 = __msa_ilvr_b(zero, src7);
+    src7 = __msa_ilvl_b(zero, src7);
+    src0 += src4;
+    src1 += src5;
+    src2 += src6;
+    src3 += src7;
+    src4 = __msa_ilvev_d(src1, src0);
+    src5 = __msa_ilvod_d(src1, src0);
+    src6 = __msa_ilvev_d(src3, src2);
+    src7 = __msa_ilvod_d(src3, src2);
+    vec0 = __msa_aver_u_h(src4, src5);
+    vec1 = __msa_aver_u_h(src6, src7);
+
+    src1 = __msa_ld_b((void*)s, 96);
+    src3 = __msa_ld_b((void*)s, 112);
+    src5 = __msa_ld_b((void*)t, 96);
+    src7 = __msa_ld_b((void*)t, 112);
+    src0 = __msa_ilvr_b(zero, src1);
+    src1 = __msa_ilvl_b(zero, src1);
+    src2 = __msa_ilvr_b(zero, src3);
+    src3 = __msa_ilvl_b(zero, src3);
+    src4 = __msa_ilvr_b(zero, src5);
+    src5 = __msa_ilvl_b(zero, src5);
+    src6 = __msa_ilvr_b(zero, src7);
+    src7 = __msa_ilvl_b(zero, src7);
+    src0 += src4;
+    src1 += src5;
+    src2 += src6;
+    src3 += src7;
+    src4 = __msa_ilvev_d(src1, src0);
+    src5 = __msa_ilvod_d(src1, src0);
+    src6 = __msa_ilvev_d(src3, src2);
+    src7 = __msa_ilvod_d(src3, src2);
+    vec2 = __msa_aver_u_h(src4, src5);
+    vec3 = __msa_aver_u_h(src6, src7);
+    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080,
+             const_0x0015002a, const_0x0035000a, shuffler0, shuffler1,
+             shuffler2, shuffler3, shift, dst2, dst3);
+
+    dst0 = (v8u16)__msa_pckev_b(dst2, dst0);
+    dst1 = (v8u16)__msa_pckev_b(dst3, dst1);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
     s += 128;
     t += 128;
     dst_v += 16;
@@ -2574,28 +2666,30 @@ void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
   int x;
   const uint8_t* s = src_rgb0;
   const uint8_t* t = src_rgb0 + src_stride_rgb;
-  v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
-  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
-  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
-                     18, 19, 22, 23, 26, 27, 30, 31};
-  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
-  v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
-  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
-  v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
-  v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A);
-  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  const uint8_t unused = 0xf;
+  v8u16 src0, src1, src2, src3;
+  v16u8 dst0, dst1;
+  v8i16 shuffler0 = {1, unused, 5, unused, 9, unused, 13, unused};
+  v8i16 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15};
+  v8i16 shuffler2 = {3, unused, 7, unused, 11, unused, 15, unused};
+  v8i16 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14};
+  v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
+  v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
+  v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
+  v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
+  v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
 
-  for (x = 0; x < width; x += 32) {
-    READ_ARGB(s, t, vec0, vec1, vec2, vec3);
-    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
-             const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
-             dst1);
-    ST_UB(dst0, dst_v);
-    ST_UB(dst1, dst_u);
-    s += 128;
-    t += 128;
-    dst_v += 16;
-    dst_u += 16;
+  for (x = 0; x < width; x += 16) {
+    READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
+    ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
+               const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
+               shuffler3, dst0, dst1);
+    *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
+    *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
+    s += 64;
+    t += 64;
+    dst_u += 8;
+    dst_v += 8;
   }
 }
 
@@ -2607,29 +2701,30 @@ void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
   int x;
   const uint8_t* s = src_rgb0;
   const uint8_t* t = src_rgb0 + src_stride_rgb;
-  v16u8 src0, src1, src2, src3;
+  const uint8_t unused = 0xf;
+  v8u16 src0, src1, src2, src3;
   v16u8 dst0, dst1;
-  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
-  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
-                     18, 19, 22, 23, 26, 27, 30, 31};
-  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
-  v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
-  v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26);
-  v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070);
-  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
-  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  v8i16 shuffler0 = {0, unused, 4, unused, 8, unused, 12, unused};
+  v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14};
+  v8i16 shuffler2 = {2, unused, 6, unused, 10, unused, 14, unused};
+  v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13};
+  v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
+  v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
+  v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
+  v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
+  v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
 
-  for (x = 0; x < width; x += 32) {
-    READ_ARGB(s, t, src0, src1, src2, src3);
-    ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E,
-             const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
-             dst1);
-    ST_UB(dst0, dst_u);
-    ST_UB(dst1, dst_v);
-    s += 128;
-    t += 128;
-    dst_u += 16;
-    dst_v += 16;
+  for (x = 0; x < width; x += 16) {
+    READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
+    ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
+               const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
+               shuffler3, dst0, dst1);
+    *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
+    *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
+    s += 64;
+    t += 64;
+    dst_u += 8;
+    dst_v += 8;
   }
 }
 
@@ -2641,28 +2736,30 @@ void RGBAToUVRow_MSA(const uint8_t* src_rgb0,
   int x;
   const uint8_t* s = src_rgb0;
   const uint8_t* t = src_rgb0 + src_stride_rgb;
-  v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
-  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
-  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
-                     18, 19, 22, 23, 26, 27, 30, 31};
-  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
-  v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
-  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A);
-  v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
-  v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E);
-  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  const uint8_t unused = 0xf;
+  v8u16 src0, src1, src2, src3;
+  v16u8 dst0, dst1;
+  v8i16 shuffler0 = {3, unused, 7, unused, 11, unused, 15, unused};
+  v8i16 shuffler1 = {2, 1, 6, 5, 10, 9, 14, 13};
+  v8i16 shuffler2 = {1, unused, 5, unused, 9, unused, 13, unused};
+  v8i16 shuffler3 = {3, 2, 7, 6, 11, 10, 15, 14};
+  v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
+  v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
+  v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
+  v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
+  v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
 
-  for (x = 0; x < width; x += 32) {
-    READ_ARGB(s, t, vec0, vec1, vec2, vec3);
-    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
-             const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
-             dst1);
-    ST_UB(dst0, dst_u);
-    ST_UB(dst1, dst_v);
-    s += 128;
-    t += 128;
-    dst_u += 16;
-    dst_v += 16;
+  for (x = 0; x < width; x += 16) {
+    READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
+    ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
+               const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
+               shuffler3, dst0, dst1);
+    *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
+    *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
+    s += 64;
+    t += 64;
+    dst_u += 8;
+    dst_v += 8;
   }
 }
 
@@ -2734,13 +2831,24 @@ void I444ToARGBRow_MSA(const uint8_t* src_y,
   }
 }
 
-void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+// TODO - respect YuvConstants
+void I400ToARGBRow_MSA(const uint8_t* src_y,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
   int x;
+#if defined(__aarch64__) || defined(__arm__)
+  int ygb = yuvconstants->kUVBiasBGR[3];
+  int yg = yuvconstants->kYToRgb[1];
+#else
+  int ygb = yuvconstants->kYBiasToRgb[0];
+  int yg = yuvconstants->kYToRgb[0];
+#endif
   v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3;
   v8i16 vec0, vec1;
   v4i32 reg0, reg1, reg2, reg3;
-  v4i32 vec_yg = __msa_fill_w(0x4A35);
-  v8i16 vec_ygb = __msa_fill_h(0xFB78);
+  v4i32 vec_yg = __msa_fill_w(yg);
+  v8i16 vec_ygb = __msa_fill_h(ygb);
   v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
   v8i16 max = __msa_ldi_h(0xFF);
   v8i16 zero = {0};
@@ -3006,7 +3114,7 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0,
                       uint8_t* dst_argb,
                       int width) {
   int x;
-  v16u8 src0, src1, src2, src3, dst0, dst1;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
   v8u16 vec8, vec9, vec10, vec11, vec12, vec13;
   v8u16 const_256 = (v8u16)__msa_ldi_h(256);
@@ -3051,12 +3159,12 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0,
     vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8);
     vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8);
     vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8);
-    vec0 += vec8;
-    vec1 += vec9;
-    vec2 += vec10;
-    vec3 += vec11;
     dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
     dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    dst2 = (v16u8)__msa_pckev_b((v16i8)vec9, (v16i8)vec8);
+    dst3 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10);
+    dst0 = (v16u8)__msa_adds_u_b(dst0, dst2);
+    dst1 = (v16u8)__msa_adds_u_b(dst1, dst3);
     dst0 = __msa_bmnz_v(dst0, const_255, mask);
     dst1 = __msa_bmnz_v(dst1, const_255, mask);
     ST_UB2(dst0, dst1, dst_argb, 16);
@@ -3082,7 +3190,7 @@ void ARGBQuantizeRow_MSA(uint8_t* dst_argb,
   v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31};
   v16i8 zero = {0};
 
-  for (x = 0; x < width; x += 8) {
+  for (x = 0; x < width; x += 16) {
     src0 = (v16u8)__msa_ld_b((void*)dst_argb, 0);
     src1 = (v16u8)__msa_ld_b((void*)dst_argb, 16);
     src2 = (v16u8)__msa_ld_b((void*)dst_argb, 32);
@@ -3315,10 +3423,10 @@ void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) {
   }
 }
 
-void MirrorUVRow_MSA(const uint8_t* src_uv,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
+void MirrorSplitUVRow_MSA(const uint8_t* src_uv,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
   int x;
   v16u8 src0, src1, src2, src3;
   v16u8 dst0, dst1, dst2, dst3;
diff --git a/chromium/third_party/libyuv/source/row_neon.cc b/chromium/third_party/libyuv/source/row_neon.cc
index ff87e74c62c..a5aeaabfbd7 100644
--- a/chromium/third_party/libyuv/source/row_neon.cc
+++ b/chromium/third_party/libyuv/source/row_neon.cc
@@ -114,11 +114,11 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
                         int width) {
   asm volatile(
       YUVTORGB_SETUP
-      "vmov.u8    d23, #255                      \n"
+      "vmov.u8     d23, #255                     \n"
       "1:                                        \n" READYUV444 YUVTORGB
-      "subs       %4, %4, #8                     \n"
-      "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-      "bgt        1b                             \n"
+      "subs        %4, %4, #8                    \n"
+      "vst4.8      {d20, d21, d22, d23}, [%3]!   \n"
+      "bgt         1b                            \n"
       : "+r"(src_y),     // %0
         "+r"(src_u),     // %1
         "+r"(src_v),     // %2
@@ -140,11 +140,11 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
                         int width) {
   asm volatile(
       YUVTORGB_SETUP
-      "vmov.u8    d23, #255                      \n"
+      "vmov.u8     d23, #255                     \n"
       "1:                                        \n" READYUV422 YUVTORGB
-      "subs       %4, %4, #8                     \n"
-      "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-      "bgt        1b                             \n"
+      "subs        %4, %4, #8                    \n"
+      "vst4.8      {d20, d21, d22, d23}, [%3]!   \n"
+      "bgt         1b                            \n"
       : "+r"(src_y),     // %0
         "+r"(src_u),     // %1
         "+r"(src_v),     // %2
@@ -168,10 +168,10 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
   asm volatile(
       YUVTORGB_SETUP
       "1:                                        \n" READYUV422 YUVTORGB
-      "subs       %5, %5, #8                     \n"
-      "vld1.8     {d23}, [%3]!                   \n"
-      "vst4.8     {d20, d21, d22, d23}, [%4]!    \n"
-      "bgt        1b                             \n"
+      "subs        %5, %5, #8                    \n"
+      "vld1.8      {d23}, [%3]!                  \n"
+      "vst4.8      {d20, d21, d22, d23}, [%4]!   \n"
+      "bgt         1b                            \n"
       : "+r"(src_y),     // %0
         "+r"(src_u),     // %1
         "+r"(src_v),     // %2
@@ -195,10 +195,10 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
   asm volatile(
       YUVTORGB_SETUP
       "1:                                        \n" READYUV422 YUVTORGB
-      "subs       %4, %4, #8                     \n"
-      "vmov.u8    d19, #255                      \n"  // YUVTORGB modified d19
-      "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
-      "bgt        1b                             \n"
+      "subs        %4, %4, #8                    \n"
+      "vmov.u8     d19, #255                     \n"  // YUVTORGB modified d19
+      "vst4.8      {d19, d20, d21, d22}, [%3]!   \n"
+      "bgt         1b                            \n"
       : "+r"(src_y),     // %0
         "+r"(src_u),     // %1
         "+r"(src_v),     // %2
@@ -221,9 +221,9 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
   asm volatile(
       YUVTORGB_SETUP
       "1:                                        \n" READYUV422 YUVTORGB
-      "subs       %4, %4, #8                     \n"
-      "vst3.8     {d20, d21, d22}, [%3]!         \n"
-      "bgt        1b                             \n"
+      "subs        %4, %4, #8                    \n"
+      "vst3.8      {d20, d21, d22}, [%3]!        \n"
+      "bgt         1b                            \n"
       : "+r"(src_y),      // %0
         "+r"(src_u),      // %1
         "+r"(src_v),      // %2
@@ -253,9 +253,9 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
   asm volatile(
       YUVTORGB_SETUP
       "1:                                        \n" READYUV422 YUVTORGB
-      "subs       %4, %4, #8                     \n" ARGBTORGB565
-      "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
-      "bgt        1b                             \n"
+      "subs        %4, %4, #8                    \n" ARGBTORGB565
+      "vst1.8      {q0}, [%3]!                   \n"  // store 8 pixels RGB565.
+      "bgt         1b                            \n"
       : "+r"(src_y),       // %0
         "+r"(src_u),       // %1
         "+r"(src_v),       // %2
@@ -287,10 +287,10 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
   asm volatile(
       YUVTORGB_SETUP
       "1:                                        \n" READYUV422 YUVTORGB
-      "subs       %4, %4, #8                     \n"
-      "vmov.u8    d23, #255                      \n" ARGBTOARGB1555
-      "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels
-      "bgt        1b                             \n"
+      "subs        %4, %4, #8                    \n"
+      "vmov.u8     d23, #255                     \n" ARGBTOARGB1555
+      "vst1.8      {q0}, [%3]!                   \n"  // store 8 pixels
+      "bgt         1b                            \n"
       : "+r"(src_y),         // %0
         "+r"(src_u),         // %1
         "+r"(src_v),         // %2
@@ -321,14 +321,14 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
                             int width) {
   asm volatile(
       YUVTORGB_SETUP
-      "vmov.u8    d4, #0x0f                      \n"  // vbic bits to clear
+      "vmov.u8     d4, #0x0f                     \n"  // vbic bits to clear
       "1:                                        \n"
 
       READYUV422 YUVTORGB
-      "subs       %4, %4, #8                     \n"
-      "vmov.u8    d23, #255                      \n" ARGBTOARGB4444
-      "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels
-      "bgt        1b                             \n"
+      "subs        %4, %4, #8                    \n"
+      "vmov.u8     d23, #255                     \n" ARGBTOARGB4444
+      "vst1.8      {q0}, [%3]!                   \n"  // store 8 pixels
+      "bgt         1b                            \n"
       : "+r"(src_y),         // %0
         "+r"(src_u),         // %1
         "+r"(src_v),         // %2
@@ -342,35 +342,38 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
         "q12", "q13", "q14", "q15");
 }
 
-void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+void I400ToARGBRow_NEON(const uint8_t* src_y,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
   asm volatile(
       YUVTORGB_SETUP
-      "vmov.u8    d23, #255                      \n"
+      "vmov.u8     d23, #255                     \n"
       "1:                                        \n" READYUV400 YUVTORGB
-      "subs       %2, %2, #8                     \n"
-      "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-      "bgt        1b                             \n"
+      "subs        %2, %2, #8                    \n"
+      "vst4.8      {d20, d21, d22, d23}, [%1]!   \n"
+      "bgt         1b                            \n"
       : "+r"(src_y),     // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
-      : [kUVToRB] "r"(&kYuvI601Constants.kUVToRB),
-        [kUVToG] "r"(&kYuvI601Constants.kUVToG),
-        [kUVBiasBGR] "r"(&kYuvI601Constants.kUVBiasBGR),
-        [kYToRgb] "r"(&kYuvI601Constants.kYToRgb)
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
       : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
         "q12", "q13", "q14", "q15");
 }
 
 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
   asm volatile(
-      "vmov.u8    d23, #255                      \n"
+      "vmov.u8     d23, #255                     \n"
       "1:                                        \n"
-      "vld1.8     {d20}, [%0]!                   \n"
-      "vmov       d21, d20                       \n"
-      "vmov       d22, d20                       \n"
-      "subs       %2, %2, #8                     \n"
-      "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-      "bgt        1b                             \n"
+      "vld1.8      {d20}, [%0]!                  \n"
+      "vmov        d21, d20                      \n"
+      "vmov        d22, d20                      \n"
+      "subs        %2, %2, #8                    \n"
+      "vst4.8      {d20, d21, d22, d23}, [%1]!   \n"
+      "bgt         1b                            \n"
       : "+r"(src_y),     // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
@@ -384,11 +387,11 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile(YUVTORGB_SETUP
-               "vmov.u8    d23, #255                      \n"
+               "vmov.u8     d23, #255                     \n"
                "1:                                        \n" READNV12 YUVTORGB
-               "subs       %3, %3, #8                     \n"
-               "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
-               "bgt        1b                             \n"
+               "subs        %3, %3, #8                    \n"
+               "vst4.8      {d20, d21, d22, d23}, [%2]!   \n"
+               "bgt         1b                            \n"
                : "+r"(src_y),     // %0
                  "+r"(src_uv),    // %1
                  "+r"(dst_argb),  // %2
@@ -407,11 +410,11 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile(YUVTORGB_SETUP
-               "vmov.u8    d23, #255                      \n"
+               "vmov.u8     d23, #255                     \n"
                "1:                                        \n" READNV21 YUVTORGB
-               "subs       %3, %3, #8                     \n"
-               "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
-               "bgt        1b                             \n"
+               "subs        %3, %3, #8                    \n"
+               "vst4.8      {d20, d21, d22, d23}, [%2]!   \n"
+               "bgt         1b                            \n"
                : "+r"(src_y),     // %0
                  "+r"(src_vu),    // %1
                  "+r"(dst_argb),  // %2
@@ -436,9 +439,9 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
       "1:                                        \n"
 
       READNV12 YUVTORGB
-      "subs       %3, %3, #8                     \n"
-      "vst3.8     {d20, d21, d22}, [%2]!         \n"
-      "bgt        1b                             \n"
+      "subs        %3, %3, #8                    \n"
+      "vst3.8      {d20, d21, d22}, [%2]!        \n"
+      "bgt         1b                            \n"
       : "+r"(src_y),      // %0
         "+r"(src_uv),     // %1
         "+r"(dst_rgb24),  // %2
@@ -463,9 +466,9 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
       "1:                                        \n"
 
       READNV21 YUVTORGB
-      "subs       %3, %3, #8                     \n"
-      "vst3.8     {d20, d21, d22}, [%2]!         \n"
-      "bgt        1b                             \n"
+      "subs        %3, %3, #8                    \n"
+      "vst3.8      {d20, d21, d22}, [%2]!        \n"
+      "bgt         1b                            \n"
       : "+r"(src_y),      // %0
         "+r"(src_vu),     // %1
         "+r"(dst_rgb24),  // %2
@@ -486,9 +489,9 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
   asm volatile(
       YUVTORGB_SETUP
       "1:                                        \n" READNV12 YUVTORGB
-      "subs       %3, %3, #8                     \n" ARGBTORGB565
-      "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
-      "bgt        1b                             \n"
+      "subs        %3, %3, #8                    \n" ARGBTORGB565
+      "vst1.8      {q0}, [%2]!                   \n"  // store 8 pixels RGB565.
+      "bgt         1b                            \n"
       : "+r"(src_y),       // %0
         "+r"(src_uv),      // %1
         "+r"(dst_rgb565),  // %2
@@ -506,11 +509,11 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile(YUVTORGB_SETUP
-               "vmov.u8    d23, #255                      \n"
+               "vmov.u8     d23, #255                     \n"
                "1:                                        \n" READYUY2 YUVTORGB
-               "subs       %2, %2, #8                     \n"
-               "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-               "bgt        1b                             \n"
+               "subs        %2, %2, #8                    \n"
+               "vst4.8      {d20, d21, d22, d23}, [%1]!   \n"
+               "bgt         1b                            \n"
                : "+r"(src_yuy2),  // %0
                  "+r"(dst_argb),  // %1
                  "+r"(width)      // %2
@@ -527,11 +530,11 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile(YUVTORGB_SETUP
-               "vmov.u8    d23, #255                      \n"
+               "vmov.u8     d23, #255                     \n"
                "1:                                        \n" READUYVY YUVTORGB
-               "subs       %2, %2, #8                     \n"
-               "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-               "bgt        1b                             \n"
+               "subs        %2, %2, #8                    \n"
+               "vst4.8      {d20, d21, d22, d23}, [%1]!   \n"
+               "bgt         1b                            \n"
                : "+r"(src_uyvy),  // %0
                  "+r"(dst_argb),  // %1
                  "+r"(width)      // %2
@@ -550,18 +553,18 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
                      int width) {
   asm volatile(
       "1:                                        \n"
-      "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
-      "subs       %3, %3, #16                    \n"  // 16 processed per loop
-      "vst1.8     {q0}, [%1]!                    \n"  // store U
-      "vst1.8     {q1}, [%2]!                    \n"  // store V
-      "bgt        1b                             \n"
+      "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pairs of UV
+      "subs        %3, %3, #16                   \n"  // 16 processed per loop
+      "vst1.8      {q0}, [%1]!                   \n"  // store U
+      "vst1.8      {q1}, [%2]!                   \n"  // store V
+      "bgt         1b                            \n"
       : "+r"(src_uv),               // %0
         "+r"(dst_u),                // %1
         "+r"(dst_v),                // %2
         "+r"(width)                 // %3  // Output registers
       :                             // Input registers
       : "cc", "memory", "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 // Reads 16 U's and V's and writes out 16 pairs of UV.
@@ -571,18 +574,18 @@ void MergeUVRow_NEON(const uint8_t* src_u,
                      int width) {
   asm volatile(
       "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load U
-      "vld1.8     {q1}, [%1]!                    \n"  // load V
-      "subs       %3, %3, #16                    \n"  // 16 processed per loop
-      "vst2.8     {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
-      "bgt        1b                             \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load U
+      "vld1.8      {q1}, [%1]!                   \n"  // load V
+      "subs        %3, %3, #16                   \n"  // 16 processed per loop
+      "vst2.8      {q0, q1}, [%2]!               \n"  // store 16 pairs of UV
+      "bgt         1b                            \n"
       : "+r"(src_u),                // %0
         "+r"(src_v),                // %1
         "+r"(dst_uv),               // %2
         "+r"(width)                 // %3  // Output registers
       :                             // Input registers
       : "cc", "memory", "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
@@ -593,13 +596,13 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
                       int width) {
   asm volatile(
       "1:                                        \n"
-      "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB
-      "vld3.8     {d1, d3, d5}, [%0]!            \n"  // next 8 RGB
-      "subs       %4, %4, #16                    \n"  // 16 processed per loop
-      "vst1.8     {q0}, [%1]!                    \n"  // store R
-      "vst1.8     {q1}, [%2]!                    \n"  // store G
-      "vst1.8     {q2}, [%3]!                    \n"  // store B
-      "bgt        1b                             \n"
+      "vld3.8      {d0, d2, d4}, [%0]!           \n"  // load 8 RGB
+      "vld3.8      {d1, d3, d5}, [%0]!           \n"  // next 8 RGB
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop
+      "vst1.8      {q0}, [%1]!                   \n"  // store R
+      "vst1.8      {q1}, [%2]!                   \n"  // store G
+      "vst1.8      {q2}, [%3]!                   \n"  // store B
+      "bgt         1b                            \n"
       : "+r"(src_rgb),                    // %0
         "+r"(dst_r),                      // %1
         "+r"(dst_g),                      // %2
@@ -607,7 +610,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
         "+r"(width)                       // %4
       :                                   // Input registers
       : "cc", "memory", "d0", "d1", "d2"  // Clobber List
-      );
+  );
 }
 
 // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
@@ -618,13 +621,13 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
                       int width) {
   asm volatile(
       "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load R
-      "vld1.8     {q1}, [%1]!                    \n"  // load G
-      "vld1.8     {q2}, [%2]!                    \n"  // load B
-      "subs       %4, %4, #16                    \n"  // 16 processed per loop
-      "vst3.8     {d0, d2, d4}, [%3]!            \n"  // store 8 RGB
-      "vst3.8     {d1, d3, d5}, [%3]!            \n"  // next 8 RGB
-      "bgt        1b                             \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load R
+      "vld1.8      {q1}, [%1]!                   \n"  // load G
+      "vld1.8      {q2}, [%2]!                   \n"  // load B
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop
+      "vst3.8      {d0, d2, d4}, [%3]!           \n"  // store 8 RGB
+      "vst3.8      {d1, d3, d5}, [%3]!           \n"  // next 8 RGB
+      "bgt         1b                            \n"
       : "+r"(src_r),                      // %0
         "+r"(src_g),                      // %1
         "+r"(src_b),                      // %2
@@ -632,33 +635,33 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
         "+r"(width)                       // %4
       :                                   // Input registers
       : "cc", "memory", "q0", "q1", "q2"  // Clobber List
-      );
+  );
 }
 
 // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
       "1:                                        \n"
-      "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
-      "subs       %2, %2, #32                    \n"  // 32 processed per loop
-      "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
-      "bgt        1b                             \n"
+      "vld1.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 32
+      "subs        %2, %2, #32                   \n"  // 32 processed per loop
+      "vst1.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 32
+      "bgt         1b                            \n"
       : "+r"(src),                  // %0
         "+r"(dst),                  // %1
         "+r"(width)                 // %2  // Output registers
       :                             // Input registers
       : "cc", "memory", "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 // SetRow writes 'width' bytes using an 8 bit value repeated.
 void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
   asm volatile(
-      "vdup.8    q0, %2                          \n"  // duplicate 16 bytes
+      "vdup.8      q0, %2                        \n"  // duplicate 16 bytes
       "1:                                        \n"
-      "subs      %1, %1, #16                     \n"  // 16 bytes per loop
-      "vst1.8    {q0}, [%0]!                     \n"  // store
-      "bgt       1b                              \n"
+      "subs        %1, %1, #16                   \n"  // 16 bytes per loop
+      "vst1.8      {q0}, [%0]!                   \n"  // store
+      "bgt         1b                            \n"
       : "+r"(dst),   // %0
         "+r"(width)  // %1
       : "r"(v8)      // %2
@@ -668,11 +671,11 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
 // ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
 void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
   asm volatile(
-      "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
+      "vdup.u32    q0, %2                        \n"  // duplicate 4 ints
       "1:                                        \n"
-      "subs      %1, %1, #4                      \n"  // 4 pixels per loop
-      "vst1.8    {q0}, [%0]!                     \n"  // store
-      "bgt       1b                              \n"
+      "subs        %1, %1, #4                    \n"  // 4 pixels per loop
+      "vst1.8      {q0}, [%0]!                   \n"  // store
+      "bgt         1b                            \n"
       : "+r"(dst),   // %0
         "+r"(width)  // %1
       : "r"(v32)     // %2
@@ -682,41 +685,62 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
       // Start at end of source row.
-      "mov        r3, #-16                       \n"
-      "add        %0, %0, %2                     \n"
-      "sub        %0, #16                        \n"
-
-      "1:                                        \n"
-      "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
-      "subs       %2, #16                        \n"  // 16 pixels per loop.
-      "vrev64.8   q0, q0                         \n"
-      "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
-      "vst1.8     {d0}, [%1]!                    \n"
-      "bgt        1b                             \n"
+      "add         %0, %0, %2                    \n"
+      "sub         %0, %0, #32                   \n"  // 32 bytes per loop
+
+      "1:                                        \n"
+      "vld1.8      {q1, q2}, [%0], %3            \n"  // src -= 32
+      "subs        %2, #32                       \n"  // 32 pixels per loop.
+      "vrev64.8    q0, q2                        \n"
+      "vrev64.8    q1, q1                        \n"
+      "vswp        d0, d1                        \n"
+      "vswp        d2, d3                        \n"
+      "vst1.8      {q0, q1}, [%1]!               \n"  // dst += 32
+      "bgt         1b                            \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
         "+r"(width)  // %2
+      : "r"(-32)     // %3
+      : "cc", "memory", "q0", "q1", "q2");
+}
+
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "mov         r12, #-16                     \n"
+      "add         %0, %0, %2, lsl #1            \n"
+      "sub         %0, #16                       \n"
+
+      "1:                                        \n"
+      "vld2.8      {d0, d1}, [%0], r12           \n"  // src -= 16
+      "subs        %2, #8                        \n"  // 8 pixels per loop.
+      "vrev64.8    q0, q0                        \n"
+      "vst2.8      {d0, d1}, [%1]!               \n"  // dst += 16
+      "bgt         1b                            \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_uv),  // %1
+        "+r"(width)    // %2
       :
-      : "cc", "memory", "r3", "q0");
+      : "cc", "memory", "r12", "q0");
 }
 
-void MirrorUVRow_NEON(const uint8_t* src_uv,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
+void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width) {
   asm volatile(
       // Start at end of source row.
-      "mov        r12, #-16                      \n"
-      "add        %0, %0, %3, lsl #1             \n"
-      "sub        %0, #16                        \n"
+      "mov         r12, #-16                     \n"
+      "add         %0, %0, %3, lsl #1            \n"
+      "sub         %0, #16                       \n"
 
       "1:                                        \n"
-      "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
-      "subs       %3, #8                         \n"  // 8 pixels per loop.
-      "vrev64.8   q0, q0                         \n"
-      "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
-      "vst1.8     {d1}, [%2]!                    \n"
-      "bgt        1b                             \n"
+      "vld2.8      {d0, d1}, [%0], r12           \n"  // src -= 16
+      "subs        %3, #8                        \n"  // 8 pixels per loop.
+      "vrev64.8    q0, q0                        \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // dst += 8
+      "vst1.8      {d1}, [%2]!                   \n"
+      "bgt         1b                            \n"
       : "+r"(src_uv),  // %0
         "+r"(dst_u),   // %1
         "+r"(dst_v),   // %2
@@ -725,77 +749,113 @@ void MirrorUVRow_NEON(const uint8_t* src_uv,
       : "cc", "memory", "r12", "q0");
 }
 
-void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
   asm volatile(
-      // Start at end of source row.
-      "mov        r3, #-16                       \n"
-      "add        %0, %0, %2, lsl #2             \n"
-      "sub        %0, #16                        \n"
+      "add         %0, %0, %2, lsl #2            \n"
+      "sub         %0, #32                       \n"
 
       "1:                                        \n"
-      "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
-      "subs       %2, #4                         \n"  // 4 pixels per loop.
-      "vrev64.32  q0, q0                         \n"
-      "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
-      "vst1.8     {d0}, [%1]!                    \n"
-      "bgt        1b                             \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "cc", "memory", "r3", "q0");
+      "vld4.8      {d0, d1, d2, d3}, [%0], %3    \n"  // src -= 32
+      "subs        %2, #8                        \n"  // 8 pixels per loop.
+      "vrev64.8    d0, d0                        \n"
+      "vrev64.8    d1, d1                        \n"
+      "vrev64.8    d2, d2                        \n"
+      "vrev64.8    d3, d3                        \n"
+      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // dst += 32
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(-32)         // %3
+      : "cc", "memory", "d0", "d1", "d2", "d3");
+}
+
+void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_rgb24,
+                         int width) {
+  src_rgb24 += width * 3 - 24;
+  asm volatile(
+      "1:                                        \n"
+      "vld3.8      {d0, d1, d2}, [%0], %3        \n"  // src -= 24
+      "subs        %2, #8                        \n"  // 8 pixels per loop.
+      "vrev64.8    d0, d0                        \n"
+      "vrev64.8    d1, d1                        \n"
+      "vrev64.8    d2, d2                        \n"
+      "vst3.8      {d0, d1, d2}, [%1]!           \n"  // dst += 24
+      "bgt         1b                            \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_rgb24),  // %1
+        "+r"(width)       // %2
+      : "r"(-24)          // %3
+      : "cc", "memory", "d0", "d1", "d2");
 }
 
 void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
                          uint8_t* dst_argb,
                          int width) {
   asm volatile(
-      "vmov.u8    d4, #255                       \n"  // Alpha
+      "vmov.u8     d4, #255                      \n"  // Alpha
       "1:                                        \n"
-      "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
-      "bgt        1b                             \n"
+      "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RGB24.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vst4.8      {d1, d2, d3, d4}, [%1]!       \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
       : "+r"(src_rgb24),  // %0
         "+r"(dst_argb),   // %1
         "+r"(width)       // %2
       :
       : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-      );
+  );
 }
 
 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
   asm volatile(
-      "vmov.u8    d4, #255                       \n"  // Alpha
+      "vmov.u8     d4, #255                      \n"  // Alpha
       "1:                                        \n"
-      "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vswp.u8    d1, d3                         \n"  // swap R, B
-      "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
-      "bgt        1b                             \n"
+      "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RAW.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vswp.u8     d1, d3                        \n"  // swap R, B
+      "vst4.8      {d1, d2, d3, d4}, [%1]!       \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
       : "+r"(src_raw),   // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
       :
       : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-      );
+  );
 }
 
+void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+  asm volatile(
+      "vmov.u8     d0, #255                      \n"  // Alpha
+      "1:                                        \n"
+      "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RAW.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vswp.u8     d1, d3                        \n"  // swap R, B
+      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 pixels of RGBA.
+      "bgt         1b                            \n"
+      : "+r"(src_raw),   // %0
+        "+r"(dst_rgba),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+  );
+}
 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
   asm volatile(
       "1:                                        \n"
-      "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vswp.u8    d1, d3                         \n"  // swap R, B
-      "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of
+      "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RAW.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vswp.u8     d1, d3                        \n"  // swap R, B
+      "vst3.8      {d1, d2, d3}, [%1]!           \n"  // store 8 pixels of
                                                       // RGB24.
-      "bgt        1b                             \n"
+      "bgt         1b                            \n"
       : "+r"(src_raw),    // %0
         "+r"(dst_rgb24),  // %1
         "+r"(width)       // %2
       :
       : "cc", "memory", "d1", "d2", "d3"  // Clobber List
-      );
+  );
 }
 
 #define RGB565TOARGB                                                        \
@@ -814,19 +874,19 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
                           uint8_t* dst_argb,
                           int width) {
   asm volatile(
-      "vmov.u8    d3, #255                       \n"  // Alpha
+      "vmov.u8     d3, #255                      \n"  // Alpha
       "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 RGB565 pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
       RGB565TOARGB
-      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-      "bgt        1b                             \n"
+      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
       : "+r"(src_rgb565),  // %0
         "+r"(dst_argb),    // %1
         "+r"(width)        // %2
       :
       : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-      );
+  );
 }
 
 #define ARGB1555TOARGB                                                      \
@@ -860,19 +920,19 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
                             uint8_t* dst_argb,
                             int width) {
   asm volatile(
-      "vmov.u8    d3, #255                       \n"  // Alpha
+      "vmov.u8     d3, #255                      \n"  // Alpha
       "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB1555 pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
       ARGB1555TOARGB
-      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-      "bgt        1b                             \n"
+      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
       : "+r"(src_argb1555),  // %0
         "+r"(dst_argb),      // %1
         "+r"(width)          // %2
       :
       : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-      );
+  );
 }
 
 #define ARGB4444TOARGB                                                      \
@@ -889,19 +949,19 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
                             uint8_t* dst_argb,
                             int width) {
   asm volatile(
-      "vmov.u8    d3, #255                       \n"  // Alpha
+      "vmov.u8     d3, #255                      \n"  // Alpha
       "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB4444 pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
       ARGB4444TOARGB
-      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-      "bgt        1b                             \n"
+      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
       : "+r"(src_argb4444),  // %0
         "+r"(dst_argb),      // %1
         "+r"(width)          // %2
       :
       : "cc", "memory", "q0", "q1", "q2"  // Clobber List
-      );
+  );
 }
 
 void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
@@ -909,63 +969,63 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
                          int width) {
   asm volatile(
       "1:                                        \n"
-      "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of
+      "vld4.8      {d1, d2, d3, d4}, [%0]!       \n"  // load 8 pixels of ARGB.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vst3.8      {d1, d2, d3}, [%1]!           \n"  // store 8 pixels of
                                                       // RGB24.
-      "bgt        1b                             \n"
+      "bgt         1b                            \n"
       : "+r"(src_argb),   // %0
         "+r"(dst_rgb24),  // %1
         "+r"(width)       // %2
       :
       : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-      );
+  );
 }
 
 void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
   asm volatile(
       "1:                                        \n"
-      "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vswp.u8    d1, d3                         \n"  // swap R, B
-      "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
-      "bgt        1b                             \n"
+      "vld4.8      {d1, d2, d3, d4}, [%0]!       \n"  // load 8 pixels of ARGB.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vswp.u8     d1, d3                        \n"  // swap R, B
+      "vst3.8      {d1, d2, d3}, [%1]!           \n"  // store 8 pixels of RAW.
+      "bgt         1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_raw),   // %1
         "+r"(width)      // %2
       :
       : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-      );
+  );
 }
 
 void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
   asm volatile(
       "1:                                        \n"
-      "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
-      "subs       %2, %2, #16                    \n"  // 16 processed per loop.
-      "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
-      "bgt        1b                             \n"
+      "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pixels of YUY2.
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop.
+      "vst1.8      {q0}, [%1]!                   \n"  // store 16 pixels of Y.
+      "bgt         1b                            \n"
       : "+r"(src_yuy2),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
       :
       : "cc", "memory", "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
   asm volatile(
       "1:                                        \n"
-      "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
-      "subs       %2, %2, #16                    \n"  // 16 processed per loop.
-      "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
-      "bgt        1b                             \n"
+      "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pixels of UYVY.
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop.
+      "vst1.8      {q1}, [%1]!                   \n"  // store 16 pixels of Y.
+      "bgt         1b                            \n"
       : "+r"(src_uyvy),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
       :
       : "cc", "memory", "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
@@ -974,18 +1034,18 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
                          int width) {
   asm volatile(
       "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
-      "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
-      "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
-      "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
-      "bgt        1b                             \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of YUY2.
+      "subs        %3, %3, #16                   \n"  // 16 pixels = 8 UVs.
+      "vst1.8      {d1}, [%1]!                   \n"  // store 8 U.
+      "vst1.8      {d3}, [%2]!                   \n"  // store 8 V.
+      "bgt         1b                            \n"
       : "+r"(src_yuy2),  // %0
         "+r"(dst_u),     // %1
         "+r"(dst_v),     // %2
         "+r"(width)      // %3
       :
       : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
-      );
+  );
 }
 
 void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
@@ -994,18 +1054,18 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
                          int width) {
   asm volatile(
       "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
-      "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
-      "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
-      "bgt        1b                             \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of UYVY.
+      "subs        %3, %3, #16                   \n"  // 16 pixels = 8 UVs.
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 U.
+      "vst1.8      {d2}, [%2]!                   \n"  // store 8 V.
+      "bgt         1b                            \n"
       : "+r"(src_uyvy),  // %0
         "+r"(dst_u),     // %1
         "+r"(dst_v),     // %2
         "+r"(width)      // %3
       :
       : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
-      );
+  );
 }
 
 void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
@@ -1014,16 +1074,16 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
                       uint8_t* dst_v,
                       int width) {
   asm volatile(
-      "add        %1, %0, %1                     \n"  // stride + src_yuy2
+      "add         %1, %0, %1                    \n"  // stride + src_yuy2
       "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
-      "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
-      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
-      "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
-      "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
-      "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
-      "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
-      "bgt        1b                             \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of YUY2.
+      "subs        %4, %4, #16                   \n"  // 16 pixels = 8 UVs.
+      "vld4.8      {d4, d5, d6, d7}, [%1]!       \n"  // load next row YUY2.
+      "vrhadd.u8   d1, d1, d5                    \n"  // average rows of U
+      "vrhadd.u8   d3, d3, d7                    \n"  // average rows of V
+      "vst1.8      {d1}, [%2]!                   \n"  // store 8 U.
+      "vst1.8      {d3}, [%3]!                   \n"  // store 8 V.
+      "bgt         1b                            \n"
       : "+r"(src_yuy2),     // %0
         "+r"(stride_yuy2),  // %1
         "+r"(dst_u),        // %2
@@ -1032,7 +1092,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
       :
       : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
         "d7"  // Clobber List
-      );
+  );
 }
 
 void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
@@ -1041,16 +1101,16 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
                       uint8_t* dst_v,
                       int width) {
   asm volatile(
-      "add        %1, %0, %1                     \n"  // stride + src_uyvy
+      "add         %1, %0, %1                    \n"  // stride + src_uyvy
       "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
-      "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
-      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
-      "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
-      "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
-      "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
-      "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
-      "bgt        1b                             \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of UYVY.
+      "subs        %4, %4, #16                   \n"  // 16 pixels = 8 UVs.
+      "vld4.8      {d4, d5, d6, d7}, [%1]!       \n"  // load next row UYVY.
+      "vrhadd.u8   d0, d0, d4                    \n"  // average rows of U
+      "vrhadd.u8   d2, d2, d6                    \n"  // average rows of V
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 U.
+      "vst1.8      {d2}, [%3]!                   \n"  // store 8 V.
+      "bgt         1b                            \n"
       : "+r"(src_uyvy),     // %0
         "+r"(stride_uyvy),  // %1
         "+r"(dst_u),        // %2
@@ -1059,7 +1119,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
       :
       : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
         "d7"  // Clobber List
-      );
+  );
 }
 
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
@@ -1068,20 +1128,20 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
                          const uint8_t* shuffler,
                          int width) {
   asm volatile(
-      "vld1.8     {q2}, [%3]                     \n"  // shuffler
+      "vld1.8      {q2}, [%3]                    \n"  // shuffler
       "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
-      "subs       %2, %2, #4                     \n"  // 4 processed per loop
-      "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
-      "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
-      "vst1.8     {q1}, [%1]!                    \n"  // store 4.
-      "bgt        1b                             \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 4 pixels.
+      "subs        %2, %2, #4                    \n"  // 4 processed per loop
+      "vtbl.8      d2, {d0, d1}, d4              \n"  // look up 2 first pixels
+      "vtbl.8      d3, {d0, d1}, d5              \n"  // look up 2 next pixels
+      "vst1.8      {q1}, [%1]!                   \n"  // store 4.
+      "bgt         1b                            \n"
       : "+r"(src_argb),                   // %0
         "+r"(dst_argb),                   // %1
         "+r"(width)                       // %2
       : "r"(shuffler)                     // %3
       : "cc", "memory", "q0", "q1", "q2"  // Clobber List
-      );
+  );
 }
 
 void I422ToYUY2Row_NEON(const uint8_t* src_y,
@@ -1091,12 +1151,12 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
                         int width) {
   asm volatile(
       "1:                                        \n"
-      "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
-      "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
-      "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
-      "subs       %4, %4, #16                    \n"  // 16 pixels
-      "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
-      "bgt        1b                             \n"
+      "vld2.8      {d0, d2}, [%0]!               \n"  // load 16 Ys
+      "vld1.8      {d1}, [%1]!                   \n"  // load 8 Us
+      "vld1.8      {d3}, [%2]!                   \n"  // load 8 Vs
+      "subs        %4, %4, #16                   \n"  // 16 pixels
+      "vst4.8      {d0, d1, d2, d3}, [%3]!       \n"  // Store 8 YUY2/16 pixels.
+      "bgt         1b                            \n"
       : "+r"(src_y),     // %0
         "+r"(src_u),     // %1
         "+r"(src_v),     // %2
@@ -1113,12 +1173,12 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
                         int width) {
   asm volatile(
       "1:                                        \n"
-      "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
-      "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
-      "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
-      "subs       %4, %4, #16                    \n"  // 16 pixels
-      "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
-      "bgt        1b                             \n"
+      "vld2.8      {d1, d3}, [%0]!               \n"  // load 16 Ys
+      "vld1.8      {d0}, [%1]!                   \n"  // load 8 Us
+      "vld1.8      {d2}, [%2]!                   \n"  // load 8 Vs
+      "subs        %4, %4, #16                   \n"  // 16 pixels
+      "vst4.8      {d0, d1, d2, d3}, [%3]!       \n"  // Store 8 UYVY/16 pixels.
+      "bgt         1b                            \n"
       : "+r"(src_y),     // %0
         "+r"(src_u),     // %1
         "+r"(src_v),     // %2
@@ -1133,11 +1193,11 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
                           int width) {
   asm volatile(
       "1:                                        \n"
-      "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vld4.8      {d20, d21, d22, d23}, [%0]!   \n"  // load 8 pixels of ARGB.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
       ARGBTORGB565
-      "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
-      "bgt        1b                             \n"
+      "vst1.8      {q0}, [%1]!                   \n"  // store 8 pixels RGB565.
+      "bgt         1b                            \n"
       : "+r"(src_argb),    // %0
         "+r"(dst_rgb565),  // %1
         "+r"(width)        // %2
@@ -1150,16 +1210,16 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
                                 const uint32_t dither4,
                                 int width) {
   asm volatile(
-      "vdup.32    d2, %2                         \n"  // dither4
+      "vdup.32     d2, %2                        \n"  // dither4
       "1:                                        \n"
-      "vld4.8     {d20, d21, d22, d23}, [%1]!    \n"  // load 8 pixels of ARGB.
-      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-      "vqadd.u8   d20, d20, d2                   \n"
-      "vqadd.u8   d21, d21, d2                   \n"
-      "vqadd.u8   d22, d22, d2                   \n"  // add for dither
+      "vld4.8      {d20, d21, d22, d23}, [%1]!   \n"  // load 8 pixels of ARGB.
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vqadd.u8    d20, d20, d2                  \n"
+      "vqadd.u8    d21, d21, d2                  \n"
+      "vqadd.u8    d22, d22, d2                  \n"  // add for dither
       ARGBTORGB565
-      "vst1.8     {q0}, [%0]!                    \n"  // store 8 RGB565.
-      "bgt        1b                             \n"
+      "vst1.8      {q0}, [%0]!                   \n"  // store 8 RGB565.
+      "bgt         1b                            \n"
       : "+r"(dst_rgb)   // %0
       : "r"(src_argb),  // %1
         "r"(dither4),   // %2
@@ -1172,11 +1232,11 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
                             int width) {
   asm volatile(
       "1:                                        \n"
-      "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vld4.8      {d20, d21, d22, d23}, [%0]!   \n"  // load 8 pixels of ARGB.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
       ARGBTOARGB1555
-      "vst1.8     {q0}, [%1]!                    \n"  // store 8 ARGB1555.
-      "bgt        1b                             \n"
+      "vst1.8      {q0}, [%1]!                   \n"  // store 8 ARGB1555.
+      "bgt         1b                            \n"
       : "+r"(src_argb),      // %0
         "+r"(dst_argb1555),  // %1
         "+r"(width)          // %2
@@ -1188,14 +1248,14 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
                             uint8_t* dst_argb4444,
                             int width) {
   asm volatile(
-      "vmov.u8    d4, #0x0f                      \n"  // bits to clear with
+      "vmov.u8     d4, #0x0f                     \n"  // bits to clear with
                                                       // vbic.
       "1:                                        \n"
-      "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vld4.8      {d20, d21, d22, d23}, [%0]!   \n"  // load 8 pixels of ARGB.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
       ARGBTOARGB4444
-      "vst1.8     {q0}, [%1]!                    \n"  // store 8 ARGB4444.
-      "bgt        1b                             \n"
+      "vst1.8      {q0}, [%1]!                   \n"  // store 8 ARGB4444.
+      "bgt         1b                            \n"
       : "+r"(src_argb),      // %0
         "+r"(dst_argb4444),  // %1
         "+r"(width)          // %2
@@ -1205,20 +1265,20 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
 
 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   asm volatile(
-      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-      "vmov.u8    d27, #16                       \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q2, d0, d24                    \n"  // B
-      "vmlal.u8   q2, d1, d25                    \n"  // G
-      "vmlal.u8   q2, d2, d26                    \n"  // R
-      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-      "vqadd.u8   d0, d27                        \n"
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-      "bgt        1b                             \n"
+      "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
+      "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
+      "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
+      "vmov.u8     d27, #16                      \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q2, d0, d24                   \n"  // B
+      "vmlal.u8    q2, d1, d25                   \n"  // G
+      "vmlal.u8    q2, d2, d26                   \n"  // R
+      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit Y
+      "vqadd.u8    d0, d27                       \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
@@ -1231,33 +1291,54 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
                               int width) {
   asm volatile(
       "1:                                        \n"
-      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels
-      "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels
-      "subs       %2, %2, #16                    \n"  // 16 processed per loop
-      "vst1.8     {q3}, [%1]!                    \n"  // store 16 A's.
-      "bgt       1b                              \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB pixels
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop
+      "vst1.8      {q3}, [%1]!                   \n"  // store 16 A's.
+      "bgt         1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_a),     // %1
         "+r"(width)      // %2
       :
       : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-      );
+  );
 }
 
 void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   asm volatile(
-      "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
-      "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
-      "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q2, d0, d24                    \n"  // B
-      "vmlal.u8   q2, d1, d25                    \n"  // G
-      "vmlal.u8   q2, d2, d26                    \n"  // R
-      "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-      "bgt        1b                             \n"
+      "vmov.u8     d24, #29                      \n"  // B * 0.1140 coefficient
+      "vmov.u8     d25, #150                     \n"  // G * 0.5870 coefficient
+      "vmov.u8     d26, #77                      \n"  // R * 0.2990 coefficient
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q2, d0, d24                   \n"  // B
+      "vmlal.u8    q2, d1, d25                   \n"  // G
+      "vmlal.u8    q2, d2, d26                   \n"  // R
+      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit Y
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
+}
+
+void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8     d24, #29                      \n"  // B * 0.1140 coefficient
+      "vmov.u8     d25, #150                     \n"  // G * 0.5870 coefficient
+      "vmov.u8     d26, #77                      \n"  // R * 0.2990 coefficient
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 RGBA pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q2, d1, d24                   \n"  // B
+      "vmlal.u8    q2, d2, d25                   \n"  // G
+      "vmlal.u8    q2, d3, d26                   \n"  // R
+      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit Y
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
@@ -1271,32 +1352,32 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_v,
                          int width) {
   asm volatile(
-      "vmov.u8    d24, #112                      \n"  // UB / VR 0.875
+      "vmov.u8     d24, #112                     \n"  // UB / VR 0.875
                                                       // coefficient
-      "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
-      "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
-      "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
-      "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
-      "vmov.u16   q15, #0x8080                   \n"  // 128.5
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q2, d0, d24                    \n"  // B
-      "vmlsl.u8   q2, d1, d25                    \n"  // G
-      "vmlsl.u8   q2, d2, d26                    \n"  // R
-      "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned
-
-      "vmull.u8   q3, d2, d24                    \n"  // R
-      "vmlsl.u8   q3, d1, d28                    \n"  // G
-      "vmlsl.u8   q3, d0, d27                    \n"  // B
-      "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned
+      "vmov.u8     d25, #74                      \n"  // UG -0.5781 coefficient
+      "vmov.u8     d26, #38                      \n"  // UR -0.2969 coefficient
+      "vmov.u8     d27, #18                      \n"  // VB -0.1406 coefficient
+      "vmov.u8     d28, #94                      \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q2, d0, d24                   \n"  // B
+      "vmlsl.u8    q2, d1, d25                   \n"  // G
+      "vmlsl.u8    q2, d2, d26                   \n"  // R
+      "vadd.u16    q2, q2, q15                   \n"  // +128 -> unsigned
+
+      "vmull.u8    q3, d2, d24                   \n"  // R
+      "vmlsl.u8    q3, d1, d28                   \n"  // G
+      "vmlsl.u8    q3, d0, d27                   \n"  // B
+      "vadd.u16    q3, q3, q15                   \n"  // +128 -> unsigned
 
       "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
       "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
 
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
-      "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
-      "bgt        1b                             \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%2]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_u),     // %1
         "+r"(dst_v),     // %2
@@ -1328,34 +1409,34 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
                       uint8_t* dst_v,
                       int width) {
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    "1:                                        \n"
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
-    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+      "add         %1, %0, %1                    \n"  // src_stride + src_argb
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB pixels.
+      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ARGB pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ARGB pixels.
+      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q1, #1                    \n"
+      "vrshr.u16   q2, q2, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
     RGBTOUV(q0, q1, q2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
   : "+r"(src_argb),  // %0
     "+r"(src_stride_argb),  // %1
     "+r"(dst_u),     // %2
@@ -1374,34 +1455,34 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
                        uint8_t* dst_v,
                        int width) {
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient
-    "vmov.s16   q11, #84 / 2                   \n"  // UG -0.33126 coefficient
-    "vmov.s16   q12, #43 / 2                   \n"  // UR -0.16874 coefficient
-    "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient
-    "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    "1:                                        \n"
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
-    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+      "add         %1, %0, %1                    \n"  // src_stride + src_argb
+      "vmov.s16    q10, #127 / 2                 \n"  // UB / VR 0.500 coefficient
+      "vmov.s16    q11, #84 / 2                  \n"  // UG -0.33126 coefficient
+      "vmov.s16    q12, #43 / 2                  \n"  // UR -0.16874 coefficient
+      "vmov.s16    q13, #20 / 2                  \n"  // VB -0.08131 coefficient
+      "vmov.s16    q14, #107 / 2                 \n"  // VG -0.41869 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB pixels.
+      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ARGB pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ARGB pixels.
+      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q1, #1                    \n"
+      "vrshr.u16   q2, q2, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
     RGBTOUV(q0, q1, q2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
   : "+r"(src_argb),  // %0
     "+r"(src_stride_argb),  // %1
     "+r"(dst_u),     // %2
@@ -1419,34 +1500,34 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
                       uint8_t* dst_v,
                       int width) {
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_bgra
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    "1:                                        \n"
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
-    "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
-    "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q1, q1, #1                     \n"  // 2x average
-    "vrshr.u16  q2, q2, #1                     \n"
-    "vrshr.u16  q3, q3, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+      "add         %1, %0, %1                    \n"  // src_stride + src_bgra
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 BGRA pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 BGRA pixels.
+      "vpaddl.u8   q3, q3                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q2                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // R 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more BGRA pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 BGRA pixels.
+      "vpadal.u8   q3, q7                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q6                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q1, q1, #1                    \n"  // 2x average
+      "vrshr.u16   q2, q2, #1                    \n"
+      "vrshr.u16   q3, q3, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
     RGBTOUV(q3, q2, q1)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
   : "+r"(src_bgra),  // %0
     "+r"(src_stride_bgra),  // %1
     "+r"(dst_u),     // %2
@@ -1464,34 +1545,34 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
                       uint8_t* dst_v,
                       int width) {
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_abgr
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    "1:                                        \n"
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
-    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
-    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+      "add         %1, %0, %1                    \n"  // src_stride + src_abgr
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ABGR pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ABGR pixels.
+      "vpaddl.u8   q2, q2                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q0, q0                        \n"  // R 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ABGR pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ABGR pixels.
+      "vpadal.u8   q2, q6                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q0, q4                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q1, #1                    \n"
+      "vrshr.u16   q2, q2, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
     RGBTOUV(q2, q1, q0)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
   : "+r"(src_abgr),  // %0
     "+r"(src_stride_abgr),  // %1
     "+r"(dst_u),     // %2
@@ -1509,34 +1590,34 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
                       uint8_t* dst_v,
                       int width) {
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_rgba
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    "1:                                        \n"
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
-    "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
-    "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q7                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+      "add         %1, %0, %1                    \n"  // src_stride + src_rgba
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 RGBA pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 RGBA pixels.
+      "vpaddl.u8   q0, q1                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q2                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q3                        \n"  // R 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more RGBA pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 RGBA pixels.
+      "vpadal.u8   q0, q5                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q6                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q7                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q1, #1                    \n"
+      "vrshr.u16   q2, q2, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
     RGBTOUV(q0, q1, q2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
   : "+r"(src_rgba),  // %0
     "+r"(src_stride_rgba),  // %1
     "+r"(dst_u),     // %2
@@ -1554,34 +1635,34 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
                        uint8_t* dst_v,
                        int width) {
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    "1:                                        \n"
-    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
-    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
-    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
-    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+      "add         %1, %0, %1                    \n"  // src_stride + src_rgb24
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld3.8      {d0, d2, d4}, [%0]!           \n"  // load 8 RGB24 pixels.
+      "vld3.8      {d1, d3, d5}, [%0]!           \n"  // load next 8 RGB24 pixels.
+      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
+      "vld3.8      {d8, d10, d12}, [%1]!         \n"  // load 8 more RGB24 pixels.
+      "vld3.8      {d9, d11, d13}, [%1]!         \n"  // load last 8 RGB24 pixels.
+      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q1, #1                    \n"
+      "vrshr.u16   q2, q2, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
     RGBTOUV(q0, q1, q2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
   : "+r"(src_rgb24),  // %0
     "+r"(src_stride_rgb24),  // %1
     "+r"(dst_u),     // %2
@@ -1599,34 +1680,34 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
                      uint8_t* dst_v,
                      int width) {
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_raw
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    "1:                                        \n"
-    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
-    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
-    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
-    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
-    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
-    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+      "add         %1, %0, %1                    \n"  // src_stride + src_raw
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld3.8      {d0, d2, d4}, [%0]!           \n"  // load 8 RAW pixels.
+      "vld3.8      {d1, d3, d5}, [%0]!           \n"  // load next 8 RAW pixels.
+      "vpaddl.u8   q2, q2                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q0, q0                        \n"  // R 16 bytes -> 8 shorts.
+      "vld3.8      {d8, d10, d12}, [%1]!         \n"  // load 8 more RAW pixels.
+      "vld3.8      {d9, d11, d13}, [%1]!         \n"  // load last 8 RAW pixels.
+      "vpadal.u8   q2, q6                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q0, q4                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q1, #1                    \n"
+      "vrshr.u16   q2, q2, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
     RGBTOUV(q2, q1, q0)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
   : "+r"(src_raw),  // %0
     "+r"(src_stride_raw),  // %1
     "+r"(dst_u),     // %2
@@ -1645,55 +1726,55 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
                         uint8_t* dst_v,
                         int width) {
   asm volatile(
-      "add        %1, %0, %1                     \n"  // src_stride + src_argb
-      "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875
+      "add         %1, %0, %1                    \n"  // src_stride + src_argb
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875
                                                       // coefficient
-      "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-      "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-      "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-      "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-      "vmov.u16   q15, #0x8080                   \n"  // 128.5
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
       "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 RGB565 pixels.
       RGB565TOARGB
-      "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-      "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
+      "vpaddl.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8      {q0}, [%0]!                   \n"  // next 8 RGB565 pixels.
       RGB565TOARGB
-      "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vpaddl.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
 
-      "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
+      "vld1.8      {q0}, [%1]!                   \n"  // load 8 RGB565 pixels.
       RGB565TOARGB
-      "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-      "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
+      "vpadal.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8      {q0}, [%1]!                   \n"  // next 8 RGB565 pixels.
       RGB565TOARGB
-      "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-      "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-      "vrshr.u16  q5, q5, #1                     \n"
-      "vrshr.u16  q6, q6, #1                     \n"
-
-      "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-      "vmul.s16   q8, q4, q10                    \n"  // B
-      "vmls.s16   q8, q5, q11                    \n"  // G
-      "vmls.s16   q8, q6, q12                    \n"  // R
-      "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-      "vmul.s16   q9, q6, q10                    \n"  // R
-      "vmls.s16   q9, q5, q14                    \n"  // G
-      "vmls.s16   q9, q4, q13                    \n"  // B
-      "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+      "vpadal.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
+
+      "vrshr.u16   q4, q4, #1                    \n"  // 2x average
+      "vrshr.u16   q5, q5, #1                    \n"
+      "vrshr.u16   q6, q6, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+      "vmul.s16    q8, q4, q10                   \n"  // B
+      "vmls.s16    q8, q5, q11                   \n"  // G
+      "vmls.s16    q8, q6, q12                   \n"  // R
+      "vadd.u16    q8, q8, q15                   \n"  // +128 -> unsigned
+      "vmul.s16    q9, q6, q10                   \n"  // R
+      "vmls.s16    q9, q5, q14                   \n"  // G
+      "vmls.s16    q9, q4, q13                   \n"  // B
+      "vadd.u16    q9, q9, q15                   \n"  // +128 -> unsigned
       "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
       "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-      "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-      "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-      "bgt        1b                             \n"
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
       : "+r"(src_rgb565),         // %0
         "+r"(src_stride_rgb565),  // %1
         "+r"(dst_u),              // %2
@@ -1711,55 +1792,55 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
                           uint8_t* dst_v,
                           int width) {
   asm volatile(
-      "add        %1, %0, %1                     \n"  // src_stride + src_argb
-      "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875
+      "add         %1, %0, %1                    \n"  // src_stride + src_argb
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875
                                                       // coefficient
-      "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-      "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-      "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-      "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-      "vmov.u16   q15, #0x8080                   \n"  // 128.5
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
       "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB1555 pixels.
       RGB555TOARGB
-      "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-      "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
+      "vpaddl.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8      {q0}, [%0]!                   \n"  // next 8 ARGB1555 pixels.
       RGB555TOARGB
-      "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vpaddl.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
 
-      "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
+      "vld1.8      {q0}, [%1]!                   \n"  // load 8 ARGB1555 pixels.
       RGB555TOARGB
-      "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-      "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
+      "vpadal.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8      {q0}, [%1]!                   \n"  // next 8 ARGB1555 pixels.
       RGB555TOARGB
-      "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-      "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-      "vrshr.u16  q5, q5, #1                     \n"
-      "vrshr.u16  q6, q6, #1                     \n"
-
-      "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-      "vmul.s16   q8, q4, q10                    \n"  // B
-      "vmls.s16   q8, q5, q11                    \n"  // G
-      "vmls.s16   q8, q6, q12                    \n"  // R
-      "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-      "vmul.s16   q9, q6, q10                    \n"  // R
-      "vmls.s16   q9, q5, q14                    \n"  // G
-      "vmls.s16   q9, q4, q13                    \n"  // B
-      "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+      "vpadal.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
+
+      "vrshr.u16   q4, q4, #1                    \n"  // 2x average
+      "vrshr.u16   q5, q5, #1                    \n"
+      "vrshr.u16   q6, q6, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+      "vmul.s16    q8, q4, q10                   \n"  // B
+      "vmls.s16    q8, q5, q11                   \n"  // G
+      "vmls.s16    q8, q6, q12                   \n"  // R
+      "vadd.u16    q8, q8, q15                   \n"  // +128 -> unsigned
+      "vmul.s16    q9, q6, q10                   \n"  // R
+      "vmls.s16    q9, q5, q14                   \n"  // G
+      "vmls.s16    q9, q4, q13                   \n"  // B
+      "vadd.u16    q9, q9, q15                   \n"  // +128 -> unsigned
       "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
       "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-      "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-      "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-      "bgt        1b                             \n"
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
       : "+r"(src_argb1555),         // %0
         "+r"(src_stride_argb1555),  // %1
         "+r"(dst_u),                // %2
@@ -1777,55 +1858,46 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
                           uint8_t* dst_v,
                           int width) {
   asm volatile(
-      "add        %1, %0, %1                     \n"  // src_stride + src_argb
-      "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875
+      "add         %1, %0, %1                    \n"  // src_stride + src_argb
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875
                                                       // coefficient
-      "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-      "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-      "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-      "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-      "vmov.u16   q15, #0x8080                   \n"  // 128.5
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
       "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB4444 pixels.
       ARGB4444TOARGB
-      "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-      "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
+      "vpaddl.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8      {q0}, [%0]!                   \n"  // next 8 ARGB4444 pixels.
       ARGB4444TOARGB
-      "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vpaddl.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
 
-      "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
+      "vld1.8      {q0}, [%1]!                   \n"  // load 8 ARGB4444 pixels.
       ARGB4444TOARGB
-      "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-      "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
+      "vpadal.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8      {q0}, [%1]!                   \n"  // next 8 ARGB4444 pixels.
       ARGB4444TOARGB
-      "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-      "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-      "vrshr.u16  q5, q5, #1                     \n"
-      "vrshr.u16  q6, q6, #1                     \n"
-
-      "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-      "vmul.s16   q8, q4, q10                    \n"  // B
-      "vmls.s16   q8, q5, q11                    \n"  // G
-      "vmls.s16   q8, q6, q12                    \n"  // R
-      "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-      "vmul.s16   q9, q6, q10                    \n"  // R
-      "vmls.s16   q9, q5, q14                    \n"  // G
-      "vmls.s16   q9, q4, q13                    \n"  // B
-      "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-      "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-      "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-      "bgt        1b                             \n"
+      "vpadal.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
+
+      "vrshr.u16   q0, q4, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q5, #1                    \n"
+      "vrshr.u16   q2, q6, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+      RGBTOUV(q0, q1, q2)
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
       : "+r"(src_argb4444),         // %0
         "+r"(src_stride_argb4444),  // %1
         "+r"(dst_u),                // %2
@@ -1838,21 +1910,21 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
 
 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
   asm volatile(
-      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-      "vmov.u8    d27, #16                       \n"  // Add 16 constant
+      "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
+      "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
+      "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
+      "vmov.u8     d27, #16                      \n"  // Add 16 constant
       "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 RGB565 pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
       RGB565TOARGB
-      "vmull.u8   q2, d0, d24                    \n"  // B
-      "vmlal.u8   q2, d1, d25                    \n"  // G
-      "vmlal.u8   q2, d2, d26                    \n"  // R
-      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-      "vqadd.u8   d0, d27                        \n"
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-      "bgt        1b                             \n"
+      "vmull.u8    q2, d0, d24                   \n"  // B
+      "vmlal.u8    q2, d1, d25                   \n"  // G
+      "vmlal.u8    q2, d2, d26                   \n"  // R
+      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit Y
+      "vqadd.u8    d0, d27                       \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
       : "+r"(src_rgb565),  // %0
         "+r"(dst_y),       // %1
         "+r"(width)        // %2
@@ -1864,21 +1936,21 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
                          uint8_t* dst_y,
                          int width) {
   asm volatile(
-      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-      "vmov.u8    d27, #16                       \n"  // Add 16 constant
+      "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
+      "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
+      "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
+      "vmov.u8     d27, #16                      \n"  // Add 16 constant
       "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB1555 pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
       ARGB1555TOARGB
-      "vmull.u8   q2, d0, d24                    \n"  // B
-      "vmlal.u8   q2, d1, d25                    \n"  // G
-      "vmlal.u8   q2, d2, d26                    \n"  // R
-      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-      "vqadd.u8   d0, d27                        \n"
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-      "bgt        1b                             \n"
+      "vmull.u8    q2, d0, d24                   \n"  // B
+      "vmlal.u8    q2, d1, d25                   \n"  // G
+      "vmlal.u8    q2, d2, d26                   \n"  // R
+      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit Y
+      "vqadd.u8    d0, d27                       \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
       : "+r"(src_argb1555),  // %0
         "+r"(dst_y),         // %1
         "+r"(width)          // %2
@@ -1890,21 +1962,21 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
                          uint8_t* dst_y,
                          int width) {
   asm volatile(
-      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-      "vmov.u8    d27, #16                       \n"  // Add 16 constant
+      "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
+      "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
+      "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
+      "vmov.u8     d27, #16                      \n"  // Add 16 constant
       "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB4444 pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
       ARGB4444TOARGB
-      "vmull.u8   q2, d0, d24                    \n"  // B
-      "vmlal.u8   q2, d1, d25                    \n"  // G
-      "vmlal.u8   q2, d2, d26                    \n"  // R
-      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-      "vqadd.u8   d0, d27                        \n"
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-      "bgt        1b                             \n"
+      "vmull.u8    q2, d0, d24                   \n"  // B
+      "vmlal.u8    q2, d1, d25                   \n"  // G
+      "vmlal.u8    q2, d2, d26                   \n"  // R
+      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit Y
+      "vqadd.u8    d0, d27                       \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
       : "+r"(src_argb4444),  // %0
         "+r"(dst_y),         // %1
         "+r"(width)          // %2
@@ -1914,20 +1986,20 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
 
 void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
   asm volatile(
-      "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-      "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-      "vmov.u8    d7, #16                        \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q8, d1, d4                     \n"  // R
-      "vmlal.u8   q8, d2, d5                     \n"  // G
-      "vmlal.u8   q8, d3, d6                     \n"  // B
-      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-      "vqadd.u8   d0, d7                         \n"
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-      "bgt        1b                             \n"
+      "vmov.u8     d6, #25                       \n"  // B * 0.1016 coefficient
+      "vmov.u8     d5, #129                      \n"  // G * 0.5078 coefficient
+      "vmov.u8     d4, #66                       \n"  // R * 0.2578 coefficient
+      "vmov.u8     d7, #16                       \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 pixels of BGRA.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q8, d1, d4                    \n"  // R
+      "vmlal.u8    q8, d2, d5                    \n"  // G
+      "vmlal.u8    q8, d3, d6                    \n"  // B
+      "vqrshrn.u16 d0, q8, #8                    \n"  // 16 bit to 8 bit Y
+      "vqadd.u8    d0, d7                        \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
       : "+r"(src_bgra),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
@@ -1937,20 +2009,20 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
 
 void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
   asm volatile(
-      "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-      "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-      "vmov.u8    d7, #16                        \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q8, d0, d4                     \n"  // R
-      "vmlal.u8   q8, d1, d5                     \n"  // G
-      "vmlal.u8   q8, d2, d6                     \n"  // B
-      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-      "vqadd.u8   d0, d7                         \n"
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-      "bgt        1b                             \n"
+      "vmov.u8     d6, #25                       \n"  // B * 0.1016 coefficient
+      "vmov.u8     d5, #129                      \n"  // G * 0.5078 coefficient
+      "vmov.u8     d4, #66                       \n"  // R * 0.2578 coefficient
+      "vmov.u8     d7, #16                       \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 pixels of ABGR.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q8, d0, d4                    \n"  // R
+      "vmlal.u8    q8, d1, d5                    \n"  // G
+      "vmlal.u8    q8, d2, d6                    \n"  // B
+      "vqrshrn.u16 d0, q8, #8                    \n"  // 16 bit to 8 bit Y
+      "vqadd.u8    d0, d7                        \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
       : "+r"(src_abgr),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
@@ -1960,20 +2032,20 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
 
 void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
   asm volatile(
-      "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
-      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-      "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
-      "vmov.u8    d7, #16                        \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q8, d1, d4                     \n"  // B
-      "vmlal.u8   q8, d2, d5                     \n"  // G
-      "vmlal.u8   q8, d3, d6                     \n"  // R
-      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-      "vqadd.u8   d0, d7                         \n"
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-      "bgt        1b                             \n"
+      "vmov.u8     d4, #25                       \n"  // B * 0.1016 coefficient
+      "vmov.u8     d5, #129                      \n"  // G * 0.5078 coefficient
+      "vmov.u8     d6, #66                       \n"  // R * 0.2578 coefficient
+      "vmov.u8     d7, #16                       \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 pixels of RGBA.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q8, d1, d4                    \n"  // B
+      "vmlal.u8    q8, d2, d5                    \n"  // G
+      "vmlal.u8    q8, d3, d6                    \n"  // R
+      "vqrshrn.u16 d0, q8, #8                    \n"  // 16 bit to 8 bit Y
+      "vqadd.u8    d0, d7                        \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
       : "+r"(src_rgba),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
@@ -1983,20 +2055,20 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
 
 void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
   asm volatile(
-      "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
-      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-      "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
-      "vmov.u8    d7, #16                        \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q8, d0, d4                     \n"  // B
-      "vmlal.u8   q8, d1, d5                     \n"  // G
-      "vmlal.u8   q8, d2, d6                     \n"  // R
-      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-      "vqadd.u8   d0, d7                         \n"
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-      "bgt        1b                             \n"
+      "vmov.u8     d4, #25                       \n"  // B * 0.1016 coefficient
+      "vmov.u8     d5, #129                      \n"  // G * 0.5078 coefficient
+      "vmov.u8     d6, #66                       \n"  // R * 0.2578 coefficient
+      "vmov.u8     d7, #16                       \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld3.8      {d0, d1, d2}, [%0]!           \n"  // load 8 pixels of RGB24.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q8, d0, d4                    \n"  // B
+      "vmlal.u8    q8, d1, d5                    \n"  // G
+      "vmlal.u8    q8, d2, d6                    \n"  // R
+      "vqrshrn.u16 d0, q8, #8                    \n"  // 16 bit to 8 bit Y
+      "vqadd.u8    d0, d7                        \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
       : "+r"(src_rgb24),  // %0
         "+r"(dst_y),      // %1
         "+r"(width)       // %2
@@ -2006,20 +2078,20 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
 
 void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
   asm volatile(
-      "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-      "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-      "vmov.u8    d7, #16                        \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q8, d0, d4                     \n"  // B
-      "vmlal.u8   q8, d1, d5                     \n"  // G
-      "vmlal.u8   q8, d2, d6                     \n"  // R
-      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-      "vqadd.u8   d0, d7                         \n"
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-      "bgt        1b                             \n"
+      "vmov.u8     d6, #25                       \n"  // B * 0.1016 coefficient
+      "vmov.u8     d5, #129                      \n"  // G * 0.5078 coefficient
+      "vmov.u8     d4, #66                       \n"  // R * 0.2578 coefficient
+      "vmov.u8     d7, #16                       \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld3.8      {d0, d1, d2}, [%0]!           \n"  // load 8 pixels of RAW.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q8, d0, d4                    \n"  // B
+      "vmlal.u8    q8, d1, d5                    \n"  // G
+      "vmlal.u8    q8, d2, d6                    \n"  // R
+      "vqrshrn.u16 d0, q8, #8                    \n"  // 16 bit to 8 bit Y
+      "vqadd.u8    d0, d7                        \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
       : "+r"(src_raw),  // %0
         "+r"(dst_y),    // %1
         "+r"(width)     // %2
@@ -2027,6 +2099,48 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
       : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
 }
 
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  asm volatile(
+      "vmov.u8     d4, #29                       \n"  // B * 0.1140 coefficient
+      "vmov.u8     d5, #150                      \n"  // G * 0.5870 coefficient
+      "vmov.u8     d6, #77                       \n"  // R * 0.2990 coefficient
+      "1:                                        \n"
+      "vld3.8      {d0, d1, d2}, [%0]!           \n"  // load 8 pixels of RGB24.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q4, d0, d4                    \n"  // B
+      "vmlal.u8    q4, d1, d5                    \n"  // G
+      "vmlal.u8    q4, d2, d6                    \n"  // R
+      "vqrshrn.u16 d0, q4, #8                    \n"  // 16 bit to 8 bit Y
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_yj),     // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q4");
+}
+
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  asm volatile(
+      "vmov.u8     d6, #29                       \n"  // B * 0.1140 coefficient
+      "vmov.u8     d5, #150                      \n"  // G * 0.5870 coefficient
+      "vmov.u8     d4, #77                       \n"  // R * 0.2990 coefficient
+      "1:                                        \n"
+      "vld3.8      {d0, d1, d2}, [%0]!           \n"  // load 8 pixels of RAW.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q4, d0, d4                    \n"  // B
+      "vmlal.u8    q4, d1, d5                    \n"  // G
+      "vmlal.u8    q4, d2, d6                    \n"  // R
+      "vqrshrn.u16 d0, q4, #8                    \n"  // 16 bit to 8 bit Y
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_raw),  // %0
+        "+r"(dst_yj),   // %1
+        "+r"(width)     // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q4");
+}
+
 // Bilinear filter 16x2 -> 16x1
 void InterpolateRow_NEON(uint8_t* dst_ptr,
                          const uint8_t* src_ptr,
@@ -2035,46 +2149,46 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
                          int source_y_fraction) {
   int y1_fraction = source_y_fraction;
   asm volatile(
-      "cmp        %4, #0                         \n"
-      "beq        100f                           \n"
-      "add        %2, %1                         \n"
-      "cmp        %4, #128                       \n"
-      "beq        50f                            \n"
+      "cmp         %4, #0                        \n"
+      "beq         100f                          \n"
+      "add         %2, %1                        \n"
+      "cmp         %4, #128                      \n"
+      "beq         50f                           \n"
 
-      "vdup.8     d5, %4                         \n"
-      "rsb        %4, #256                       \n"
-      "vdup.8     d4, %4                         \n"
+      "vdup.8      d5, %4                        \n"
+      "rsb         %4, #256                      \n"
+      "vdup.8      d4, %4                        \n"
       // General purpose row blend.
       "1:                                        \n"
-      "vld1.8     {q0}, [%1]!                    \n"
-      "vld1.8     {q1}, [%2]!                    \n"
-      "subs       %3, %3, #16                    \n"
-      "vmull.u8   q13, d0, d4                    \n"
-      "vmull.u8   q14, d1, d4                    \n"
-      "vmlal.u8   q13, d2, d5                    \n"
-      "vmlal.u8   q14, d3, d5                    \n"
-      "vrshrn.u16 d0, q13, #8                    \n"
-      "vrshrn.u16 d1, q14, #8                    \n"
-      "vst1.8     {q0}, [%0]!                    \n"
-      "bgt        1b                             \n"
-      "b          99f                            \n"
+      "vld1.8      {q0}, [%1]!                   \n"
+      "vld1.8      {q1}, [%2]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vmull.u8    q13, d0, d4                   \n"
+      "vmull.u8    q14, d1, d4                   \n"
+      "vmlal.u8    q13, d2, d5                   \n"
+      "vmlal.u8    q14, d3, d5                   \n"
+      "vrshrn.u16  d0, q13, #8                   \n"
+      "vrshrn.u16  d1, q14, #8                   \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         1b                            \n"
+      "b           99f                           \n"
 
       // Blend 50 / 50.
       "50:                                       \n"
-      "vld1.8     {q0}, [%1]!                    \n"
-      "vld1.8     {q1}, [%2]!                    \n"
-      "subs       %3, %3, #16                    \n"
-      "vrhadd.u8  q0, q1                         \n"
-      "vst1.8     {q0}, [%0]!                    \n"
-      "bgt        50b                            \n"
-      "b          99f                            \n"
+      "vld1.8      {q0}, [%1]!                   \n"
+      "vld1.8      {q1}, [%2]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vrhadd.u8   q0, q1                        \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         50b                           \n"
+      "b           99f                           \n"
 
       // Blend 100 / 0 - Copy row unchanged.
       "100:                                      \n"
-      "vld1.8     {q0}, [%1]!                    \n"
-      "subs       %3, %3, #16                    \n"
-      "vst1.8     {q0}, [%0]!                    \n"
-      "bgt        100b                           \n"
+      "vld1.8      {q0}, [%1]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         100b                          \n"
 
       "99:                                       \n"
       : "+r"(dst_ptr),     // %0
@@ -2092,51 +2206,51 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0,
                        uint8_t* dst_argb,
                        int width) {
   asm volatile(
-      "subs       %3, #8                         \n"
-      "blt        89f                            \n"
+      "subs        %3, #8                        \n"
+      "blt         89f                           \n"
       // Blend 8 pixels.
       "8:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
-      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
-      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q10, d4, d3                    \n"  // db * a
-      "vmull.u8   q11, d5, d3                    \n"  // dg * a
-      "vmull.u8   q12, d6, d3                    \n"  // dr * a
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 pixels of ARGB0.
+      "vld4.8      {d4, d5, d6, d7}, [%1]!       \n"  // load 8 pixels of ARGB1.
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q10, d4, d3                   \n"  // db * a
+      "vmull.u8    q11, d5, d3                   \n"  // dg * a
+      "vmull.u8    q12, d6, d3                   \n"  // dr * a
       "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
       "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
       "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
-      "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
-      "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
-      "vqadd.u8   q0, q0, q2                     \n"  // + sbg
-      "vqadd.u8   d2, d2, d6                     \n"  // + sr
-      "vmov.u8    d3, #255                       \n"  // a = 255
-      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
-      "bge        8b                             \n"
+      "vqsub.u8    q2, q2, q10                   \n"  // dbg - dbg * a / 256
+      "vqsub.u8    d6, d6, d22                   \n"  // dr - dr * a / 256
+      "vqadd.u8    q0, q0, q2                    \n"  // + sbg
+      "vqadd.u8    d2, d2, d6                    \n"  // + sr
+      "vmov.u8     d3, #255                      \n"  // a = 255
+      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 pixels of ARGB.
+      "bge         8b                            \n"
 
       "89:                                       \n"
-      "adds       %3, #8-1                       \n"
-      "blt        99f                            \n"
+      "adds        %3, #8-1                      \n"
+      "blt         99f                           \n"
 
       // Blend 1 pixels.
       "1:                                        \n"
-      "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
-      "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
-      "subs       %3, %3, #1                     \n"    // 1 processed per loop.
-      "vmull.u8   q10, d4, d3                    \n"    // db * a
-      "vmull.u8   q11, d5, d3                    \n"    // dg * a
-      "vmull.u8   q12, d6, d3                    \n"    // dr * a
-      "vqrshrn.u16 d20, q10, #8                  \n"    // db >>= 8
-      "vqrshrn.u16 d21, q11, #8                  \n"    // dg >>= 8
-      "vqrshrn.u16 d22, q12, #8                  \n"    // dr >>= 8
-      "vqsub.u8   q2, q2, q10                    \n"    // dbg - dbg * a / 256
-      "vqsub.u8   d6, d6, d22                    \n"    // dr - dr * a / 256
-      "vqadd.u8   q0, q0, q2                     \n"    // + sbg
-      "vqadd.u8   d2, d2, d6                     \n"    // + sr
-      "vmov.u8    d3, #255                       \n"    // a = 255
-      "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
-      "bge        1b                             \n"
-
-      "99:                                         \n"
+      "vld4.8      {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
+      "vld4.8      {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
+      "subs        %3, %3, #1                    \n"  // 1 processed per loop.
+      "vmull.u8    q10, d4, d3                   \n"  // db * a
+      "vmull.u8    q11, d5, d3                   \n"  // dg * a
+      "vmull.u8    q12, d6, d3                   \n"  // dr * a
+      "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
+      "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
+      "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
+      "vqsub.u8    q2, q2, q10                   \n"  // dbg - dbg * a / 256
+      "vqsub.u8    d6, d6, d22                   \n"  // dr - dr * a / 256
+      "vqadd.u8    q0, q0, q2                    \n"  // + sbg
+      "vqadd.u8    d2, d2, d6                    \n"  // + sr
+      "vmov.u8     d3, #255                      \n"  // a = 255
+      "vst4.8      {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
+      "bge         1b                            \n"
+
+      "99:                                       \n"
 
       : "+r"(src_argb0),  // %0
         "+r"(src_argb1),  // %1
@@ -2153,16 +2267,16 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
   asm volatile(
       // Attenuate 8 pixels.
       "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q10, d0, d3                    \n"  // b * a
-      "vmull.u8   q11, d1, d3                    \n"  // g * a
-      "vmull.u8   q12, d2, d3                    \n"  // r * a
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 pixels of ARGB.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q10, d0, d3                   \n"  // b * a
+      "vmull.u8    q11, d1, d3                   \n"  // g * a
+      "vmull.u8    q12, d2, d3                   \n"  // r * a
       "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
       "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
       "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
-      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-      "bgt        1b                             \n"
+      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
@@ -2178,32 +2292,32 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
                           int interval_offset,
                           int width) {
   asm volatile(
-      "vdup.u16   q8, %2                         \n"
-      "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
-      "vdup.u16   q9, %3                         \n"  // interval multiply.
-      "vdup.u16   q10, %4                        \n"  // interval add
+      "vdup.u16    q8, %2                        \n"
+      "vshr.u16    q8, q8, #1                    \n"  // scale >>= 1
+      "vdup.u16    q9, %3                        \n"  // interval multiply.
+      "vdup.u16    q10, %4                       \n"  // interval add
 
       // 8 pixel loop.
       "1:                                        \n"
-      "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
-      "subs       %1, %1, #8                     \n"  // 8 processed per loop.
-      "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
-      "vmovl.u8   q1, d2                         \n"
-      "vmovl.u8   q2, d4                         \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]        \n"  // load 8 pixels of ARGB.
+      "subs        %1, %1, #8                    \n"  // 8 processed per loop.
+      "vmovl.u8    q0, d0                        \n"  // b (0 .. 255)
+      "vmovl.u8    q1, d2                        \n"
+      "vmovl.u8    q2, d4                        \n"
       "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
       "vqdmulh.s16 q1, q1, q8                    \n"  // g
       "vqdmulh.s16 q2, q2, q8                    \n"  // r
-      "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
-      "vmul.u16   q1, q1, q9                     \n"  // g
-      "vmul.u16   q2, q2, q9                     \n"  // r
-      "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
-      "vadd.u16   q1, q1, q10                    \n"  // g
-      "vadd.u16   q2, q2, q10                    \n"  // r
-      "vqmovn.u16 d0, q0                         \n"
-      "vqmovn.u16 d2, q1                         \n"
-      "vqmovn.u16 d4, q2                         \n"
-      "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
-      "bgt        1b                             \n"
+      "vmul.u16    q0, q0, q9                    \n"  // b * interval_size
+      "vmul.u16    q1, q1, q9                    \n"  // g
+      "vmul.u16    q2, q2, q9                    \n"  // r
+      "vadd.u16    q0, q0, q10                   \n"  // b + interval_offset
+      "vadd.u16    q1, q1, q10                   \n"  // g
+      "vadd.u16    q2, q2, q10                   \n"  // r
+      "vqmovn.u16  d0, q0                        \n"
+      "vqmovn.u16  d2, q1                        \n"
+      "vqmovn.u16  d4, q2                        \n"
+      "vst4.8      {d0, d2, d4, d6}, [%0]!       \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
       : "+r"(dst_argb),       // %0
         "+r"(width)           // %1
       : "r"(scale),           // %2
@@ -2220,28 +2334,28 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
                        int width,
                        uint32_t value) {
   asm volatile(
-      "vdup.u32   q0, %3                         \n"  // duplicate scale value.
-      "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
-      "vshr.u16   q0, q0, #1                     \n"  // scale / 2.
+      "vdup.u32    q0, %3                        \n"  // duplicate scale value.
+      "vzip.u8     d0, d1                        \n"  // d0 aarrggbb.
+      "vshr.u16    q0, q0, #1                    \n"  // scale / 2.
 
       // 8 pixel loop.
       "1:                                        \n"
-      "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
-      "vmovl.u8   q11, d22                       \n"
-      "vmovl.u8   q12, d24                       \n"
-      "vmovl.u8   q13, d26                       \n"
+      "vld4.8      {d20, d22, d24, d26}, [%0]!   \n"  // load 8 pixels of ARGB.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmovl.u8    q10, d20                      \n"  // b (0 .. 255)
+      "vmovl.u8    q11, d22                      \n"
+      "vmovl.u8    q12, d24                      \n"
+      "vmovl.u8    q13, d26                      \n"
       "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
       "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
       "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
       "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
-      "vqmovn.u16 d20, q10                       \n"
-      "vqmovn.u16 d22, q11                       \n"
-      "vqmovn.u16 d24, q12                       \n"
-      "vqmovn.u16 d26, q13                       \n"
-      "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
-      "bgt        1b                             \n"
+      "vqmovn.u16  d20, q10                      \n"
+      "vqmovn.u16  d22, q11                      \n"
+      "vqmovn.u16  d24, q12                      \n"
+      "vqmovn.u16  d26, q13                      \n"
+      "vst4.8      {d20, d22, d24, d26}, [%1]!   \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
@@ -2251,23 +2365,23 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
 
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
 // Similar to ARGBToYJ but stores ARGB.
-// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
 void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
   asm volatile(
-      "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
-      "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
-      "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q2, d0, d24                    \n"  // B
-      "vmlal.u8   q2, d1, d25                    \n"  // G
-      "vmlal.u8   q2, d2, d26                    \n"  // R
-      "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
-      "vmov       d1, d0                         \n"  // G
-      "vmov       d2, d0                         \n"  // R
-      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
-      "bgt        1b                             \n"
+      "vmov.u8     d24, #29                      \n"  // B * 0.1140 coefficient
+      "vmov.u8     d25, #150                     \n"  // G * 0.5870 coefficient
+      "vmov.u8     d26, #77                      \n"  // R * 0.2990 coefficient
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q2, d0, d24                   \n"  // B
+      "vmlal.u8    q2, d1, d25                   \n"  // G
+      "vmlal.u8    q2, d2, d26                   \n"  // R
+      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit B
+      "vmov        d1, d0                        \n"  // G
+      "vmov        d2, d0                        \n"  // R
+      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
@@ -2281,32 +2395,32 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
 //    r = (r * 50 + g * 98 + b * 24) >> 7
 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
   asm volatile(
-      "vmov.u8    d20, #17                       \n"  // BB coefficient
-      "vmov.u8    d21, #68                       \n"  // BG coefficient
-      "vmov.u8    d22, #35                       \n"  // BR coefficient
-      "vmov.u8    d24, #22                       \n"  // GB coefficient
-      "vmov.u8    d25, #88                       \n"  // GG coefficient
-      "vmov.u8    d26, #45                       \n"  // GR coefficient
-      "vmov.u8    d28, #24                       \n"  // BB coefficient
-      "vmov.u8    d29, #98                       \n"  // BG coefficient
-      "vmov.u8    d30, #50                       \n"  // BR coefficient
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
-      "subs       %1, %1, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
-      "vmlal.u8   q2, d1, d21                    \n"  // G
-      "vmlal.u8   q2, d2, d22                    \n"  // R
-      "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
-      "vmlal.u8   q3, d1, d25                    \n"  // G
-      "vmlal.u8   q3, d2, d26                    \n"  // R
-      "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
-      "vmlal.u8   q8, d1, d29                    \n"  // G
-      "vmlal.u8   q8, d2, d30                    \n"  // R
-      "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
-      "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
-      "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
-      "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
-      "bgt        1b                             \n"
+      "vmov.u8     d20, #17                      \n"  // BB coefficient
+      "vmov.u8     d21, #68                      \n"  // BG coefficient
+      "vmov.u8     d22, #35                      \n"  // BR coefficient
+      "vmov.u8     d24, #22                      \n"  // GB coefficient
+      "vmov.u8     d25, #88                      \n"  // GG coefficient
+      "vmov.u8     d26, #45                      \n"  // GR coefficient
+      "vmov.u8     d28, #24                      \n"  // BB coefficient
+      "vmov.u8     d29, #98                      \n"  // BG coefficient
+      "vmov.u8     d30, #50                      \n"  // BR coefficient
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]        \n"  // load 8 ARGB pixels.
+      "subs        %1, %1, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q2, d0, d20                   \n"  // B to Sepia B
+      "vmlal.u8    q2, d1, d21                   \n"  // G
+      "vmlal.u8    q2, d2, d22                   \n"  // R
+      "vmull.u8    q3, d0, d24                   \n"  // B to Sepia G
+      "vmlal.u8    q3, d1, d25                   \n"  // G
+      "vmlal.u8    q3, d2, d26                   \n"  // R
+      "vmull.u8    q8, d0, d28                   \n"  // B to Sepia R
+      "vmlal.u8    q8, d1, d29                   \n"  // G
+      "vmlal.u8    q8, d2, d30                   \n"  // R
+      "vqshrn.u16  d0, q2, #7                    \n"  // 16 bit to 8 bit B
+      "vqshrn.u16  d1, q3, #7                    \n"  // 16 bit to 8 bit G
+      "vqshrn.u16  d2, q8, #7                    \n"  // 16 bit to 8 bit R
+      "vst4.8      {d0, d1, d2, d3}, [%0]!       \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
       : "+r"(dst_argb),  // %0
         "+r"(width)      // %1
       :
@@ -2322,51 +2436,51 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
                              const int8_t* matrix_argb,
                              int width) {
   asm volatile(
-      "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
-      "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
-      "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
-
-      "1:                                        \n"
-      "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
-      "vmovl.u8   q9, d18                        \n"  // g
-      "vmovl.u8   q10, d20                       \n"  // r
-      "vmovl.u8   q11, d22                       \n"  // a
-      "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
-      "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
-      "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
-      "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
-      "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
-      "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
-      "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
-      "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
-      "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-      "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-      "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-      "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-      "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
-      "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
-      "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
-      "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
-      "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-      "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-      "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-      "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-      "vmul.s16   q4, q11, d0[3]                 \n"  // B += A * Matrix B
-      "vmul.s16   q5, q11, d1[3]                 \n"  // G += A * Matrix G
-      "vmul.s16   q6, q11, d2[3]                 \n"  // R += A * Matrix R
-      "vmul.s16   q7, q11, d3[3]                 \n"  // A += A * Matrix A
-      "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-      "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-      "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-      "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+      "vld1.8      {q2}, [%3]                    \n"  // load 3 ARGB vectors.
+      "vmovl.s8    q0, d4                        \n"  // B,G coefficients s16.
+      "vmovl.s8    q1, d5                        \n"  // R,A coefficients s16.
+
+      "1:                                        \n"
+      "vld4.8      {d16, d18, d20, d22}, [%0]!   \n"  // load 8 ARGB pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmovl.u8    q8, d16                       \n"  // b (0 .. 255) 16 bit
+      "vmovl.u8    q9, d18                       \n"  // g
+      "vmovl.u8    q10, d20                      \n"  // r
+      "vmovl.u8    q11, d22                      \n"  // a
+      "vmul.s16    q12, q8, d0[0]                \n"  // B = B * Matrix B
+      "vmul.s16    q13, q8, d1[0]                \n"  // G = B * Matrix G
+      "vmul.s16    q14, q8, d2[0]                \n"  // R = B * Matrix R
+      "vmul.s16    q15, q8, d3[0]                \n"  // A = B * Matrix A
+      "vmul.s16    q4, q9, d0[1]                 \n"  // B += G * Matrix B
+      "vmul.s16    q5, q9, d1[1]                 \n"  // G += G * Matrix G
+      "vmul.s16    q6, q9, d2[1]                 \n"  // R += G * Matrix R
+      "vmul.s16    q7, q9, d3[1]                 \n"  // A += G * Matrix A
+      "vqadd.s16   q12, q12, q4                  \n"  // Accumulate B
+      "vqadd.s16   q13, q13, q5                  \n"  // Accumulate G
+      "vqadd.s16   q14, q14, q6                  \n"  // Accumulate R
+      "vqadd.s16   q15, q15, q7                  \n"  // Accumulate A
+      "vmul.s16    q4, q10, d0[2]                \n"  // B += R * Matrix B
+      "vmul.s16    q5, q10, d1[2]                \n"  // G += R * Matrix G
+      "vmul.s16    q6, q10, d2[2]                \n"  // R += R * Matrix R
+      "vmul.s16    q7, q10, d3[2]                \n"  // A += R * Matrix A
+      "vqadd.s16   q12, q12, q4                  \n"  // Accumulate B
+      "vqadd.s16   q13, q13, q5                  \n"  // Accumulate G
+      "vqadd.s16   q14, q14, q6                  \n"  // Accumulate R
+      "vqadd.s16   q15, q15, q7                  \n"  // Accumulate A
+      "vmul.s16    q4, q11, d0[3]                \n"  // B += A * Matrix B
+      "vmul.s16    q5, q11, d1[3]                \n"  // G += A * Matrix G
+      "vmul.s16    q6, q11, d2[3]                \n"  // R += A * Matrix R
+      "vmul.s16    q7, q11, d3[3]                \n"  // A += A * Matrix A
+      "vqadd.s16   q12, q12, q4                  \n"  // Accumulate B
+      "vqadd.s16   q13, q13, q5                  \n"  // Accumulate G
+      "vqadd.s16   q14, q14, q6                  \n"  // Accumulate R
+      "vqadd.s16   q15, q15, q7                  \n"  // Accumulate A
       "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
       "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
       "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
       "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
-      "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
-      "bgt        1b                             \n"
+      "vst4.8      {d16, d18, d20, d22}, [%1]!   \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
       : "+r"(src_argb),   // %0
         "+r"(dst_argb),   // %1
         "+r"(width)       // %2
@@ -2383,19 +2497,19 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
   asm volatile(
       // 8 pixel loop.
       "1:                                        \n"
-      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-      "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB
-      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q0, d0, d1                     \n"  // multiply B
-      "vmull.u8   q1, d2, d3                     \n"  // multiply G
-      "vmull.u8   q2, d4, d5                     \n"  // multiply R
-      "vmull.u8   q3, d6, d7                     \n"  // multiply A
-      "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
-      "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
-      "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
-      "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
-      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-      "bgt        1b                             \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%1]!       \n"  // load 8 more ARGB
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q0, d0, d1                    \n"  // multiply B
+      "vmull.u8    q1, d2, d3                    \n"  // multiply G
+      "vmull.u8    q2, d4, d5                    \n"  // multiply R
+      "vmull.u8    q3, d6, d7                    \n"  // multiply A
+      "vrshrn.u16  d0, q0, #8                    \n"  // 16 bit to 8 bit B
+      "vrshrn.u16  d1, q1, #8                    \n"  // 16 bit to 8 bit G
+      "vrshrn.u16  d2, q2, #8                    \n"  // 16 bit to 8 bit R
+      "vrshrn.u16  d3, q3, #8                    \n"  // 16 bit to 8 bit A
+      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
       : "+r"(src_argb0),  // %0
         "+r"(src_argb1),  // %1
         "+r"(dst_argb),   // %2
@@ -2412,13 +2526,13 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
   asm volatile(
       // 8 pixel loop.
       "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB
-      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-      "vqadd.u8   q0, q0, q2                     \n"  // add B, G
-      "vqadd.u8   q1, q1, q3                     \n"  // add R, A
-      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-      "bgt        1b                             \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.8      {d4, d5, d6, d7}, [%1]!       \n"  // load 8 more ARGB
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vqadd.u8    q0, q0, q2                    \n"  // add B, G
+      "vqadd.u8    q1, q1, q3                    \n"  // add R, A
+      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
       : "+r"(src_argb0),  // %0
         "+r"(src_argb1),  // %1
         "+r"(dst_argb),   // %2
@@ -2435,13 +2549,13 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
   asm volatile(
       // 8 pixel loop.
       "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB
-      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-      "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
-      "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
-      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-      "bgt        1b                             \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.8      {d4, d5, d6, d7}, [%1]!       \n"  // load 8 more ARGB
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vqsub.u8    q0, q0, q2                    \n"  // subtract B, G
+      "vqsub.u8    q1, q1, q3                    \n"  // subtract R, A
+      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
       : "+r"(src_argb0),  // %0
         "+r"(src_argb1),  // %1
         "+r"(dst_argb),   // %2
@@ -2460,17 +2574,17 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
                    uint8_t* dst_argb,
                    int width) {
   asm volatile(
-      "vmov.u8    d3, #255                       \n"  // alpha
+      "vmov.u8     d3, #255                      \n"  // alpha
       // 8 pixel loop.
       "1:                                        \n"
-      "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
-      "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
-      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-      "vqadd.u8   d0, d0, d1                     \n"  // add
-      "vmov.u8    d1, d0                         \n"
-      "vmov.u8    d2, d0                         \n"
-      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-      "bgt        1b                             \n"
+      "vld1.8      {d0}, [%0]!                   \n"  // load 8 sobelx.
+      "vld1.8      {d1}, [%1]!                   \n"  // load 8 sobely.
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vqadd.u8    d0, d0, d1                    \n"  // add
+      "vmov.u8     d1, d0                        \n"
+      "vmov.u8     d2, d0                        \n"
+      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
       : "+r"(src_sobelx),  // %0
         "+r"(src_sobely),  // %1
         "+r"(dst_argb),    // %2
@@ -2487,12 +2601,12 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
   asm volatile(
       // 16 pixel loop.
       "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
-      "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
-      "subs       %3, %3, #16                    \n"  // 16 processed per loop.
-      "vqadd.u8   q0, q0, q1                     \n"  // add
-      "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
-      "bgt        1b                             \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 16 sobelx.
+      "vld1.8      {q1}, [%1]!                   \n"  // load 16 sobely.
+      "subs        %3, %3, #16                   \n"  // 16 processed per loop.
+      "vqadd.u8    q0, q0, q1                    \n"  // add
+      "vst1.8      {q0}, [%2]!                   \n"  // store 16 pixels.
+      "bgt         1b                            \n"
       : "+r"(src_sobelx),  // %0
         "+r"(src_sobely),  // %1
         "+r"(dst_y),       // %2
@@ -2511,15 +2625,15 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
                      uint8_t* dst_argb,
                      int width) {
   asm volatile(
-      "vmov.u8    d3, #255                       \n"  // alpha
+      "vmov.u8     d3, #255                      \n"  // alpha
       // 8 pixel loop.
       "1:                                        \n"
-      "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
-      "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
-      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-      "vqadd.u8   d1, d0, d2                     \n"  // add
-      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-      "bgt        1b                             \n"
+      "vld1.8      {d2}, [%0]!                   \n"  // load 8 sobelx.
+      "vld1.8      {d0}, [%1]!                   \n"  // load 8 sobely.
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vqadd.u8    d1, d0, d2                    \n"  // add
+      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
       : "+r"(src_sobelx),  // %0
         "+r"(src_sobely),  // %1
         "+r"(dst_argb),    // %2
@@ -2539,23 +2653,23 @@ void SobelXRow_NEON(const uint8_t* src_y0,
                     int width) {
   asm volatile(
       "1:                                        \n"
-      "vld1.8     {d0}, [%0],%5                  \n"  // top
-      "vld1.8     {d1}, [%0],%6                  \n"
-      "vsubl.u8   q0, d0, d1                     \n"
-      "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
-      "vld1.8     {d3}, [%1],%6                  \n"
-      "vsubl.u8   q1, d2, d3                     \n"
-      "vadd.s16   q0, q0, q1                     \n"
-      "vadd.s16   q0, q0, q1                     \n"
-      "vld1.8     {d2}, [%2],%5                  \n"  // bottom
-      "vld1.8     {d3}, [%2],%6                  \n"
-      "subs       %4, %4, #8                     \n"  // 8 pixels
-      "vsubl.u8   q1, d2, d3                     \n"
-      "vadd.s16   q0, q0, q1                     \n"
-      "vabs.s16   q0, q0                         \n"
-      "vqmovn.u16 d0, q0                         \n"
-      "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
-      "bgt        1b                             \n"
+      "vld1.8      {d0}, [%0],%5                 \n"  // top
+      "vld1.8      {d1}, [%0],%6                 \n"
+      "vsubl.u8    q0, d0, d1                    \n"
+      "vld1.8      {d2}, [%1],%5                 \n"  // center * 2
+      "vld1.8      {d3}, [%1],%6                 \n"
+      "vsubl.u8    q1, d2, d3                    \n"
+      "vadd.s16    q0, q0, q1                    \n"
+      "vadd.s16    q0, q0, q1                    \n"
+      "vld1.8      {d2}, [%2],%5                 \n"  // bottom
+      "vld1.8      {d3}, [%2],%6                 \n"
+      "subs        %4, %4, #8                    \n"  // 8 pixels
+      "vsubl.u8    q1, d2, d3                    \n"
+      "vadd.s16    q0, q0, q1                    \n"
+      "vabs.s16    q0, q0                        \n"
+      "vqmovn.u16  d0, q0                        \n"
+      "vst1.8      {d0}, [%3]!                   \n"  // store 8 sobelx
+      "bgt         1b                            \n"
       : "+r"(src_y0),               // %0
         "+r"(src_y1),               // %1
         "+r"(src_y2),               // %2
@@ -2564,7 +2678,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
       : "r"(2),                     // %5
         "r"(6)                      // %6
       : "cc", "memory", "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 // SobelY as a matrix is
@@ -2577,23 +2691,23 @@ void SobelYRow_NEON(const uint8_t* src_y0,
                     int width) {
   asm volatile(
       "1:                                        \n"
-      "vld1.8     {d0}, [%0],%4                  \n"  // left
-      "vld1.8     {d1}, [%1],%4                  \n"
-      "vsubl.u8   q0, d0, d1                     \n"
-      "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
-      "vld1.8     {d3}, [%1],%4                  \n"
-      "vsubl.u8   q1, d2, d3                     \n"
-      "vadd.s16   q0, q0, q1                     \n"
-      "vadd.s16   q0, q0, q1                     \n"
-      "vld1.8     {d2}, [%0],%5                  \n"  // right
-      "vld1.8     {d3}, [%1],%5                  \n"
-      "subs       %3, %3, #8                     \n"  // 8 pixels
-      "vsubl.u8   q1, d2, d3                     \n"
-      "vadd.s16   q0, q0, q1                     \n"
-      "vabs.s16   q0, q0                         \n"
-      "vqmovn.u16 d0, q0                         \n"
-      "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
-      "bgt        1b                             \n"
+      "vld1.8      {d0}, [%0],%4                 \n"  // left
+      "vld1.8      {d1}, [%1],%4                 \n"
+      "vsubl.u8    q0, d0, d1                    \n"
+      "vld1.8      {d2}, [%0],%4                 \n"  // center * 2
+      "vld1.8      {d3}, [%1],%4                 \n"
+      "vsubl.u8    q1, d2, d3                    \n"
+      "vadd.s16    q0, q0, q1                    \n"
+      "vadd.s16    q0, q0, q1                    \n"
+      "vld1.8      {d2}, [%0],%5                 \n"  // right
+      "vld1.8      {d3}, [%1],%5                 \n"
+      "subs        %3, %3, #8                    \n"  // 8 pixels
+      "vsubl.u8    q1, d2, d3                    \n"
+      "vadd.s16    q0, q0, q1                    \n"
+      "vabs.s16    q0, q0                        \n"
+      "vqmovn.u16  d0, q0                        \n"
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 sobely
+      "bgt         1b                            \n"
       : "+r"(src_y0),               // %0
         "+r"(src_y1),               // %1
         "+r"(dst_sobely),           // %2
@@ -2601,7 +2715,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
       : "r"(1),                     // %4
         "r"(6)                      // %5
       : "cc", "memory", "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 // %y passes a float as a scalar vector for vector * scalar multiply.
@@ -2615,18 +2729,18 @@ void HalfFloat1Row_NEON(const uint16_t* src,
   asm volatile(
 
       "1:                                        \n"
-      "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts
-      "subs       %2, %2, #8                     \n"  // 8 pixels per loop
-      "vmovl.u16  q2, d2                         \n"  // 8 int's
-      "vmovl.u16  q3, d3                         \n"
-      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
-      "vcvt.f32.u32  q3, q3                      \n"
-      "vmul.f32   q2, q2, %y3                    \n"  // adjust exponent
-      "vmul.f32   q3, q3, %y3                    \n"
-      "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat
-      "vqshrn.u32 d3, q3, #13                    \n"
-      "vst1.8     {q1}, [%1]!                    \n"
-      "bgt        1b                             \n"
+      "vld1.8      {q1}, [%0]!                   \n"  // load 8 shorts
+      "subs        %2, %2, #8                    \n"  // 8 pixels per loop
+      "vmovl.u16   q2, d2                        \n"  // 8 int's
+      "vmovl.u16   q3, d3                        \n"
+      "vcvt.f32.u32 q2, q2                       \n"  // 8 floats
+      "vcvt.f32.u32 q3, q3                       \n"
+      "vmul.f32    q2, q2, %y3                   \n"  // adjust exponent
+      "vmul.f32    q3, q3, %y3                   \n"
+      "vqshrn.u32  d2, q2, #13                   \n"  // isolate halffloat
+      "vqshrn.u32  d3, q3, #13                   \n"
+      "vst1.8      {q1}, [%1]!                   \n"
+      "bgt         1b                            \n"
       : "+r"(src),              // %0
         "+r"(dst),              // %1
         "+r"(width)             // %2
@@ -2641,18 +2755,18 @@ void HalfFloatRow_NEON(const uint16_t* src,
   asm volatile(
 
       "1:                                        \n"
-      "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts
-      "subs       %2, %2, #8                     \n"  // 8 pixels per loop
-      "vmovl.u16  q2, d2                         \n"  // 8 int's
-      "vmovl.u16  q3, d3                         \n"
-      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
-      "vcvt.f32.u32  q3, q3                      \n"
-      "vmul.f32   q2, q2, %y3                    \n"  // adjust exponent
-      "vmul.f32   q3, q3, %y3                    \n"
-      "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat
-      "vqshrn.u32 d3, q3, #13                    \n"
-      "vst1.8     {q1}, [%1]!                    \n"
-      "bgt        1b                             \n"
+      "vld1.8      {q1}, [%0]!                   \n"  // load 8 shorts
+      "subs        %2, %2, #8                    \n"  // 8 pixels per loop
+      "vmovl.u16   q2, d2                        \n"  // 8 int's
+      "vmovl.u16   q3, d3                        \n"
+      "vcvt.f32.u32 q2, q2                       \n"  // 8 floats
+      "vcvt.f32.u32 q3, q3                       \n"
+      "vmul.f32    q2, q2, %y3                   \n"  // adjust exponent
+      "vmul.f32    q3, q3, %y3                   \n"
+      "vqshrn.u32  d2, q2, #13                   \n"  // isolate halffloat
+      "vqshrn.u32  d3, q3, #13                   \n"
+      "vst1.8      {q1}, [%1]!                   \n"
+      "bgt         1b                            \n"
       : "+r"(src),                      // %0
         "+r"(dst),                      // %1
         "+r"(width)                     // %2
@@ -2667,17 +2781,17 @@ void ByteToFloatRow_NEON(const uint8_t* src,
   asm volatile(
 
       "1:                                        \n"
-      "vld1.8     {d2}, [%0]!                    \n"  // load 8 bytes
-      "subs       %2, %2, #8                     \n"  // 8 pixels per loop
-      "vmovl.u8   q1, d2                         \n"  // 8 shorts
-      "vmovl.u16  q2, d2                         \n"  // 8 ints
-      "vmovl.u16  q3, d3                         \n"
-      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
-      "vcvt.f32.u32  q3, q3                      \n"
-      "vmul.f32   q2, q2, %y3                    \n"  // scale
-      "vmul.f32   q3, q3, %y3                    \n"
-      "vst1.8     {q2, q3}, [%1]!                \n"  // store 8 floats
-      "bgt        1b                             \n"
+      "vld1.8      {d2}, [%0]!                   \n"  // load 8 bytes
+      "subs        %2, %2, #8                    \n"  // 8 pixels per loop
+      "vmovl.u8    q1, d2                        \n"  // 8 shorts
+      "vmovl.u16   q2, d2                        \n"  // 8 ints
+      "vmovl.u16   q3, d3                        \n"
+      "vcvt.f32.u32 q2, q2                       \n"  // 8 floats
+      "vcvt.f32.u32 q3, q3                       \n"
+      "vmul.f32    q2, q2, %y3                   \n"  // scale
+      "vmul.f32    q3, q3, %y3                   \n"
+      "vst1.8      {q2, q3}, [%1]!               \n"  // store 8 floats
+      "bgt         1b                            \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
         "+r"(width)  // %2
@@ -2685,6 +2799,238 @@ void ByteToFloatRow_NEON(const uint8_t* src,
       : "cc", "memory", "q1", "q2", "q3");
 }
 
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_NEON(const uint16_t* src0,
+                   const uint16_t* src1,
+                   const uint16_t* src2,
+                   const uint16_t* src3,
+                   const uint16_t* src4,
+                   uint32_t* dst,
+                   int width) {
+  asm volatile(
+      "vmov.u16    d6, #4                        \n"  // constant 4
+      "vmov.u16    d7, #6                        \n"  // constant 6
+
+      "1:                                        \n"
+      "vld1.16     {q1}, [%0]!                   \n"  // load 8 samples, 5 rows
+      "vld1.16     {q2}, [%4]!                   \n"
+      "vaddl.u16   q0, d2, d4                    \n"  // * 1
+      "vaddl.u16   q1, d3, d5                    \n"  // * 1
+      "vld1.16     {q2}, [%1]!                   \n"
+      "vmlal.u16   q0, d4, d6                    \n"  // * 4
+      "vmlal.u16   q1, d5, d6                    \n"  // * 4
+      "vld1.16     {q2}, [%2]!                   \n"
+      "vmlal.u16   q0, d4, d7                    \n"  // * 6
+      "vmlal.u16   q1, d5, d7                    \n"  // * 6
+      "vld1.16     {q2}, [%3]!                   \n"
+      "vmlal.u16   q0, d4, d6                    \n"  // * 4
+      "vmlal.u16   q1, d5, d6                    \n"  // * 4
+      "subs        %6, %6, #8                    \n"  // 8 processed per loop
+      "vst1.32     {q0, q1}, [%5]!               \n"  // store 8 samples
+      "bgt         1b                            \n"
+      : "+r"(src0),  // %0
+        "+r"(src1),  // %1
+        "+r"(src2),  // %2
+        "+r"(src3),  // %3
+        "+r"(src4),  // %4
+        "+r"(dst),   // %5
+        "+r"(width)  // %6
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
+  const uint32_t* src1 = src + 1;
+  const uint32_t* src2 = src + 2;
+  const uint32_t* src3 = src + 3;
+  asm volatile(
+      "vmov.u32    q10, #4                       \n"  // constant 4
+      "vmov.u32    q11, #6                       \n"  // constant 6
+
+      "1:                                        \n"
+      "vld1.32     {q0, q1}, [%0]!               \n"  // load 12 source samples
+      "vld1.32     {q2}, [%0]                    \n"
+      "vadd.u32    q0, q0, q1                    \n"  // * 1
+      "vadd.u32    q1, q1, q2                    \n"  // * 1
+      "vld1.32     {q2, q3}, [%2]!               \n"
+      "vmla.u32    q0, q2, q11                   \n"  // * 6
+      "vmla.u32    q1, q3, q11                   \n"  // * 6
+      "vld1.32     {q2, q3}, [%1]!               \n"
+      "vld1.32     {q8, q9}, [%3]!               \n"
+      "vadd.u32    q2, q2, q8                    \n"  // add rows for * 4
+      "vadd.u32    q3, q3, q9                    \n"
+      "vmla.u32    q0, q2, q10                   \n"  // * 4
+      "vmla.u32    q1, q3, q10                   \n"  // * 4
+      "subs        %5, %5, #8                    \n"  // 8 processed per loop
+      "vqshrn.u32  d0, q0, #8                    \n"  // round and pack
+      "vqshrn.u32  d1, q1, #8                    \n"
+      "vst1.u16    {q0}, [%4]!                   \n"  // store 8 samples
+      "bgt         1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(src1),  // %1
+        "+r"(src2),  // %2
+        "+r"(src3),  // %3
+        "+r"(dst),   // %4
+        "+r"(width)  // %5
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+}
+
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_yuv24,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q2}, [%0]!                   \n"  // load 16 Y values
+      "vld2.8      {d0, d2}, [%1]!               \n"  // load 8 VU values
+      "vmov        d1, d0                        \n"
+      "vzip.u8     d0, d1                        \n"  // VV
+      "vmov        d3, d2                        \n"
+      "vzip.u8     d2, d3                        \n"  // UU
+      "subs        %3, %3, #16                   \n"  // 16 pixels per loop
+      "vst3.8      {d0, d2, d4}, [%2]!           \n"  // store 16 YUV pixels
+      "vst3.8      {d1, d3, d5}, [%2]!           \n"
+      "bgt         1b                            \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_vu),     // %1
+        "+r"(dst_yuv24),  // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2");
+}
+
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+                      int src_stride_ayuv,
+                      uint8_t* dst_uv,
+                      int width) {
+  asm volatile(
+      "add         %1, %0, %1                    \n"  // src_stride + src_AYUV
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 AYUV pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 AYUV
+                                                      // pixels.
+      "vpaddl.u8   q0, q0                        \n"  // V 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // U 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more AYUV
+                                                      // pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 AYUV
+                                                      // pixels.
+      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vqrshrun.s16 d1, q0, #2                   \n"  // 2x2 average
+      "vqrshrun.s16 d0, q1, #2                   \n"
+      "subs        %3, %3, #16                   \n"  // 16 processed per loop.
+      "vst2.8      {d0, d1}, [%2]!               \n"  // store 8 pixels UV.
+      "bgt         1b                            \n"
+      : "+r"(src_ayuv),         // %0
+        "+r"(src_stride_ayuv),  // %1
+        "+r"(dst_uv),           // %2
+        "+r"(width)             // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+}
+
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+                      int src_stride_ayuv,
+                      uint8_t* dst_vu,
+                      int width) {
+  asm volatile(
+      "add         %1, %0, %1                    \n"  // src_stride + src_AYUV
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 AYUV pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 AYUV
+                                                      // pixels.
+      "vpaddl.u8   q0, q0                        \n"  // V 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // U 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more AYUV
+                                                      // pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 AYUV
+                                                      // pixels.
+      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vqrshrun.s16 d0, q0, #2                   \n"  // 2x2 average
+      "vqrshrun.s16 d1, q1, #2                   \n"
+      "subs        %3, %3, #16                   \n"  // 16 processed per loop.
+      "vst2.8      {d0, d1}, [%2]!               \n"  // store 8 pixels VU.
+      "bgt         1b                            \n"
+      : "+r"(src_ayuv),         // %0
+        "+r"(src_stride_ayuv),  // %1
+        "+r"(dst_vu),           // %2
+        "+r"(width)             // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+}
+
+// Copy row of AYUV Y's into Y.
+// Similar to ARGBExtractAlphaRow_NEON
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 AYUV pixels
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 AYUV pixels
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop
+      "vst1.8      {q2}, [%1]!                   \n"  // store 16 Y's.
+      "bgt         1b                            \n"
+      : "+r"(src_ayuv),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8      {d0, d2}, [%0]!               \n"  // load 16 UV values
+      "vld2.8      {d1, d3}, [%0]!               \n"
+      "vorr.u8     q2, q0, q0                    \n"  // move U after V
+      "subs        %2, %2, #16                   \n"  // 16 pixels per loop
+      "vst2.8      {q1, q2}, [%1]!               \n"  // store 16 VU pixels
+      "bgt         1b                            \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_vu),  // %1
+        "+r"(width)    // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2");
+}
+
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+                         int src_stride_u,
+                         const uint8_t* src_v,
+                         int src_stride_v,
+                         uint8_t* dst_uv,
+                         int width) {
+  const uint8_t* src_u_1 = src_u + src_stride_u;
+  const uint8_t* src_v_1 = src_v + src_stride_v;
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 16 U values
+      "vld1.8      {q1}, [%2]!                   \n"  // load 16 V values
+      "vld1.8      {q2}, [%1]!                   \n"
+      "vld1.8      {q3}, [%3]!                   \n"
+      "vpaddl.u8   q0, q0                        \n"  // half size
+      "vpaddl.u8   q1, q1                        \n"
+      "vpadal.u8   q0, q2                        \n"
+      "vpadal.u8   q1, q3                        \n"
+      "vqrshrn.u16 d0, q0, #2                    \n"
+      "vqrshrn.u16 d1, q1, #2                    \n"
+      "subs        %5, %5, #16                   \n"  // 16 src pixels per loop
+      "vst2.8      {d0, d1}, [%4]!               \n"  // store 8 UV pixels
+      "bgt         1b                            \n"
+      : "+r"(src_u),    // %0
+        "+r"(src_u_1),  // %1
+        "+r"(src_v),    // %2
+        "+r"(src_v_1),  // %3
+        "+r"(dst_uv),   // %4
+        "+r"(width)     // %5
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
 
 #ifdef __cplusplus
diff --git a/chromium/third_party/libyuv/source/row_neon64.cc b/chromium/third_party/libyuv/source/row_neon64.cc
index 24b4520babc..d5258a3aef3 100644
--- a/chromium/third_party/libyuv/source/row_neon64.cc
+++ b/chromium/third_party/libyuv/source/row_neon64.cc
@@ -68,13 +68,13 @@ extern "C" {
   "uzp2       v3.8b, v2.8b, v2.8b            \n" \
   "ins        v1.s[1], v3.s[0]               \n"
 
-#define YUVTORGB_SETUP                           \
-  "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n" \
-  "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n" \
-  "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n" \
-  "ld1r       {v31.4s}, [%[kYToRgb]]         \n" \
-  "ld2        {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
-  "ld2        {v29.8h, v30.8h}, [%[kUVToG]]  \n"
+#define YUVTORGB_SETUP                                      \
+  "ld3r       {v24.8h, v25.8h, v26.8h}, [%[kUVBiasBGR]] \n" \
+  "ld1r       {v31.4s}, [%[kYToRgb]]                    \n" \
+  "ld2        {v27.8h, v28.8h}, [%[kUVToRB]]            \n" \
+  "ld2        {v29.8h, v30.8h}, [%[kUVToG]]             \n"
+
+// clang-format off
 
 #define YUVTORGB(vR, vG, vB)                                        \
   "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */ \
@@ -89,29 +89,23 @@ extern "C" {
   "mov        v2.d[0], v1.d[1]               \n" /* Extract V */    \
   "uxtl       v2.8h, v2.8b                   \n"                    \
   "uxtl       v1.8h, v1.8b                   \n" /* Extract U */    \
-  "mul        v3.8h, v1.8h, v27.8h           \n"                    \
-  "mul        v5.8h, v1.8h, v29.8h           \n"                    \
-  "mul        v6.8h, v2.8h, v30.8h           \n"                    \
-  "mul        v7.8h, v2.8h, v28.8h           \n"                    \
+  "mul        v3.8h, v27.8h, v1.8h           \n"                    \
+  "mul        v5.8h, v29.8h, v1.8h           \n"                    \
+  "mul        v6.8h, v30.8h, v2.8h           \n"                    \
+  "mul        v7.8h, v28.8h, v2.8h           \n"                    \
   "sqadd      v6.8h, v6.8h, v5.8h            \n"                    \
-  "sqadd      " #vB                                                 \
-  ".8h, v24.8h, v0.8h      \n" /* B */                              \
-  "sqadd      " #vG                                                 \
-  ".8h, v25.8h, v0.8h      \n" /* G */                              \
-  "sqadd      " #vR                                                 \
-  ".8h, v26.8h, v0.8h      \n" /* R */                              \
-  "sqadd      " #vB ".8h, " #vB                                     \
-  ".8h, v3.8h  \n" /* B */                                          \
-  "sqsub      " #vG ".8h, " #vG                                     \
-  ".8h, v6.8h  \n" /* G */                                          \
-  "sqadd      " #vR ".8h, " #vR                                     \
-  ".8h, v7.8h  \n" /* R */                                          \
-  "sqshrun    " #vB ".8b, " #vB                                     \
-  ".8h, #6     \n" /* B */                                          \
-  "sqshrun    " #vG ".8b, " #vG                                     \
-  ".8h, #6     \n"                               /* G */            \
+  "sqadd      " #vB ".8h, v24.8h, v0.8h      \n" /* B */            \
+  "sqadd      " #vG ".8h, v25.8h, v0.8h      \n" /* G */            \
+  "sqadd      " #vR ".8h, v26.8h, v0.8h      \n" /* R */            \
+  "sqadd      " #vB ".8h, " #vB ".8h, v3.8h  \n" /* B */            \
+  "sqsub      " #vG ".8h, " #vG ".8h, v6.8h  \n" /* G */            \
+  "sqadd      " #vR ".8h, " #vR ".8h, v7.8h  \n" /* R */            \
+  "sqshrun    " #vB ".8b, " #vB ".8h, #6     \n" /* B */            \
+  "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */            \
   "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */
 
+// clang-format on
+
 void I444ToARGBRow_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@@ -120,13 +114,16 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
                         int width) {
   asm volatile (
     YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n" /* A */
-  "1:                                          \n"
+      "movi        v23.8b, #255                  \n" /* A */
+      "1:                                        \n"
     READYUV444
+      "prfm        pldl1keep, [%0, 448]          \n"
     YUVTORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
-    "b.gt       1b                             \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "subs        %w4, %w4, #8                  \n"
+      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_y),     // %0
       "+r"(src_u),     // %1
       "+r"(src_v),     // %2
@@ -149,13 +146,17 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
                         int width) {
   asm volatile (
     YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n" /* A */
-  "1:                                          \n"
+      "movi        v23.8b, #255                  \n" /* A */
+
+      "1:                                        \n"
     READYUV422
+      "prfm        pldl1keep, [%0, 448]          \n"
     YUVTORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
-    "b.gt       1b                             \n"
+      "prfm        pldl1keep, [%1, 128]          \n"
+      "prfm        pldl1keep, [%2, 128]          \n"
+      "subs        %w4, %w4, #8                  \n"
+      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_y),     // %0
       "+r"(src_u),     // %1
       "+r"(src_v),     // %2
@@ -179,13 +180,17 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
                              int width) {
   asm volatile (
     YUVTORGB_SETUP
-  "1:                                          \n"
+      "1:                                        \n"
     READYUV422
+      "prfm        pldl1keep, [%0, 448]          \n"
     YUVTORGB(v22, v21, v20)
-    "ld1        {v23.8b}, [%3], #8             \n"
-    "subs       %w5, %w5, #8                   \n"
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32     \n"
-    "b.gt       1b                             \n"
+      "ld1         {v23.8b}, [%3], #8            \n"
+      "prfm        pldl1keep, [%1, 128]          \n"
+      "prfm        pldl1keep, [%2, 128]          \n"
+      "prfm        pldl1keep, [%3, 448]          \n"
+      "subs        %w5, %w5, #8                  \n"
+      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_y),     // %0
       "+r"(src_u),     // %1
       "+r"(src_v),     // %2
@@ -209,13 +214,16 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
                         int width) {
   asm volatile (
     YUVTORGB_SETUP
-    "movi       v20.8b, #255                   \n" /* A */
-  "1:                                          \n"
+      "movi        v20.8b, #255                  \n" /* A */
+      "1:                                        \n"
     READYUV422
+      "prfm        pldl1keep, [%0, 448]          \n"
     YUVTORGB(v23, v22, v21)
-    "subs       %w4, %w4, #8                   \n"
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
-    "b.gt       1b                             \n"
+      "prfm        pldl1keep, [%1, 128]          \n"
+      "prfm        pldl1keep, [%2, 128]          \n"
+      "subs        %w4, %w4, #8                  \n"
+      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_y),     // %0
       "+r"(src_u),     // %1
       "+r"(src_v),     // %2
@@ -238,12 +246,15 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
                          int width) {
   asm volatile (
     YUVTORGB_SETUP
-  "1:                                          \n"
+      "1:                                        \n"
     READYUV422
+      "prfm        pldl1keep, [%0, 448]          \n"
     YUVTORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
-    "b.gt       1b                             \n"
+      "prfm        pldl1keep, [%1, 128]          \n"
+      "prfm        pldl1keep, [%2, 128]          \n"
+      "subs        %w4, %w4, #8                  \n"
+      "st3         {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_y),     // %0
       "+r"(src_u),     // %1
       "+r"(src_v),     // %2
@@ -265,6 +276,8 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
   "sri        v0.8h,  v21.8h, #5             \n" /* RG                   */ \
   "sri        v0.8h,  v20.8h, #11            \n" /* RGB                  */
 
+// clang-format off
+
 void I422ToRGB565Row_NEON(const uint8_t* src_y,
                           const uint8_t* src_u,
                           const uint8_t* src_v,
@@ -272,13 +285,17 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
                           const struct YuvConstants* yuvconstants,
                           int width) {
   asm volatile(
-      YUVTORGB_SETUP
-      "1:                                        \n" READYUV422 YUVTORGB(
-          v22, v21,
-          v20) "subs       %w4, %w4, #8                   \n" ARGBTORGB565
-               "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels
-                                                               // RGB565.
-               "b.gt       1b                             \n"
+    YUVTORGB_SETUP
+      "1:                                        \n"
+    READYUV422
+    YUVTORGB(v22, v21, v20)
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w4, %w4, #8                  \n"
+    ARGBTORGB565
+      "prfm        pldl1keep, [%1, 128]          \n"
+      "prfm        pldl1keep, [%2, 128]          \n"
+      "st1         {v0.8h}, [%3], #16            \n"  // store 8 pixels RGB565.
+      "b.gt        1b                            \n"
       : "+r"(src_y),       // %0
         "+r"(src_u),       // %1
         "+r"(src_v),       // %2
@@ -308,14 +325,18 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
                             const struct YuvConstants* yuvconstants,
                             int width) {
   asm volatile(
-      YUVTORGB_SETUP
-      "movi       v23.8b, #255                   \n"
-      "1:                                        \n" READYUV422 YUVTORGB(
-          v22, v21,
-          v20) "subs       %w4, %w4, #8                   \n" ARGBTOARGB1555
-               "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels
-                                                               // RGB565.
-               "b.gt       1b                             \n"
+    YUVTORGB_SETUP
+      "movi        v23.8b, #255                  \n"
+      "1:                                        \n"
+    READYUV422
+    YUVTORGB(v22, v21, v20)
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w4, %w4, #8                  \n"
+    ARGBTOARGB1555
+      "prfm        pldl1keep, [%1, 128]          \n"
+      "prfm        pldl1keep, [%2, 128]          \n"
+      "st1         {v0.8h}, [%3], #16            \n"  // store 8 pixels RGB565.
+      "b.gt        1b                            \n"
       : "+r"(src_y),         // %0
         "+r"(src_u),         // %1
         "+r"(src_v),         // %2
@@ -328,6 +349,7 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
         "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
 }
+// clang-format on
 
 #define ARGBTOARGB4444                                                       \
   /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
@@ -347,15 +369,18 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
                             int width) {
   asm volatile (
     YUVTORGB_SETUP
-    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
-  "1:                                          \n"
+      "movi        v4.16b, #0x0f                 \n"  // bits to clear with vbic.
+      "1:                                        \n"
     READYUV422
     YUVTORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    "movi       v23.8b, #255                   \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w4, %w4, #8                  \n"
+      "movi        v23.8b, #255                  \n"
     ARGBTOARGB4444
-    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.
-    "b.gt       1b                             \n"
+      "prfm        pldl1keep, [%1, 128]          \n"
+      "prfm        pldl1keep, [%2, 128]          \n"
+      "st1         {v0.8h}, [%3], #16            \n"  // store 8 pixels ARGB4444.
+      "b.gt        1b                            \n"
     : "+r"(src_y),    // %0
       "+r"(src_u),    // %1
       "+r"(src_v),    // %2
@@ -370,23 +395,27 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
   );
 }
 
-void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+void I400ToARGBRow_NEON(const uint8_t* src_y,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
   asm volatile (
     YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n"
-  "1:                                          \n"
+      "movi        v23.8b, #255                  \n"
+      "1:                                        \n"
     READYUV400
     YUVTORGB(v22, v21, v20)
-    "subs       %w2, %w2, #8                   \n"
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
-    "b.gt       1b                             \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"
+      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_y),     // %0
       "+r"(dst_argb),  // %1
       "+r"(width)      // %2
-    : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
-      [kUVToG]"r"(&kYuvI601Constants.kUVToG),
-      [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
-      [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
@@ -394,14 +423,15 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
 
 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
   asm volatile(
-      "movi       v23.8b, #255                   \n"
+      "movi        v23.8b, #255                  \n"
       "1:                                        \n"
-      "ld1        {v20.8b}, [%0], #8             \n"
-      "orr        v21.8b, v20.8b, v20.8b         \n"
-      "orr        v22.8b, v20.8b, v20.8b         \n"
-      "subs       %w2, %w2, #8                   \n"
-      "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
-      "b.gt       1b                             \n"
+      "ld1         {v20.8b}, [%0], #8            \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "orr         v21.8b, v20.8b, v20.8b        \n"
+      "orr         v22.8b, v20.8b, v20.8b        \n"
+      "subs        %w2, %w2, #8                  \n"
+      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+      "b.gt        1b                            \n"
       : "+r"(src_y),     // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
@@ -416,13 +446,15 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
                         int width) {
   asm volatile (
     YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n"
-  "1:                                          \n"
+      "movi        v23.8b, #255                  \n"
+      "1:                                        \n"
     READNV12
+      "prfm        pldl1keep, [%0, 448]          \n"
     YUVTORGB(v22, v21, v20)
-    "subs       %w3, %w3, #8                   \n"
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
-    "b.gt       1b                             \n"
+      "prfm        pldl1keep, [%1, 256]          \n"
+      "subs        %w3, %w3, #8                  \n"
+      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_y),     // %0
       "+r"(src_uv),    // %1
       "+r"(dst_argb),  // %2
@@ -443,13 +475,15 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
                         int width) {
   asm volatile (
     YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n"
-  "1:                                          \n"
+      "movi        v23.8b, #255                  \n"
+      "1:                                        \n"
     READNV21
+      "prfm        pldl1keep, [%0, 448]          \n"
     YUVTORGB(v22, v21, v20)
-    "subs       %w3, %w3, #8                   \n"
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
-    "b.gt       1b                             \n"
+      "prfm        pldl1keep, [%1, 256]          \n"
+      "subs        %w3, %w3, #8                  \n"
+      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_y),     // %0
       "+r"(src_vu),    // %1
       "+r"(dst_argb),  // %2
@@ -470,12 +504,14 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
                          int width) {
   asm volatile (
     YUVTORGB_SETUP
-  "1:                                          \n"
+      "1:                                        \n"
     READNV12
+      "prfm        pldl1keep, [%0, 448]          \n"
     YUVTORGB(v22, v21, v20)
-    "subs       %w3, %w3, #8                   \n"
-    "st3        {v20.8b,v21.8b,v22.8b}, [%2], #24     \n"
-    "b.gt       1b                             \n"
+      "prfm        pldl1keep, [%1, 256]          \n"
+      "subs        %w3, %w3, #8                  \n"
+      "st3         {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_y),     // %0
       "+r"(src_uv),    // %1
       "+r"(dst_rgb24),  // %2
@@ -496,12 +532,14 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
                          int width) {
   asm volatile (
     YUVTORGB_SETUP
-  "1:                                          \n"
+      "1:                                        \n"
     READNV21
+      "prfm        pldl1keep, [%0, 448]          \n"
     YUVTORGB(v22, v21, v20)
-    "subs       %w3, %w3, #8                   \n"
-    "st3        {v20.8b,v21.8b,v22.8b}, [%2], #24     \n"
-    "b.gt       1b                             \n"
+      "prfm        pldl1keep, [%1, 256]          \n"
+      "subs        %w3, %w3, #8                  \n"
+      "st3         {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_y),     // %0
       "+r"(src_vu),    // %1
       "+r"(dst_rgb24),  // %2
@@ -521,13 +559,13 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
                           const struct YuvConstants* yuvconstants,
                           int width) {
   asm volatile(
-      YUVTORGB_SETUP
-      "1:                                        \n" READNV12 YUVTORGB(
-          v22, v21,
-          v20) "subs       %w3, %w3, #8                   \n" ARGBTORGB565
-               "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels
-                                                               // RGB565.
-               "b.gt       1b                             \n"
+      YUVTORGB_SETUP "1:                                        \n" READNV12
+                     "prfm        pldl1keep, [%0, 448]          \n" YUVTORGB(
+                         v22, v21, v20) ARGBTORGB565
+      "prfm        pldl1keep, [%1, 256]          \n"
+      "subs        %w3, %w3, #8                  \n"
+      "st1         {v0.8h}, [%2], 16             \n"  // store 8 pixels
+      "b.gt        1b                            \n"
       : "+r"(src_y),       // %0
         "+r"(src_uv),      // %1
         "+r"(dst_rgb565),  // %2
@@ -546,13 +584,14 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
                         int width) {
   asm volatile (
     YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n"
-  "1:                                          \n"
+      "movi        v23.8b, #255                  \n"
+      "1:                                        \n"
     READYUY2
+      "prfm        pldl1keep, [%0, 448]          \n"
     YUVTORGB(v22, v21, v20)
-    "subs       %w2, %w2, #8                   \n"
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
-    "b.gt       1b                             \n"
+      "subs        %w2, %w2, #8                  \n"
+      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_yuy2),  // %0
       "+r"(dst_argb),  // %1
       "+r"(width)      // %2
@@ -571,13 +610,14 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
                         int width) {
   asm volatile (
     YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n"
-  "1:                                          \n"
+      "movi        v23.8b, #255                  \n"
+      "1:                                        \n"
     READUYVY
     YUVTORGB(v22, v21, v20)
-    "subs       %w2, %w2, #8                   \n"
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
-    "b.gt       1b                             \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"
+      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_uyvy),  // %0
       "+r"(dst_argb),  // %1
       "+r"(width)      // %2
@@ -597,18 +637,19 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
                      int width) {
   asm volatile(
       "1:                                        \n"
-      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
-      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
-      "st1        {v0.16b}, [%1], #16            \n"  // store U
-      "st1        {v1.16b}, [%2], #16            \n"  // store V
-      "b.gt       1b                             \n"
+      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pairs of UV
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
+      "st1         {v0.16b}, [%1], #16           \n"  // store U
+      "st1         {v1.16b}, [%2], #16           \n"  // store V
+      "b.gt        1b                            \n"
       : "+r"(src_uv),               // %0
         "+r"(dst_u),                // %1
         "+r"(dst_v),                // %2
         "+r"(width)                 // %3  // Output registers
       :                             // Input registers
       : "cc", "memory", "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 // Reads 16 U's and V's and writes out 16 pairs of UV.
@@ -618,18 +659,20 @@ void MergeUVRow_NEON(const uint8_t* src_u,
                      int width) {
   asm volatile(
       "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load U
-      "ld1        {v1.16b}, [%1], #16            \n"  // load V
-      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
-      "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
-      "b.gt       1b                             \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load U
+      "ld1         {v1.16b}, [%1], #16           \n"  // load V
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
+      "st2         {v0.16b,v1.16b}, [%2], #32    \n"  // store 16 pairs of UV
+      "b.gt        1b                            \n"
       : "+r"(src_u),                // %0
         "+r"(src_v),                // %1
         "+r"(dst_uv),               // %2
         "+r"(width)                 // %3  // Output registers
       :                             // Input registers
       : "cc", "memory", "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
@@ -640,12 +683,13 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
                       int width) {
   asm volatile(
       "1:                                        \n"
-      "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 RGB
-      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop
-      "st1        {v0.16b}, [%1], #16            \n"  // store R
-      "st1        {v1.16b}, [%2], #16            \n"  // store G
-      "st1        {v2.16b}, [%3], #16            \n"  // store B
-      "b.gt       1b                             \n"
+      "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 RGB
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
+      "st1         {v0.16b}, [%1], #16           \n"  // store R
+      "st1         {v1.16b}, [%2], #16           \n"  // store G
+      "st1         {v2.16b}, [%3], #16           \n"  // store B
+      "b.gt        1b                            \n"
       : "+r"(src_rgb),                    // %0
         "+r"(dst_r),                      // %1
         "+r"(dst_g),                      // %2
@@ -653,7 +697,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
         "+r"(width)                       // %4
       :                                   // Input registers
       : "cc", "memory", "v0", "v1", "v2"  // Clobber List
-      );
+  );
 }
 
 // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
@@ -664,12 +708,16 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
                       int width) {
   asm volatile(
       "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load R
-      "ld1        {v1.16b}, [%1], #16            \n"  // load G
-      "ld1        {v2.16b}, [%2], #16            \n"  // load B
-      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop
-      "st3        {v0.16b,v1.16b,v2.16b}, [%3], #48 \n"  // store 16 RGB
-      "b.gt       1b                             \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load R
+      "ld1         {v1.16b}, [%1], #16           \n"  // load G
+      "ld1         {v2.16b}, [%2], #16           \n"  // load B
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
+      "st3         {v0.16b,v1.16b,v2.16b}, [%3], #48 \n"  // store 16 RGB
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "b.gt        1b                            \n"
       : "+r"(src_r),                      // %0
         "+r"(src_g),                      // %1
         "+r"(src_b),                      // %2
@@ -677,33 +725,34 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
         "+r"(width)                       // %4
       :                                   // Input registers
       : "cc", "memory", "v0", "v1", "v2"  // Clobber List
-      );
+  );
 }
 
 // Copy multiple of 32.
 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
       "1:                                        \n"
-      "ldp        q0, q1, [%0], #32              \n"
-      "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
-      "stp        q0, q1, [%1], #32              \n"
-      "b.gt       1b                             \n"
+      "ldp         q0, q1, [%0], #32             \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #32                 \n"  // 32 processed per loop
+      "stp         q0, q1, [%1], #32             \n"
+      "b.gt        1b                            \n"
       : "+r"(src),                  // %0
         "+r"(dst),                  // %1
         "+r"(width)                 // %2  // Output registers
       :                             // Input registers
       : "cc", "memory", "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 // SetRow writes 'width' bytes using an 8 bit value repeated.
 void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
   asm volatile(
-      "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
+      "dup         v0.16b, %w2                   \n"  // duplicate 16 bytes
       "1:                                        \n"
-      "subs       %w1, %w1, #16                  \n"  // 16 bytes per loop
-      "st1        {v0.16b}, [%0], #16            \n"  // store
-      "b.gt       1b                             \n"
+      "subs        %w1, %w1, #16                 \n"  // 16 bytes per loop
+      "st1         {v0.16b}, [%0], #16           \n"  // store
+      "b.gt        1b                            \n"
       : "+r"(dst),   // %0
         "+r"(width)  // %1
       : "r"(v8)      // %2
@@ -712,130 +761,219 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
 
 void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
   asm volatile(
-      "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
+      "dup         v0.4s, %w2                    \n"  // duplicate 4 ints
       "1:                                        \n"
-      "subs       %w1, %w1, #4                   \n"  // 4 ints per loop
-      "st1        {v0.16b}, [%0], #16            \n"  // store
-      "b.gt       1b                             \n"
+      "subs        %w1, %w1, #4                  \n"  // 4 ints per loop
+      "st1         {v0.16b}, [%0], #16           \n"  // store
+      "b.gt        1b                            \n"
       : "+r"(dst),   // %0
         "+r"(width)  // %1
       : "r"(v32)     // %2
       : "cc", "memory", "v0");
 }
 
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
+
 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
       // Start at end of source row.
-      "add        %0, %0, %w2, sxtw              \n"
-      "sub        %0, %0, #16                    \n"
-      "1:                                        \n"
-      "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
-      "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop.
-      "rev64      v0.16b, v0.16b                 \n"
-      "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
-      "st1        {v0.D}[0], [%1], #8            \n"
-      "b.gt       1b                             \n"
-      : "+r"(src),           // %0
-        "+r"(dst),           // %1
-        "+r"(width)          // %2
-      : "r"((ptrdiff_t)-16)  // %3
-      : "cc", "memory", "v0");
+      "ld1         {v3.16b}, [%3]                \n"  // shuffler
+      "add         %0, %0, %w2, sxtw             \n"
+      "sub         %0, %0, #32                   \n"
+      "1:                                        \n"
+      "ldr         q2, [%0, 16]                  \n"
+      "ldr         q1, [%0], -32                 \n"  // src -= 32
+      "subs        %w2, %w2, #32                 \n"  // 32 pixels per loop.
+      "tbl         v0.16b, {v2.16b}, v3.16b      \n"
+      "tbl         v1.16b, {v1.16b}, v3.16b      \n"
+      "st1         {v0.16b, v1.16b}, [%1], #32   \n"  // store 32 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src),            // %0
+        "+r"(dst),            // %1
+        "+r"(width)           // %2
+      : "r"(&kShuffleMirror)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3");
 }
 
-void MirrorUVRow_NEON(const uint8_t* src_uv,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
+// Shuffle table for reversing the UV.
+static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
+                                       6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};
+
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
   asm volatile(
       // Start at end of source row.
-      "add        %0, %0, %w3, sxtw #1           \n"
-      "sub        %0, %0, #16                    \n"
-      "1:                                        \n"
-      "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
-      "subs       %w3, %w3, #8                   \n"  // 8 pixels per loop.
-      "rev64      v0.8b, v0.8b                   \n"
-      "rev64      v1.8b, v1.8b                   \n"
-      "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
-      "st1        {v1.8b}, [%2], #8              \n"
-      "b.gt       1b                             \n"
-      : "+r"(src_uv),        // %0
-        "+r"(dst_u),         // %1
-        "+r"(dst_v),         // %2
-        "+r"(width)          // %3
-      : "r"((ptrdiff_t)-16)  // %4
-      : "cc", "memory", "v0", "v1");
-}
-
-void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+      "ld1         {v4.16b}, [%3]                \n"  // shuffler
+      "add         %0, %0, %w2, sxtw #1          \n"
+      "sub         %0, %0, #32                   \n"
+      "1:                                        \n"
+      "ldr         q1, [%0, 16]                  \n"
+      "ldr         q0, [%0], -32                 \n"  // src -= 32
+      "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop.
+      "tbl         v2.16b, {v1.16b}, v4.16b      \n"
+      "tbl         v3.16b, {v0.16b}, v4.16b      \n"
+      "st1         {v2.16b, v3.16b}, [%1], #32   \n"  // dst += 32
+      "b.gt        1b                            \n"
+      : "+r"(src_uv),           // %0
+        "+r"(dst_uv),           // %1
+        "+r"(width)             // %2
+      : "r"(&kShuffleMirrorUV)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width) {
   asm volatile(
       // Start at end of source row.
-      "add        %0, %0, %w2, sxtw #2           \n"
-      "sub        %0, %0, #16                    \n"
-      "1:                                        \n"
-      "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
-      "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
-      "rev64      v0.4s, v0.4s                   \n"
-      "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
-      "st1        {v0.D}[0], [%1], #8            \n"
-      "b.gt       1b                             \n"
-      : "+r"(src),           // %0
-        "+r"(dst),           // %1
-        "+r"(width)          // %2
-      : "r"((ptrdiff_t)-16)  // %3
-      : "cc", "memory", "v0");
+      "ld1         {v4.16b}, [%4]                \n"  // shuffler
+      "add         %0, %0, %w3, sxtw #1          \n"
+      "sub         %0, %0, #32                   \n"
+      "1:                                        \n"
+      "ldr         q1, [%0, 16]                  \n"
+      "ldr         q0, [%0], -32                 \n"  // src -= 32
+      "subs        %w3, %w3, #16                 \n"  // 16 pixels per loop.
+      "tbl         v2.16b, {v1.16b}, v4.16b      \n"
+      "tbl         v3.16b, {v0.16b}, v4.16b      \n"
+      "uzp1        v0.16b, v2.16b, v3.16b        \n"  // U
+      "uzp2        v1.16b, v2.16b, v3.16b        \n"  // V
+      "st1         {v0.16b}, [%1], #16           \n"  // dst += 16
+      "st1         {v1.16b}, [%2], #16           \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_uv),           // %0
+        "+r"(dst_u),            // %1
+        "+r"(dst_v),            // %2
+        "+r"(width)             // %3
+      : "r"(&kShuffleMirrorUV)  // %4
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+// Shuffle table for reversing the ARGB.
+static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
+                                         4u,  5u,  6u,  7u,  0u, 1u, 2u,  3u};
+
+void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "ld1         {v4.16b}, [%3]                \n"  // shuffler
+      "add         %0, %0, %w2, sxtw #2          \n"
+      "sub         %0, %0, #32                   \n"
+      "1:                                        \n"
+      "ldr         q1, [%0, 16]                  \n"
+      "ldr         q0, [%0], -32                 \n"  // src -= 32
+      "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop.
+      "tbl         v2.16b, {v1.16b}, v4.16b      \n"
+      "tbl         v3.16b, {v0.16b}, v4.16b      \n"
+      "st1         {v2.16b, v3.16b}, [%1], #32   \n"  // dst += 32
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),           // %0
+        "+r"(dst_argb),           // %1
+        "+r"(width)               // %2
+      : "r"(&kShuffleMirrorARGB)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_rgb24,
+                         int width) {
+  asm volatile(
+      "ld1         {v3.16b}, [%4]                \n"  // shuffler
+      "add         %0, %0, %w2, sxtw #1          \n"  // Start at end of row.
+      "add         %0, %0, %w2, sxtw             \n"
+      "sub         %0, %0, #48                   \n"
+
+      "1:                                        \n"
+      "ld3         {v0.16b, v1.16b, v2.16b}, [%0], %3 \n"  // src -= 48
+      "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop.
+      "tbl         v0.16b, {v0.16b}, v3.16b      \n"
+      "tbl         v1.16b, {v1.16b}, v3.16b      \n"
+      "tbl         v2.16b, {v2.16b}, v3.16b      \n"
+      "st3         {v0.16b, v1.16b, v2.16b}, [%1], #48 \n"  // dst += 48
+      "b.gt        1b                            \n"
+      : "+r"(src_rgb24),      // %0
+        "+r"(dst_rgb24),      // %1
+        "+r"(width)           // %2
+      : "r"((ptrdiff_t)-48),  // %3
+        "r"(&kShuffleMirror)  // %4
+      : "cc", "memory", "v0", "v1", "v2", "v3");
 }
 
 void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
                          uint8_t* dst_argb,
                          int width) {
   asm volatile(
-      "movi       v4.8b, #255                    \n"  // Alpha
+      "movi        v4.8b, #255                   \n"  // Alpha
       "1:                                        \n"
-      "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB
-      "b.gt       1b                             \n"
+      "ld3         {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of
+                                                       // RGB24.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "st4         {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
       : "+r"(src_rgb24),  // %0
         "+r"(dst_argb),   // %1
         "+r"(width)       // %2
       :
       : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
-      );
+  );
 }
 
 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
   asm volatile(
-      "movi       v5.8b, #255                    \n"  // Alpha
+      "movi        v5.8b, #255                   \n"  // Alpha
       "1:                                        \n"
-      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
-      "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
-      "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
-      "b.gt       1b                             \n"
+      "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "orr         v3.8b, v1.8b, v1.8b           \n"  // move g
+      "orr         v4.8b, v0.8b, v0.8b           \n"  // move r
+      "st4         {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
+      "b.gt        1b                            \n"
       : "+r"(src_raw),   // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
-      );
+  );
+}
+
+void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+  asm volatile(
+      "movi        v0.8b, #255                   \n"  // Alpha
+      "1:                                        \n"
+      "ld3         {v3.8b,v4.8b,v5.8b}, [%0], #24 \n"  // read r g b
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "orr         v2.8b, v4.8b, v4.8b           \n"  // move g
+      "orr         v1.8b, v5.8b, v5.8b           \n"  // move r
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store a b g r
+      "b.gt        1b                            \n"
+      : "+r"(src_raw),   // %0
+        "+r"(dst_rgba),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
+  );
 }
 
 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
   asm volatile(
       "1:                                        \n"
-      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
-      "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
-      "st3        {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
-      "b.gt       1b                             \n"
+      "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
+      "orr         v3.8b, v1.8b, v1.8b           \n"   // move g
+      "orr         v4.8b, v0.8b, v0.8b           \n"   // move r
+      "st3         {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
+      "b.gt        1b                            \n"
       : "+r"(src_raw),    // %0
         "+r"(dst_rgb24),  // %1
         "+r"(width)       // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
-      );
+  );
 }
 
 #define RGB565TOARGB                                                        \
@@ -855,19 +993,20 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
                           uint8_t* dst_argb,
                           int width) {
   asm volatile(
-      "movi       v3.8b, #255                    \n"  // Alpha
+      "movi        v3.8b, #255                   \n"  // Alpha
       "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       RGB565TOARGB
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
-      "b.gt       1b                             \n"
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
       : "+r"(src_rgb565),  // %0
         "+r"(dst_argb),    // %1
         "+r"(width)        // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
-      );
+  );
 }
 
 #define ARGB1555TOARGB                                                      \
@@ -911,22 +1050,24 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
                             uint8_t* dst_argb,
                             int width) {
   asm volatile(
-      "movi       v3.8b, #255                    \n"  // Alpha
+      "movi        v3.8b, #255                   \n"  // Alpha
       "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB1555 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       ARGB1555TOARGB
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
-                                                            // pixels
-      "b.gt       1b                             \n"
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
       : "+r"(src_argb1555),  // %0
         "+r"(dst_argb),      // %1
         "+r"(width)          // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
+// Convert v0.8h to b = v0.8b g = v1.8b r = v2.8b
+// clobbers v3
 #define ARGB4444TOARGB                                                      \
   "shrn       v1.8b,  v0.8h, #8              \n" /* v1(l) AR             */ \
   "xtn2       v1.16b, v0.8h                  \n" /* v1(h) GB             */ \
@@ -944,18 +1085,18 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
                             int width) {
   asm volatile(
       "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       ARGB4444TOARGB
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
-                                                            // pixels
-      "b.gt       1b                             \n"
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
       : "+r"(src_argb4444),  // %0
         "+r"(dst_argb),      // %1
         "+r"(width)          // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
-      );
+  );
 }
 
 void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
@@ -963,64 +1104,68 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
                          int width) {
   asm volatile(
       "1:                                        \n"
-      "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of
-                                                      // RGB24.
-      "b.gt       1b                             \n"
+      "ld4         {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
+      "st3         {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of
+                                                       // RGB24
+      "b.gt        1b                            \n"
       : "+r"(src_argb),   // %0
         "+r"(dst_rgb24),  // %1
         "+r"(width)       // %2
       :
       : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
-      );
+  );
 }
 
 void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
   asm volatile(
       "1:                                        \n"
-      "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
-      "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
-      "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
-      "b.gt       1b                             \n"
+      "ld4         {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
+      "orr         v4.8b, v2.8b, v2.8b           \n"   // mov g
+      "orr         v5.8b, v1.8b, v1.8b           \n"   // mov b
+      "st3         {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
+      "b.gt        1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_raw),   // %1
         "+r"(width)      // %2
       :
       : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
-      );
+  );
 }
 
 void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
   asm volatile(
       "1:                                        \n"
-      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
-      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
-      "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
-      "b.gt       1b                             \n"
+      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels of YUY2.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
+      "st1         {v0.16b}, [%1], #16           \n"  // store 16 pixels of Y.
+      "b.gt        1b                            \n"
       : "+r"(src_yuy2),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
       :
       : "cc", "memory", "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
   asm volatile(
       "1:                                        \n"
-      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
-      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
-      "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
-      "b.gt       1b                             \n"
+      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels of UYVY.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
+      "st1         {v1.16b}, [%1], #16           \n"  // store 16 pixels of Y.
+      "b.gt        1b                            \n"
       : "+r"(src_uyvy),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
       :
       : "cc", "memory", "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
@@ -1029,18 +1174,19 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
                          int width) {
   asm volatile(
       "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2
-      "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
-      "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
-      "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
-      "b.gt       1b                             \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
+      "st1         {v1.8b}, [%1], #8             \n"  // store 8 U.
+      "st1         {v3.8b}, [%2], #8             \n"  // store 8 V.
+      "b.gt        1b                            \n"
       : "+r"(src_yuy2),  // %0
         "+r"(dst_u),     // %1
         "+r"(dst_v),     // %2
         "+r"(width)      // %3
       :
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
@@ -1049,18 +1195,19 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
                          int width) {
   asm volatile(
       "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY
-      "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
-      "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
-      "b.gt       1b                             \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 U.
+      "st1         {v2.8b}, [%2], #8             \n"  // store 8 V.
+      "b.gt        1b                            \n"
       : "+r"(src_uyvy),  // %0
         "+r"(dst_u),     // %1
         "+r"(dst_v),     // %2
         "+r"(width)      // %3
       :
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
@@ -1071,14 +1218,15 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
   const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
   asm volatile(
       "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
-      "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
-      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
-      "urhadd     v1.8b, v1.8b, v5.8b            \n"        // average rows of U
-      "urhadd     v3.8b, v3.8b, v7.8b            \n"        // average rows of V
-      "st1        {v1.8b}, [%2], #8              \n"        // store 8 U.
-      "st1        {v3.8b}, [%3], #8              \n"        // store 8 V.
-      "b.gt       1b                             \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w4, %w4, #16                 \n"  // 16 pixels = 8 UVs.
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+      "urhadd      v1.8b, v1.8b, v5.8b           \n"  // average rows of U
+      "urhadd      v3.8b, v3.8b, v7.8b           \n"  // average rows of V
+      "st1         {v1.8b}, [%2], #8             \n"  // store 8 U.
+      "st1         {v3.8b}, [%3], #8             \n"  // store 8 V.
+      "b.gt        1b                            \n"
       : "+r"(src_yuy2),   // %0
         "+r"(src_yuy2b),  // %1
         "+r"(dst_u),      // %2
@@ -1087,7 +1235,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
         "v7"  // Clobber List
-      );
+  );
 }
 
 void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
@@ -1098,14 +1246,15 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
   const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
   asm volatile(
       "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
-      "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
-      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
-      "urhadd     v0.8b, v0.8b, v4.8b            \n"        // average rows of U
-      "urhadd     v2.8b, v2.8b, v6.8b            \n"        // average rows of V
-      "st1        {v0.8b}, [%2], #8              \n"        // store 8 U.
-      "st1        {v2.8b}, [%3], #8              \n"        // store 8 V.
-      "b.gt       1b                             \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w4, %w4, #16                 \n"  // 16 pixels = 8 UVs.
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+      "urhadd      v0.8b, v0.8b, v4.8b           \n"  // average rows of U
+      "urhadd      v2.8b, v2.8b, v6.8b           \n"  // average rows of V
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 U.
+      "st1         {v2.8b}, [%3], #8             \n"  // store 8 V.
+      "b.gt        1b                            \n"
       : "+r"(src_uyvy),   // %0
         "+r"(src_uyvyb),  // %1
         "+r"(dst_u),      // %2
@@ -1114,7 +1263,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
         "v7"  // Clobber List
-      );
+  );
 }
 
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
@@ -1123,19 +1272,20 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
                          const uint8_t* shuffler,
                          int width) {
   asm volatile(
-      "ld1        {v2.16b}, [%3]                 \n"  // shuffler
+      "ld1         {v2.16b}, [%3]                \n"  // shuffler
       "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
-      "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
-      "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
-      "st1        {v1.16b}, [%1], #16            \n"  // store 4.
-      "b.gt       1b                             \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 4 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #4                  \n"  // 4 processed per loop
+      "tbl         v1.16b, {v0.16b}, v2.16b      \n"  // look up 4 pixels
+      "st1         {v1.16b}, [%1], #16           \n"  // store 4.
+      "b.gt        1b                            \n"
       : "+r"(src_argb),                   // %0
         "+r"(dst_argb),                   // %1
         "+r"(width)                       // %2
       : "r"(shuffler)                     // %3
       : "cc", "memory", "v0", "v1", "v2"  // Clobber List
-      );
+  );
 }
 
 void I422ToYUY2Row_NEON(const uint8_t* src_y,
@@ -1145,13 +1295,14 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
                         int width) {
   asm volatile(
       "1:                                        \n"
-      "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
-      "orr        v2.8b, v1.8b, v1.8b            \n"
-      "ld1        {v1.8b}, [%1], #8              \n"        // load 8 Us
-      "ld1        {v3.8b}, [%2], #8              \n"        // load 8 Vs
-      "subs       %w4, %w4, #16                  \n"        // 16 pixels
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
-      "b.gt       1b                             \n"
+      "ld2         {v0.8b, v1.8b}, [%0], #16     \n"  // load 16 Ys
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "orr         v2.8b, v1.8b, v1.8b           \n"
+      "ld1         {v1.8b}, [%1], #8             \n"         // load 8 Us
+      "ld1         {v3.8b}, [%2], #8             \n"         // load 8 Vs
+      "subs        %w4, %w4, #16                 \n"         // 16 pixels
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+      "b.gt        1b                            \n"
       : "+r"(src_y),     // %0
         "+r"(src_u),     // %1
         "+r"(src_v),     // %2
@@ -1168,13 +1319,14 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
                         int width) {
   asm volatile(
       "1:                                        \n"
-      "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
-      "orr        v3.8b, v2.8b, v2.8b            \n"
-      "ld1        {v0.8b}, [%1], #8              \n"        // load 8 Us
-      "ld1        {v2.8b}, [%2], #8              \n"        // load 8 Vs
-      "subs       %w4, %w4, #16                  \n"        // 16 pixels
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
-      "b.gt       1b                             \n"
+      "ld2         {v1.8b,v2.8b}, [%0], #16      \n"  // load 16 Ys
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "orr         v3.8b, v2.8b, v2.8b           \n"
+      "ld1         {v0.8b}, [%1], #8             \n"         // load 8 Us
+      "ld1         {v2.8b}, [%2], #8             \n"         // load 8 Vs
+      "subs        %w4, %w4, #16                 \n"         // 16 pixels
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+      "b.gt        1b                            \n"
       : "+r"(src_y),     // %0
         "+r"(src_u),     // %1
         "+r"(src_v),     // %2
@@ -1189,11 +1341,13 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
                           int width) {
   asm volatile(
       "1:                                        \n"
-      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8
+                                                                 // pixels
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       ARGBTORGB565
-      "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
-      "b.gt       1b                             \n"
+      "st1         {v0.16b}, [%1], #16           \n"  // store 8 pixels RGB565.
+      "b.gt        1b                            \n"
       : "+r"(src_argb),    // %0
         "+r"(dst_rgb565),  // %1
         "+r"(width)        // %2
@@ -1206,15 +1360,17 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
                                 const uint32_t dither4,
                                 int width) {
   asm volatile(
-      "dup        v1.4s, %w2                     \n"  // dither4
-      "1:                                        \n"
-      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
-      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-      "uqadd      v20.8b, v20.8b, v1.8b          \n"
-      "uqadd      v21.8b, v21.8b, v1.8b          \n"
-      "uqadd      v22.8b, v22.8b, v1.8b          \n" ARGBTORGB565
-      "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
-      "b.gt       1b                             \n"
+      "dup         v1.4s, %w2                    \n"  // dither4
+      "1:                                        \n"
+      "ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8
+                                                                 // pixels
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "uqadd       v20.8b, v20.8b, v1.8b         \n"
+      "uqadd       v21.8b, v21.8b, v1.8b         \n"
+      "uqadd       v22.8b, v22.8b, v1.8b         \n" ARGBTORGB565
+      "st1         {v0.16b}, [%0], #16           \n"  // store 8 pixels RGB565.
+      "b.gt        1b                            \n"
       : "+r"(dst_rgb)   // %0
       : "r"(src_argb),  // %1
         "r"(dither4),   // %2
@@ -1227,12 +1383,13 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
                             int width) {
   asm volatile(
       "1:                                        \n"
-      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8
+                                                                 // pixels
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       ARGBTOARGB1555
-      "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels
-                                                      // ARGB1555.
-      "b.gt       1b                             \n"
+      "st1         {v0.16b}, [%1], #16           \n"  // store 8 pixels
+      "b.gt        1b                            \n"
       : "+r"(src_argb),      // %0
         "+r"(dst_argb1555),  // %1
         "+r"(width)          // %2
@@ -1244,15 +1401,16 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
                             uint8_t* dst_argb4444,
                             int width) {
   asm volatile(
-      "movi       v4.16b, #0x0f                  \n"  // bits to clear with
+      "movi        v4.16b, #0x0f                 \n"  // bits to clear with
                                                       // vbic.
       "1:                                        \n"
-      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8
+                                                                 // pixels
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       ARGBTOARGB4444
-      "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels
-                                                      // ARGB4444.
-      "b.gt       1b                             \n"
+      "st1         {v0.16b}, [%1], #16           \n"  // store 8 pixels
+      "b.gt        1b                            \n"
       : "+r"(src_argb),      // %0
         "+r"(dst_argb4444),  // %1
         "+r"(width)          // %2
@@ -1262,20 +1420,21 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
 
 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   asm volatile(
-      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-      "movi       v7.8b, #16                     \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "umull      v3.8h, v0.8b, v4.8b            \n"  // B
-      "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
-      "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
-      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-      "uqadd      v0.8b, v0.8b, v7.8b            \n"
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-      "b.gt       1b                             \n"
+      "movi        v4.8b, #25                    \n"  // B * 0.1016 coefficient
+      "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
+      "movi        v6.8b, #66                    \n"  // R * 0.2578 coefficient
+      "movi        v7.8b, #16                    \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v3.8h, v0.8b, v4.8b           \n"  // B
+      "umlal       v3.8h, v1.8b, v5.8b           \n"  // G
+      "umlal       v3.8h, v2.8b, v6.8b           \n"  // R
+      "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.8b, v0.8b, v7.8b           \n"
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
@@ -1288,33 +1447,56 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
                               int width) {
   asm volatile(
       "1:                                        \n"
-      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load row 16
-                                                                // pixels
-      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
-      "st1        {v3.16b}, [%1], #16            \n"  // store 16 A's.
-      "b.gt       1b                             \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
+      "st1         {v3.16b}, [%1], #16           \n"  // store 16 A's.
+      "b.gt        1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_a),     // %1
         "+r"(width)      // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   asm volatile(
-      "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
-      "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
-      "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "umull      v3.8h, v0.8b, v4.8b            \n"  // B
-      "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
-      "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
-      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-      "b.gt       1b                             \n"
+      "movi        v4.8b, #29                    \n"  // B * 0.1140 coefficient
+      "movi        v5.8b, #150                   \n"  // G * 0.5870 coefficient
+      "movi        v6.8b, #77                    \n"  // R * 0.2990 coefficient
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v3.8h, v0.8b, v4.8b           \n"  // B
+      "umlal       v3.8h, v1.8b, v5.8b           \n"  // G
+      "umlal       v3.8h, v2.8b, v6.8b           \n"  // R
+      "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi        v4.8b, #29                    \n"  // B * 0.1140 coefficient
+      "movi        v5.8b, #150                   \n"  // G * 0.5870 coefficient
+      "movi        v6.8b, #77                    \n"  // R * 0.2990 coefficient
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 RGBA
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v0.8h, v1.8b, v4.8b           \n"  // B
+      "umlal       v0.8h, v2.8b, v5.8b           \n"  // G
+      "umlal       v0.8h, v3.8b, v6.8b           \n"  // R
+      "uqrshrn     v3.8b, v0.8h, #8              \n"  // 16 bit to 8 bit Y
+      "st1         {v3.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
@@ -1328,33 +1510,33 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_v,
                          int width) {
   asm volatile(
-      "movi       v24.8b, #112                   \n"  // UB / VR 0.875
+      "movi        v24.8b, #112                  \n"  // UB / VR 0.875
                                                       // coefficient
-      "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
-      "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
-      "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
-      "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
-      "movi       v29.16b,#0x80                  \n"  // 128.5
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-                                                            // pixels.
-      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-      "umull      v4.8h, v0.8b, v24.8b           \n"  // B
-      "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
-      "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
-      "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
-
-      "umull      v3.8h, v2.8b, v24.8b           \n"  // R
-      "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
-      "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
-      "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
-
-      "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
-      "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
-
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
-      "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
-      "b.gt       1b                             \n"
+      "movi        v25.8b, #74                   \n"  // UG -0.5781 coefficient
+      "movi        v26.8b, #38                   \n"  // UR -0.2969 coefficient
+      "movi        v27.8b, #18                   \n"  // VB -0.1406 coefficient
+      "movi        v28.8b, #94                   \n"  // VG -0.7344 coefficient
+      "movi        v29.16b,#0x80                 \n"  // 128.5
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "umull       v4.8h, v0.8b, v24.8b          \n"  // B
+      "umlsl       v4.8h, v1.8b, v25.8b          \n"  // G
+      "umlsl       v4.8h, v2.8b, v26.8b          \n"  // R
+      "add         v4.8h, v4.8h, v29.8h          \n"  // +128 -> unsigned
+
+      "umull       v3.8h, v2.8b, v24.8b          \n"  // R
+      "umlsl       v3.8h, v1.8b, v28.8b          \n"  // G
+      "umlsl       v3.8h, v0.8b, v27.8b          \n"  // B
+      "add         v3.8h, v3.8h, v29.8h          \n"  // +128 -> unsigned
+
+      "uqshrn      v0.8b, v4.8h, #8              \n"  // 16 bit to 8 bit U
+      "uqshrn      v1.8b, v3.8h, #8              \n"  // 16 bit to 8 bit V
+
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%2], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_u),     // %1
         "+r"(dst_v),     // %2
@@ -1398,26 +1580,28 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
   const uint8_t* src_argb_1 = src_argb + src_stride_argb;
   asm volatile (
     RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
-    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
+      "urshr       v1.8h, v1.8h, #1              \n"
+      "urshr       v2.8h, v2.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
     RGBTOUV(v0.8h, v1.8h, v2.8h)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
   : "+r"(src_argb),  // %0
     "+r"(src_argb_1),  // %1
     "+r"(dst_u),     // %2
@@ -1429,7 +1613,6 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
   );
 }
 
-// TODO(fbarchard): Subsample match C code.
 void ARGBToUVJRow_NEON(const uint8_t* src_argb,
                        int src_stride_argb,
                        uint8_t* dst_u,
@@ -1437,31 +1620,33 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
                        int width) {
   const uint8_t* src_argb_1 = src_argb + src_stride_argb;
   asm volatile (
-    "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
-    "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2
-    "movi       v22.8h, #21, lsl #0            \n"  // UR coeff (-0.16874) / 2
-    "movi       v23.8h, #10, lsl #0            \n"  // VB coeff (-0.08131) / 2
-    "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2
-    "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
-  "1:                                          \n"
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16
-    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+      "movi        v20.8h, #63, lsl #0           \n"  // UB/VR coeff (0.500) / 2
+      "movi        v21.8h, #42, lsl #0           \n"  // UG coeff (-0.33126) / 2
+      "movi        v22.8h, #21, lsl #0           \n"  // UR coeff (-0.16874) / 2
+      "movi        v23.8h, #10, lsl #0           \n"  // VB coeff (-0.08131) / 2
+      "movi        v24.8h, #53, lsl #0           \n"  // VG coeff (-0.41869) / 2
+      "movi        v25.16b, #0x80                \n"  // 128.5 (0x8080 in 16-bit)
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
+      "urshr       v1.8h, v1.8h, #1              \n"
+      "urshr       v2.8h, v2.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
     RGBTOUV(v0.8h, v1.8h, v2.8h)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
   : "+r"(src_argb),  // %0
     "+r"(src_argb_1),  // %1
     "+r"(dst_u),     // %2
@@ -1481,25 +1666,27 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
   const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
   asm volatile (
     RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
-    "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v5.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v3.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v0.8h, v3.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uaddlp      v3.8h, v2.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v1.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v0.8h, v7.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uadalp      v3.8h, v6.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v5.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
+      "urshr       v1.8h, v3.8h, #1              \n"
+      "urshr       v2.8h, v2.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
     RGBTOUV(v0.8h, v1.8h, v2.8h)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
   : "+r"(src_bgra),  // %0
     "+r"(src_bgra_1),  // %1
     "+r"(dst_u),     // %2
@@ -1519,25 +1706,27 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
   const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
   asm volatile (
     RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
-    "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v3.8h, #1               \n"  // 2x average
-    "urshr      v2.8h, v2.8h, #1               \n"
-    "urshr      v1.8h, v1.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v3.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v3.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v3.8h, #1              \n"  // 2x average
+      "urshr       v2.8h, v2.8h, #1              \n"
+      "urshr       v1.8h, v1.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
     RGBTOUV(v0.8h, v2.8h, v1.8h)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
   : "+r"(src_abgr),  // %0
     "+r"(src_abgr_1),  // %1
     "+r"(dst_u),     // %2
@@ -1557,25 +1746,27 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
   const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
   asm volatile (
     RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
-    "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v7.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v0.8h, v1.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v2.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v3.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v0.8h, v5.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v6.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v7.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
+      "urshr       v1.8h, v1.8h, #1              \n"
+      "urshr       v2.8h, v2.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
     RGBTOUV(v0.8h, v1.8h, v2.8h)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
   : "+r"(src_rgba),  // %0
     "+r"(src_rgba_1),  // %1
     "+r"(dst_u),     // %2
@@ -1595,25 +1786,27 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
   const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
   asm volatile (
     RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
-    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+      "1:                                        \n"
+      "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld3         {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
+      "urshr       v1.8h, v1.8h, #1              \n"
+      "urshr       v2.8h, v2.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
     RGBTOUV(v0.8h, v1.8h, v2.8h)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
   : "+r"(src_rgb24),  // %0
     "+r"(src_rgb24_1),  // %1
     "+r"(dst_u),     // %2
@@ -1633,25 +1826,27 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
   const uint8_t* src_raw_1 = src_raw + src_stride_raw;
   asm volatile (
     RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
-    "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
-    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
-    "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v0.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v2.8h, v2.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v0.8h, v0.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+      "1:                                        \n"
+      "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v2.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v0.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld3         {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v2.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v0.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v2.8h, v2.8h, #1              \n"  // 2x average
+      "urshr       v1.8h, v1.8h, #1              \n"
+      "urshr       v0.8h, v0.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
     RGBTOUV(v2.8h, v1.8h, v0.8h)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
   : "+r"(src_raw),  // %0
     "+r"(src_raw_1),  // %1
     "+r"(dst_u),     // %2
@@ -1663,7 +1858,7 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
   );
 }
 
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+// 16x2 pixels -> 8x1.  width is number of rgb pixels. e.g. 16.
 void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
                         int src_stride_rgb565,
                         uint8_t* dst_u,
@@ -1671,67 +1866,54 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
                         int width) {
   const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
   asm volatile(
-      "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) /
-                                                      // 2
-      "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
-      "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
-      "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
-      "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
-      "movi       v27.16b, #0x80                 \n"  // 128.5 0x8080 in 16bit
+      RGBTOUV_SETUP_REG
       "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
       RGB565TOARGB
-      "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-      "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
+      "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+      "ld1         {v0.16b}, [%0], #16           \n"  // next 8 RGB565 pixels.
       RGB565TOARGB
-      "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "uaddlp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
 
-      "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
+      "ld1         {v0.16b}, [%1], #16           \n"  // load 8 RGB565 pixels.
+      "prfm        pldl1keep, [%1, 448]          \n"
       RGB565TOARGB
-      "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-      "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
+      "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+      "ld1         {v0.16b}, [%1], #16           \n"  // next 8 RGB565 pixels.
       RGB565TOARGB
-      "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-      "ins        v16.D[1], v17.D[0]             \n"
-      "ins        v18.D[1], v19.D[0]             \n"
-      "ins        v20.D[1], v21.D[0]             \n"
-
-      "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
-      "urshr      v5.8h, v18.8h, #1              \n"
-      "urshr      v6.8h, v20.8h, #1              \n"
-
-      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
-      "mul        v16.8h, v4.8h, v22.8h          \n"  // B
-      "mls        v16.8h, v5.8h, v23.8h          \n"  // G
-      "mls        v16.8h, v6.8h, v24.8h          \n"  // R
-      "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
-      "mul        v17.8h, v6.8h, v22.8h          \n"  // R
-      "mls        v17.8h, v5.8h, v26.8h          \n"  // G
-      "mls        v17.8h, v4.8h, v25.8h          \n"  // B
-      "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
-      "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
-      "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
-      "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-      "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-      "b.gt       1b                             \n"
+      "uadalp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uadalp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uadalp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+
+      "ins         v16.D[1], v26.D[0]            \n"
+      "ins         v17.D[1], v27.D[0]            \n"
+      "ins         v18.D[1], v28.D[0]            \n"
+
+      "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
+      "urshr       v1.8h, v17.8h, #1             \n"
+      "urshr       v2.8h, v18.8h, #1             \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+      RGBTOUV(v0.8h, v1.8h, v2.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
       : "+r"(src_rgb565),    // %0
         "+r"(src_rgb565_1),  // %1
-        "+r"(dst_u),         // %2
-        "+r"(dst_v),         // %3
-        "+r"(width)          // %4
+        "+r"(dst_u),           // %2
+        "+r"(dst_v),           // %3
+        "+r"(width)            // %4
       :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
-        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
-        "v27");
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+        "v28");
 }
 
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
@@ -1744,50 +1926,43 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
   asm volatile(
       RGBTOUV_SETUP_REG
       "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB1555 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
       RGB555TOARGB
-      "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-      "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
+      "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+      "ld1         {v0.16b}, [%0], #16           \n"  // next 8 ARGB1555 pixels.
       RGB555TOARGB
-      "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "uaddlp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
 
-      "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
+      "ld1         {v0.16b}, [%1], #16           \n"  // load 8 ARGB1555 pixels.
+      "prfm        pldl1keep, [%1, 448]          \n"
       RGB555TOARGB
-      "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-      "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
+      "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+      "ld1         {v0.16b}, [%1], #16           \n"  // next 8 ARGB1555 pixels.
       RGB555TOARGB
-      "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-      "ins        v16.D[1], v26.D[0]             \n"
-      "ins        v17.D[1], v27.D[0]             \n"
-      "ins        v18.D[1], v28.D[0]             \n"
-
-      "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
-      "urshr      v5.8h, v17.8h, #1              \n"
-      "urshr      v6.8h, v18.8h, #1              \n"
-
-      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
-      "mul        v2.8h, v4.8h, v20.8h           \n"  // B
-      "mls        v2.8h, v5.8h, v21.8h           \n"  // G
-      "mls        v2.8h, v6.8h, v22.8h           \n"  // R
-      "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
-      "mul        v3.8h, v6.8h, v20.8h           \n"  // R
-      "mls        v3.8h, v5.8h, v24.8h           \n"  // G
-      "mls        v3.8h, v4.8h, v23.8h           \n"  // B
-      "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
-      "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
-      "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
-      "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-      "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-      "b.gt       1b                             \n"
+      "uadalp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uadalp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uadalp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+
+      "ins         v16.D[1], v26.D[0]            \n"
+      "ins         v17.D[1], v27.D[0]            \n"
+      "ins         v18.D[1], v28.D[0]            \n"
+
+      "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
+      "urshr       v1.8h, v17.8h, #1             \n"
+      "urshr       v2.8h, v18.8h, #1             \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+      RGBTOUV(v0.8h, v1.8h, v2.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
       : "+r"(src_argb1555),    // %0
         "+r"(src_argb1555_1),  // %1
         "+r"(dst_u),           // %2
@@ -1807,52 +1982,45 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
                           int width) {
   const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
   asm volatile(
-      RGBTOUV_SETUP_REG
+      RGBTOUV_SETUP_REG  // sets v20-v25
       "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
       ARGB4444TOARGB
-      "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-      "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
+      "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+      "ld1         {v0.16b}, [%0], #16           \n"  // next 8 ARGB4444 pixels.
       ARGB4444TOARGB
-      "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "uaddlp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
 
-      "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
+      "ld1         {v0.16b}, [%1], #16           \n"  // load 8 ARGB4444 pixels.
+      "prfm        pldl1keep, [%1, 448]          \n"
       ARGB4444TOARGB
-      "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-      "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
+      "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+      "ld1         {v0.16b}, [%1], #16           \n"  // next 8 ARGB4444 pixels.
       ARGB4444TOARGB
-      "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-      "ins        v16.D[1], v26.D[0]             \n"
-      "ins        v17.D[1], v27.D[0]             \n"
-      "ins        v18.D[1], v28.D[0]             \n"
-
-      "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
-      "urshr      v5.8h, v17.8h, #1              \n"
-      "urshr      v6.8h, v18.8h, #1              \n"
-
-      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
-      "mul        v2.8h, v4.8h, v20.8h           \n"  // B
-      "mls        v2.8h, v5.8h, v21.8h           \n"  // G
-      "mls        v2.8h, v6.8h, v22.8h           \n"  // R
-      "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
-      "mul        v3.8h, v6.8h, v20.8h           \n"  // R
-      "mls        v3.8h, v5.8h, v24.8h           \n"  // G
-      "mls        v3.8h, v4.8h, v23.8h           \n"  // B
-      "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
-      "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
-      "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
-      "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-      "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-      "b.gt       1b                             \n"
+      "uadalp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uadalp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uadalp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+
+      "ins         v16.D[1], v26.D[0]            \n"
+      "ins         v17.D[1], v27.D[0]            \n"
+      "ins         v18.D[1], v28.D[0]            \n"
+
+      "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
+      "urshr       v1.8h, v17.8h, #1             \n"
+      "urshr       v2.8h, v18.8h, #1             \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+      RGBTOUV(v0.8h, v1.8h, v2.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
       : "+r"(src_argb4444),    // %0
         "+r"(src_argb4444_1),  // %1
         "+r"(dst_u),           // %2
@@ -1863,26 +2031,27 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
         "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
         "v28"
 
-      );
+  );
 }
 
 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
   asm volatile(
-      "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
-      "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
-      "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
-      "movi       v27.8b, #16                    \n"  // Add 16 constant
+      "movi        v24.8b, #25                   \n"  // B * 0.1016 coefficient
+      "movi        v25.8b, #129                  \n"  // G * 0.5078 coefficient
+      "movi        v26.8b, #66                   \n"  // R * 0.2578 coefficient
+      "movi        v27.8b, #16                   \n"  // Add 16 constant
       "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       RGB565TOARGB
-      "umull      v3.8h, v0.8b, v24.8b           \n"  // B
-      "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
-      "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
-      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-      "uqadd      v0.8b, v0.8b, v27.8b           \n"
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-      "b.gt       1b                             \n"
+      "umull       v3.8h, v0.8b, v24.8b          \n"  // B
+      "umlal       v3.8h, v1.8b, v25.8b          \n"  // G
+      "umlal       v3.8h, v2.8b, v26.8b          \n"  // R
+      "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.8b, v0.8b, v27.8b          \n"
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
       : "+r"(src_rgb565),  // %0
         "+r"(dst_y),       // %1
         "+r"(width)        // %2
@@ -1895,21 +2064,22 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
                          uint8_t* dst_y,
                          int width) {
   asm volatile(
-      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-      "movi       v7.8b, #16                     \n"  // Add 16 constant
+      "movi        v4.8b, #25                    \n"  // B * 0.1016 coefficient
+      "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
+      "movi        v6.8b, #66                    \n"  // R * 0.2578 coefficient
+      "movi        v7.8b, #16                    \n"  // Add 16 constant
       "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB1555 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       ARGB1555TOARGB
-      "umull      v3.8h, v0.8b, v4.8b            \n"  // B
-      "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
-      "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
-      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-      "uqadd      v0.8b, v0.8b, v7.8b            \n"
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-      "b.gt       1b                             \n"
+      "umull       v3.8h, v0.8b, v4.8b           \n"  // B
+      "umlal       v3.8h, v1.8b, v5.8b           \n"  // G
+      "umlal       v3.8h, v2.8b, v6.8b           \n"  // R
+      "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.8b, v0.8b, v7.8b           \n"
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
       : "+r"(src_argb1555),  // %0
         "+r"(dst_y),         // %1
         "+r"(width)          // %2
@@ -1921,21 +2091,22 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
                          uint8_t* dst_y,
                          int width) {
   asm volatile(
-      "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
-      "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
-      "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
-      "movi       v27.8b, #16                    \n"  // Add 16 constant
+      "movi        v24.8b, #25                   \n"  // B * 0.1016 coefficient
+      "movi        v25.8b, #129                  \n"  // G * 0.5078 coefficient
+      "movi        v26.8b, #66                   \n"  // R * 0.2578 coefficient
+      "movi        v27.8b, #16                   \n"  // Add 16 constant
       "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       ARGB4444TOARGB
-      "umull      v3.8h, v0.8b, v24.8b           \n"  // B
-      "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
-      "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
-      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-      "uqadd      v0.8b, v0.8b, v27.8b           \n"
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-      "b.gt       1b                             \n"
+      "umull       v3.8h, v0.8b, v24.8b          \n"  // B
+      "umlal       v3.8h, v1.8b, v25.8b          \n"  // G
+      "umlal       v3.8h, v2.8b, v26.8b          \n"  // R
+      "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.8b, v0.8b, v27.8b          \n"
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
       : "+r"(src_argb4444),  // %0
         "+r"(dst_y),         // %1
         "+r"(width)          // %2
@@ -1945,20 +2116,21 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
 
 void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
   asm volatile(
-      "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
-      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-      "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
-      "movi       v7.8b, #16                     \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "umull      v16.8h, v1.8b, v4.8b           \n"  // R
-      "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
-      "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
-      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-      "uqadd      v0.8b, v0.8b, v7.8b            \n"
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-      "b.gt       1b                             \n"
+      "movi        v4.8b, #66                    \n"  // R * 0.2578 coefficient
+      "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
+      "movi        v6.8b, #25                    \n"  // B * 0.1016 coefficient
+      "movi        v7.8b, #16                    \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v16.8h, v1.8b, v4.8b          \n"  // R
+      "umlal       v16.8h, v2.8b, v5.8b          \n"  // G
+      "umlal       v16.8h, v3.8b, v6.8b          \n"  // B
+      "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.8b, v0.8b, v7.8b           \n"
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
       : "+r"(src_bgra),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
@@ -1968,20 +2140,21 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
 
 void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
   asm volatile(
-      "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
-      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-      "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
-      "movi       v7.8b, #16                     \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "umull      v16.8h, v0.8b, v4.8b           \n"  // R
-      "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
-      "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
-      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-      "uqadd      v0.8b, v0.8b, v7.8b            \n"
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-      "b.gt       1b                             \n"
+      "movi        v6.8b, #25                    \n"  // B * 0.1016 coefficient
+      "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
+      "movi        v4.8b, #66                    \n"  // R * 0.2578 coefficient
+      "movi        v7.8b, #16                    \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v16.8h, v0.8b, v4.8b          \n"  // R
+      "umlal       v16.8h, v1.8b, v5.8b          \n"  // G
+      "umlal       v16.8h, v2.8b, v6.8b          \n"  // B
+      "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.8b, v0.8b, v7.8b           \n"
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
       : "+r"(src_abgr),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
@@ -1991,20 +2164,21 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
 
 void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
   asm volatile(
-      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-      "movi       v7.8b, #16                     \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "umull      v16.8h, v1.8b, v4.8b           \n"  // B
-      "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
-      "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
-      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-      "uqadd      v0.8b, v0.8b, v7.8b            \n"
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-      "b.gt       1b                             \n"
+      "movi        v4.8b, #25                    \n"  // B * 0.1016 coefficient
+      "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
+      "movi        v6.8b, #66                    \n"  // R * 0.2578 coefficient
+      "movi        v7.8b, #16                    \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v16.8h, v1.8b, v4.8b          \n"  // B
+      "umlal       v16.8h, v2.8b, v5.8b          \n"  // G
+      "umlal       v16.8h, v3.8b, v6.8b          \n"  // R
+      "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.8b, v0.8b, v7.8b           \n"
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
       : "+r"(src_rgba),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
@@ -2014,20 +2188,21 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
 
 void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
   asm volatile(
-      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-      "movi       v7.8b, #16                     \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "umull      v16.8h, v0.8b, v4.8b           \n"  // B
-      "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
-      "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
-      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-      "uqadd      v0.8b, v0.8b, v7.8b            \n"
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-      "b.gt       1b                             \n"
+      "movi        v4.8b, #25                    \n"  // B * 0.1016 coefficient
+      "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
+      "movi        v6.8b, #66                    \n"  // R * 0.2578 coefficient
+      "movi        v7.8b, #16                    \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v16.8h, v0.8b, v4.8b          \n"  // B
+      "umlal       v16.8h, v1.8b, v5.8b          \n"  // G
+      "umlal       v16.8h, v2.8b, v6.8b          \n"  // R
+      "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.8b, v0.8b, v7.8b           \n"
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
       : "+r"(src_rgb24),  // %0
         "+r"(dst_y),      // %1
         "+r"(width)       // %2
@@ -2037,20 +2212,21 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
 
 void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
   asm volatile(
-      "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
-      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-      "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
-      "movi       v7.8b, #16                     \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "umull      v16.8h, v0.8b, v4.8b           \n"  // B
-      "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
-      "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
-      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-      "uqadd      v0.8b, v0.8b, v7.8b            \n"
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-      "b.gt       1b                             \n"
+      "movi        v6.8b, #25                    \n"  // B * 0.1016 coefficient
+      "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
+      "movi        v4.8b, #66                    \n"  // R * 0.2578 coefficient
+      "movi        v7.8b, #16                    \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v16.8h, v0.8b, v4.8b          \n"  // B
+      "umlal       v16.8h, v1.8b, v5.8b          \n"  // G
+      "umlal       v16.8h, v2.8b, v6.8b          \n"  // R
+      "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.8b, v0.8b, v7.8b           \n"
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
       : "+r"(src_raw),  // %0
         "+r"(dst_y),    // %1
         "+r"(width)     // %2
@@ -2058,6 +2234,50 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
 }
 
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  asm volatile(
+      "movi        v4.8b, #29                    \n"  // B * 0.1140 coefficient
+      "movi        v5.8b, #150                   \n"  // G * 0.5870 coefficient
+      "movi        v6.8b, #77                    \n"  // R * 0.2990 coefficient
+      "1:                                        \n"
+      "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v0.8h, v0.8b, v4.8b           \n"  // B
+      "umlal       v0.8h, v1.8b, v5.8b           \n"  // G
+      "umlal       v0.8h, v2.8b, v6.8b           \n"  // R
+      "uqrshrn     v0.8b, v0.8h, #8              \n"  // 16 bit to 8 bit Y
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_yj),     // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  asm volatile(
+      "movi        v6.8b, #29                    \n"  // B * 0.1140 coefficient
+      "movi        v5.8b, #150                   \n"  // G * 0.5870 coefficient
+      "movi        v4.8b, #77                    \n"  // R * 0.2990 coefficient
+      "1:                                        \n"
+      "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v0.8h, v0.8b, v4.8b           \n"  // B
+      "umlal       v0.8h, v1.8b, v5.8b           \n"  // G
+      "umlal       v0.8h, v2.8b, v6.8b           \n"  // R
+      "uqrshrn     v0.8b, v0.8h, #8              \n"  // 16 bit to 8 bit Y
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_raw),  // %0
+        "+r"(dst_yj),   // %1
+        "+r"(width)     // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
 // Bilinear filter 16x2 -> 16x1
 void InterpolateRow_NEON(uint8_t* dst_ptr,
                          const uint8_t* src_ptr,
@@ -2068,44 +2288,49 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
   int y0_fraction = 256 - y1_fraction;
   const uint8_t* src_ptr1 = src_ptr + src_stride;
   asm volatile(
-      "cmp        %w4, #0                        \n"
-      "b.eq       100f                           \n"
-      "cmp        %w4, #128                      \n"
-      "b.eq       50f                            \n"
+      "cmp         %w4, #0                       \n"
+      "b.eq        100f                          \n"
+      "cmp         %w4, #128                     \n"
+      "b.eq        50f                           \n"
 
-      "dup        v5.16b, %w4                    \n"
-      "dup        v4.16b, %w5                    \n"
+      "dup         v5.16b, %w4                   \n"
+      "dup         v4.16b, %w5                   \n"
       // General purpose row blend.
       "1:                                        \n"
-      "ld1        {v0.16b}, [%1], #16            \n"
-      "ld1        {v1.16b}, [%2], #16            \n"
-      "subs       %w3, %w3, #16                  \n"
-      "umull      v2.8h, v0.8b,  v4.8b           \n"
-      "umull2     v3.8h, v0.16b, v4.16b          \n"
-      "umlal      v2.8h, v1.8b,  v5.8b           \n"
-      "umlal2     v3.8h, v1.16b, v5.16b          \n"
-      "rshrn      v0.8b,  v2.8h, #8              \n"
-      "rshrn2     v0.16b, v3.8h, #8              \n"
-      "st1        {v0.16b}, [%0], #16            \n"
-      "b.gt       1b                             \n"
-      "b          99f                            \n"
+      "ld1         {v0.16b}, [%1], #16           \n"
+      "ld1         {v1.16b}, [%2], #16           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "subs        %w3, %w3, #16                 \n"
+      "umull       v2.8h, v0.8b,  v4.8b          \n"
+      "umull2      v3.8h, v0.16b, v4.16b         \n"
+      "umlal       v2.8h, v1.8b,  v5.8b          \n"
+      "umlal2      v3.8h, v1.16b, v5.16b         \n"
+      "rshrn       v0.8b,  v2.8h, #8             \n"
+      "rshrn2      v0.16b, v3.8h, #8             \n"
+      "st1         {v0.16b}, [%0], #16           \n"
+      "b.gt        1b                            \n"
+      "b           99f                           \n"
 
       // Blend 50 / 50.
       "50:                                       \n"
-      "ld1        {v0.16b}, [%1], #16            \n"
-      "ld1        {v1.16b}, [%2], #16            \n"
-      "subs       %w3, %w3, #16                  \n"
-      "urhadd     v0.16b, v0.16b, v1.16b         \n"
-      "st1        {v0.16b}, [%0], #16            \n"
-      "b.gt       50b                            \n"
-      "b          99f                            \n"
+      "ld1         {v0.16b}, [%1], #16           \n"
+      "ld1         {v1.16b}, [%2], #16           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "subs        %w3, %w3, #16                 \n"
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"
+      "st1         {v0.16b}, [%0], #16           \n"
+      "b.gt        50b                           \n"
+      "b           99f                           \n"
 
       // Blend 100 / 0 - Copy row unchanged.
       "100:                                      \n"
-      "ld1        {v0.16b}, [%1], #16            \n"
-      "subs       %w3, %w3, #16                  \n"
-      "st1        {v0.16b}, [%0], #16            \n"
-      "b.gt       100b                           \n"
+      "ld1         {v0.16b}, [%1], #16           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "subs        %w3, %w3, #16                 \n"
+      "st1         {v0.16b}, [%0], #16           \n"
+      "b.gt        100b                          \n"
 
       "99:                                       \n"
       : "+r"(dst_ptr),      // %0
@@ -2124,56 +2349,60 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0,
                        uint8_t* dst_argb,
                        int width) {
   asm volatile(
-      "subs       %w3, %w3, #8                   \n"
-      "b.lt       89f                            \n"
+      "subs        %w3, %w3, #8                  \n"
+      "b.lt        89f                           \n"
       // Blend 8 pixels.
       "8:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0
-                                                            // pixels
-      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1
-                                                            // pixels
-      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-      "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
-      "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
-      "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
-      "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
-      "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
-      "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
-      "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
-      "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
-      "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
-      "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
-      "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
-      "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
-      "movi       v3.8b, #255                    \n"  // a = 255
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
-                                                            // pixels
-      "b.ge       8b                             \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "umull       v16.8h, v4.8b, v3.8b          \n"  // db * a
+      "umull       v17.8h, v5.8b, v3.8b          \n"  // dg * a
+      "umull       v18.8h, v6.8b, v3.8b          \n"  // dr * a
+      "uqrshrn     v16.8b, v16.8h, #8            \n"  // db >>= 8
+      "uqrshrn     v17.8b, v17.8h, #8            \n"  // dg >>= 8
+      "uqrshrn     v18.8b, v18.8h, #8            \n"  // dr >>= 8
+      "uqsub       v4.8b, v4.8b, v16.8b          \n"  // db - (db * a / 256)
+      "uqsub       v5.8b, v5.8b, v17.8b          \n"  // dg - (dg * a / 256)
+      "uqsub       v6.8b, v6.8b, v18.8b          \n"  // dr - (dr * a / 256)
+      "uqadd       v0.8b, v0.8b, v4.8b           \n"  // + sb
+      "uqadd       v1.8b, v1.8b, v5.8b           \n"  // + sg
+      "uqadd       v2.8b, v2.8b, v6.8b           \n"  // + sr
+      "movi        v3.8b, #255                   \n"  // a = 255
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+                                                             // pixels
+      "b.ge        8b                            \n"
 
       "89:                                       \n"
-      "adds       %w3, %w3, #8-1                 \n"
-      "b.lt       99f                            \n"
+      "adds        %w3, %w3, #8-1                \n"
+      "b.lt        99f                           \n"
 
       // Blend 1 pixels.
       "1:                                        \n"
-      "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
-      "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
-      "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
-      "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
-      "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
-      "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
-      "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
-      "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
-      "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
-      "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
-      "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
-      "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
-      "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
-      "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
-      "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
-      "movi       v3.8b, #255                    \n"  // a = 255
-      "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
-      "b.ge       1b                             \n"
+      "ld4         {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel
+                                                           // ARGB0.
+      "ld4         {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel
+                                                           // ARGB1.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "subs        %w3, %w3, #1                  \n"  // 1 processed per loop.
+      "umull       v16.8h, v4.8b, v3.8b          \n"  // db * a
+      "umull       v17.8h, v5.8b, v3.8b          \n"  // dg * a
+      "umull       v18.8h, v6.8b, v3.8b          \n"  // dr * a
+      "uqrshrn     v16.8b, v16.8h, #8            \n"  // db >>= 8
+      "uqrshrn     v17.8b, v17.8h, #8            \n"  // dg >>= 8
+      "uqrshrn     v18.8b, v18.8h, #8            \n"  // dr >>= 8
+      "uqsub       v4.8b, v4.8b, v16.8b          \n"  // db - (db * a / 256)
+      "uqsub       v5.8b, v5.8b, v17.8b          \n"  // dg - (dg * a / 256)
+      "uqsub       v6.8b, v6.8b, v18.8b          \n"  // dr - (dr * a / 256)
+      "uqadd       v0.8b, v0.8b, v4.8b           \n"  // + sb
+      "uqadd       v1.8b, v1.8b, v5.8b           \n"  // + sg
+      "uqadd       v2.8b, v2.8b, v6.8b           \n"  // + sr
+      "movi        v3.8b, #255                   \n"  // a = 255
+      "st4         {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
+      "b.ge        1b                            \n"
 
       "99:                                       \n"
 
@@ -2193,17 +2422,17 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
   asm volatile(
       // Attenuate 8 pixels.
       "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
-      "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
-      "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
-      "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
-      "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
-      "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
-                                                            // pixels
-      "b.gt       1b                             \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v4.8h, v0.8b, v3.8b           \n"  // b * a
+      "umull       v5.8h, v1.8b, v3.8b           \n"  // g * a
+      "umull       v6.8h, v2.8b, v3.8b           \n"  // r * a
+      "uqrshrn     v0.8b, v4.8h, #8              \n"  // b >>= 8
+      "uqrshrn     v1.8b, v5.8h, #8              \n"  // g >>= 8
+      "uqrshrn     v2.8b, v6.8h, #8              \n"  // r >>= 8
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
@@ -2219,32 +2448,33 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
                           int interval_offset,
                           int width) {
   asm volatile(
-      "dup        v4.8h, %w2                     \n"
-      "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
-      "dup        v5.8h, %w3                     \n"  // interval multiply.
-      "dup        v6.8h, %w4                     \n"  // interval add
+      "dup         v4.8h, %w2                    \n"
+      "ushr        v4.8h, v4.8h, #1              \n"  // scale >>= 1
+      "dup         v5.8h, %w3                    \n"  // interval multiply.
+      "dup         v6.8h, %w4                    \n"  // interval add
 
       // 8 pixel loop.
       "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8  ARGB.
-      "subs       %w1, %w1, #8                   \n"    // 8 processed per loop.
-      "uxtl       v0.8h, v0.8b                   \n"    // b (0 .. 255)
-      "uxtl       v1.8h, v1.8b                   \n"
-      "uxtl       v2.8h, v2.8b                   \n"
-      "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
-      "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
-      "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
-      "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
-      "mul        v1.8h, v1.8h, v5.8h            \n"  // g
-      "mul        v2.8h, v2.8h, v5.8h            \n"  // r
-      "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
-      "add        v1.8h, v1.8h, v6.8h            \n"  // g
-      "add        v2.8h, v2.8h, v6.8h            \n"  // r
-      "uqxtn      v0.8b, v0.8h                   \n"
-      "uqxtn      v1.8b, v1.8h                   \n"
-      "uqxtn      v2.8b, v2.8h                   \n"
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB
-      "b.gt       1b                             \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8  ARGB.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w1, %w1, #8                  \n"  // 8 processed per loop.
+      "uxtl        v0.8h, v0.8b                  \n"  // b (0 .. 255)
+      "uxtl        v1.8h, v1.8b                  \n"
+      "uxtl        v2.8h, v2.8b                  \n"
+      "sqdmulh     v0.8h, v0.8h, v4.8h           \n"  // b * scale
+      "sqdmulh     v1.8h, v1.8h, v4.8h           \n"  // g
+      "sqdmulh     v2.8h, v2.8h, v4.8h           \n"  // r
+      "mul         v0.8h, v0.8h, v5.8h           \n"  // b * interval_size
+      "mul         v1.8h, v1.8h, v5.8h           \n"  // g
+      "mul         v2.8h, v2.8h, v5.8h           \n"  // r
+      "add         v0.8h, v0.8h, v6.8h           \n"  // b + interval_offset
+      "add         v1.8h, v1.8h, v6.8h           \n"  // g
+      "add         v2.8h, v2.8h, v6.8h           \n"  // r
+      "uqxtn       v0.8b, v0.8h                  \n"
+      "uqxtn       v1.8b, v1.8h                  \n"
+      "uqxtn       v2.8b, v2.8h                  \n"
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
       : "+r"(dst_argb),       // %0
         "+r"(width)           // %1
       : "r"(scale),           // %2
@@ -2261,28 +2491,29 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
                        int width,
                        uint32_t value) {
   asm volatile(
-      "dup        v0.4s, %w3                     \n"  // duplicate scale value.
-      "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
-      "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
+      "dup         v0.4s, %w3                    \n"  // duplicate scale value.
+      "zip1        v0.8b, v0.8b, v0.8b           \n"  // v0.8b aarrggbb.
+      "ushr        v0.8h, v0.8h, #1              \n"  // scale / 2.
 
       // 8 pixel loop.
       "1:                                        \n"
-      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
-      "uxtl       v5.8h, v5.8b                   \n"
-      "uxtl       v6.8h, v6.8b                   \n"
-      "uxtl       v7.8h, v7.8b                   \n"
-      "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
-      "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
-      "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
-      "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
-      "uqxtn      v4.8b, v4.8h                   \n"
-      "uqxtn      v5.8b, v5.8h                   \n"
-      "uqxtn      v6.8b, v6.8h                   \n"
-      "uqxtn      v7.8b, v7.8h                   \n"
-      "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB
-      "b.gt       1b                             \n"
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "uxtl        v4.8h, v4.8b                  \n"  // b (0 .. 255)
+      "uxtl        v5.8h, v5.8b                  \n"
+      "uxtl        v6.8h, v6.8b                  \n"
+      "uxtl        v7.8h, v7.8b                  \n"
+      "sqrdmulh    v4.8h, v4.8h, v0.h[0]         \n"  // b * scale * 2
+      "sqrdmulh    v5.8h, v5.8h, v0.h[1]         \n"  // g
+      "sqrdmulh    v6.8h, v6.8h, v0.h[2]         \n"  // r
+      "sqrdmulh    v7.8h, v7.8h, v0.h[3]         \n"  // a
+      "uqxtn       v4.8b, v4.8h                  \n"
+      "uqxtn       v5.8b, v5.8h                  \n"
+      "uqxtn       v6.8b, v6.8h                  \n"
+      "uqxtn       v7.8b, v7.8h                  \n"
+      "st4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
@@ -2292,23 +2523,24 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
 
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
 // Similar to ARGBToYJ but stores ARGB.
-// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
 void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
   asm volatile(
-      "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
-      "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
-      "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "umull      v4.8h, v0.8b, v24.8b           \n"  // B
-      "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
-      "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
-      "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
-      "orr        v1.8b, v0.8b, v0.8b            \n"  // G
-      "orr        v2.8b, v0.8b, v0.8b            \n"  // R
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
-      "b.gt       1b                             \n"
+      "movi        v24.8b, #29                   \n"  // B * 0.1140 coefficient
+      "movi        v25.8b, #150                  \n"  // G * 0.5870 coefficient
+      "movi        v26.8b, #77                   \n"  // R * 0.2990 coefficient
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v4.8h, v0.8b, v24.8b          \n"  // B
+      "umlal       v4.8h, v1.8b, v25.8b          \n"  // G
+      "umlal       v4.8h, v2.8b, v26.8b          \n"  // R
+      "uqrshrn     v0.8b, v4.8h, #8              \n"  // 16 bit to 8 bit B
+      "orr         v1.8b, v0.8b, v0.8b           \n"  // G
+      "orr         v2.8b, v0.8b, v0.8b           \n"  // R
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
+      "b.gt        1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
@@ -2323,32 +2555,33 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
 
 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
   asm volatile(
-      "movi       v20.8b, #17                    \n"  // BB coefficient
-      "movi       v21.8b, #68                    \n"  // BG coefficient
-      "movi       v22.8b, #35                    \n"  // BR coefficient
-      "movi       v24.8b, #22                    \n"  // GB coefficient
-      "movi       v25.8b, #88                    \n"  // GG coefficient
-      "movi       v26.8b, #45                    \n"  // GR coefficient
-      "movi       v28.8b, #24                    \n"  // BB coefficient
-      "movi       v29.8b, #98                    \n"  // BG coefficient
-      "movi       v30.8b, #50                    \n"  // BR coefficient
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
-      "subs       %w1, %w1, #8                   \n"   // 8 processed per loop.
-      "umull      v4.8h, v0.8b, v20.8b           \n"   // B to Sepia B
-      "umlal      v4.8h, v1.8b, v21.8b           \n"   // G
-      "umlal      v4.8h, v2.8b, v22.8b           \n"   // R
-      "umull      v5.8h, v0.8b, v24.8b           \n"   // B to Sepia G
-      "umlal      v5.8h, v1.8b, v25.8b           \n"   // G
-      "umlal      v5.8h, v2.8b, v26.8b           \n"   // R
-      "umull      v6.8h, v0.8b, v28.8b           \n"   // B to Sepia R
-      "umlal      v6.8h, v1.8b, v29.8b           \n"   // G
-      "umlal      v6.8h, v2.8b, v30.8b           \n"   // R
-      "uqshrn     v0.8b, v4.8h, #7               \n"   // 16 bit to 8 bit B
-      "uqshrn     v1.8b, v5.8h, #7               \n"   // 16 bit to 8 bit G
-      "uqshrn     v2.8b, v6.8h, #7               \n"   // 16 bit to 8 bit R
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
-      "b.gt       1b                             \n"
+      "movi        v20.8b, #17                   \n"  // BB coefficient
+      "movi        v21.8b, #68                   \n"  // BG coefficient
+      "movi        v22.8b, #35                   \n"  // BR coefficient
+      "movi        v24.8b, #22                   \n"  // GB coefficient
+      "movi        v25.8b, #88                   \n"  // GG coefficient
+      "movi        v26.8b, #45                   \n"  // GR coefficient
+      "movi        v28.8b, #24                   \n"  // BB coefficient
+      "movi        v29.8b, #98                   \n"  // BG coefficient
+      "movi        v30.8b, #50                   \n"  // BR coefficient
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w1, %w1, #8                  \n"  // 8 processed per loop.
+      "umull       v4.8h, v0.8b, v20.8b          \n"  // B to Sepia B
+      "umlal       v4.8h, v1.8b, v21.8b          \n"  // G
+      "umlal       v4.8h, v2.8b, v22.8b          \n"  // R
+      "umull       v5.8h, v0.8b, v24.8b          \n"  // B to Sepia G
+      "umlal       v5.8h, v1.8b, v25.8b          \n"  // G
+      "umlal       v5.8h, v2.8b, v26.8b          \n"  // R
+      "umull       v6.8h, v0.8b, v28.8b          \n"  // B to Sepia R
+      "umlal       v6.8h, v1.8b, v29.8b          \n"  // G
+      "umlal       v6.8h, v2.8b, v30.8b          \n"  // R
+      "uqshrn      v0.8b, v4.8h, #7              \n"  // 16 bit to 8 bit B
+      "uqshrn      v1.8b, v5.8h, #7              \n"  // 16 bit to 8 bit G
+      "uqshrn      v2.8b, v6.8h, #7              \n"  // 16 bit to 8 bit R
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
+      "b.gt        1b                            \n"
       : "+r"(dst_argb),  // %0
         "+r"(width)      // %1
       :
@@ -2364,51 +2597,52 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
                              const int8_t* matrix_argb,
                              int width) {
   asm volatile(
-      "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
-      "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
-      "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
-
-      "1:                                        \n"
-      "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 ARGB
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
-      "uxtl       v17.8h, v17.8b                 \n"  // g
-      "uxtl       v18.8h, v18.8b                 \n"  // r
-      "uxtl       v19.8h, v19.8b                 \n"  // a
-      "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
-      "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
-      "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
-      "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
-      "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
-      "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
-      "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
-      "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
-      "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
-      "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
-      "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
-      "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
-      "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
-      "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
-      "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
-      "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
-      "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
-      "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
-      "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
-      "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
-      "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
-      "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
-      "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
-      "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
-      "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
-      "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
-      "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
-      "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
-      "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
-      "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
-      "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
-      "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
-      "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 ARGB
-      "b.gt       1b                             \n"
+      "ld1         {v2.16b}, [%3]                \n"  // load 3 ARGB vectors.
+      "sxtl        v0.8h, v2.8b                  \n"  // B,G coefficients s16.
+      "sxtl2       v1.8h, v2.16b                 \n"  // R,A coefficients s16.
+
+      "1:                                        \n"
+      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 ARGB
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "uxtl        v16.8h, v16.8b                \n"  // b (0 .. 255) 16 bit
+      "uxtl        v17.8h, v17.8b                \n"  // g
+      "uxtl        v18.8h, v18.8b                \n"  // r
+      "uxtl        v19.8h, v19.8b                \n"  // a
+      "mul         v22.8h, v16.8h, v0.h[0]       \n"  // B = B * Matrix B
+      "mul         v23.8h, v16.8h, v0.h[4]       \n"  // G = B * Matrix G
+      "mul         v24.8h, v16.8h, v1.h[0]       \n"  // R = B * Matrix R
+      "mul         v25.8h, v16.8h, v1.h[4]       \n"  // A = B * Matrix A
+      "mul         v4.8h, v17.8h, v0.h[1]        \n"  // B += G * Matrix B
+      "mul         v5.8h, v17.8h, v0.h[5]        \n"  // G += G * Matrix G
+      "mul         v6.8h, v17.8h, v1.h[1]        \n"  // R += G * Matrix R
+      "mul         v7.8h, v17.8h, v1.h[5]        \n"  // A += G * Matrix A
+      "sqadd       v22.8h, v22.8h, v4.8h         \n"  // Accumulate B
+      "sqadd       v23.8h, v23.8h, v5.8h         \n"  // Accumulate G
+      "sqadd       v24.8h, v24.8h, v6.8h         \n"  // Accumulate R
+      "sqadd       v25.8h, v25.8h, v7.8h         \n"  // Accumulate A
+      "mul         v4.8h, v18.8h, v0.h[2]        \n"  // B += R * Matrix B
+      "mul         v5.8h, v18.8h, v0.h[6]        \n"  // G += R * Matrix G
+      "mul         v6.8h, v18.8h, v1.h[2]        \n"  // R += R * Matrix R
+      "mul         v7.8h, v18.8h, v1.h[6]        \n"  // A += R * Matrix A
+      "sqadd       v22.8h, v22.8h, v4.8h         \n"  // Accumulate B
+      "sqadd       v23.8h, v23.8h, v5.8h         \n"  // Accumulate G
+      "sqadd       v24.8h, v24.8h, v6.8h         \n"  // Accumulate R
+      "sqadd       v25.8h, v25.8h, v7.8h         \n"  // Accumulate A
+      "mul         v4.8h, v19.8h, v0.h[3]        \n"  // B += A * Matrix B
+      "mul         v5.8h, v19.8h, v0.h[7]        \n"  // G += A * Matrix G
+      "mul         v6.8h, v19.8h, v1.h[3]        \n"  // R += A * Matrix R
+      "mul         v7.8h, v19.8h, v1.h[7]        \n"  // A += A * Matrix A
+      "sqadd       v22.8h, v22.8h, v4.8h         \n"  // Accumulate B
+      "sqadd       v23.8h, v23.8h, v5.8h         \n"  // Accumulate G
+      "sqadd       v24.8h, v24.8h, v6.8h         \n"  // Accumulate R
+      "sqadd       v25.8h, v25.8h, v7.8h         \n"  // Accumulate A
+      "sqshrun     v16.8b, v22.8h, #6            \n"  // 16 bit to 8 bit B
+      "sqshrun     v17.8b, v23.8h, #6            \n"  // 16 bit to 8 bit G
+      "sqshrun     v18.8b, v24.8h, #6            \n"  // 16 bit to 8 bit R
+      "sqshrun     v19.8b, v25.8h, #6            \n"  // 16 bit to 8 bit A
+      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
       : "+r"(src_argb),   // %0
         "+r"(dst_argb),   // %1
         "+r"(width)       // %2
@@ -2426,19 +2660,21 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
   asm volatile(
       // 8 pixel loop.
       "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
-      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-      "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
-      "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
-      "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
-      "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
-      "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
-      "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
-      "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
-      "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
-      "b.gt       1b                             \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "umull       v0.8h, v0.8b, v4.8b           \n"  // multiply B
+      "umull       v1.8h, v1.8b, v5.8b           \n"  // multiply G
+      "umull       v2.8h, v2.8b, v6.8b           \n"  // multiply R
+      "umull       v3.8h, v3.8b, v7.8b           \n"  // multiply A
+      "rshrn       v0.8b, v0.8h, #8              \n"  // 16 bit to 8 bit B
+      "rshrn       v1.8b, v1.8h, #8              \n"  // 16 bit to 8 bit G
+      "rshrn       v2.8b, v2.8h, #8              \n"  // 16 bit to 8 bit R
+      "rshrn       v3.8b, v3.8h, #8              \n"  // 16 bit to 8 bit A
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
       : "+r"(src_argb0),  // %0
         "+r"(src_argb1),  // %1
         "+r"(dst_argb),   // %2
@@ -2455,15 +2691,17 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
   asm volatile(
       // 8 pixel loop.
       "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
-      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-      "uqadd      v0.8b, v0.8b, v4.8b            \n"
-      "uqadd      v1.8b, v1.8b, v5.8b            \n"
-      "uqadd      v2.8b, v2.8b, v6.8b            \n"
-      "uqadd      v3.8b, v3.8b, v7.8b            \n"
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
-      "b.gt       1b                             \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "uqadd       v0.8b, v0.8b, v4.8b           \n"
+      "uqadd       v1.8b, v1.8b, v5.8b           \n"
+      "uqadd       v2.8b, v2.8b, v6.8b           \n"
+      "uqadd       v3.8b, v3.8b, v7.8b           \n"
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
       : "+r"(src_argb0),  // %0
         "+r"(src_argb1),  // %1
         "+r"(dst_argb),   // %2
@@ -2480,15 +2718,17 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
   asm volatile(
       // 8 pixel loop.
       "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
-      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-      "uqsub      v0.8b, v0.8b, v4.8b            \n"
-      "uqsub      v1.8b, v1.8b, v5.8b            \n"
-      "uqsub      v2.8b, v2.8b, v6.8b            \n"
-      "uqsub      v3.8b, v3.8b, v7.8b            \n"
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
-      "b.gt       1b                             \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "uqsub       v0.8b, v0.8b, v4.8b           \n"
+      "uqsub       v1.8b, v1.8b, v5.8b           \n"
+      "uqsub       v2.8b, v2.8b, v6.8b           \n"
+      "uqsub       v3.8b, v3.8b, v7.8b           \n"
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
       : "+r"(src_argb0),  // %0
         "+r"(src_argb1),  // %1
         "+r"(dst_argb),   // %2
@@ -2507,17 +2747,19 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
                    uint8_t* dst_argb,
                    int width) {
   asm volatile(
-      "movi       v3.8b, #255                    \n"  // alpha
+      "movi        v3.8b, #255                   \n"  // alpha
       // 8 pixel loop.
       "1:                                        \n"
-      "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
-      "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
-      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-      "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
-      "orr        v1.8b, v0.8b, v0.8b            \n"
-      "orr        v2.8b, v0.8b, v0.8b            \n"
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
-      "b.gt       1b                             \n"
+      "ld1         {v0.8b}, [%0], #8             \n"  // load 8 sobelx.
+      "ld1         {v1.8b}, [%1], #8             \n"  // load 8 sobely.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "uqadd       v0.8b, v0.8b, v1.8b           \n"  // add
+      "orr         v1.8b, v0.8b, v0.8b           \n"
+      "orr         v2.8b, v0.8b, v0.8b           \n"
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
       : "+r"(src_sobelx),  // %0
         "+r"(src_sobely),  // %1
         "+r"(dst_argb),    // %2
@@ -2534,12 +2776,14 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
   asm volatile(
       // 16 pixel loop.
       "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
-      "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
-      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
-      "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
-      "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
-      "b.gt       1b                             \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 16 sobelx.
+      "ld1         {v1.16b}, [%1], #16           \n"  // load 16 sobely.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop.
+      "uqadd       v0.16b, v0.16b, v1.16b        \n"  // add
+      "st1         {v0.16b}, [%2], #16           \n"  // store 16 pixels.
+      "b.gt        1b                            \n"
       : "+r"(src_sobelx),  // %0
         "+r"(src_sobely),  // %1
         "+r"(dst_y),       // %2
@@ -2558,15 +2802,17 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
                      uint8_t* dst_argb,
                      int width) {
   asm volatile(
-      "movi       v3.8b, #255                    \n"  // alpha
+      "movi        v3.8b, #255                   \n"  // alpha
       // 8 pixel loop.
       "1:                                        \n"
-      "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
-      "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
-      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-      "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
-      "b.gt       1b                             \n"
+      "ld1         {v2.8b}, [%0], #8             \n"  // load 8 sobelx.
+      "ld1         {v0.8b}, [%1], #8             \n"  // load 8 sobely.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "uqadd       v1.8b, v0.8b, v2.8b           \n"  // add
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
       : "+r"(src_sobelx),  // %0
         "+r"(src_sobely),  // %1
         "+r"(dst_argb),    // %2
@@ -2586,23 +2832,26 @@ void SobelXRow_NEON(const uint8_t* src_y0,
                     int width) {
   asm volatile(
       "1:                                        \n"
-      "ld1        {v0.8b}, [%0],%5               \n"  // top
-      "ld1        {v1.8b}, [%0],%6               \n"
-      "usubl      v0.8h, v0.8b, v1.8b            \n"
-      "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
-      "ld1        {v3.8b}, [%1],%6               \n"
-      "usubl      v1.8h, v2.8b, v3.8b            \n"
-      "add        v0.8h, v0.8h, v1.8h            \n"
-      "add        v0.8h, v0.8h, v1.8h            \n"
-      "ld1        {v2.8b}, [%2],%5               \n"  // bottom
-      "ld1        {v3.8b}, [%2],%6               \n"
-      "subs       %w4, %w4, #8                   \n"  // 8 pixels
-      "usubl      v1.8h, v2.8b, v3.8b            \n"
-      "add        v0.8h, v0.8h, v1.8h            \n"
-      "abs        v0.8h, v0.8h                   \n"
-      "uqxtn      v0.8b, v0.8h                   \n"
-      "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
-      "b.gt       1b                             \n"
+      "ld1         {v0.8b}, [%0],%5              \n"  // top
+      "ld1         {v1.8b}, [%0],%6              \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "usubl       v0.8h, v0.8b, v1.8b           \n"
+      "ld1         {v2.8b}, [%1],%5              \n"  // center * 2
+      "ld1         {v3.8b}, [%1],%6              \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "usubl       v1.8h, v2.8b, v3.8b           \n"
+      "add         v0.8h, v0.8h, v1.8h           \n"
+      "add         v0.8h, v0.8h, v1.8h           \n"
+      "ld1         {v2.8b}, [%2],%5              \n"  // bottom
+      "ld1         {v3.8b}, [%2],%6              \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "subs        %w4, %w4, #8                  \n"  // 8 pixels
+      "usubl       v1.8h, v2.8b, v3.8b           \n"
+      "add         v0.8h, v0.8h, v1.8h           \n"
+      "abs         v0.8h, v0.8h                  \n"
+      "uqxtn       v0.8b, v0.8h                  \n"
+      "st1         {v0.8b}, [%3], #8             \n"  // store 8 sobelx
+      "b.gt        1b                            \n"
       : "+r"(src_y0),                           // %0
         "+r"(src_y1),                           // %1
         "+r"(src_y2),                           // %2
@@ -2611,7 +2860,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
       : "r"(2LL),                               // %5
         "r"(6LL)                                // %6
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 // SobelY as a matrix is
@@ -2624,23 +2873,25 @@ void SobelYRow_NEON(const uint8_t* src_y0,
                     int width) {
   asm volatile(
       "1:                                        \n"
-      "ld1        {v0.8b}, [%0],%4               \n"  // left
-      "ld1        {v1.8b}, [%1],%4               \n"
-      "usubl      v0.8h, v0.8b, v1.8b            \n"
-      "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
-      "ld1        {v3.8b}, [%1],%4               \n"
-      "usubl      v1.8h, v2.8b, v3.8b            \n"
-      "add        v0.8h, v0.8h, v1.8h            \n"
-      "add        v0.8h, v0.8h, v1.8h            \n"
-      "ld1        {v2.8b}, [%0],%5               \n"  // right
-      "ld1        {v3.8b}, [%1],%5               \n"
-      "subs       %w3, %w3, #8                   \n"  // 8 pixels
-      "usubl      v1.8h, v2.8b, v3.8b            \n"
-      "add        v0.8h, v0.8h, v1.8h            \n"
-      "abs        v0.8h, v0.8h                   \n"
-      "uqxtn      v0.8b, v0.8h                   \n"
-      "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
-      "b.gt       1b                             \n"
+      "ld1         {v0.8b}, [%0],%4              \n"  // left
+      "ld1         {v1.8b}, [%1],%4              \n"
+      "usubl       v0.8h, v0.8b, v1.8b           \n"
+      "ld1         {v2.8b}, [%0],%4              \n"  // center * 2
+      "ld1         {v3.8b}, [%1],%4              \n"
+      "usubl       v1.8h, v2.8b, v3.8b           \n"
+      "add         v0.8h, v0.8h, v1.8h           \n"
+      "add         v0.8h, v0.8h, v1.8h           \n"
+      "ld1         {v2.8b}, [%0],%5              \n"  // right
+      "ld1         {v3.8b}, [%1],%5              \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "subs        %w3, %w3, #8                  \n"  // 8 pixels
+      "usubl       v1.8h, v2.8b, v3.8b           \n"
+      "add         v0.8h, v0.8h, v1.8h           \n"
+      "abs         v0.8h, v0.8h                  \n"
+      "uqxtn       v0.8b, v0.8h                  \n"
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 sobely
+      "b.gt        1b                            \n"
       : "+r"(src_y0),                           // %0
         "+r"(src_y1),                           // %1
         "+r"(dst_sobely),                       // %2
@@ -2648,7 +2899,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
       : "r"(1LL),                               // %4
         "r"(6LL)                                // %5
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 // Caveat - rounds float to half float whereas scaling version truncates.
@@ -2658,16 +2909,17 @@ void HalfFloat1Row_NEON(const uint16_t* src,
                         int width) {
   asm volatile(
       "1:                                        \n"
-      "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
-      "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
-      "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
-      "uxtl2      v3.4s, v1.8h                   \n"
-      "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
-      "scvtf      v3.4s, v3.4s                   \n"
-      "fcvtn      v1.4h, v2.4s                   \n"  // 8 half floats
-      "fcvtn2     v1.8h, v3.4s                   \n"
-      "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
-      "b.gt       1b                             \n"
+      "ld1         {v1.16b}, [%0], #16           \n"  // load 8 shorts
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
+      "uxtl        v2.4s, v1.4h                  \n"  // 8 int's
+      "uxtl2       v3.4s, v1.8h                  \n"
+      "scvtf       v2.4s, v2.4s                  \n"  // 8 floats
+      "scvtf       v3.4s, v3.4s                  \n"
+      "fcvtn       v1.4h, v2.4s                  \n"  // 8 half floats
+      "fcvtn2      v1.8h, v3.4s                  \n"
+      "st1         {v1.16b}, [%1], #16           \n"  // store 8 shorts
+      "b.gt        1b                            \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
         "+r"(width)  // %2
@@ -2681,18 +2933,19 @@ void HalfFloatRow_NEON(const uint16_t* src,
                        int width) {
   asm volatile(
       "1:                                        \n"
-      "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
-      "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
-      "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
-      "uxtl2      v3.4s, v1.8h                   \n"
-      "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
-      "scvtf      v3.4s, v3.4s                   \n"
-      "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // adjust exponent
-      "fmul       v3.4s, v3.4s, %3.s[0]          \n"
-      "uqshrn     v1.4h, v2.4s, #13              \n"  // isolate halffloat
-      "uqshrn2    v1.8h, v3.4s, #13              \n"
-      "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
-      "b.gt       1b                             \n"
+      "ld1         {v1.16b}, [%0], #16           \n"  // load 8 shorts
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
+      "uxtl        v2.4s, v1.4h                  \n"  // 8 int's
+      "uxtl2       v3.4s, v1.8h                  \n"
+      "scvtf       v2.4s, v2.4s                  \n"  // 8 floats
+      "scvtf       v3.4s, v3.4s                  \n"
+      "fmul        v2.4s, v2.4s, %3.s[0]         \n"  // adjust exponent
+      "fmul        v3.4s, v3.4s, %3.s[0]         \n"
+      "uqshrn      v1.4h, v2.4s, #13             \n"  // isolate halffloat
+      "uqshrn2     v1.8h, v3.4s, #13             \n"
+      "st1         {v1.16b}, [%1], #16           \n"  // store 8 shorts
+      "b.gt        1b                            \n"
       : "+r"(src),                      // %0
         "+r"(dst),                      // %1
         "+r"(width)                     // %2
@@ -2706,17 +2959,18 @@ void ByteToFloatRow_NEON(const uint8_t* src,
                          int width) {
   asm volatile(
       "1:                                        \n"
-      "ld1        {v1.8b}, [%0], #8              \n"  // load 8 bytes
-      "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
-      "uxtl       v1.8h, v1.8b                   \n"  // 8 shorts
-      "uxtl       v2.4s, v1.4h                   \n"  // 8 ints
-      "uxtl2      v3.4s, v1.8h                   \n"
-      "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
-      "scvtf      v3.4s, v3.4s                   \n"
-      "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // scale
-      "fmul       v3.4s, v3.4s, %3.s[0]          \n"
-      "st1        {v2.16b, v3.16b}, [%1], #32    \n"  // store 8 floats
-      "b.gt       1b                             \n"
+      "ld1         {v1.8b}, [%0], #8             \n"  // load 8 bytes
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
+      "uxtl        v1.8h, v1.8b                  \n"  // 8 shorts
+      "uxtl        v2.4s, v1.4h                  \n"  // 8 ints
+      "uxtl2       v3.4s, v1.8h                  \n"
+      "scvtf       v2.4s, v2.4s                  \n"  // 8 floats
+      "scvtf       v3.4s, v3.4s                  \n"
+      "fmul        v2.4s, v2.4s, %3.s[0]         \n"  // scale
+      "fmul        v3.4s, v3.4s, %3.s[0]         \n"
+      "st1         {v2.16b, v3.16b}, [%1], #32   \n"  // store 8 floats
+      "b.gt        1b                            \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
         "+r"(width)  // %2
@@ -2730,20 +2984,21 @@ float ScaleMaxSamples_NEON(const float* src,
                            int width) {
   float fmax;
   asm volatile(
-      "movi       v5.4s, #0                      \n"  // max
-      "movi       v6.4s, #0                      \n"
-
-      "1:                                        \n"
-      "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-      "fmul       v3.4s, v1.4s, %4.s[0]          \n"  // scale
-      "fmul       v4.4s, v2.4s, %4.s[0]          \n"  // scale
-      "fmax       v5.4s, v5.4s, v1.4s            \n"  // max
-      "fmax       v6.4s, v6.4s, v2.4s            \n"
-      "st1        {v3.4s, v4.4s}, [%1], #32      \n"  // store 8 samples
-      "b.gt       1b                             \n"
-      "fmax       v5.4s, v5.4s, v6.4s            \n"  // max
-      "fmaxv      %s3, v5.4s                     \n"  // signed max acculator
+      "movi        v5.4s, #0                     \n"  // max
+      "movi        v6.4s, #0                     \n"
+
+      "1:                                        \n"
+      "ld1         {v1.4s, v2.4s}, [%0], #32     \n"  // load 8 samples
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "fmul        v3.4s, v1.4s, %4.s[0]         \n"  // scale
+      "fmul        v4.4s, v2.4s, %4.s[0]         \n"  // scale
+      "fmax        v5.4s, v5.4s, v1.4s           \n"  // max
+      "fmax        v6.4s, v6.4s, v2.4s           \n"
+      "st1         {v3.4s, v4.4s}, [%1], #32     \n"  // store 8 samples
+      "b.gt        1b                            \n"
+      "fmax        v5.4s, v5.4s, v6.4s           \n"  // max
+      "fmaxv       %s3, v5.4s                    \n"  // signed max acculator
       : "+r"(src),                                    // %0
         "+r"(dst),                                    // %1
         "+r"(width),                                  // %2
@@ -2759,21 +3014,22 @@ float ScaleSumSamples_NEON(const float* src,
                            int width) {
   float fsum;
   asm volatile(
-      "movi       v5.4s, #0                      \n"  // max
-      "movi       v6.4s, #0                      \n"  // max
-
-      "1:                                        \n"
-      "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-      "fmul       v3.4s, v1.4s, %4.s[0]          \n"  // scale
-      "fmul       v4.4s, v2.4s, %4.s[0]          \n"
-      "fmla       v5.4s, v1.4s, v1.4s            \n"  // sum of squares
-      "fmla       v6.4s, v2.4s, v2.4s            \n"
-      "st1        {v3.4s, v4.4s}, [%1], #32      \n"  // store 8 samples
-      "b.gt       1b                             \n"
-      "faddp      v5.4s, v5.4s, v6.4s            \n"
-      "faddp      v5.4s, v5.4s, v5.4s            \n"
-      "faddp      %3.4s, v5.4s, v5.4s            \n"  // sum
+      "movi        v5.4s, #0                     \n"  // max
+      "movi        v6.4s, #0                     \n"  // max
+
+      "1:                                        \n"
+      "ld1         {v1.4s, v2.4s}, [%0], #32     \n"  // load 8 samples
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "fmul        v3.4s, v1.4s, %4.s[0]         \n"  // scale
+      "fmul        v4.4s, v2.4s, %4.s[0]         \n"
+      "fmla        v5.4s, v1.4s, v1.4s           \n"  // sum of squares
+      "fmla        v6.4s, v2.4s, v2.4s           \n"
+      "st1         {v3.4s, v4.4s}, [%1], #32     \n"  // store 8 samples
+      "b.gt        1b                            \n"
+      "faddp       v5.4s, v5.4s, v6.4s           \n"
+      "faddp       v5.4s, v5.4s, v5.4s           \n"
+      "faddp       %3.4s, v5.4s, v5.4s           \n"  // sum
       : "+r"(src),                                    // %0
         "+r"(dst),                                    // %1
         "+r"(width),                                  // %2
@@ -2786,12 +3042,13 @@ float ScaleSumSamples_NEON(const float* src,
 void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
   asm volatile(
       "1:                                        \n"
-      "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-      "fmul       v1.4s, v1.4s, %3.s[0]          \n"  // scale
-      "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // scale
-      "st1        {v1.4s, v2.4s}, [%1], #32      \n"  // store 8 samples
-      "b.gt       1b                             \n"
+      "ld1         {v1.4s, v2.4s}, [%0], #32     \n"  // load 8 samples
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "fmul        v1.4s, v1.4s, %3.s[0]         \n"  // scale
+      "fmul        v2.4s, v2.4s, %3.s[0]         \n"  // scale
+      "st1         {v1.4s, v2.4s}, [%1], #32     \n"  // store 8 samples
+      "b.gt        1b                            \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
         "+r"(width)  // %2
@@ -2808,26 +3065,31 @@ void GaussCol_NEON(const uint16_t* src0,
                    uint32_t* dst,
                    int width) {
   asm volatile(
-      "movi       v6.8h, #4                      \n"  // constant 4
-      "movi       v7.8h, #6                      \n"  // constant 6
-
-      "1:                                        \n"
-      "ld1        {v1.8h}, [%0], #16             \n"  // load 8 samples, 5 rows
-      "ld1        {v2.8h}, [%4], #16             \n"
-      "uaddl      v0.4s, v1.4h, v2.4h            \n"  // * 1
-      "uaddl2     v1.4s, v1.8h, v2.8h            \n"  // * 1
-      "ld1        {v2.8h}, [%1], #16             \n"
-      "umlal      v0.4s, v2.4h, v6.4h            \n"  // * 4
-      "umlal2     v1.4s, v2.8h, v6.8h            \n"  // * 4
-      "ld1        {v2.8h}, [%2], #16             \n"
-      "umlal      v0.4s, v2.4h, v7.4h            \n"  // * 6
-      "umlal2     v1.4s, v2.8h, v7.8h            \n"  // * 6
-      "ld1        {v2.8h}, [%3], #16             \n"
-      "umlal      v0.4s, v2.4h, v6.4h            \n"  // * 4
-      "umlal2     v1.4s, v2.8h, v6.8h            \n"  // * 4
-      "subs       %w6, %w6, #8                   \n"  // 8 processed per loop
-      "st1        {v0.4s,v1.4s}, [%5], #32       \n"  // store 8 samples
-      "b.gt       1b                             \n"
+      "movi        v6.8h, #4                     \n"  // constant 4
+      "movi        v7.8h, #6                     \n"  // constant 6
+
+      "1:                                        \n"
+      "ld1         {v1.8h}, [%0], #16            \n"  // load 8 samples, 5 rows
+      "ld1         {v2.8h}, [%4], #16            \n"
+      "uaddl       v0.4s, v1.4h, v2.4h           \n"  // * 1
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddl2      v1.4s, v1.8h, v2.8h           \n"  // * 1
+      "ld1         {v2.8h}, [%1], #16            \n"
+      "umlal       v0.4s, v2.4h, v6.4h           \n"  // * 4
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "umlal2      v1.4s, v2.8h, v6.8h           \n"  // * 4
+      "ld1         {v2.8h}, [%2], #16            \n"
+      "umlal       v0.4s, v2.4h, v7.4h           \n"  // * 6
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "umlal2      v1.4s, v2.8h, v7.8h           \n"  // * 6
+      "ld1         {v2.8h}, [%3], #16            \n"
+      "umlal       v0.4s, v2.4h, v6.4h           \n"  // * 4
+      "prfm        pldl1keep, [%3, 448]          \n"
+      "umlal2      v1.4s, v2.8h, v6.8h           \n"  // * 4
+      "subs        %w6, %w6, #8                  \n"  // 8 processed per loop
+      "st1         {v0.4s,v1.4s}, [%5], #32      \n"  // store 8 samples
+      "prfm        pldl1keep, [%4, 448]          \n"
+      "b.gt        1b                            \n"
       : "+r"(src0),  // %0
         "+r"(src1),  // %1
         "+r"(src2),  // %2
@@ -2845,27 +3107,28 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
   const uint32_t* src2 = src + 2;
   const uint32_t* src3 = src + 3;
   asm volatile(
-      "movi       v6.4s, #4                      \n"  // constant 4
-      "movi       v7.4s, #6                      \n"  // constant 6
-
-      "1:                                        \n"
-      "ld1        {v0.4s,v1.4s,v2.4s}, [%0], %6  \n"  // load 12 source samples
-      "add        v0.4s, v0.4s, v1.4s            \n"  // * 1
-      "add        v1.4s, v1.4s, v2.4s            \n"  // * 1
-      "ld1        {v2.4s,v3.4s}, [%2], #32       \n"
-      "mla        v0.4s, v2.4s, v7.4s            \n"  // * 6
-      "mla        v1.4s, v3.4s, v7.4s            \n"  // * 6
-      "ld1        {v2.4s,v3.4s}, [%1], #32       \n"
-      "ld1        {v4.4s,v5.4s}, [%3], #32       \n"
-      "add        v2.4s, v2.4s, v4.4s            \n"  // add rows for * 4
-      "add        v3.4s, v3.4s, v5.4s            \n"
-      "mla        v0.4s, v2.4s, v6.4s            \n"  // * 4
-      "mla        v1.4s, v3.4s, v6.4s            \n"  // * 4
-      "subs       %w5, %w5, #8                   \n"  // 8 processed per loop
-      "uqrshrn    v0.4h, v0.4s, #8               \n"  // round and pack
-      "uqrshrn2   v0.8h, v1.4s, #8               \n"
-      "st1        {v0.8h}, [%4], #16             \n"  // store 8 samples
-      "b.gt       1b                             \n"
+      "movi        v6.4s, #4                     \n"  // constant 4
+      "movi        v7.4s, #6                     \n"  // constant 6
+
+      "1:                                        \n"
+      "ld1         {v0.4s,v1.4s,v2.4s}, [%0], %6 \n"  // load 12 source samples
+      "add         v0.4s, v0.4s, v1.4s           \n"  // * 1
+      "add         v1.4s, v1.4s, v2.4s           \n"  // * 1
+      "ld1         {v2.4s,v3.4s}, [%2], #32      \n"
+      "mla         v0.4s, v2.4s, v7.4s           \n"  // * 6
+      "mla         v1.4s, v3.4s, v7.4s           \n"  // * 6
+      "ld1         {v2.4s,v3.4s}, [%1], #32      \n"
+      "ld1         {v4.4s,v5.4s}, [%3], #32      \n"
+      "add         v2.4s, v2.4s, v4.4s           \n"  // add rows for * 4
+      "add         v3.4s, v3.4s, v5.4s           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "mla         v0.4s, v2.4s, v6.4s           \n"  // * 4
+      "mla         v1.4s, v3.4s, v6.4s           \n"  // * 4
+      "subs        %w5, %w5, #8                  \n"  // 8 processed per loop
+      "uqrshrn     v0.4h, v0.4s, #8              \n"  // round and pack
+      "uqrshrn2    v0.8h, v1.4s, #8              \n"
+      "st1         {v0.8h}, [%4], #16            \n"  // store 8 samples
+      "b.gt        1b                            \n"
       : "+r"(src),   // %0
         "+r"(src1),  // %1
         "+r"(src2),  // %2
@@ -2876,6 +3139,246 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
 
+static const vecf32 kGaussCoefficients = {4.0f, 6.0f, 1.0f / 256.0f, 0.0f};
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_F32_NEON(const float* src0,
+                       const float* src1,
+                       const float* src2,
+                       const float* src3,
+                       const float* src4,
+                       float* dst,
+                       int width) {
+  asm volatile(
+      "ld2r        {v6.4s, v7.4s}, [%7]          \n"  // constants 4 and 6
+
+      "1:                                        \n"
+      "ld1         {v0.4s, v1.4s}, [%0], #32     \n"  // load 8 samples, 5 rows
+      "ld1         {v2.4s, v3.4s}, [%1], #32     \n"
+      "fmla        v0.4s, v2.4s, v6.4s           \n"  // * 4
+      "ld1         {v4.4s, v5.4s}, [%2], #32     \n"
+      "fmla        v1.4s, v3.4s, v6.4s           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "fmla        v0.4s, v4.4s, v7.4s           \n"  // * 6
+      "ld1         {v2.4s, v3.4s}, [%3], #32     \n"
+      "fmla        v1.4s, v5.4s, v7.4s           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "fmla        v0.4s, v2.4s, v6.4s           \n"  // * 4
+      "ld1         {v4.4s, v5.4s}, [%4], #32     \n"
+      "fmla        v1.4s, v3.4s, v6.4s           \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "fadd        v0.4s, v0.4s, v4.4s           \n"  // * 1
+      "prfm        pldl1keep, [%3, 448]          \n"
+      "fadd        v1.4s, v1.4s, v5.4s           \n"
+      "prfm        pldl1keep, [%4, 448]          \n"
+      "subs        %w6, %w6, #8                  \n"  // 8 processed per loop
+      "st1         {v0.4s, v1.4s}, [%5], #32     \n"  // store 8 samples
+      "b.gt        1b                            \n"
+      : "+r"(src0),               // %0
+        "+r"(src1),               // %1
+        "+r"(src2),               // %2
+        "+r"(src3),               // %3
+        "+r"(src4),               // %4
+        "+r"(dst),                // %5
+        "+r"(width)               // %6
+      : "r"(&kGaussCoefficients)  // %7
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_F32_NEON(const float* src, float* dst, int width) {
+  asm volatile(
+      "ld3r        {v6.4s, v7.4s, v8.4s}, [%3]   \n"  // constants 4, 6, 1/256
+
+      "1:                                        \n"
+      "ld1         {v0.4s, v1.4s, v2.4s}, [%0], %4 \n"  // load 12 samples, 5
+                                                        // rows
+      "fadd        v0.4s, v0.4s, v1.4s           \n"    // * 1
+      "ld1         {v4.4s, v5.4s}, [%0], %5      \n"
+      "fadd        v1.4s, v1.4s, v2.4s           \n"
+      "fmla        v0.4s, v4.4s, v7.4s           \n"  // * 6
+      "ld1         {v2.4s, v3.4s}, [%0], %4      \n"
+      "fmla        v1.4s, v5.4s, v7.4s           \n"
+      "ld1         {v4.4s, v5.4s}, [%0], %6      \n"
+      "fadd        v2.4s, v2.4s, v4.4s           \n"
+      "fadd        v3.4s, v3.4s, v5.4s           \n"
+      "fmla        v0.4s, v2.4s, v6.4s           \n"  // * 4
+      "fmla        v1.4s, v3.4s, v6.4s           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "fmul        v0.4s, v0.4s, v8.4s           \n"  // / 256
+      "fmul        v1.4s, v1.4s, v8.4s           \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "st1         {v0.4s, v1.4s}, [%1], #32     \n"  // store 8 samples
+      "b.gt        1b                            \n"
+      : "+r"(src),                 // %0
+        "+r"(dst),                 // %1
+        "+r"(width)                // %2
+      : "r"(&kGaussCoefficients),  // %3
+        "r"(8LL),                  // %4
+        "r"(-4LL),                 // %5
+        "r"(20LL)                  // %6
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8");
+}
+
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_yuv24,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v2.16b}, [%0], #16           \n"  // load 16 Y values
+      "ld2         {v0.8b, v1.8b}, [%1], #16     \n"  // load 8 VU values
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "zip1        v0.16b, v0.16b, v0.16b        \n"      // replicate V values
+      "zip1        v1.16b, v1.16b, v1.16b        \n"      // replicate U values
+      "subs        %w3, %w3, #16                 \n"      // 16 pixels per loop
+      "st3         {v0.16b,v1.16b,v2.16b}, [%2], #48 \n"  // store 16 YUV pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_vu),     // %1
+        "+r"(dst_yuv24),  // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2");
+}
+
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+                      int src_stride_ayuv,
+                      uint8_t* dst_uv,
+                      int width) {
+  const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
+  asm volatile(
+
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ayuv
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v0.8h, v0.16b                 \n"  // V 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v1.16b                 \n"  // U 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v0.8h, v4.16b                 \n"  // V 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v5.16b                 \n"  // U 16 bytes -> 8 shorts.
+      "uqrshrn     v3.8b, v0.8h, #2              \n"  // 2x2 average
+      "uqrshrn     v2.8b, v1.8h, #2              \n"
+      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop.
+      "st2         {v2.8b,v3.8b}, [%2], #16      \n"  // store 8 pixels UV.
+      "b.gt        1b                            \n"
+      : "+r"(src_ayuv),    // %0
+        "+r"(src_ayuv_1),  // %1
+        "+r"(dst_uv),      // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+                      int src_stride_ayuv,
+                      uint8_t* dst_vu,
+                      int width) {
+  const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
+  asm volatile(
+
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ayuv
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v0.8h, v0.16b                 \n"  // V 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v1.16b                 \n"  // U 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v0.8h, v4.16b                 \n"  // V 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v5.16b                 \n"  // U 16 bytes -> 8 shorts.
+      "uqrshrn     v0.8b, v0.8h, #2              \n"  // 2x2 average
+      "uqrshrn     v1.8b, v1.8h, #2              \n"
+      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop.
+      "st2         {v0.8b,v1.8b}, [%2], #16      \n"  // store 8 pixels VU.
+      "b.gt        1b                            \n"
+      : "+r"(src_ayuv),    // %0
+        "+r"(src_ayuv_1),  // %1
+        "+r"(dst_vu),      // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Copy row of AYUV Y's into Y
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop
+      "st1         {v2.16b}, [%1], #16           \n"  // store 16 Y pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_ayuv),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// Shuffle table for swapping UV bytes.
+static const uvec8 kShuffleSwapUV = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
+                                     9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+  asm volatile(
+      "ld1         {v2.16b}, [%3]                \n"  // shuffler
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], 16            \n"  // load 16 UV values
+      "ld1         {v1.16b}, [%0], 16            \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop
+      "tbl         v0.16b, {v0.16b}, v2.16b      \n"
+      "tbl         v1.16b, {v1.16b}, v2.16b      \n"
+      "stp         q0, q1, [%1], 32              \n"  // store 16 VU pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_uv),         // %0
+        "+r"(dst_vu),         // %1
+        "+r"(width)           // %2
+      : "r"(&kShuffleSwapUV)  // %3
+      : "cc", "memory", "v0", "v1", "v2");
+}
+
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+                         int src_stride_u,
+                         const uint8_t* src_v,
+                         int src_stride_v,
+                         uint8_t* dst_uv,
+                         int width) {
+  const uint8_t* src_u_1 = src_u + src_stride_u;
+  const uint8_t* src_v_1 = src_v + src_stride_v;
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 16 U values
+      "ld1         {v1.16b}, [%2], #16           \n"  // load 16 V values
+      "ld1         {v2.16b}, [%1], #16           \n"
+      "ld1         {v3.16b}, [%3], #16           \n"
+      "uaddlp      v0.8h, v0.16b                 \n"  // half size
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v1.8h, v1.16b                 \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "uadalp      v0.8h, v2.16b                 \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v1.8h, v3.16b                 \n"
+      "prfm        pldl1keep, [%3, 448]          \n"
+      "uqrshrn     v0.8b, v0.8h, #2              \n"
+      "uqrshrn     v1.8b, v1.8h, #2              \n"
+      "subs        %w5, %w5, #16                 \n"  // 16 src pixels per loop
+      "st2         {v0.8b, v1.8b}, [%4], #16     \n"  // store 8 UV pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_u),    // %0
+        "+r"(src_u_1),  // %1
+        "+r"(src_v),    // %2
+        "+r"(src_v_1),  // %3
+        "+r"(dst_uv),   // %4
+        "+r"(width)     // %5
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 #ifdef __cplusplus
diff --git a/chromium/third_party/libyuv/source/row_win.cc b/chromium/third_party/libyuv/source/row_win.cc
index 5500d7f5a64..9afcf060a4d 100644
--- a/chromium/third_party/libyuv/source/row_win.cc
+++ b/chromium/third_party/libyuv/source/row_win.cc
@@ -1594,9 +1594,9 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
-    vbroadcastf128 ymm5, xmmword ptr kAddUV128
-    vbroadcastf128 ymm6, xmmword ptr kARGBToV
-    vbroadcastf128 ymm7, xmmword ptr kARGBToU
+    vbroadcastf128 ymm5, xmmword ptr kAddUVJ128
+    vbroadcastf128 ymm6, xmmword ptr kARGBToVJ
+    vbroadcastf128 ymm7, xmmword ptr kARGBToUJ
     sub        edi, edx   // stride from u to v
 
  convertloop:
@@ -2898,10 +2898,12 @@ __declspec(naked) void I422ToRGBARow_SSSE3(
 }
 #endif  // HAS_I422TOARGBROW_SSSE3
 
+// I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter
 #ifdef HAS_I400TOARGBROW_SSE2
 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
 __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
                                           uint8_t* rgb_buf,
+                                          const struct YuvConstants*,
                                           int width) {
   __asm {
     mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
@@ -2949,6 +2951,7 @@ __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
 // note: vpunpcklbw mutates and vpackuswb unmutates.
 __declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
                                           uint8_t* rgb_buf,
+                                          const struct YuvConstants*,
                                           int width) {
   __asm {
     mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
@@ -3045,15 +3048,15 @@ __declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
 }
 #endif  // HAS_MIRRORROW_AVX2
 
-#ifdef HAS_MIRRORUVROW_SSSE3
+#ifdef HAS_MIRRORSPLITUVROW_SSSE3
 // Shuffle table for reversing the bytes of UV channels.
 static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
                                        15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
 
-__declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src,
-                                         uint8_t* dst_u,
-                                         uint8_t* dst_v,
-                                         int width) {
+__declspec(naked) void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+                                              uint8_t* dst_u,
+                                              uint8_t* dst_v,
+                                              int width) {
   __asm {
     push      edi
     mov       eax, [esp + 4 + 4]  // src
@@ -3078,7 +3081,7 @@ __declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src,
     ret
   }
 }
-#endif  // HAS_MIRRORUVROW_SSSE3
+#endif  // HAS_MIRRORSPLITUVROW_SSSE3
 
 #ifdef HAS_ARGBMIRRORROW_SSE2
 __declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
@@ -4222,7 +4225,7 @@ __declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
     add        ecx, 4 - 1
     jl         convertloop1b
 
-        // 1 pixel loop.
+            // 1 pixel loop.
   convertloop1:
     movd       xmm3, [eax]  // src argb
     lea        eax, [eax + 4]
@@ -5360,7 +5363,7 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
     add        ecx, 4 - 1
     jl         l1b
 
-        // 1 pixel loop
+            // 1 pixel loop
   l1:
     movdqu     xmm0, [eax]
     psubd      xmm0, [eax + edx * 4]
@@ -5448,9 +5451,9 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
     add        ecx, 4 - 1
     jl         l1b
 
-        // 1 pixel loop
+            // 1 pixel loop
   l1:
-    movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
+    movd       xmm2, dword ptr [eax]  // 1 argb pixel
     lea        eax, [eax + 4]
     punpcklbw  xmm2, xmm1
     punpcklwd  xmm2, xmm1
@@ -5534,7 +5537,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
     add        ecx, 4 - 1
     jl         l1b
 
-        // 1 pixel loop
+            // 1 pixel loop
   l1:
     cvttps2dq  xmm0, xmm2  // x, y float to int
     packssdw   xmm0, xmm0  // x, y as shorts
diff --git a/chromium/third_party/libyuv/source/scale.cc b/chromium/third_party/libyuv/source/scale.cc
index 2cfa1c6cb1c..cf3c0332573 100644
--- a/chromium/third_party/libyuv/source/scale.cc
+++ b/chromium/third_party/libyuv/source/scale.cc
@@ -17,6 +17,7 @@
 #include "libyuv/planar_functions.h"  // For CopyPlane
 #include "libyuv/row.h"
 #include "libyuv/scale_row.h"
+#include "libyuv/scale_uv.h"  // For UVScale
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -103,6 +104,21 @@ static void ScalePlaneDown2(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN2_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_Any_MMI
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MMI
+                                          : ScaleRowDown2Box_Any_MMI);
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MMI
+                                               : (filtering == kFilterLinear
+                                                      ? ScaleRowDown2Linear_MMI
+                                                      : ScaleRowDown2Box_MMI);
+    }
+  }
+#endif
 #if defined(HAS_SCALEROWDOWN2_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ScaleRowDown2 =
@@ -169,6 +185,14 @@ static void ScalePlaneDown2_16(int src_width,
                                           : ScaleRowDown2Box_16_SSE2);
   }
 #endif
+#if defined(HAS_SCALEROWDOWN2_16_MMI)
+  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_MMI
+                                             : (filtering == kFilterLinear
+                                                    ? ScaleRowDown2Linear_16_MMI
+                                                    : ScaleRowDown2Box_16_MMI);
+  }
+#endif
 
   if (filtering == kFilterLinear) {
     src_stride = 0;
@@ -232,6 +256,15 @@ static void ScalePlaneDown4(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN4_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_Any_MMI : ScaleRowDown4_Any_MMI;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_MMI : ScaleRowDown4_MMI;
+    }
+  }
+#endif
 #if defined(HAS_SCALEROWDOWN4_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ScaleRowDown4 =
@@ -284,6 +317,11 @@ static void ScalePlaneDown4_16(int src_width,
         filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2;
   }
 #endif
+#if defined(HAS_SCALEROWDOWN4_16_MMI)
+  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_MMI : ScaleRowDown4_16_MMI;
+  }
+#endif
 
   if (filtering == kFilterLinear) {
     src_stride = 0;
@@ -341,6 +379,18 @@ static void ScalePlaneDown34(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN34_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_Any_MMI;
+      ScaleRowDown34_1 = ScaleRowDown34_Any_MMI;
+      if (dst_width % 24 == 0) {
+        ScaleRowDown34_0 = ScaleRowDown34_MMI;
+        ScaleRowDown34_1 = ScaleRowDown34_MMI;
+      }
+    }
+  }
+#endif
 #if defined(HAS_SCALEROWDOWN34_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     if (!filtering) {
@@ -841,6 +891,14 @@ static void ScalePlaneBox(int src_width,
       }
     }
 #endif
+#if defined(HAS_SCALEADDROW_MMI)
+    if (TestCpuFlag(kCpuHasMMI)) {
+      ScaleAddRow = ScaleAddRow_Any_MMI;
+      if (IS_ALIGNED(src_width, 8)) {
+        ScaleAddRow = ScaleAddRow_MMI;
+      }
+    }
+#endif
 #if defined(HAS_SCALEADDROW_MSA)
     if (TestCpuFlag(kCpuHasMSA)) {
       ScaleAddRow = ScaleAddRow_Any_MSA;
@@ -904,6 +962,11 @@ static void ScalePlaneBox_16(int src_width,
     }
 #endif
 
+#if defined(HAS_SCALEADDROW_16_MMI)
+    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(src_width, 4)) {
+      ScaleAddRow = ScaleAddRow_16_MMI;
+    }
+#endif
     for (j = 0; j < dst_height; ++j) {
       int boxheight;
       int iy = y >> 16;
@@ -980,6 +1043,14 @@ void ScalePlaneBilinearDown(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    InterpolateRow = InterpolateRow_Any_MMI;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     InterpolateRow = InterpolateRow_Any_MSA;
@@ -1207,6 +1278,11 @@ void ScalePlaneBilinearUp(int src_width,
       ScaleFilterCols = ScaleColsUp2_SSE2;
     }
 #endif
+#if defined(HAS_SCALECOLS_MMI)
+    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleColsUp2_MMI;
+    }
+#endif
   }
 
   if (y > max_y) {
@@ -1334,6 +1410,11 @@ void ScalePlaneBilinearUp_16(int src_width,
       ScaleFilterCols = ScaleColsUp2_16_SSE2;
     }
 #endif
+#if defined(HAS_SCALECOLS_16_MMI)
+    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleColsUp2_16_MMI;
+    }
+#endif
   }
 
   if (y > max_y) {
@@ -1419,6 +1500,11 @@ static void ScalePlaneSimple(int src_width,
       ScaleCols = ScaleColsUp2_SSE2;
     }
 #endif
+#if defined(HAS_SCALECOLS_MMI)
+    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+      ScaleCols = ScaleColsUp2_MMI;
+    }
+#endif
   }
 
   for (i = 0; i < dst_height; ++i) {
@@ -1455,6 +1541,11 @@ static void ScalePlaneSimple_16(int src_width,
       ScaleCols = ScaleColsUp2_16_SSE2;
     }
 #endif
+#if defined(HAS_SCALECOLS_16_MMI)
+    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+      ScaleCols = ScaleColsUp2_16_MMI;
+    }
+#endif
   }
 
   for (i = 0; i < dst_height; ++i) {
@@ -1580,7 +1671,7 @@ void ScalePlane_16(const uint16_t* src,
   }
   if (dst_width == src_width && filtering != kFilterBox) {
     int dy = FixedDiv(src_height, dst_height);
-    // Arbitrary scale vertically, but unscaled vertically.
+    // Arbitrary scale vertically, but unscaled horizontally.
     ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
                           dst_stride, src, dst, 0, 0, dy, 1, filtering);
     return;
@@ -1710,6 +1801,109 @@ int I420Scale_16(const uint16_t* src_y,
   return 0;
 }
 
+// Scale an I444 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int I444Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering) {
+  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+             dst_width, dst_height, filtering);
+  ScalePlane(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
+             dst_width, dst_height, filtering);
+  ScalePlane(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
+             dst_width, dst_height, filtering);
+  return 0;
+}
+
+LIBYUV_API
+int I444Scale_16(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering) {
+  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+                dst_width, dst_height, filtering);
+  ScalePlane_16(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
+                dst_width, dst_height, filtering);
+  ScalePlane_16(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
+                dst_width, dst_height, filtering);
+  return 0;
+}
+
+// Scale an NV12 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int NV12Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_uv,
+              int src_stride_uv,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_uv,
+              int dst_stride_uv,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  if (!src_y || !src_uv || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_uv ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+             dst_width, dst_height, filtering);
+  UVScale(src_uv, src_stride_uv, src_halfwidth, src_halfheight, dst_uv,
+          dst_stride_uv, dst_halfwidth, dst_halfheight, filtering);
+  return 0;
+}
+
 // Deprecated api
 LIBYUV_API
 int Scale(const uint8_t* src_y,
diff --git a/chromium/third_party/libyuv/source/scale_any.cc b/chromium/third_party/libyuv/source/scale_any.cc
index 53ad1364049..c93d70c5fc7 100644
--- a/chromium/third_party/libyuv/source/scale_any.cc
+++ b/chromium/third_party/libyuv/source/scale_any.cc
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <string.h>  // For memset/memcpy
+
 #include "libyuv/scale.h"
 #include "libyuv/scale_row.h"
 
@@ -18,46 +20,6 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
-#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \
-  void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
-               int dx) {                                                       \
-    int r = dst_width & MASK;                                                  \
-    int n = dst_width & ~MASK;                                                 \
-    if (n > 0) {                                                               \
-      TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                   \
-    }                                                                          \
-    TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx);                     \
-  }
-
-#ifdef HAS_SCALEFILTERCOLS_NEON
-CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
-#endif
-#ifdef HAS_SCALEFILTERCOLS_MSA
-CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
-#endif
-#ifdef HAS_SCALEARGBCOLS_NEON
-CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
-#endif
-#ifdef HAS_SCALEARGBCOLS_MSA
-CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
-#endif
-#ifdef HAS_SCALEARGBFILTERCOLS_NEON
-CANY(ScaleARGBFilterCols_Any_NEON,
-     ScaleARGBFilterCols_NEON,
-     ScaleARGBFilterCols_C,
-     4,
-     3)
-#endif
-#ifdef HAS_SCALEARGBFILTERCOLS_MSA
-CANY(ScaleARGBFilterCols_Any_MSA,
-     ScaleARGBFilterCols_MSA,
-     ScaleARGBFilterCols_C,
-     4,
-     7)
-#endif
-#undef CANY
-
 // Fixed scale down.
 // Mask may be non-power of 2, so use MOD
 #define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
@@ -108,6 +70,22 @@ SDODD(ScaleRowDown2Box_Odd_SSSE3,
       1,
       15)
 #endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
+SDANY(ScaleUVRowDown2Box_Any_SSSE3,
+      ScaleUVRowDown2Box_SSSE3,
+      ScaleUVRowDown2Box_C,
+      2,
+      2,
+      4)
+#endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
+SDANY(ScaleUVRowDown2Box_Any_AVX2,
+      ScaleUVRowDown2Box_AVX2,
+      ScaleUVRowDown2Box_C,
+      2,
+      2,
+      8)
+#endif
 #ifdef HAS_SCALEROWDOWN2_AVX2
 SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
 SDANY(ScaleRowDown2Linear_Any_AVX2,
@@ -150,6 +128,15 @@ SDODD(ScaleRowDown2Box_Odd_NEON,
       1,
       15)
 #endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_NEON
+SDANY(ScaleUVRowDown2Box_Any_NEON,
+      ScaleUVRowDown2Box_NEON,
+      ScaleUVRowDown2Box_C,
+      2,
+      2,
+      8)
+#endif
+
 #ifdef HAS_SCALEROWDOWN2_MSA
 SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
 SDANY(ScaleRowDown2Linear_Any_MSA,
@@ -165,6 +152,27 @@ SDANY(ScaleRowDown2Box_Any_MSA,
       1,
       31)
 #endif
+#ifdef HAS_SCALEROWDOWN2_MMI
+SDANY(ScaleRowDown2_Any_MMI, ScaleRowDown2_MMI, ScaleRowDown2_C, 2, 1, 7)
+SDANY(ScaleRowDown2Linear_Any_MMI,
+      ScaleRowDown2Linear_MMI,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      7)
+SDANY(ScaleRowDown2Box_Any_MMI,
+      ScaleRowDown2Box_MMI,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      7)
+SDODD(ScaleRowDown2Box_Odd_MMI,
+      ScaleRowDown2Box_MMI,
+      ScaleRowDown2Box_Odd_C,
+      2,
+      1,
+      7)
+#endif
 #ifdef HAS_SCALEROWDOWN4_SSSE3
 SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
 SDANY(ScaleRowDown4Box_Any_SSSE3,
@@ -201,6 +209,15 @@ SDANY(ScaleRowDown4Box_Any_MSA,
       1,
       15)
 #endif
+#ifdef HAS_SCALEROWDOWN4_MMI
+SDANY(ScaleRowDown4_Any_MMI, ScaleRowDown4_MMI, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_MMI,
+      ScaleRowDown4Box_MMI,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      7)
+#endif
 #ifdef HAS_SCALEROWDOWN34_SSSE3
 SDANY(ScaleRowDown34_Any_SSSE3,
       ScaleRowDown34_SSSE3,
@@ -261,6 +278,14 @@ SDANY(ScaleRowDown34_1_Box_Any_MSA,
       1,
       47)
 #endif
+#ifdef HAS_SCALEROWDOWN34_MMI
+SDANY(ScaleRowDown34_Any_MMI,
+      ScaleRowDown34_MMI,
+      ScaleRowDown34_C,
+      4 / 3,
+      1,
+      23)
+#endif
 #ifdef HAS_SCALEROWDOWN38_SSSE3
 SDANY(ScaleRowDown38_Any_SSSE3,
       ScaleRowDown38_SSSE3,
@@ -382,6 +407,26 @@ SDANY(ScaleARGBRowDown2Box_Any_MSA,
       4,
       3)
 #endif
+#ifdef HAS_SCALEARGBROWDOWN2_MMI
+SDANY(ScaleARGBRowDown2_Any_MMI,
+      ScaleARGBRowDown2_MMI,
+      ScaleARGBRowDown2_C,
+      2,
+      4,
+      1)
+SDANY(ScaleARGBRowDown2Linear_Any_MMI,
+      ScaleARGBRowDown2Linear_MMI,
+      ScaleARGBRowDown2Linear_C,
+      2,
+      4,
+      1)
+SDANY(ScaleARGBRowDown2Box_Any_MMI,
+      ScaleARGBRowDown2Box_MMI,
+      ScaleARGBRowDown2Box_C,
+      2,
+      4,
+      1)
+#endif
 #undef SDANY
 
 // Scale down by even scale factor.
@@ -433,6 +478,64 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_MSA,
        4,
        3)
 #endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_MMI
+SDAANY(ScaleARGBRowDownEven_Any_MMI,
+       ScaleARGBRowDownEven_MMI,
+       ScaleARGBRowDownEven_C,
+       4,
+       1)
+SDAANY(ScaleARGBRowDownEvenBox_Any_MMI,
+       ScaleARGBRowDownEvenBox_MMI,
+       ScaleARGBRowDownEvenBox_C,
+       4,
+       1)
+#endif
+#ifdef HAS_SCALEUVROWDOWNEVEN_NEON
+SDAANY(ScaleUVRowDownEven_Any_NEON,
+       ScaleUVRowDownEven_NEON,
+       ScaleUVRowDownEven_C,
+       2,
+       3)
+#endif
+
+#ifdef SASIMDONLY
+// This also works and uses memcpy and SIMD instead of C, but is slower on ARM
+
+// Add rows box filter scale down.  Using macro from row_any
+#define SAROW(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                      \
+  void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int width) { \
+    SIMD_ALIGNED(uint16_t dst_temp[32]);                               \
+    SIMD_ALIGNED(uint8_t src_temp[32]);                                \
+    memset(dst_temp, 0, 32 * 2); /* for msan */                        \
+    int r = width & MASK;                                              \
+    int n = width & ~MASK;                                             \
+    if (n > 0) {                                                       \
+      ANY_SIMD(src_ptr, dst_ptr, n);                                   \
+    }                                                                  \
+    memcpy(src_temp, src_ptr + n * SBPP, r * SBPP);                    \
+    memcpy(dst_temp, dst_ptr + n * BPP, r * BPP);                      \
+    ANY_SIMD(src_temp, dst_temp, MASK + 1);                            \
+    memcpy(dst_ptr + n * BPP, dst_temp, r * BPP);                      \
+  }
+
+#ifdef HAS_SCALEADDROW_SSE2
+SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15)
+#endif
+#ifdef HAS_SCALEADDROW_AVX2
+SAROW(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, 1, 2, 31)
+#endif
+#ifdef HAS_SCALEADDROW_NEON
+SAROW(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, 1, 2, 15)
+#endif
+#ifdef HAS_SCALEADDROW_MSA
+SAROW(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, 1, 2, 15)
+#endif
+#ifdef HAS_SCALEADDROW_MMI
+SAROW(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, 1, 2, 7)
+#endif
+#undef SAANY
+
+#else
 
 // Add rows box filter scale down.
 #define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)              \
@@ -456,8 +559,56 @@ SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
 #ifdef HAS_SCALEADDROW_MSA
 SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15)
 #endif
+#ifdef HAS_SCALEADDROW_MMI
+SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7)
+#endif
 #undef SAANY
 
+#endif  // SASIMDONLY
+
+// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
+#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \
+  void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
+               int dx) {                                                       \
+    int r = dst_width & MASK;                                                  \
+    int n = dst_width & ~MASK;                                                 \
+    if (n > 0) {                                                               \
+      TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                   \
+    }                                                                          \
+    TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx);                     \
+  }
+
+#ifdef HAS_SCALEFILTERCOLS_NEON
+CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
+#endif
+#ifdef HAS_SCALEFILTERCOLS_MSA
+CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
+#endif
+#ifdef HAS_SCALEARGBCOLS_NEON
+CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
+#endif
+#ifdef HAS_SCALEARGBCOLS_MSA
+CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBCOLS_MMI
+CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_NEON
+CANY(ScaleARGBFilterCols_Any_NEON,
+     ScaleARGBFilterCols_NEON,
+     ScaleARGBFilterCols_C,
+     4,
+     3)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_MSA
+CANY(ScaleARGBFilterCols_Any_MSA,
+     ScaleARGBFilterCols_MSA,
+     ScaleARGBFilterCols_C,
+     4,
+     7)
+#endif
+#undef CANY
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/chromium/third_party/libyuv/source/scale_argb.cc b/chromium/third_party/libyuv/source/scale_argb.cc
index 53a22e8b41e..451d4ec4d1b 100644
--- a/chromium/third_party/libyuv/source/scale_argb.cc
+++ b/chromium/third_party/libyuv/source/scale_argb.cc
@@ -95,6 +95,22 @@ static void ScaleARGBDown2(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBROWDOWN2_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ScaleARGBRowDown2 =
+        filtering == kFilterNone
+            ? ScaleARGBRowDown2_Any_MMI
+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MMI
+                                          : ScaleARGBRowDown2Box_Any_MMI);
+    if (IS_ALIGNED(dst_width, 2)) {
+      ScaleARGBRowDown2 =
+          filtering == kFilterNone
+              ? ScaleARGBRowDown2_MMI
+              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MMI
+                                            : ScaleARGBRowDown2Box_MMI);
+    }
+  }
+#endif
 #if defined(HAS_SCALEARGBROWDOWN2_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ScaleARGBRowDown2 =
@@ -227,6 +243,16 @@ static void ScaleARGBDownEven(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MMI
+                                     : ScaleARGBRowDownEven_Any_MMI;
+    if (IS_ALIGNED(dst_width, 2)) {
+      ScaleARGBRowDownEven =
+          filtering ? ScaleARGBRowDownEvenBox_MMI : ScaleARGBRowDownEven_MMI;
+    }
+  }
+#endif
 #if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA
@@ -410,6 +436,14 @@ static void ScaleARGBBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    InterpolateRow = InterpolateRow_Any_MMI;
+    if (IS_ALIGNED(dst_width, 2)) {
+      InterpolateRow = InterpolateRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     InterpolateRow = InterpolateRow_Any_MSA;
@@ -456,6 +490,14 @@ static void ScaleARGBBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBCOLS_MMI)
+  if (!filtering && TestCpuFlag(kCpuHasMMI)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_MMI;
+    if (IS_ALIGNED(dst_width, 1)) {
+      ScaleARGBFilterCols = ScaleARGBCols_MMI;
+    }
+  }
+#endif
 #if defined(HAS_SCALEARGBCOLS_MSA)
   if (!filtering && TestCpuFlag(kCpuHasMSA)) {
     ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
@@ -471,6 +513,11 @@ static void ScaleARGBBilinearUp(int src_width,
       ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
     }
 #endif
+#if defined(HAS_SCALEARGBCOLSUP2_MMI)
+    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBColsUp2_MMI;
+    }
+#endif
   }
 
   if (y > max_y) {
@@ -572,6 +619,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_I422TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(src_width, 4)) {
+      I422ToARGBRow = I422ToARGBRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_I422TOARGBROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     I422ToARGBRow = I422ToARGBRow_Any_MSA;
@@ -658,6 +713,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBCOLS_MMI)
+  if (!filtering && TestCpuFlag(kCpuHasMMI)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_MMI;
+    if (IS_ALIGNED(dst_width, 1)) {
+      ScaleARGBFilterCols = ScaleARGBCols_MMI;
+    }
+  }
+#endif
 #if defined(HAS_SCALEARGBCOLS_MSA)
   if (!filtering && TestCpuFlag(kCpuHasMSA)) {
     ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
@@ -673,6 +736,11 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
       ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
     }
 #endif
+#if defined(HAS_SCALEARGBCOLSUP2_MMI)
+    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBColsUp2_MMI;
+    }
+#endif
   }
 
   const int max_y = (src_height - 1) << 16;
@@ -789,6 +857,14 @@ static void ScaleARGBSimple(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBCOLS_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ScaleARGBCols = ScaleARGBCols_Any_MMI;
+    if (IS_ALIGNED(dst_width, 1)) {
+      ScaleARGBCols = ScaleARGBCols_MMI;
+    }
+  }
+#endif
 #if defined(HAS_SCALEARGBCOLS_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ScaleARGBCols = ScaleARGBCols_Any_MSA;
@@ -804,6 +880,11 @@ static void ScaleARGBSimple(int src_width,
       ScaleARGBCols = ScaleARGBColsUp2_SSE2;
     }
 #endif
+#if defined(HAS_SCALEARGBCOLSUP2_MMI)
+    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBCols = ScaleARGBColsUp2_MMI;
+    }
+#endif
   }
 
   for (j = 0; j < dst_height; ++j) {
@@ -900,7 +981,7 @@ static void ScaleARGB(const uint8_t* src,
     }
   }
   if (dx == 0x10000 && (x & 0xffff) == 0) {
-    // Arbitrary scale vertically, but unscaled vertically.
+    // Arbitrary scale vertically, but unscaled horizontally.
     ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
                        dst_stride, src, dst, x, y, dy, 4, filtering);
     return;
diff --git a/chromium/third_party/libyuv/source/scale_common.cc b/chromium/third_party/libyuv/source/scale_common.cc
index b28d7da41fc..81959925c8a 100644
--- a/chromium/third_party/libyuv/source/scale_common.cc
+++ b/chromium/third_party/libyuv/source/scale_common.cc
@@ -542,7 +542,9 @@ void ScaleFilterCols64_C(uint8_t* dst_ptr,
 
 // Same as 8 bit arm blender but return is cast to uint16_t
 #define BLENDER(a, b, f) \
-  (uint16_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+  (uint16_t)(            \
+      (int)(a) +         \
+      (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> 16))
 
 void ScaleFilterCols_16_C(uint16_t* dst_ptr,
                           const uint16_t* src_ptr,
@@ -774,6 +776,8 @@ void ScaleAddRow_16_C(const uint16_t* src_ptr,
   }
 }
 
+// ARGB scale row functions
+
 void ScaleARGBRowDown2_C(const uint8_t* src_argb,
                          ptrdiff_t src_stride,
                          uint8_t* dst_argb,
@@ -1016,6 +1020,235 @@ void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
 #undef BLENDERC
 #undef BLENDER
 
+// UV scale row functions
+// same as ARGB but 2 channels
+
+void ScaleUVRowDown2_C(const uint8_t* src_uv,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst_uv,
+                       int dst_width) {
+  const uint16_t* src = (const uint16_t*)(src_uv);
+  uint16_t* dst = (uint16_t*)(dst_uv);
+  int x;
+  (void)src_stride;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src[1];
+    dst[1] = src[3];
+    src += 2;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[1];
+  }
+}
+
+void ScaleUVRowDown2Linear_C(const uint8_t* src_uv,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_uv,
+                             int dst_width) {
+  int x;
+  (void)src_stride;
+  for (x = 0; x < dst_width; ++x) {
+    dst_uv[0] = (src_uv[0] + src_uv[2] + 1) >> 1;
+    dst_uv[1] = (src_uv[1] + src_uv[3] + 1) >> 1;
+    src_uv += 4;
+    dst_uv += 2;
+  }
+}
+
+void ScaleUVRowDown2Box_C(const uint8_t* src_uv,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_uv,
+                          int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] +
+                 src_uv[src_stride + 2] + 2) >>
+                2;
+    dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] +
+                 src_uv[src_stride + 3] + 2) >>
+                2;
+    src_uv += 4;
+    dst_uv += 2;
+  }
+}
+
+void ScaleUVRowDownEven_C(const uint8_t* src_uv,
+                          ptrdiff_t src_stride,
+                          int src_stepx,
+                          uint8_t* dst_uv,
+                          int dst_width) {
+  const uint16_t* src = (const uint16_t*)(src_uv);
+  uint16_t* dst = (uint16_t*)(dst_uv);
+  (void)src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src[0];
+    dst[1] = src[src_stepx];
+    src += src_stepx * 2;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
+void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv,
+                             ptrdiff_t src_stride,
+                             int src_stepx,
+                             uint8_t* dst_uv,
+                             int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] +
+                 src_uv[src_stride + 2] + 2) >>
+                2;
+    dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] +
+                 src_uv[src_stride + 3] + 2) >>
+                2;
+    src_uv += src_stepx * 2;
+    dst_uv += 2;
+  }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleUVCols_C(uint8_t* dst_uv,
+                   const uint8_t* src_uv,
+                   int dst_width,
+                   int x,
+                   int dx) {
+  const uint16_t* src = (const uint16_t*)(src_uv);
+  uint16_t* dst = (uint16_t*)(dst_uv);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[0] = src[x >> 16];
+    x += dx;
+    dst[1] = src[x >> 16];
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[x >> 16];
+  }
+}
+
+void ScaleUVCols64_C(uint8_t* dst_uv,
+                     const uint8_t* src_uv,
+                     int dst_width,
+                     int x32,
+                     int dx) {
+  int64_t x = (int64_t)(x32);
+  const uint16_t* src = (const uint16_t*)(src_uv);
+  uint16_t* dst = (uint16_t*)(dst_uv);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[0] = src[x >> 16];
+    x += dx;
+    dst[1] = src[x >> 16];
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[x >> 16];
+  }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleUVColsUp2_C(uint8_t* dst_uv,
+                      const uint8_t* src_uv,
+                      int dst_width,
+                      int x,
+                      int dx) {
+  const uint16_t* src = (const uint16_t*)(src_uv);
+  uint16_t* dst = (uint16_t*)(dst_uv);
+  int j;
+  (void)x;
+  (void)dx;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[1] = dst[0] = src[0];
+    src += 1;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
+// TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=607.
+// Mimics SSSE3 blender
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
+#define BLENDERC(a, b, f, s) \
+  (uint16_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f) BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+
+void ScaleUVFilterCols_C(uint8_t* dst_uv,
+                         const uint8_t* src_uv,
+                         int dst_width,
+                         int x,
+                         int dx) {
+  const uint16_t* src = (const uint16_t*)(src_uv);
+  uint16_t* dst = (uint16_t*)(dst_uv);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint16_t a = src[xi];
+    uint16_t b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+    x += dx;
+    xi = x >> 16;
+    xf = (x >> 9) & 0x7f;
+    a = src[xi];
+    b = src[xi + 1];
+    dst[1] = BLENDER(a, b, xf);
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint16_t a = src[xi];
+    uint16_t b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+  }
+}
+
+void ScaleUVFilterCols64_C(uint8_t* dst_uv,
+                           const uint8_t* src_uv,
+                           int dst_width,
+                           int x32,
+                           int dx) {
+  int64_t x = (int64_t)(x32);
+  const uint16_t* src = (const uint16_t*)(src_uv);
+  uint16_t* dst = (uint16_t*)(dst_uv);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int64_t xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint16_t a = src[xi];
+    uint16_t b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+    x += dx;
+    xi = x >> 16;
+    xf = (x >> 9) & 0x7f;
+    a = src[xi];
+    b = src[xi + 1];
+    dst[1] = BLENDER(a, b, xf);
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    int64_t xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint16_t a = src[xi];
+    uint16_t b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+  }
+}
+#undef BLENDER1
+#undef BLENDERC
+#undef BLENDER
+
 // Scale plane vertically with bilinear interpolation.
 void ScalePlaneVertical(int src_height,
                         int dst_width,
@@ -1065,6 +1298,14 @@ void ScalePlaneVertical(int src_height,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    InterpolateRow = InterpolateRow_Any_MMI;
+    if (IS_ALIGNED(dst_width_bytes, 8)) {
+      InterpolateRow = InterpolateRow_MMI;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     InterpolateRow = InterpolateRow_Any_MSA;
@@ -1171,8 +1412,8 @@ enum FilterMode ScaleFilterReduce(int src_width,
     src_height = -src_height;
   }
   if (filtering == kFilterBox) {
-    // If scaling both axis to 0.5 or larger, switch from Box to Bilinear.
-    if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
+    // If scaling either axis to 0.5 or larger, switch from Box to Bilinear.
+    if (dst_width * 2 >= src_width || dst_height * 2 >= src_height) {
       filtering = kFilterBilinear;
     }
   }
diff --git a/chromium/third_party/libyuv/source/scale_gcc.cc b/chromium/third_party/libyuv/source/scale_gcc.cc
index 312236d2df8..e575ee18bcb 100644
--- a/chromium/third_party/libyuv/source/scale_gcc.cc
+++ b/chromium/third_party/libyuv/source/scale_gcc.cc
@@ -102,16 +102,16 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
       // 16 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "psrlw     $0x8,%%xmm0                     \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
         "+r"(dst_width)  // %2
@@ -125,25 +125,25 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
                                int dst_width) {
   (void)src_stride;
   asm volatile(
-      "pcmpeqb    %%xmm4,%%xmm4                  \n"
-      "psrlw      $0xf,%%xmm4                    \n"
-      "packuswb   %%xmm4,%%xmm4                  \n"
-      "pxor       %%xmm5,%%xmm5                  \n"
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrlw       $0xf,%%xmm4                   \n"
+      "packuswb    %%xmm4,%%xmm4                 \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "pmaddubsw  %%xmm4,%%xmm0                  \n"
-      "pmaddubsw  %%xmm4,%%xmm1                  \n"
-      "pavgw      %%xmm5,%%xmm0                  \n"
-      "pavgw      %%xmm5,%%xmm1                  \n"
-      "packuswb   %%xmm1,%%xmm0                  \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm1                 \n"
+      "pavgw       %%xmm5,%%xmm0                 \n"
+      "pavgw       %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
         "+r"(dst_width)  // %2
@@ -156,33 +156,33 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int dst_width) {
   asm volatile(
-      "pcmpeqb    %%xmm4,%%xmm4                  \n"
-      "psrlw      $0xf,%%xmm4                    \n"
-      "packuswb   %%xmm4,%%xmm4                  \n"
-      "pxor       %%xmm5,%%xmm5                  \n"
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrlw       $0xf,%%xmm4                   \n"
+      "packuswb    %%xmm4,%%xmm4                 \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x00(%0,%3,1),%%xmm2            \n"
-      "movdqu    0x10(%0,%3,1),%%xmm3            \n"
-      "lea       0x20(%0),%0                     \n"
-      "pmaddubsw  %%xmm4,%%xmm0                  \n"
-      "pmaddubsw  %%xmm4,%%xmm1                  \n"
-      "pmaddubsw  %%xmm4,%%xmm2                  \n"
-      "pmaddubsw  %%xmm4,%%xmm3                  \n"
-      "paddw      %%xmm2,%%xmm0                  \n"
-      "paddw      %%xmm3,%%xmm1                  \n"
-      "psrlw      $0x1,%%xmm0                    \n"
-      "psrlw      $0x1,%%xmm1                    \n"
-      "pavgw      %%xmm5,%%xmm0                  \n"
-      "pavgw      %%xmm5,%%xmm1                  \n"
-      "packuswb   %%xmm1,%%xmm0                  \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x00(%0,%3,1),%%xmm2          \n"
+      "movdqu      0x10(%0,%3,1),%%xmm3          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm1                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm4,%%xmm3                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"
+      "psrlw       $0x1,%%xmm0                   \n"
+      "psrlw       $0x1,%%xmm1                   \n"
+      "pavgw       %%xmm5,%%xmm0                 \n"
+      "pavgw       %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
       : "+r"(src_ptr),               // %0
         "+r"(dst_ptr),               // %1
         "+r"(dst_width)              // %2
@@ -200,17 +200,17 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "lea        0x40(%0),%0                    \n"
-      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-      "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
-      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "lea        0x20(%1),%1                    \n"
-      "sub        $0x20,%2                       \n"
-      "jg         1b                             \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
@@ -225,26 +225,26 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
                               int dst_width) {
   (void)src_stride;
   asm volatile(
-      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
-      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
-      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
+      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "lea        0x40(%0),%0                    \n"
-      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-      "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
-      "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
-      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "lea        0x20(%1),%1                    \n"
-      "sub        $0x20,%2                       \n"
-      "jg         1b                             \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
+      "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
@@ -258,34 +258,34 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
                            uint8_t* dst_ptr,
                            int dst_width) {
   asm volatile(
-      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
-      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
-      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
+      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "vmovdqu    0x00(%0,%3,1),%%ymm2           \n"
-      "vmovdqu    0x20(%0,%3,1),%%ymm3           \n"
-      "lea        0x40(%0),%0                    \n"
-      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-      "vpsrlw     $0x1,%%ymm0,%%ymm0             \n"
-      "vpsrlw     $0x1,%%ymm1,%%ymm1             \n"
-      "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
-      "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
-      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "lea        0x20(%1),%1                    \n"
-      "sub        $0x20,%2                       \n"
-      "jg         1b                             \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
+      "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
+      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
+      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x1,%%ymm1,%%ymm1            \n"
+      "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src_ptr),               // %0
         "+r"(dst_ptr),               // %1
@@ -301,24 +301,24 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
                          int dst_width) {
   (void)src_stride;
   asm volatile(
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
-      "psrld     $0x18,%%xmm5                    \n"
-      "pslld     $0x10,%%xmm5                    \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrld       $0x18,%%xmm5                  \n"
+      "pslld       $0x10,%%xmm5                  \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "pand      %%xmm5,%%xmm0                   \n"
-      "pand      %%xmm5,%%xmm1                   \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "psrlw     $0x8,%%xmm0                     \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
         "+r"(dst_width)  // %2
@@ -332,46 +332,46 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
                             int dst_width) {
   intptr_t stridex3;
   asm volatile(
-      "pcmpeqb    %%xmm4,%%xmm4                  \n"
-      "psrlw      $0xf,%%xmm4                    \n"
-      "movdqa     %%xmm4,%%xmm5                  \n"
-      "packuswb   %%xmm4,%%xmm4                  \n"
-      "psllw      $0x3,%%xmm5                    \n"
-      "lea       0x00(%4,%4,2),%3                \n"
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrlw       $0xf,%%xmm4                   \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "packuswb    %%xmm4,%%xmm4                 \n"
+      "psllw       $0x3,%%xmm5                   \n"
+      "lea         0x00(%4,%4,2),%3              \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x00(%0,%4,1),%%xmm2            \n"
-      "movdqu    0x10(%0,%4,1),%%xmm3            \n"
-      "pmaddubsw  %%xmm4,%%xmm0                  \n"
-      "pmaddubsw  %%xmm4,%%xmm1                  \n"
-      "pmaddubsw  %%xmm4,%%xmm2                  \n"
-      "pmaddubsw  %%xmm4,%%xmm3                  \n"
-      "paddw      %%xmm2,%%xmm0                  \n"
-      "paddw      %%xmm3,%%xmm1                  \n"
-      "movdqu    0x00(%0,%4,2),%%xmm2            \n"
-      "movdqu    0x10(%0,%4,2),%%xmm3            \n"
-      "pmaddubsw  %%xmm4,%%xmm2                  \n"
-      "pmaddubsw  %%xmm4,%%xmm3                  \n"
-      "paddw      %%xmm2,%%xmm0                  \n"
-      "paddw      %%xmm3,%%xmm1                  \n"
-      "movdqu    0x00(%0,%3,1),%%xmm2            \n"
-      "movdqu    0x10(%0,%3,1),%%xmm3            \n"
-      "lea       0x20(%0),%0                     \n"
-      "pmaddubsw  %%xmm4,%%xmm2                  \n"
-      "pmaddubsw  %%xmm4,%%xmm3                  \n"
-      "paddw      %%xmm2,%%xmm0                  \n"
-      "paddw      %%xmm3,%%xmm1                  \n"
-      "phaddw     %%xmm1,%%xmm0                  \n"
-      "paddw      %%xmm5,%%xmm0                  \n"
-      "psrlw      $0x4,%%xmm0                    \n"
-      "packuswb   %%xmm0,%%xmm0                  \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x00(%0,%4,1),%%xmm2          \n"
+      "movdqu      0x10(%0,%4,1),%%xmm3          \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm1                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm4,%%xmm3                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"
+      "movdqu      0x00(%0,%4,2),%%xmm2          \n"
+      "movdqu      0x10(%0,%4,2),%%xmm3          \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm4,%%xmm3                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"
+      "movdqu      0x00(%0,%3,1),%%xmm2          \n"
+      "movdqu      0x10(%0,%3,1),%%xmm3          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm4,%%xmm3                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"
+      "phaddw      %%xmm1,%%xmm0                 \n"
+      "paddw       %%xmm5,%%xmm0                 \n"
+      "psrlw       $0x4,%%xmm0                   \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src_ptr),               // %0
         "+r"(dst_ptr),               // %1
         "+r"(dst_width),             // %2
@@ -387,26 +387,26 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
                         int dst_width) {
   (void)src_stride;
   asm volatile(
-      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-      "vpsrld     $0x18,%%ymm5,%%ymm5            \n"
-      "vpslld     $0x10,%%ymm5,%%ymm5            \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsrld      $0x18,%%ymm5,%%ymm5           \n"
+      "vpslld      $0x10,%%ymm5,%%ymm5           \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "lea        0x40(%0),%0                    \n"
-      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
-      "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
-      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "vmovdqu    %%xmm0,(%1)                    \n"
-      "lea        0x10(%1),%1                    \n"
-      "sub        $0x10,%2                       \n"
-      "jg         1b                             \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
@@ -420,46 +420,46 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
                            uint8_t* dst_ptr,
                            int dst_width) {
   asm volatile(
-      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
-      "vpsllw     $0x3,%%ymm4,%%ymm5             \n"
-      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
+      "vpsllw      $0x3,%%ymm4,%%ymm5            \n"
+      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "vmovdqu    0x00(%0,%3,1),%%ymm2           \n"
-      "vmovdqu    0x20(%0,%3,1),%%ymm3           \n"
-      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-      "vmovdqu    0x00(%0,%3,2),%%ymm2           \n"
-      "vmovdqu    0x20(%0,%3,2),%%ymm3           \n"
-      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-      "vmovdqu    0x00(%0,%4,1),%%ymm2           \n"
-      "vmovdqu    0x20(%0,%4,1),%%ymm3           \n"
-      "lea        0x40(%0),%0                    \n"
-      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-      "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
-      "vpsrlw     $0x4,%%ymm0,%%ymm0             \n"
-      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "vmovdqu    %%xmm0,(%1)                    \n"
-      "lea        0x10(%1),%1                    \n"
-      "sub        $0x10,%2                       \n"
-      "jg         1b                             \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
+      "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
+      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
+      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
+      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
+      "vmovdqu     0x00(%0,%3,2),%%ymm2          \n"
+      "vmovdqu     0x20(%0,%3,2),%%ymm3          \n"
+      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
+      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
+      "vmovdqu     0x00(%0,%4,1),%%ymm2          \n"
+      "vmovdqu     0x20(%0,%4,1),%%ymm3          \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
+      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
+      "vphaddw     %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpsrlw      $0x4,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src_ptr),                   // %0
         "+r"(dst_ptr),                   // %1
@@ -476,32 +476,32 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
                           int dst_width) {
   (void)src_stride;
   asm volatile(
-      "movdqa    %0,%%xmm3                       \n"
-      "movdqa    %1,%%xmm4                       \n"
-      "movdqa    %2,%%xmm5                       \n"
+      "movdqa      %0,%%xmm3                     \n"
+      "movdqa      %1,%%xmm4                     \n"
+      "movdqa      %2,%%xmm5                     \n"
       :
       : "m"(kShuf0),  // %0
         "m"(kShuf1),  // %1
         "m"(kShuf2)   // %2
-      );
+  );
   asm volatile(
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm2                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "movdqa    %%xmm2,%%xmm1                   \n"
-      "palignr   $0x8,%%xmm0,%%xmm1              \n"
-      "pshufb    %%xmm3,%%xmm0                   \n"
-      "pshufb    %%xmm4,%%xmm1                   \n"
-      "pshufb    %%xmm5,%%xmm2                   \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "movq      %%xmm1,0x8(%1)                  \n"
-      "movq      %%xmm2,0x10(%1)                 \n"
-      "lea       0x18(%1),%1                     \n"
-      "sub       $0x18,%2                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm2               \n"
+      "lea         0x20(%0),%0                   \n"
+      "movdqa      %%xmm2,%%xmm1                 \n"
+      "palignr     $0x8,%%xmm0,%%xmm1            \n"
+      "pshufb      %%xmm3,%%xmm0                 \n"
+      "pshufb      %%xmm4,%%xmm1                 \n"
+      "pshufb      %%xmm5,%%xmm2                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movq        %%xmm1,0x8(%1)                \n"
+      "movq        %%xmm2,0x10(%1)               \n"
+      "lea         0x18(%1),%1                   \n"
+      "sub         $0x18,%2                      \n"
+      "jg          1b                            \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
         "+r"(dst_width)  // %2
@@ -514,58 +514,58 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 int dst_width) {
   asm volatile(
-      "movdqa    %0,%%xmm2                       \n"  // kShuf01
-      "movdqa    %1,%%xmm3                       \n"  // kShuf11
-      "movdqa    %2,%%xmm4                       \n"  // kShuf21
+      "movdqa      %0,%%xmm2                     \n"  // kShuf01
+      "movdqa      %1,%%xmm3                     \n"  // kShuf11
+      "movdqa      %2,%%xmm4                     \n"  // kShuf21
       :
       : "m"(kShuf01),  // %0
         "m"(kShuf11),  // %1
         "m"(kShuf21)   // %2
-      );
+  );
   asm volatile(
-      "movdqa    %0,%%xmm5                       \n"  // kMadd01
-      "movdqa    %1,%%xmm0                       \n"  // kMadd11
-      "movdqa    %2,%%xmm1                       \n"  // kRound34
+      "movdqa      %0,%%xmm5                     \n"  // kMadd01
+      "movdqa      %1,%%xmm0                     \n"  // kMadd11
+      "movdqa      %2,%%xmm1                     \n"  // kRound34
       :
       : "m"(kMadd01),  // %0
         "m"(kMadd11),  // %1
         "m"(kRound34)  // %2
-      );
+  );
   asm volatile(
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm6                     \n"
-      "movdqu    0x00(%0,%3,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm6                   \n"
-      "pshufb    %%xmm2,%%xmm6                   \n"
-      "pmaddubsw %%xmm5,%%xmm6                   \n"
-      "paddsw    %%xmm1,%%xmm6                   \n"
-      "psrlw     $0x2,%%xmm6                     \n"
-      "packuswb  %%xmm6,%%xmm6                   \n"
-      "movq      %%xmm6,(%1)                     \n"
-      "movdqu    0x8(%0),%%xmm6                  \n"
-      "movdqu    0x8(%0,%3,1),%%xmm7             \n"
-      "pavgb     %%xmm7,%%xmm6                   \n"
-      "pshufb    %%xmm3,%%xmm6                   \n"
-      "pmaddubsw %%xmm0,%%xmm6                   \n"
-      "paddsw    %%xmm1,%%xmm6                   \n"
-      "psrlw     $0x2,%%xmm6                     \n"
-      "packuswb  %%xmm6,%%xmm6                   \n"
-      "movq      %%xmm6,0x8(%1)                  \n"
-      "movdqu    0x10(%0),%%xmm6                 \n"
-      "movdqu    0x10(%0,%3,1),%%xmm7            \n"
-      "lea       0x20(%0),%0                     \n"
-      "pavgb     %%xmm7,%%xmm6                   \n"
-      "pshufb    %%xmm4,%%xmm6                   \n"
-      "pmaddubsw %4,%%xmm6                       \n"
-      "paddsw    %%xmm1,%%xmm6                   \n"
-      "psrlw     $0x2,%%xmm6                     \n"
-      "packuswb  %%xmm6,%%xmm6                   \n"
-      "movq      %%xmm6,0x10(%1)                 \n"
-      "lea       0x18(%1),%1                     \n"
-      "sub       $0x18,%2                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "movdqu      0x00(%0,%3,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+      "pshufb      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm5,%%xmm6                 \n"
+      "paddsw      %%xmm1,%%xmm6                 \n"
+      "psrlw       $0x2,%%xmm6                   \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movq        %%xmm6,(%1)                   \n"
+      "movdqu      0x8(%0),%%xmm6                \n"
+      "movdqu      0x8(%0,%3,1),%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+      "pshufb      %%xmm3,%%xmm6                 \n"
+      "pmaddubsw   %%xmm0,%%xmm6                 \n"
+      "paddsw      %%xmm1,%%xmm6                 \n"
+      "psrlw       $0x2,%%xmm6                   \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movq        %%xmm6,0x8(%1)                \n"
+      "movdqu      0x10(%0),%%xmm6               \n"
+      "movdqu      0x10(%0,%3,1),%%xmm7          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+      "pshufb      %%xmm4,%%xmm6                 \n"
+      "pmaddubsw   %4,%%xmm6                     \n"
+      "paddsw      %%xmm1,%%xmm6                 \n"
+      "psrlw       $0x2,%%xmm6                   \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movq        %%xmm6,0x10(%1)               \n"
+      "lea         0x18(%1),%1                   \n"
+      "sub         $0x18,%2                      \n"
+      "jg          1b                            \n"
       : "+r"(src_ptr),                // %0
         "+r"(dst_ptr),                // %1
         "+r"(dst_width)               // %2
@@ -580,62 +580,62 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 int dst_width) {
   asm volatile(
-      "movdqa    %0,%%xmm2                       \n"  // kShuf01
-      "movdqa    %1,%%xmm3                       \n"  // kShuf11
-      "movdqa    %2,%%xmm4                       \n"  // kShuf21
+      "movdqa      %0,%%xmm2                     \n"  // kShuf01
+      "movdqa      %1,%%xmm3                     \n"  // kShuf11
+      "movdqa      %2,%%xmm4                     \n"  // kShuf21
       :
       : "m"(kShuf01),  // %0
         "m"(kShuf11),  // %1
         "m"(kShuf21)   // %2
-      );
+  );
   asm volatile(
-      "movdqa    %0,%%xmm5                       \n"  // kMadd01
-      "movdqa    %1,%%xmm0                       \n"  // kMadd11
-      "movdqa    %2,%%xmm1                       \n"  // kRound34
+      "movdqa      %0,%%xmm5                     \n"  // kMadd01
+      "movdqa      %1,%%xmm0                     \n"  // kMadd11
+      "movdqa      %2,%%xmm1                     \n"  // kRound34
       :
       : "m"(kMadd01),  // %0
         "m"(kMadd11),  // %1
         "m"(kRound34)  // %2
-      );
+  );
 
   asm volatile(
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm6                     \n"
-      "movdqu    0x00(%0,%3,1),%%xmm7            \n"
-      "pavgb     %%xmm6,%%xmm7                   \n"
-      "pavgb     %%xmm7,%%xmm6                   \n"
-      "pshufb    %%xmm2,%%xmm6                   \n"
-      "pmaddubsw %%xmm5,%%xmm6                   \n"
-      "paddsw    %%xmm1,%%xmm6                   \n"
-      "psrlw     $0x2,%%xmm6                     \n"
-      "packuswb  %%xmm6,%%xmm6                   \n"
-      "movq      %%xmm6,(%1)                     \n"
-      "movdqu    0x8(%0),%%xmm6                  \n"
-      "movdqu    0x8(%0,%3,1),%%xmm7             \n"
-      "pavgb     %%xmm6,%%xmm7                   \n"
-      "pavgb     %%xmm7,%%xmm6                   \n"
-      "pshufb    %%xmm3,%%xmm6                   \n"
-      "pmaddubsw %%xmm0,%%xmm6                   \n"
-      "paddsw    %%xmm1,%%xmm6                   \n"
-      "psrlw     $0x2,%%xmm6                     \n"
-      "packuswb  %%xmm6,%%xmm6                   \n"
-      "movq      %%xmm6,0x8(%1)                  \n"
-      "movdqu    0x10(%0),%%xmm6                 \n"
-      "movdqu    0x10(%0,%3,1),%%xmm7            \n"
-      "lea       0x20(%0),%0                     \n"
-      "pavgb     %%xmm6,%%xmm7                   \n"
-      "pavgb     %%xmm7,%%xmm6                   \n"
-      "pshufb    %%xmm4,%%xmm6                   \n"
-      "pmaddubsw %4,%%xmm6                       \n"
-      "paddsw    %%xmm1,%%xmm6                   \n"
-      "psrlw     $0x2,%%xmm6                     \n"
-      "packuswb  %%xmm6,%%xmm6                   \n"
-      "movq      %%xmm6,0x10(%1)                 \n"
-      "lea       0x18(%1),%1                     \n"
-      "sub       $0x18,%2                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "movdqu      0x00(%0,%3,1),%%xmm7          \n"
+      "pavgb       %%xmm6,%%xmm7                 \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+      "pshufb      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm5,%%xmm6                 \n"
+      "paddsw      %%xmm1,%%xmm6                 \n"
+      "psrlw       $0x2,%%xmm6                   \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movq        %%xmm6,(%1)                   \n"
+      "movdqu      0x8(%0),%%xmm6                \n"
+      "movdqu      0x8(%0,%3,1),%%xmm7           \n"
+      "pavgb       %%xmm6,%%xmm7                 \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+      "pshufb      %%xmm3,%%xmm6                 \n"
+      "pmaddubsw   %%xmm0,%%xmm6                 \n"
+      "paddsw      %%xmm1,%%xmm6                 \n"
+      "psrlw       $0x2,%%xmm6                   \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movq        %%xmm6,0x8(%1)                \n"
+      "movdqu      0x10(%0),%%xmm6               \n"
+      "movdqu      0x10(%0,%3,1),%%xmm7          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pavgb       %%xmm6,%%xmm7                 \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+      "pshufb      %%xmm4,%%xmm6                 \n"
+      "pmaddubsw   %4,%%xmm6                     \n"
+      "paddsw      %%xmm1,%%xmm6                 \n"
+      "psrlw       $0x2,%%xmm6                   \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movq        %%xmm6,0x10(%1)               \n"
+      "lea         0x18(%1),%1                   \n"
+      "sub         $0x18,%2                      \n"
+      "jg          1b                            \n"
       : "+r"(src_ptr),                // %0
         "+r"(dst_ptr),                // %1
         "+r"(dst_width)               // %2
@@ -651,23 +651,23 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
                           int dst_width) {
   (void)src_stride;
   asm volatile(
-      "movdqa    %3,%%xmm4                       \n"
-      "movdqa    %4,%%xmm5                       \n"
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "pshufb    %%xmm4,%%xmm0                   \n"
-      "pshufb    %%xmm5,%%xmm1                   \n"
-      "paddusb   %%xmm1,%%xmm0                   \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "movhlps   %%xmm0,%%xmm1                   \n"
-      "movd      %%xmm1,0x8(%1)                  \n"
-      "lea       0xc(%1),%1                      \n"
-      "sub       $0xc,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "pshufb      %%xmm5,%%xmm1                 \n"
+      "paddusb     %%xmm1,%%xmm0                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movhlps     %%xmm0,%%xmm1                 \n"
+      "movd        %%xmm1,0x8(%1)                \n"
+      "lea         0xc(%1),%1                    \n"
+      "sub         $0xc,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
         "+r"(dst_width)  // %2
@@ -681,39 +681,39 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 int dst_width) {
   asm volatile(
-      "movdqa    %0,%%xmm2                       \n"
-      "movdqa    %1,%%xmm3                       \n"
-      "movdqa    %2,%%xmm4                       \n"
-      "movdqa    %3,%%xmm5                       \n"
+      "movdqa      %0,%%xmm2                     \n"
+      "movdqa      %1,%%xmm3                     \n"
+      "movdqa      %2,%%xmm4                     \n"
+      "movdqa      %3,%%xmm5                     \n"
       :
       : "m"(kShufAb0),  // %0
         "m"(kShufAb1),  // %1
         "m"(kShufAb2),  // %2
         "m"(kScaleAb2)  // %3
-      );
+  );
   asm volatile(
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x00(%0,%3,1),%%xmm1            \n"
-      "lea       0x10(%0),%0                     \n"
-      "pavgb     %%xmm1,%%xmm0                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "pshufb    %%xmm2,%%xmm1                   \n"
-      "movdqa    %%xmm0,%%xmm6                   \n"
-      "pshufb    %%xmm3,%%xmm6                   \n"
-      "paddusw   %%xmm6,%%xmm1                   \n"
-      "pshufb    %%xmm4,%%xmm0                   \n"
-      "paddusw   %%xmm0,%%xmm1                   \n"
-      "pmulhuw   %%xmm5,%%xmm1                   \n"
-      "packuswb  %%xmm1,%%xmm1                   \n"
-      "movd      %%xmm1,(%1)                     \n"
-      "psrlq     $0x10,%%xmm1                    \n"
-      "movd      %%xmm1,0x2(%1)                  \n"
-      "lea       0x6(%1),%1                      \n"
-      "sub       $0x6,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%3,1),%%xmm1          \n"
+      "lea         0x10(%0),%0                   \n"
+      "pavgb       %%xmm1,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pshufb      %%xmm2,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm6                 \n"
+      "pshufb      %%xmm3,%%xmm6                 \n"
+      "paddusw     %%xmm6,%%xmm1                 \n"
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "paddusw     %%xmm0,%%xmm1                 \n"
+      "pmulhuw     %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "movd        %%xmm1,(%1)                   \n"
+      "psrlq       $0x10,%%xmm1                  \n"
+      "movd        %%xmm1,0x2(%1)                \n"
+      "lea         0x6(%1),%1                    \n"
+      "sub         $0x6,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src_ptr),               // %0
         "+r"(dst_ptr),               // %1
         "+r"(dst_width)              // %2
@@ -726,57 +726,57 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 int dst_width) {
   asm volatile(
-      "movdqa    %0,%%xmm2                       \n"
-      "movdqa    %1,%%xmm3                       \n"
-      "movdqa    %2,%%xmm4                       \n"
-      "pxor      %%xmm5,%%xmm5                   \n"
+      "movdqa      %0,%%xmm2                     \n"
+      "movdqa      %1,%%xmm3                     \n"
+      "movdqa      %2,%%xmm4                     \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
       :
       : "m"(kShufAc),    // %0
         "m"(kShufAc3),   // %1
         "m"(kScaleAc33)  // %2
-      );
+  );
   asm volatile(
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x00(%0,%3,1),%%xmm6            \n"
-      "movhlps   %%xmm0,%%xmm1                   \n"
-      "movhlps   %%xmm6,%%xmm7                   \n"
-      "punpcklbw %%xmm5,%%xmm0                   \n"
-      "punpcklbw %%xmm5,%%xmm1                   \n"
-      "punpcklbw %%xmm5,%%xmm6                   \n"
-      "punpcklbw %%xmm5,%%xmm7                   \n"
-      "paddusw   %%xmm6,%%xmm0                   \n"
-      "paddusw   %%xmm7,%%xmm1                   \n"
-      "movdqu    0x00(%0,%3,2),%%xmm6            \n"
-      "lea       0x10(%0),%0                     \n"
-      "movhlps   %%xmm6,%%xmm7                   \n"
-      "punpcklbw %%xmm5,%%xmm6                   \n"
-      "punpcklbw %%xmm5,%%xmm7                   \n"
-      "paddusw   %%xmm6,%%xmm0                   \n"
-      "paddusw   %%xmm7,%%xmm1                   \n"
-      "movdqa    %%xmm0,%%xmm6                   \n"
-      "psrldq    $0x2,%%xmm0                     \n"
-      "paddusw   %%xmm0,%%xmm6                   \n"
-      "psrldq    $0x2,%%xmm0                     \n"
-      "paddusw   %%xmm0,%%xmm6                   \n"
-      "pshufb    %%xmm2,%%xmm6                   \n"
-      "movdqa    %%xmm1,%%xmm7                   \n"
-      "psrldq    $0x2,%%xmm1                     \n"
-      "paddusw   %%xmm1,%%xmm7                   \n"
-      "psrldq    $0x2,%%xmm1                     \n"
-      "paddusw   %%xmm1,%%xmm7                   \n"
-      "pshufb    %%xmm3,%%xmm7                   \n"
-      "paddusw   %%xmm7,%%xmm6                   \n"
-      "pmulhuw   %%xmm4,%%xmm6                   \n"
-      "packuswb  %%xmm6,%%xmm6                   \n"
-      "movd      %%xmm6,(%1)                     \n"
-      "psrlq     $0x10,%%xmm6                    \n"
-      "movd      %%xmm6,0x2(%1)                  \n"
-      "lea       0x6(%1),%1                      \n"
-      "sub       $0x6,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%3,1),%%xmm6          \n"
+      "movhlps     %%xmm0,%%xmm1                 \n"
+      "movhlps     %%xmm6,%%xmm7                 \n"
+      "punpcklbw   %%xmm5,%%xmm0                 \n"
+      "punpcklbw   %%xmm5,%%xmm1                 \n"
+      "punpcklbw   %%xmm5,%%xmm6                 \n"
+      "punpcklbw   %%xmm5,%%xmm7                 \n"
+      "paddusw     %%xmm6,%%xmm0                 \n"
+      "paddusw     %%xmm7,%%xmm1                 \n"
+      "movdqu      0x00(%0,%3,2),%%xmm6          \n"
+      "lea         0x10(%0),%0                   \n"
+      "movhlps     %%xmm6,%%xmm7                 \n"
+      "punpcklbw   %%xmm5,%%xmm6                 \n"
+      "punpcklbw   %%xmm5,%%xmm7                 \n"
+      "paddusw     %%xmm6,%%xmm0                 \n"
+      "paddusw     %%xmm7,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm6                 \n"
+      "psrldq      $0x2,%%xmm0                   \n"
+      "paddusw     %%xmm0,%%xmm6                 \n"
+      "psrldq      $0x2,%%xmm0                   \n"
+      "paddusw     %%xmm0,%%xmm6                 \n"
+      "pshufb      %%xmm2,%%xmm6                 \n"
+      "movdqa      %%xmm1,%%xmm7                 \n"
+      "psrldq      $0x2,%%xmm1                   \n"
+      "paddusw     %%xmm1,%%xmm7                 \n"
+      "psrldq      $0x2,%%xmm1                   \n"
+      "paddusw     %%xmm1,%%xmm7                 \n"
+      "pshufb      %%xmm3,%%xmm7                 \n"
+      "paddusw     %%xmm7,%%xmm6                 \n"
+      "pmulhuw     %%xmm4,%%xmm6                 \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movd        %%xmm6,(%1)                   \n"
+      "psrlq       $0x10,%%xmm6                  \n"
+      "movd        %%xmm6,0x2(%1)                \n"
+      "lea         0x6(%1),%1                    \n"
+      "sub         $0x6,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src_ptr),               // %0
         "+r"(dst_ptr),               // %1
         "+r"(dst_width)              // %2
@@ -791,25 +791,25 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
                       int src_width) {
   asm volatile(
 
-      "pxor      %%xmm5,%%xmm5                   \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
 
       // 16 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm3                     \n"
-      "lea       0x10(%0),%0                     \n"  // src_ptr += 16
-      "movdqu    (%1),%%xmm0                     \n"
-      "movdqu    0x10(%1),%%xmm1                 \n"
-      "movdqa    %%xmm3,%%xmm2                   \n"
-      "punpcklbw %%xmm5,%%xmm2                   \n"
-      "punpckhbw %%xmm5,%%xmm3                   \n"
-      "paddusw   %%xmm2,%%xmm0                   \n"
-      "paddusw   %%xmm3,%%xmm1                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "movdqu    %%xmm1,0x10(%1)                 \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm3                   \n"
+      "lea         0x10(%0),%0                   \n"  // src_ptr += 16
+      "movdqu      (%1),%%xmm0                   \n"
+      "movdqu      0x10(%1),%%xmm1               \n"
+      "movdqa      %%xmm3,%%xmm2                 \n"
+      "punpcklbw   %%xmm5,%%xmm2                 \n"
+      "punpckhbw   %%xmm5,%%xmm3                 \n"
+      "paddusw     %%xmm2,%%xmm0                 \n"
+      "paddusw     %%xmm3,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
         "+r"(src_width)  // %2
@@ -824,22 +824,22 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr,
                       int src_width) {
   asm volatile(
 
-      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu    (%0),%%ymm3                    \n"
-      "lea        0x20(%0),%0                    \n"  // src_ptr += 32
-      "vpermq     $0xd8,%%ymm3,%%ymm3            \n"
-      "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
-      "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
-      "vpaddusw   (%1),%%ymm2,%%ymm0             \n"
-      "vpaddusw   0x20(%1),%%ymm3,%%ymm1         \n"
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "vmovdqu    %%ymm1,0x20(%1)                \n"
-      "lea       0x40(%1),%1                     \n"
-      "sub       $0x20,%2                        \n"
-      "jg        1b                              \n"
+      "vmovdqu     (%0),%%ymm3                   \n"
+      "lea         0x20(%0),%0                   \n"  // src_ptr += 32
+      "vpermq      $0xd8,%%ymm3,%%ymm3           \n"
+      "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
+      "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
+      "vpaddusw    (%1),%%ymm2,%%ymm0            \n"
+      "vpaddusw    0x20(%1),%%ymm3,%%ymm1        \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
       "vzeroupper                                \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
@@ -866,69 +866,69 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
                            int dx) {
   intptr_t x0, x1, temp_pixel;
   asm volatile(
-      "movd      %6,%%xmm2                       \n"
-      "movd      %7,%%xmm3                       \n"
-      "movl      $0x04040000,%k2                 \n"
-      "movd      %k2,%%xmm5                      \n"
-      "pcmpeqb   %%xmm6,%%xmm6                   \n"
-      "psrlw     $0x9,%%xmm6                     \n"  // 0x007f007f
-      "pcmpeqb   %%xmm7,%%xmm7                   \n"
-      "psrlw     $15,%%xmm7                      \n"  // 0x00010001
-
-      "pextrw    $0x1,%%xmm2,%k3                 \n"
-      "subl      $0x2,%5                         \n"
-      "jl        29f                             \n"
-      "movdqa    %%xmm2,%%xmm0                   \n"
-      "paddd     %%xmm3,%%xmm0                   \n"
-      "punpckldq %%xmm0,%%xmm2                   \n"
-      "punpckldq %%xmm3,%%xmm3                   \n"
-      "paddd     %%xmm3,%%xmm3                   \n"
-      "pextrw    $0x3,%%xmm2,%k4                 \n"
+      "movd        %6,%%xmm2                     \n"
+      "movd        %7,%%xmm3                     \n"
+      "movl        $0x04040000,%k2               \n"
+      "movd        %k2,%%xmm5                    \n"
+      "pcmpeqb     %%xmm6,%%xmm6                 \n"
+      "psrlw       $0x9,%%xmm6                   \n"  // 0x007f007f
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psrlw       $15,%%xmm7                    \n"  // 0x00010001
+
+      "pextrw      $0x1,%%xmm2,%k3               \n"
+      "subl        $0x2,%5                       \n"
+      "jl          29f                           \n"
+      "movdqa      %%xmm2,%%xmm0                 \n"
+      "paddd       %%xmm3,%%xmm0                 \n"
+      "punpckldq   %%xmm0,%%xmm2                 \n"
+      "punpckldq   %%xmm3,%%xmm3                 \n"
+      "paddd       %%xmm3,%%xmm3                 \n"
+      "pextrw      $0x3,%%xmm2,%k4               \n"
 
       LABELALIGN
       "2:                                        \n"
-      "movdqa    %%xmm2,%%xmm1                   \n"
-      "paddd     %%xmm3,%%xmm2                   \n"
-      "movzwl    0x00(%1,%3,1),%k2               \n"
-      "movd      %k2,%%xmm0                      \n"
-      "psrlw     $0x9,%%xmm1                     \n"
-      "movzwl    0x00(%1,%4,1),%k2               \n"
-      "movd      %k2,%%xmm4                      \n"
-      "pshufb    %%xmm5,%%xmm1                   \n"
-      "punpcklwd %%xmm4,%%xmm0                   \n"
-      "psubb     %8,%%xmm0                       \n"  // make pixels signed.
-      "pxor      %%xmm6,%%xmm1                   \n"  // 128 - f = (f ^ 127 ) +
+      "movdqa      %%xmm2,%%xmm1                 \n"
+      "paddd       %%xmm3,%%xmm2                 \n"
+      "movzwl      0x00(%1,%3,1),%k2             \n"
+      "movd        %k2,%%xmm0                    \n"
+      "psrlw       $0x9,%%xmm1                   \n"
+      "movzwl      0x00(%1,%4,1),%k2             \n"
+      "movd        %k2,%%xmm4                    \n"
+      "pshufb      %%xmm5,%%xmm1                 \n"
+      "punpcklwd   %%xmm4,%%xmm0                 \n"
+      "psubb       %8,%%xmm0                     \n"  // make pixels signed.
+      "pxor        %%xmm6,%%xmm1                 \n"  // 128 - f = (f ^ 127 ) +
                                                       // 1
-      "paddusb   %%xmm7,%%xmm1                   \n"
-      "pmaddubsw %%xmm0,%%xmm1                   \n"
-      "pextrw    $0x1,%%xmm2,%k3                 \n"
-      "pextrw    $0x3,%%xmm2,%k4                 \n"
-      "paddw     %9,%%xmm1                       \n"  // make pixels unsigned.
-      "psrlw     $0x7,%%xmm1                     \n"
-      "packuswb  %%xmm1,%%xmm1                   \n"
-      "movd      %%xmm1,%k2                      \n"
-      "mov       %w2,(%0)                        \n"
-      "lea       0x2(%0),%0                      \n"
-      "subl      $0x2,%5                         \n"
-      "jge       2b                              \n"
+      "paddusb     %%xmm7,%%xmm1                 \n"
+      "pmaddubsw   %%xmm0,%%xmm1                 \n"
+      "pextrw      $0x1,%%xmm2,%k3               \n"
+      "pextrw      $0x3,%%xmm2,%k4               \n"
+      "paddw       %9,%%xmm1                     \n"  // make pixels unsigned.
+      "psrlw       $0x7,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "movd        %%xmm1,%k2                    \n"
+      "mov         %w2,(%0)                      \n"
+      "lea         0x2(%0),%0                    \n"
+      "subl        $0x2,%5                       \n"
+      "jge         2b                            \n"
 
       LABELALIGN
       "29:                                       \n"
-      "addl      $0x1,%5                         \n"
-      "jl        99f                             \n"
-      "movzwl    0x00(%1,%3,1),%k2               \n"
-      "movd      %k2,%%xmm0                      \n"
-      "psrlw     $0x9,%%xmm2                     \n"
-      "pshufb    %%xmm5,%%xmm2                   \n"
-      "psubb     %8,%%xmm0                       \n"  // make pixels signed.
-      "pxor      %%xmm6,%%xmm2                   \n"
-      "paddusb   %%xmm7,%%xmm2                   \n"
-      "pmaddubsw %%xmm0,%%xmm2                   \n"
-      "paddw     %9,%%xmm2                       \n"  // make pixels unsigned.
-      "psrlw     $0x7,%%xmm2                     \n"
-      "packuswb  %%xmm2,%%xmm2                   \n"
-      "movd      %%xmm2,%k2                      \n"
-      "mov       %b2,(%0)                        \n"
+      "addl        $0x1,%5                       \n"
+      "jl          99f                           \n"
+      "movzwl      0x00(%1,%3,1),%k2             \n"
+      "movd        %k2,%%xmm0                    \n"
+      "psrlw       $0x9,%%xmm2                   \n"
+      "pshufb      %%xmm5,%%xmm2                 \n"
+      "psubb       %8,%%xmm0                     \n"  // make pixels signed.
+      "pxor        %%xmm6,%%xmm2                 \n"
+      "paddusb     %%xmm7,%%xmm2                 \n"
+      "pmaddubsw   %%xmm0,%%xmm2                 \n"
+      "paddw       %9,%%xmm2                     \n"  // make pixels unsigned.
+      "psrlw       $0x7,%%xmm2                   \n"
+      "packuswb    %%xmm2,%%xmm2                 \n"
+      "movd        %%xmm2,%k2                    \n"
+      "mov         %b2,(%0)                      \n"
       "99:                                       \n"
       : "+r"(dst_ptr),      // %0
         "+r"(src_ptr),      // %1
@@ -966,16 +966,16 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%1),%%xmm0                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "punpcklbw %%xmm0,%%xmm0                   \n"
-      "punpckhbw %%xmm1,%%xmm1                   \n"
-      "movdqu    %%xmm0,(%0)                     \n"
-      "movdqu    %%xmm1,0x10(%0)                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "sub       $0x20,%2                        \n"
-      "jg        1b                              \n"
+      "movdqu      (%1),%%xmm0                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%0)                   \n"
+      "movdqu      %%xmm1,0x10(%0)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
 
       : "+r"(dst_ptr),   // %0
         "+r"(src_ptr),   // %1
@@ -993,14 +993,14 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "shufps    $0xdd,%%xmm1,%%xmm0             \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x4,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "shufps      $0xdd,%%xmm1,%%xmm0           \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
         "+r"(dst_width)  // %2
@@ -1017,17 +1017,17 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "shufps    $0x88,%%xmm1,%%xmm0             \n"
-      "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-      "pavgb     %%xmm2,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x4,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm2           \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
         "+r"(dst_width)  // %2
@@ -1043,21 +1043,21 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x00(%0,%3,1),%%xmm2            \n"
-      "movdqu    0x10(%0,%3,1),%%xmm3            \n"
-      "lea       0x20(%0),%0                     \n"
-      "pavgb     %%xmm2,%%xmm0                   \n"
-      "pavgb     %%xmm3,%%xmm1                   \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "shufps    $0x88,%%xmm1,%%xmm0             \n"
-      "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-      "pavgb     %%xmm2,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x4,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x00(%0,%3,1),%%xmm2          \n"
+      "movdqu      0x10(%0,%3,1),%%xmm3          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "pavgb       %%xmm3,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm2           \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
       : "+r"(src_argb),              // %0
         "+r"(dst_argb),              // %1
         "+r"(dst_width)              // %2
@@ -1076,23 +1076,23 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
   intptr_t src_stepx_x12;
   (void)src_stride;
   asm volatile(
-      "lea       0x00(,%1,4),%1                  \n"
-      "lea       0x00(%1,%1,2),%4                \n"
+      "lea         0x00(,%1,4),%1                \n"
+      "lea         0x00(%1,%1,2),%4              \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movd      (%0),%%xmm0                     \n"
-      "movd      0x00(%0,%1,1),%%xmm1            \n"
-      "punpckldq %%xmm1,%%xmm0                   \n"
-      "movd      0x00(%0,%1,2),%%xmm2            \n"
-      "movd      0x00(%0,%4,1),%%xmm3            \n"
-      "lea       0x00(%0,%1,4),%0                \n"
-      "punpckldq %%xmm3,%%xmm2                   \n"
-      "punpcklqdq %%xmm2,%%xmm0                  \n"
-      "movdqu    %%xmm0,(%2)                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "sub       $0x4,%3                         \n"
-      "jg        1b                              \n"
+      "movd        (%0),%%xmm0                   \n"
+      "movd        0x00(%0,%1,1),%%xmm1          \n"
+      "punpckldq   %%xmm1,%%xmm0                 \n"
+      "movd        0x00(%0,%1,2),%%xmm2          \n"
+      "movd        0x00(%0,%4,1),%%xmm3          \n"
+      "lea         0x00(%0,%1,4),%0              \n"
+      "punpckldq   %%xmm3,%%xmm2                 \n"
+      "punpcklqdq  %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jg          1b                            \n"
       : "+r"(src_argb),       // %0
         "+r"(src_stepx_x4),   // %1
         "+r"(dst_argb),       // %2
@@ -1113,32 +1113,32 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
   intptr_t src_stepx_x12;
   intptr_t row1 = (intptr_t)(src_stride);
   asm volatile(
-      "lea       0x00(,%1,4),%1                  \n"
-      "lea       0x00(%1,%1,2),%4                \n"
-      "lea       0x00(%0,%5,1),%5                \n"
+      "lea         0x00(,%1,4),%1                \n"
+      "lea         0x00(%1,%1,2),%4              \n"
+      "lea         0x00(%0,%5,1),%5              \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movq      (%0),%%xmm0                     \n"
-      "movhps    0x00(%0,%1,1),%%xmm0            \n"
-      "movq      0x00(%0,%1,2),%%xmm1            \n"
-      "movhps    0x00(%0,%4,1),%%xmm1            \n"
-      "lea       0x00(%0,%1,4),%0                \n"
-      "movq      (%5),%%xmm2                     \n"
-      "movhps    0x00(%5,%1,1),%%xmm2            \n"
-      "movq      0x00(%5,%1,2),%%xmm3            \n"
-      "movhps    0x00(%5,%4,1),%%xmm3            \n"
-      "lea       0x00(%5,%1,4),%5                \n"
-      "pavgb     %%xmm2,%%xmm0                   \n"
-      "pavgb     %%xmm3,%%xmm1                   \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "shufps    $0x88,%%xmm1,%%xmm0             \n"
-      "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-      "pavgb     %%xmm2,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%2)                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "sub       $0x4,%3                         \n"
-      "jg        1b                              \n"
+      "movq        (%0),%%xmm0                   \n"
+      "movhps      0x00(%0,%1,1),%%xmm0          \n"
+      "movq        0x00(%0,%1,2),%%xmm1          \n"
+      "movhps      0x00(%0,%4,1),%%xmm1          \n"
+      "lea         0x00(%0,%1,4),%0              \n"
+      "movq        (%5),%%xmm2                   \n"
+      "movhps      0x00(%5,%1,1),%%xmm2          \n"
+      "movq        0x00(%5,%1,2),%%xmm3          \n"
+      "movhps      0x00(%5,%4,1),%%xmm3          \n"
+      "lea         0x00(%5,%1,4),%5              \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "pavgb       %%xmm3,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm2           \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jg          1b                            \n"
       : "+r"(src_argb),        // %0
         "+r"(src_stepx_x4),    // %1
         "+r"(dst_argb),        // %2
@@ -1156,56 +1156,56 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb,
                         int dx) {
   intptr_t x0, x1;
   asm volatile(
-      "movd      %5,%%xmm2                       \n"
-      "movd      %6,%%xmm3                       \n"
-      "pshufd    $0x0,%%xmm2,%%xmm2              \n"
-      "pshufd    $0x11,%%xmm3,%%xmm0             \n"
-      "paddd     %%xmm0,%%xmm2                   \n"
-      "paddd     %%xmm3,%%xmm3                   \n"
-      "pshufd    $0x5,%%xmm3,%%xmm0              \n"
-      "paddd     %%xmm0,%%xmm2                   \n"
-      "paddd     %%xmm3,%%xmm3                   \n"
-      "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-      "pextrw    $0x1,%%xmm2,%k0                 \n"
-      "pextrw    $0x3,%%xmm2,%k1                 \n"
-      "cmp       $0x0,%4                         \n"
-      "jl        99f                             \n"
-      "sub       $0x4,%4                         \n"
-      "jl        49f                             \n"
+      "movd        %5,%%xmm2                     \n"
+      "movd        %6,%%xmm3                     \n"
+      "pshufd      $0x0,%%xmm2,%%xmm2            \n"
+      "pshufd      $0x11,%%xmm3,%%xmm0           \n"
+      "paddd       %%xmm0,%%xmm2                 \n"
+      "paddd       %%xmm3,%%xmm3                 \n"
+      "pshufd      $0x5,%%xmm3,%%xmm0            \n"
+      "paddd       %%xmm0,%%xmm2                 \n"
+      "paddd       %%xmm3,%%xmm3                 \n"
+      "pshufd      $0x0,%%xmm3,%%xmm3            \n"
+      "pextrw      $0x1,%%xmm2,%k0               \n"
+      "pextrw      $0x3,%%xmm2,%k1               \n"
+      "cmp         $0x0,%4                       \n"
+      "jl          99f                           \n"
+      "sub         $0x4,%4                       \n"
+      "jl          49f                           \n"
 
       LABELALIGN
       "40:                                       \n"
-      "movd      0x00(%3,%0,4),%%xmm0            \n"
-      "movd      0x00(%3,%1,4),%%xmm1            \n"
-      "pextrw    $0x5,%%xmm2,%k0                 \n"
-      "pextrw    $0x7,%%xmm2,%k1                 \n"
-      "paddd     %%xmm3,%%xmm2                   \n"
-      "punpckldq %%xmm1,%%xmm0                   \n"
-      "movd      0x00(%3,%0,4),%%xmm1            \n"
-      "movd      0x00(%3,%1,4),%%xmm4            \n"
-      "pextrw    $0x1,%%xmm2,%k0                 \n"
-      "pextrw    $0x3,%%xmm2,%k1                 \n"
-      "punpckldq %%xmm4,%%xmm1                   \n"
-      "punpcklqdq %%xmm1,%%xmm0                  \n"
-      "movdqu    %%xmm0,(%2)                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "sub       $0x4,%4                         \n"
-      "jge       40b                             \n"
+      "movd        0x00(%3,%0,4),%%xmm0          \n"
+      "movd        0x00(%3,%1,4),%%xmm1          \n"
+      "pextrw      $0x5,%%xmm2,%k0               \n"
+      "pextrw      $0x7,%%xmm2,%k1               \n"
+      "paddd       %%xmm3,%%xmm2                 \n"
+      "punpckldq   %%xmm1,%%xmm0                 \n"
+      "movd        0x00(%3,%0,4),%%xmm1          \n"
+      "movd        0x00(%3,%1,4),%%xmm4          \n"
+      "pextrw      $0x1,%%xmm2,%k0               \n"
+      "pextrw      $0x3,%%xmm2,%k1               \n"
+      "punpckldq   %%xmm4,%%xmm1                 \n"
+      "punpcklqdq  %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%4                       \n"
+      "jge         40b                           \n"
 
       "49:                                       \n"
-      "test      $0x2,%4                         \n"
-      "je        29f                             \n"
-      "movd      0x00(%3,%0,4),%%xmm0            \n"
-      "movd      0x00(%3,%1,4),%%xmm1            \n"
-      "pextrw    $0x5,%%xmm2,%k0                 \n"
-      "punpckldq %%xmm1,%%xmm0                   \n"
-      "movq      %%xmm0,(%2)                     \n"
-      "lea       0x8(%2),%2                      \n"
+      "test        $0x2,%4                       \n"
+      "je          29f                           \n"
+      "movd        0x00(%3,%0,4),%%xmm0          \n"
+      "movd        0x00(%3,%1,4),%%xmm1          \n"
+      "pextrw      $0x5,%%xmm2,%k0               \n"
+      "punpckldq   %%xmm1,%%xmm0                 \n"
+      "movq        %%xmm0,(%2)                   \n"
+      "lea         0x8(%2),%2                    \n"
       "29:                                       \n"
-      "test      $0x1,%4                         \n"
-      "je        99f                             \n"
-      "movd      0x00(%3,%0,4),%%xmm0            \n"
-      "movd      %%xmm0,(%2)                     \n"
+      "test        $0x1,%4                       \n"
+      "je          99f                           \n"
+      "movd        0x00(%3,%0,4),%%xmm0          \n"
+      "movd        %%xmm0,(%2)                   \n"
       "99:                                       \n"
       : "=&a"(x0),       // %0
         "=&d"(x1),       // %1
@@ -1230,16 +1230,16 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
 
       LABELALIGN
       "1:                                        \n"
-      "movdqu    (%1),%%xmm0                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "punpckldq %%xmm0,%%xmm0                   \n"
-      "punpckhdq %%xmm1,%%xmm1                   \n"
-      "movdqu    %%xmm0,(%0)                     \n"
-      "movdqu    %%xmm1,0x10(%0)                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
+      "movdqu      (%1),%%xmm0                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpckldq   %%xmm0,%%xmm0                 \n"
+      "punpckhdq   %%xmm1,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%0)                   \n"
+      "movdqu      %%xmm1,0x10(%0)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
 
       : "+r"(dst_argb),  // %0
         "+r"(src_argb),  // %1
@@ -1267,63 +1267,64 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
                                int dx) {
   intptr_t x0, x1;
   asm volatile(
-      "movdqa    %0,%%xmm4                       \n"
-      "movdqa    %1,%%xmm5                       \n"
+      "movdqa      %0,%%xmm4                     \n"
+      "movdqa      %1,%%xmm5                     \n"
       :
       : "m"(kShuffleColARGB),   // %0
         "m"(kShuffleFractions)  // %1
-      );
+  );
 
   asm volatile(
-      "movd      %5,%%xmm2                       \n"
-      "movd      %6,%%xmm3                       \n"
-      "pcmpeqb   %%xmm6,%%xmm6                   \n"
-      "psrlw     $0x9,%%xmm6                     \n"
-      "pextrw    $0x1,%%xmm2,%k3                 \n"
-      "sub       $0x2,%2                         \n"
-      "jl        29f                             \n"
-      "movdqa    %%xmm2,%%xmm0                   \n"
-      "paddd     %%xmm3,%%xmm0                   \n"
-      "punpckldq %%xmm0,%%xmm2                   \n"
-      "punpckldq %%xmm3,%%xmm3                   \n"
-      "paddd     %%xmm3,%%xmm3                   \n"
-      "pextrw    $0x3,%%xmm2,%k4                 \n"
+      "movd        %5,%%xmm2                     \n"
+      "movd        %6,%%xmm3                     \n"
+      "pcmpeqb     %%xmm6,%%xmm6                 \n"
+      "psrlw       $0x9,%%xmm6                   \n"
+      "pextrw      $0x1,%%xmm2,%k3               \n"
+      "sub         $0x2,%2                       \n"
+      "jl          29f                           \n"
+      "movdqa      %%xmm2,%%xmm0                 \n"
+      "paddd       %%xmm3,%%xmm0                 \n"
+      "punpckldq   %%xmm0,%%xmm2                 \n"
+      "punpckldq   %%xmm3,%%xmm3                 \n"
+      "paddd       %%xmm3,%%xmm3                 \n"
+      "pextrw      $0x3,%%xmm2,%k4               \n"
 
       LABELALIGN
       "2:                                        \n"
-      "movdqa    %%xmm2,%%xmm1                   \n"
-      "paddd     %%xmm3,%%xmm2                   \n"
-      "movq      0x00(%1,%3,4),%%xmm0            \n"
-      "psrlw     $0x9,%%xmm1                     \n"
-      "movhps    0x00(%1,%4,4),%%xmm0            \n"
-      "pshufb    %%xmm5,%%xmm1                   \n"
-      "pshufb    %%xmm4,%%xmm0                   \n"
-      "pxor      %%xmm6,%%xmm1                   \n"
-      "pmaddubsw %%xmm1,%%xmm0                   \n"
-      "psrlw     $0x7,%%xmm0                     \n"
-      "pextrw    $0x1,%%xmm2,%k3                 \n"
-      "pextrw    $0x3,%%xmm2,%k4                 \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "movq      %%xmm0,(%0)                     \n"
-      "lea       0x8(%0),%0                      \n"
-      "sub       $0x2,%2                         \n"
-      "jge       2b                              \n"
+      "movdqa      %%xmm2,%%xmm1                 \n"
+      "paddd       %%xmm3,%%xmm2                 \n"
+      "movq        0x00(%1,%3,4),%%xmm0          \n"
+      "psrlw       $0x9,%%xmm1                   \n"
+      "movhps      0x00(%1,%4,4),%%xmm0          \n"
+      "pshufb      %%xmm5,%%xmm1                 \n"
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "pxor        %%xmm6,%%xmm1                 \n"
+      "pmaddubsw   %%xmm1,%%xmm0                 \n"
+      "psrlw       $0x7,%%xmm0                   \n"
+      "pextrw      $0x1,%%xmm2,%k3               \n"
+      "pextrw      $0x3,%%xmm2,%k4               \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movq        %%xmm0,(%0)                   \n"
+      "lea         0x8(%0),%0                    \n"
+      "sub         $0x2,%2                       \n"
+      "jge         2b                            \n"
 
       LABELALIGN
       "29:                                       \n"
-      "add       $0x1,%2                         \n"
-      "jl        99f                             \n"
-      "psrlw     $0x9,%%xmm2                     \n"
-      "movq      0x00(%1,%3,4),%%xmm0            \n"
-      "pshufb    %%xmm5,%%xmm2                   \n"
-      "pshufb    %%xmm4,%%xmm0                   \n"
-      "pxor      %%xmm6,%%xmm2                   \n"
-      "pmaddubsw %%xmm2,%%xmm0                   \n"
-      "psrlw     $0x7,%%xmm0                     \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "movd      %%xmm0,(%0)                     \n"
-
-      LABELALIGN "99:                            \n"  // clang-format error.
+      "add         $0x1,%2                       \n"
+      "jl          99f                           \n"
+      "psrlw       $0x9,%%xmm2                   \n"
+      "movq        0x00(%1,%3,4),%%xmm0          \n"
+      "pshufb      %%xmm5,%%xmm2                 \n"
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "pxor        %%xmm6,%%xmm2                 \n"
+      "pmaddubsw   %%xmm2,%%xmm0                 \n"
+      "psrlw       $0x7,%%xmm0                   \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movd        %%xmm0,(%0)                   \n"
+
+      LABELALIGN
+      "99:                                       \n"  // clang-format error.
 
       : "+r"(dst_argb),    // %0
         "+r"(src_argb),    // %1
@@ -1339,10 +1340,10 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
 int FixedDiv_X86(int num, int div) {
   asm volatile(
       "cdq                                       \n"
-      "shld      $0x10,%%eax,%%edx               \n"
-      "shl       $0x10,%%eax                     \n"
-      "idiv      %1                              \n"
-      "mov       %0, %%eax                       \n"
+      "shld        $0x10,%%eax,%%edx             \n"
+      "shl         $0x10,%%eax                   \n"
+      "idiv        %1                            \n"
+      "mov         %0, %%eax                     \n"
       : "+a"(num)  // %0
       : "c"(div)   // %1
       : "memory", "cc", "edx");
@@ -1353,19 +1354,108 @@ int FixedDiv_X86(int num, int div) {
 int FixedDiv1_X86(int num, int div) {
   asm volatile(
       "cdq                                       \n"
-      "shld      $0x10,%%eax,%%edx               \n"
-      "shl       $0x10,%%eax                     \n"
-      "sub       $0x10001,%%eax                  \n"
-      "sbb       $0x0,%%edx                      \n"
-      "sub       $0x1,%1                         \n"
-      "idiv      %1                              \n"
-      "mov       %0, %%eax                       \n"
+      "shld        $0x10,%%eax,%%edx             \n"
+      "shl         $0x10,%%eax                   \n"
+      "sub         $0x10001,%%eax                \n"
+      "sbb         $0x0,%%edx                    \n"
+      "sub         $0x1,%1                       \n"
+      "idiv        %1                            \n"
+      "mov         %0, %%eax                     \n"
       : "+a"(num)  // %0
       : "c"(div)   // %1
       : "memory", "cc", "edx");
   return num;
 }
 
+#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
+// Shuffle table for splitting UV into upper and lower part of register.
+static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
+                                      1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
+static const uvec8 kShuffleMergeUV = {0u,   8u,   2u,   10u,  4u,   12u,
+                                      6u,   14u,  0x80, 0x80, 0x80, 0x80,
+                                      0x80, 0x80, 0x80, 0x80};
+
+void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  asm volatile(
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 01010101
+      "psrlw       $0xf,%%xmm4                   \n"
+      "packuswb    %%xmm4,%%xmm4                 \n"
+      "pxor        %%xmm5, %%xmm5                \n"  // zero
+      "movdqa      %4,%%xmm1                     \n"  // split shuffler
+      "movdqa      %5,%%xmm3                     \n"  // merge shuffler
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"  // 8 UV row 0
+      "movdqu      0x00(%0,%3,1),%%xmm2          \n"  // 8 UV row 1
+      "lea         0x10(%0),%0                   \n"
+      "pshufb      %%xmm1,%%xmm0                 \n"  // uuuuvvvv
+      "pshufb      %%xmm1,%%xmm2                 \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"  // horizontal add
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"  // vertical add
+      "psrlw       $0x1,%%xmm0                   \n"  // round
+      "pavgw       %%xmm5,%%xmm0                 \n"
+      "pshufb      %%xmm3,%%xmm0                 \n"  // merge uv
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"  // 4 UV
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kShuffleSplitUV),         // %4
+        "m"(kShuffleMergeUV)          // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_SCALEUVROWDOWN2BOX_SSSE3
+
+#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
+void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 01010101
+      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
+      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"  // zero
+      "vbroadcastf128 %4,%%ymm1                  \n"  // split shuffler
+      "vbroadcastf128 %5,%%ymm3                  \n"  // merge shuffler
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"  // 16 UV row 0
+      "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"  // 16 UV row 1
+      "lea         0x20(%0),%0                   \n"
+      "vpshufb     %%ymm1,%%ymm0,%%ymm0          \n"  // uuuuvvvv
+      "vpshufb     %%ymm1,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"  // horizontal add
+      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
+      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"  // vertical add
+      "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"  // round
+      "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpshufb     %%ymm3,%%ymm0,%%ymm0          \n"  // merge uv
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"  // combine qwords
+      "vmovdqu     %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"  // 8 UV
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kShuffleSplitUV),         // %4
+        "m"(kShuffleMergeUV)          // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_SCALEUVROWDOWN2BOX_AVX2
+
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #ifdef __cplusplus
diff --git a/chromium/third_party/libyuv/source/scale_mmi.cc b/chromium/third_party/libyuv/source/scale_mmi.cc
new file mode 100644
index 00000000000..1226ef3eaf5
--- /dev/null
+++ b/chromium/third_party/libyuv/source/scale_mmi.cc
@@ -0,0 +1,1168 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Mips MMI.
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+// clang-format off
+
+// CPU agnostic row functions
+void ScaleRowDown2_MMI(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width) {
+  (void)src_stride;
+
+  uint64_t src0, src1, dest;
+  const uint64_t shift = 0x8ULL;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
+      "psrlh      %[src0],         %[src0],           %[shift]      \n\t"
+
+      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
+      "psrlh      %[src1],         %[src1],           %[shift]      \n\t"
+
+      "packushb   %[dest],         %[src0],           %[src1]       \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x08          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
+        [shift] "f"(shift)
+      : "memory");
+}
+
+void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width) {
+  (void)src_stride;
+
+  uint64_t src0, src1;
+  uint64_t dest, dest0, dest1;
+
+  const uint64_t mask = 0x00ff00ff00ff00ffULL;
+  const uint64_t shift = 0x8ULL;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldrc1    %[src0],          0x00(%[src_ptr])                \n\t"
+      "gsldlc1    %[src0],          0x07(%[src_ptr])                \n\t"
+      "and        %[dest0],         %[src0],          %[mask]       \n\t"
+      "gsldrc1    %[src1],          0x08(%[src_ptr])                \n\t"
+      "gsldlc1    %[src1],          0x0f(%[src_ptr])                \n\t"
+      "and        %[dest1],         %[src1],          %[mask]       \n\t"
+      "packushb   %[dest0],         %[dest0],         %[dest1]      \n\t"
+
+      "psrlh      %[src0],          %[src0],          %[shift]      \n\t"
+      "psrlh      %[src1],          %[src1],          %[shift]      \n\t"
+      "packushb   %[dest1],         %[src0],          %[src1]       \n\t"
+
+      "pavgb      %[dest],          %[dest0],         %[dest1]      \n\t"
+      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
+      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
+
+      "daddiu     %[src_ptr],       %[src_ptr],        0x10         \n\t"
+      "daddiu     %[dst_ptr],       %[dst_ptr],        0x08         \n\t"
+      "daddi      %[width],         %[width],         -0x08         \n\t"
+      "bnez       %[width],         1b                              \n\t"
+      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest0] "=&f"(dest0),
+        [dest1] "=&f"(dest1), [dest] "=&f"(dest)
+      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [mask] "f"(mask),
+        [shift] "f"(shift), [width] "r"(dst_width)
+      : "memory");
+}
+
+void ScaleRowDown2Box_MMI(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+
+  uint64_t s0, s1, t0, t1;
+  uint64_t dest, dest0, dest1;
+
+  const uint64_t ph = 0x0002000200020002ULL;
+  const uint64_t mask = 0x00ff00ff00ff00ffULL;
+  const uint64_t shift0 = 0x2ULL;
+  const uint64_t shift1 = 0x8ULL;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldrc1    %[s0],            0x00(%[s])                      \n\t"
+      "gsldlc1    %[s0],            0x07(%[s])                      \n\t"
+      "psrlh      %[s1],            %[s0],            %[shift1]     \n\t"
+      "and        %[s0],            %[s0],            %[mask]       \n\t"
+
+      "gsldrc1    %[t0],            0x00(%[t])                      \n\t"
+      "gsldlc1    %[t0],            0x07(%[t])                      \n\t"
+      "psrlh      %[t1],            %[t0],            %[shift1]     \n\t"
+      "and        %[t0],            %[t0],            %[mask]       \n\t"
+
+      "paddh      %[dest0],         %[s0],            %[s1]         \n\t"
+      "paddh      %[dest0],         %[dest0],         %[t0]         \n\t"
+      "paddh      %[dest0],         %[dest0],         %[t1]         \n\t"
+      "paddh      %[dest0],         %[dest0],         %[ph]         \n\t"
+      "psrlh      %[dest0],         %[dest0],         %[shift0]     \n\t"
+
+      "gsldrc1    %[s0],            0x08(%[s])                      \n\t"
+      "gsldlc1    %[s0],            0x0f(%[s])                      \n\t"
+      "psrlh      %[s1],            %[s0],            %[shift1]     \n\t"
+      "and        %[s0],            %[s0],            %[mask]       \n\t"
+
+      "gsldrc1    %[t0],            0x08(%[t])                      \n\t"
+      "gsldlc1    %[t0],            0x0f(%[t])                      \n\t"
+      "psrlh      %[t1],            %[t0],            %[shift1]     \n\t"
+      "and        %[t0],            %[t0],            %[mask]       \n\t"
+
+      "paddh      %[dest1],         %[s0],            %[s1]         \n\t"
+      "paddh      %[dest1],         %[dest1],         %[t0]         \n\t"
+      "paddh      %[dest1],         %[dest1],         %[t1]         \n\t"
+      "paddh      %[dest1],         %[dest1],         %[ph]         \n\t"
+      "psrlh      %[dest1],         %[dest1],         %[shift0]     \n\t"
+
+      "packushb   %[dest],          %[dest0],         %[dest1]      \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[s],            %[s],              0x10          \n\t"
+      "daddiu     %[t],            %[t],              0x10          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x08          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1),
+        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest] "=&f"(dest)
+      : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width),
+        [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph),
+        [mask] "f"(mask)
+      : "memory");
+}
+
+void ScaleARGBRowDown2_MMI(const uint8_t* src_argb,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_argb,
+                           int dst_width) {
+  (void)src_stride;
+
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
+
+  uint64_t src0, src1, dest;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
+      "punpckhwd  %[dest],         %[src0],           %[src1]       \n\t"
+
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x02          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width)
+      : "memory");
+}
+
+void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_argb,
+                                 int dst_width) {
+  (void)src_stride;
+
+  uint64_t src0, src1;
+  uint64_t dest, dest_hi, dest_lo;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "lwc1       %[src0],         0x00(%[src_ptr])                 \n\t"
+      "lwc1       %[src1],         0x08(%[src_ptr])                 \n\t"
+      "punpcklwd  %[dest_lo],      %[src0],           %[src1]       \n\t"
+      "lwc1       %[src0],         0x04(%[src_ptr])                 \n\t"
+      "lwc1       %[src1],         0x0c(%[src_ptr])                 \n\t"
+      "punpcklwd  %[dest_hi],      %[src0],           %[src1]       \n\t"
+
+      "pavgb      %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x02          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
+        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
+      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width)
+      : "memory");
+}
+
+void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_argb,
+                              int dst_width) {
+  const uint8_t* s = src_argb;
+  const uint8_t* t = src_argb + src_stride;
+
+  uint64_t s0, s_hi, s_lo;
+  uint64_t t0, t_hi, t_lo;
+  uint64_t dest, dest_hi, dest_lo;
+
+  const uint64_t mask = 0x0ULL;
+  const uint64_t ph = 0x0002000200020002ULL;
+  const uint64_t shfit = 0x2ULL;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldrc1    %[s0],            0x00(%[s])                      \n\t"
+      "gsldlc1    %[s0],            0x07(%[s])                      \n\t"
+      "punpcklbh  %[s_lo],          %[s0],           %[mask]        \n\t"
+      "punpckhbh  %[s_hi],          %[s0],           %[mask]        \n\t"
+      "paddh      %[dest_lo],       %[s_lo],         %[s_hi]        \n\t"
+
+      "gsldrc1    %[t0],            0x00(%[t])                      \n\t"
+      "gsldlc1    %[t0],            0x07(%[t])                      \n\t"
+      "punpcklbh  %[t_lo],          %[t0],           %[mask]        \n\t"
+      "punpckhbh  %[t_hi],          %[t0],           %[mask]        \n\t"
+      "paddh      %[dest_lo],       %[dest_lo],      %[t_lo]        \n\t"
+      "paddh      %[dest_lo],       %[dest_lo],      %[t_hi]        \n\t"
+
+      "paddh      %[dest_lo],      %[dest_lo],       %[ph]          \n\t"
+      "psrlh      %[dest_lo],      %[dest_lo],       %[shfit]       \n\t"
+
+      "gsldrc1    %[s0],            0x08(%[s])                      \n\t"
+      "gsldlc1    %[s0],            0x0f(%[s])                      \n\t"
+      "punpcklbh  %[s_lo],          %[s0],           %[mask]        \n\t"
+      "punpckhbh  %[s_hi],          %[s0],           %[mask]        \n\t"
+      "paddh      %[dest_hi],       %[s_lo],         %[s_hi]        \n\t"
+
+      "gsldrc1    %[t0],            0x08(%[t])                      \n\t"
+      "gsldlc1    %[t0],            0x0f(%[t])                      \n\t"
+      "punpcklbh  %[t_lo],          %[t0],           %[mask]        \n\t"
+      "punpckhbh  %[t_hi],          %[t0],           %[mask]        \n\t"
+      "paddh      %[dest_hi],       %[dest_hi],      %[t_lo]        \n\t"
+      "paddh      %[dest_hi],       %[dest_hi],      %[t_hi]        \n\t"
+
+      "paddh      %[dest_hi],      %[dest_hi],       %[ph]          \n\t"
+      "psrlh      %[dest_hi],      %[dest_hi],       %[shfit]       \n\t"
+
+      "packushb   %[dest],         %[dest_lo],       %[dest_hi]     \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[s],            %[s],              0x10          \n\t"
+      "daddiu     %[t],            %[t],              0x10          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x02          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [s0] "=&f"(s0), [t0] "=&f"(t0), [dest_hi] "=&f"(dest_hi),
+        [dest_lo] "=&f"(dest_lo), [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo),
+        [t_hi] "=&f"(t_hi), [t_lo] "=&f"(t_lo), [dest] "=&f"(dest)
+      : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width),
+        [mask] "f"(mask), [ph] "f"(ph), [shfit] "f"(shfit)
+      : "memory");
+}
+
+void ScaleRowDown2_16_MMI(const uint16_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint16_t* dst,
+                          int dst_width) {
+  (void)src_stride;
+
+  uint64_t src0, src1, dest;
+  const uint64_t shift = 0x10ULL;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
+      "psrlw      %[src0],         %[src0],           %[shift]      \n\t"
+
+      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
+      "psrlw      %[src1],         %[src1],           %[shift]      \n\t"
+
+      "packsswh   %[dest],         %[src0],           %[src1]       \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x04          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
+        [shift] "f"(shift)
+      : "memory");
+}
+
+void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint16_t* dst,
+                                int dst_width) {
+  (void)src_stride;
+
+  uint64_t src0, src1;
+  uint64_t dest, dest_hi, dest_lo;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
+      "punpcklhw  %[dest_lo],      %[src0],           %[src1]       \n\t"
+      "punpckhhw  %[dest_hi],      %[src0],           %[src1]       \n\t"
+
+      "punpcklhw  %[src0],         %[dest_lo],        %[dest_hi]    \n\t"
+      "punpckhhw  %[src1],         %[dest_lo],        %[dest_hi]    \n\t"
+
+      "pavgh      %[dest],         %[src0],           %[src1]       \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x04          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
+        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
+      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width)
+      : "memory");
+}
+
+void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint16_t* dst,
+                             int dst_width) {
+  const uint16_t* s = src_ptr;
+  const uint16_t* t = src_ptr + src_stride;
+
+  uint64_t s0, s1, s_hi, s_lo;
+  uint64_t t0, t1, t_hi, t_lo;
+  uint64_t dest, dest0, dest1;
+
+  const uint64_t ph = 0x0000000200000002ULL;
+  const uint64_t mask = 0x0000ffff0000ffffULL;
+  const uint64_t shift0 = 0x10ULL;
+  const uint64_t shift1 = 0x2ULL;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldrc1    %[s0],            0x00(%[s])                      \n\t"
+      "gsldlc1    %[s0],            0x07(%[s])                      \n\t"
+      "psrlw      %[s1],            %[s0],            %[shift0]     \n\t"
+      "and        %[s0],            %[s0],            %[mask]       \n\t"
+
+      "gsldrc1    %[t0],            0x00(%[t])                      \n\t"
+      "gsldlc1    %[t0],            0x07(%[t])                      \n\t"
+      "psrlw      %[t1],            %[t0],            %[shift0]     \n\t"
+      "and        %[t0],            %[t0],            %[mask]       \n\t"
+
+      "paddw      %[dest0],         %[s0],            %[s1]         \n\t"
+      "paddw      %[dest0],         %[dest0],         %[t0]         \n\t"
+      "paddw      %[dest0],         %[dest0],         %[t1]         \n\t"
+      "paddw      %[dest0],         %[dest0],         %[ph]         \n\t"
+      "psrlw      %[dest0],         %[dest0],         %[shift1]     \n\t"
+
+      "gsldrc1    %[s0],            0x08(%[s])                      \n\t"
+      "gsldlc1    %[s0],            0x0f(%[s])                      \n\t"
+      "psrlw      %[s1],            %[s0],            %[shift0]     \n\t"
+      "and        %[s0],            %[s0],            %[mask]       \n\t"
+
+      "gsldrc1    %[t0],            0x08(%[t])                      \n\t"
+      "gsldlc1    %[t0],            0x0f(%[t])                      \n\t"
+      "psrlw      %[t1],            %[t0],            %[shift0]     \n\t"
+      "and        %[t0],            %[t0],            %[mask]       \n\t"
+
+      "paddw      %[dest1],         %[s0],            %[s1]         \n\t"
+      "paddw      %[dest1],         %[dest1],         %[t0]         \n\t"
+      "paddw      %[dest1],         %[dest1],         %[t1]         \n\t"
+      "paddw      %[dest1],         %[dest1],         %[ph]         \n\t"
+      "psrlw      %[dest1],         %[dest1],         %[shift1]     \n\t"
+
+      "packsswh   %[dest],          %[dest0],         %[dest1]      \n\t"
+      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
+      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
+
+      "daddiu     %[s],             %[s],              0x10         \n\t"
+      "daddiu     %[t],             %[t],              0x10         \n\t"
+      "daddiu     %[dst_ptr],       %[dst_ptr],        0x08         \n\t"
+      "daddi      %[width],         %[width],         -0x04         \n\t"
+      "bnez       %[width],         1b                              \n\t"
+      : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1),
+        [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo), [t_hi] "=&f"(t_hi),
+        [t_lo] "=&f"(t_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
+        [dest] "=&f"(dest)
+      : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width),
+        [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph),
+        [mask] "f"(mask)
+      : "memory");
+}
+
+void ScaleRowDown4_MMI(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width) {
+  (void)src_stride;
+
+  uint64_t src0, src1;
+  uint64_t dest, dest_hi, dest_lo;
+
+  const uint64_t shift = 0x10ULL;
+  const uint64_t mask = 0x000000ff000000ffULL;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
+      "psrlw      %[src0],         %[src0],           %[shift]      \n\t"
+      "and        %[src0],         %[src0],           %[mask]       \n\t"
+      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
+      "psrlw      %[src1],         %[src1],           %[shift]      \n\t"
+      "and        %[src1],         %[src1],           %[mask]       \n\t"
+      "packsswh   %[dest_lo],      %[src0],           %[src1]       \n\t"
+
+      "gsldrc1    %[src0],         0x10(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src0],         0x17(%[src_ptr])                 \n\t"
+      "psrlw      %[src0],         %[src0],           %[shift]      \n\t"
+      "and        %[src0],         %[src0],           %[mask]       \n\t"
+      "gsldrc1    %[src1],         0x18(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src1],         0x1f(%[src_ptr])                 \n\t"
+      "psrlw      %[src1],         %[src1],           %[shift]      \n\t"
+      "and        %[src1],         %[src1],           %[mask]       \n\t"
+      "packsswh   %[dest_hi],      %[src0],           %[src1]       \n\t"
+
+      "packushb   %[dest],         %[dest_lo],         %[dest_hi]   \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x08          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
+        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
+      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
+        [shift] "f"(shift), [mask] "f"(mask)
+      : "memory");
+}
+
+void ScaleRowDown4_16_MMI(const uint16_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint16_t* dst,
+                          int dst_width) {
+  (void)src_stride;
+
+  uint64_t src0, src1;
+  uint64_t dest, dest_hi, dest_lo;
+
+  const uint64_t mask = 0x0ULL;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
+      "punpckhhw  %[dest_lo],      %[src0],           %[src1]       \n\t"
+      "punpcklhw  %[dest_lo],      %[dest_lo],        %[mask]       \n\t"
+
+      "gsldrc1    %[src0],         0x10(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src0],         0x17(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src1],         0x18(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src1],         0x1f(%[src_ptr])                 \n\t"
+      "punpckhhw  %[dest_hi],      %[src0],           %[src1]       \n\t"
+      "punpcklhw  %[dest_hi],      %[dest_hi],        %[mask]       \n\t"
+
+      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x04          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
+        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
+      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
+        [mask] "f"(mask)
+      : "memory");
+}
+
+#define DO_SCALEROWDOWN4BOX_PUNPCKADD()                              \
+  "punpcklbh  %[src_lo],       %[src],           %[mask0]      \n\t" \
+  "punpckhbh  %[src_hi],       %[src],           %[mask0]      \n\t" \
+  "paddh      %[dest_lo],      %[dest_lo],       %[src_lo]     \n\t" \
+  "paddh      %[dest_hi],      %[dest_hi],       %[src_hi]     \n\t"
+
+#define DO_SCALEROWDOWN4BOX_LOOP(reg)                                \
+  "ldc1       %[src],          0x00(%[src0_ptr])               \n\t" \
+  "punpcklbh  %[dest_lo],      %[src],           %[mask0]      \n\t" \
+  "punpckhbh  %[dest_hi],      %[src],           %[mask0]      \n\t" \
+                                                                     \
+  "ldc1       %[src],          0x00(%[src1_ptr])               \n\t" \
+  DO_SCALEROWDOWN4BOX_PUNPCKADD()                                    \
+                                                                     \
+  "ldc1       %[src],          0x00(%[src2_ptr])               \n\t" \
+  DO_SCALEROWDOWN4BOX_PUNPCKADD()                                    \
+                                                                     \
+  "ldc1       %[src],          0x00(%[src3_ptr])               \n\t" \
+  DO_SCALEROWDOWN4BOX_PUNPCKADD()                                    \
+                                                                     \
+  "pmaddhw    %[dest_lo],      %[dest_lo],       %[mask1]      \n\t" \
+  "pmaddhw    %[dest_hi],      %[dest_hi],       %[mask1]      \n\t" \
+  "packsswh   " #reg   ",      %[dest_lo],       %[dest_hi]    \n\t" \
+  "pmaddhw    " #reg   ",      " #reg   ",       %[mask1]      \n\t" \
+  "paddh      " #reg   ",      " #reg   ",       %[ph]         \n\t" \
+  "psrlh      " #reg   ",      " #reg   ",       %[shift]      \n\t" \
+                                                                     \
+  "daddiu     %[src0_ptr],     %[src0_ptr],      0x08          \n\t" \
+  "daddiu     %[src1_ptr],     %[src1_ptr],      0x08          \n\t" \
+  "daddiu     %[src2_ptr],     %[src2_ptr],      0x08          \n\t" \
+  "daddiu     %[src3_ptr],     %[src3_ptr],      0x08          \n\t"
+
+/* LibYUVScaleTest.ScaleDownBy4_Box */
+void ScaleRowDown4Box_MMI(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  const uint8_t* src0_ptr = src_ptr;
+  const uint8_t* src1_ptr = src_ptr + src_stride;
+  const uint8_t* src2_ptr = src_ptr + src_stride * 2;
+  const uint8_t* src3_ptr = src_ptr + src_stride * 3;
+
+  uint64_t src, src_hi, src_lo;
+  uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3;
+
+  const uint64_t mask0 = 0x0ULL;
+  const uint64_t mask1 = 0x0001000100010001ULL;
+  const uint64_t ph = 0x0008000800080008ULL;
+  const uint64_t shift = 0x4ULL;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+
+      DO_SCALEROWDOWN4BOX_LOOP(%[dest0])
+      DO_SCALEROWDOWN4BOX_LOOP(%[dest1])
+      DO_SCALEROWDOWN4BOX_LOOP(%[dest2])
+      DO_SCALEROWDOWN4BOX_LOOP(%[dest3])
+
+      "packsswh   %[dest_lo],      %[dest0],          %[dest1]      \n\t"
+      "packsswh   %[dest_hi],      %[dest2],          %[dest3]      \n\t"
+
+      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x08          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
+        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+        [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest)
+      : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
+        [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst),
+        [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0),
+        [ph] "f"(ph), [mask1] "f"(mask1)
+      : "memory");
+}
+
+#define DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                            \
+  "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t" \
+  "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t" \
+  "paddh      %[dest_lo],      %[dest_lo],        %[src_lo]     \n\t" \
+  "paddh      %[dest_hi],      %[dest_hi],        %[src_hi]     \n\t"
+
+#define DO_SCALEROWDOWN4BOX_16_LOOP(reg)                              \
+  "ldc1       %[src],          0x00(%[src0_ptr])                \n\t" \
+  "punpcklbh  %[dest_lo],      %[src],            %[mask0]      \n\t" \
+  "punpckhbh  %[dest_hi],      %[src],            %[mask0]      \n\t" \
+                                                                      \
+  "ldc1       %[src],          0x00(%[src1_ptr])                \n\t" \
+  DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                                  \
+                                                                      \
+  "ldc1       %[src],          0x00(%[src2_ptr])                \n\t" \
+  DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                                  \
+                                                                      \
+  "ldc1       %[src],          0x00(%[src3_ptr])                \n\t" \
+  DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                                  \
+                                                                      \
+  "paddw      %[dest],         %[dest_lo],        %[dest_hi]    \n\t" \
+  "punpckhwd  %[dest_hi],      %[dest],           %[dest]       \n\t" \
+  "paddw      %[dest],         %[dest_hi],        %[dest]       \n\t" \
+  "paddw      %[dest],         %[dest],           %[ph]         \n\t" \
+  "psraw      %[dest],         %[dest],           %[shift]      \n\t" \
+  "and        " #reg ",        %[dest],           %[mask1]      \n\t" \
+                                                                      \
+  "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t" \
+  "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t" \
+  "daddiu     %[src2_ptr],     %[src2_ptr],       0x08          \n\t" \
+  "daddiu     %[src3_ptr],     %[src3_ptr],       0x08          \n\t"
+
+/* LibYUVScaleTest.ScaleDownBy4_Box_16 */
+void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint16_t* dst,
+                             int dst_width) {
+  const uint16_t* src0_ptr = src_ptr;
+  const uint16_t* src1_ptr = src_ptr + src_stride;
+  const uint16_t* src2_ptr = src_ptr + src_stride * 2;
+  const uint16_t* src3_ptr = src_ptr + src_stride * 3;
+
+  uint64_t src, src_hi, src_lo;
+  uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3;
+
+  const uint64_t mask0 = 0x0ULL;
+  const uint64_t mask1 = 0x00000000ffffffffULL;
+  const uint64_t ph = 0x0000000800000008ULL;
+  const uint64_t shift = 0x04ULL;
+
+  __asm__ volatile(
+      "1:                                                        \n\t"
+
+      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest0])
+      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest1])
+      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest2])
+      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest3])
+      "punpcklwd  %[dest_lo],      %[dest0],          %[dest1]   \n\t"
+      "punpcklwd  %[dest_hi],      %[dest2],          %[dest3]   \n\t"
+
+      "packushb   %[dest],         %[dest_lo],        %[dest_hi] \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])              \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])              \n\t"
+
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08       \n\t"
+      "daddi      %[width],        %[width],         -0x04       \n\t"
+      "bnez       %[width],        1b                            \n\t"
+      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
+        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+        [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest)
+      : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
+        [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst),
+        [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0),
+        [ph] "f"(ph), [mask1] "f"(mask1)
+      : "memory");
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleColsUp2_MMI(uint8_t* dst_ptr,
+                      const uint8_t* src_ptr,
+                      int dst_width,
+                      int x,
+                      int dx) {
+  uint64_t src, dest;
+
+  (void)x;
+  (void)dx;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "lwc1       %[src],          0x00(%[src_ptr])                 \n\t"
+
+      "punpcklbh  %[dest],         %[src],            %[src]        \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x04          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x08          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src] "=&f"(src), [dest] "=&f"(dest)
+      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width)
+      : "memory");
+}
+
+void ScaleColsUp2_16_MMI(uint16_t* dst_ptr,
+                         const uint16_t* src_ptr,
+                         int dst_width,
+                         int x,
+                         int dx) {
+  uint64_t src, dest;
+
+  (void)x;
+  (void)dx;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
+
+      "punpcklhw  %[dest],         %[src],            %[src]        \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "punpckhhw  %[dest],         %[src],            %[src]        \n\t"
+      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
+      "daddi      %[width],        %[width],         -0x08          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src] "=&f"(src), [dest] "=&f"(dest)
+      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width)
+      : "memory");
+}
+
+void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
+  uint64_t src, src_hi, src_lo, dest0, dest1;
+  const uint64_t mask = 0x0ULL;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
+      "punpcklbh  %[src_lo],       %[src],            %[mask]       \n\t"
+      "punpckhbh  %[src_hi],       %[src],            %[mask]       \n\t"
+
+      "gsldrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
+      "gsldlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
+      "paddush    %[dest0],        %[dest0],          %[src_lo]     \n\t"
+      "gsldrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
+      "gsldlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
+      "paddush    %[dest1],        %[dest1],          %[src_hi]     \n\t"
+
+      "gssdlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
+      "gssdlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
+      "daddi      %[width],        %[width],         -0x08          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi),
+        [src_lo] "=&f"(src_lo), [src] "=&f"(src)
+      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width),
+        [mask] "f"(mask)
+      : "memory");
+}
+
+void ScaleAddRow_16_MMI(const uint16_t* src_ptr,
+                        uint32_t* dst_ptr,
+                        int src_width) {
+  uint64_t src, src_hi, src_lo, dest0, dest1;
+  const uint64_t mask = 0x0ULL;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
+      "punpcklhw  %[src_lo],       %[src],            %[mask]       \n\t"
+      "punpckhhw  %[src_hi],       %[src],            %[mask]       \n\t"
+
+      "gsldrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
+      "gsldlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
+      "paddw      %[dest0],        %[dest0],          %[src_lo]     \n\t"
+      "gssdlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
+
+      "gsldrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
+      "gsldlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
+      "paddw      %[dest1],        %[dest1],          %[src_hi]     \n\t"
+      "gssdlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
+
+      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
+      "daddi      %[width],        %[width],         -0x04          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi),
+        [src_lo] "=&f"(src_lo), [src] "=&f"(src)
+      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width),
+        [mask] "f"(mask)
+      : "memory");
+}
+
+void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              int src_stepx,
+                              uint8_t* dst_argb,
+                              int dst_width) {
+  (void)src_stride;
+
+  uint64_t src0, src1, dest;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "lwc1       %[src0],          0x00(%[src_ptr])                \n\t"
+      "dadd       %[src_ptr],       %[src_ptr],       %[src_stepx_4]\n\t"
+      "lwc1       %[src1],          0x00(%[src_ptr])                \n\t"
+      "punpcklwd  %[dest],          %[src0],          %[src1]       \n\t"
+
+      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
+      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
+
+      "dadd       %[src_ptr],       %[src_ptr],       %[src_stepx_4]\n\t"
+      "daddiu     %[dst_ptr],       %[dst_ptr],       0x08          \n\t"
+      "daddi      %[width],         %[width],        -0x02          \n\t"
+      "bnez       %[width],         1b                              \n\t"
+      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb),
+        [src_stepx_4] "r"(src_stepx << 2), [width] "r"(dst_width)
+      : "memory");
+}
+
+void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_argb,
+                                 int dst_width) {
+  const uint8_t* src0_ptr = src_argb;
+  const uint8_t* src1_ptr = src_argb + src_stride;
+
+  uint64_t src0, src1, src_hi, src_lo;
+  uint64_t dest, dest_hi, dest_lo, dest0, dest1;
+
+  const uint64_t mask = 0x0ULL;
+  const uint64_t ph = 0x0002000200020002ULL;
+  const uint64_t shift = 0x2ULL;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+
+      "lwc1       %[src0],         0x00(%[src0_ptr])                \n\t"
+      "punpcklbh  %[dest_lo],      %[src0],          %[mask]        \n\t"
+      "lwc1       %[src0],         0x04(%[src0_ptr])                \n\t"
+      "punpcklbh  %[dest_hi],      %[src0],          %[mask]        \n\t"
+
+      "lwc1       %[src1],         0x00(%[src1_ptr])                \n\t"
+      "punpcklbh  %[src_lo],       %[src1],          %[mask]        \n\t"
+      "lwc1       %[src1],         0x04(%[src1_ptr])                \n\t"
+      "punpcklbh  %[src_hi],       %[src1],          %[mask]        \n\t"
+      "paddh      %[dest_lo],      %[dest_lo],       %[src_lo]      \n\t"
+      "paddh      %[dest_hi],      %[dest_hi],       %[src_hi]      \n\t"
+      "paddh      %[dest0],        %[dest_hi],       %[dest_lo]     \n\t"
+      "paddh      %[dest0],        %[dest0],         %[ph]          \n\t"
+      "psrlh      %[dest0],        %[dest0],         %[shift]       \n\t"
+
+      "dadd       %[src0_ptr],     %[src0_ptr],      %[src_stepx_4] \n\t"
+      "dadd       %[src1_ptr],     %[src1_ptr],      %[src_stepx_4] \n\t"
+
+      "lwc1       %[src0],         0x00(%[src0_ptr])                \n\t"
+      "punpcklbh  %[dest_lo],      %[src0],          %[mask]        \n\t"
+      "lwc1       %[src0],         0x04(%[src0_ptr])                \n\t"
+      "punpcklbh  %[dest_hi],      %[src0],          %[mask]        \n\t"
+
+      "lwc1       %[src1],         0x00(%[src1_ptr])                \n\t"
+      "punpcklbh  %[src_lo],       %[src1],          %[mask]        \n\t"
+      "lwc1       %[src1],         0x04(%[src1_ptr])                \n\t"
+      "punpcklbh  %[src_hi],       %[src1],          %[mask]        \n\t"
+      "paddh      %[dest_lo],      %[dest_lo],       %[src_lo]      \n\t"
+      "paddh      %[dest_hi],      %[dest_hi],       %[src_hi]      \n\t"
+      "paddh      %[dest1],        %[dest_hi],       %[dest_lo]     \n\t"
+      "paddh      %[dest1],        %[dest1],         %[ph]          \n\t"
+      "psrlh      %[dest1],        %[dest1],         %[shift]       \n\t"
+
+      "packushb   %[dest],         %[dest0],          %[dest1]      \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+
+      "dadd       %[src0_ptr],     %[src0_ptr],      %[src_stepx_4] \n\t"
+      "dadd       %[src1_ptr],     %[src1_ptr],      %[src_stepx_4] \n\t"
+      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
+      "daddi      %[width],        %[width],         -0x02          \n\t"
+      "bnez       %[width],        1b                               \n\t"
+      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
+        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0),
+        [src1] "=&f"(src1), [dest] "=&f"(dest)
+      : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
+        [dst_ptr] "r"(dst_argb), [width] "r"(dst_width),
+        [src_stepx_4] "r"(src_stepx << 2), [shift] "f"(shift), [mask] "f"(mask),
+        [ph] "f"(ph)
+      : "memory");
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleARGBCols_MMI(uint8_t* dst_argb,
+                       const uint8_t* src_argb,
+                       int dst_width,
+                       int x,
+                       int dx) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
+
+  const uint32_t* src_tmp;
+
+  uint64_t dest, offset;
+
+  const uint64_t shift0 = 16;
+  const uint64_t shift1 = 2;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "srav       %[offset],        %[x],             %[shift0]     \n\t"
+      "sllv       %[offset],        %[offset],        %[shift1]     \n\t"
+      "dadd       %[src_tmp],       %[src_ptr],       %[offset]     \n\t"
+      "lwc1       %[dest],          0x00(%[src_tmp])                \n\t"
+      "swc1       %[dest],          0x00(%[dst_ptr])                \n\t"
+
+      "dadd       %[x],             %[x],             %[dx]         \n\t"
+
+      "daddiu     %[dst_ptr],       %[dst_ptr],       0x04          \n\t"
+      "daddi      %[width],         %[width],        -0x01          \n\t"
+      "bnez       %[width],         1b                              \n\t"
+      : [dest] "=&f"(dest), [offset] "=&r"(offset), [src_tmp] "=&r"(src_tmp)
+      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width),
+        [dx] "r"(dx), [x] "r"(x), [shift0] "r"(shift0), [shift1] "r"(shift1)
+      : "memory");
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleARGBColsUp2_MMI(uint8_t* dst_argb,
+                          const uint8_t* src_argb,
+                          int dst_width,
+                          int x,
+                          int dx) {
+  uint64_t src, dest0, dest1;
+  (void)x;
+  (void)dx;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldrc1    %[src],           0x00(%[src_ptr])                \n\t"
+      "gsldlc1    %[src],           0x07(%[src_ptr])                \n\t"
+      "punpcklwd  %[dest0],         %[src],           %[src]        \n\t"
+      "gssdlc1    %[dest0],         0x07(%[dst_ptr])                \n\t"
+      "gssdrc1    %[dest0],         0x00(%[dst_ptr])                \n\t"
+      "punpckhwd  %[dest1],         %[src],           %[src]        \n\t"
+      "gssdlc1    %[dest1],         0x0f(%[dst_ptr])                \n\t"
+      "gssdrc1    %[dest1],         0x08(%[dst_ptr])                \n\t"
+
+      "daddiu     %[src_ptr],       %[src_ptr],       0x08          \n\t"
+      "daddiu     %[dst_ptr],       %[dst_ptr],       0x10          \n\t"
+      "daddi      %[width],         %[width],        -0x04          \n\t"
+      "bnez       %[width],         1b                              \n\t"
+      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src] "=&f"(src)
+      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width)
+      : "memory");
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+/* LibYUVBaseTest.TestFixedDiv */
+int FixedDiv_MIPS(int num, int div) {
+  int quotient = 0;
+  const int shift = 16;
+
+  asm(
+      "dsll    %[num],     %[num],     %[shift]    \n\t"
+      "ddiv    %[num],     %[div]                  \t\n"
+      "mflo    %[quo]                              \t\n"
+      : [quo] "+&r"(quotient)
+      : [num] "r"(num), [div] "r"(div), [shift] "r"(shift));
+
+  return quotient;
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+/* LibYUVScaleTest.ARGBScaleTo320x240_Linear */
+int FixedDiv1_MIPS(int num, int div) {
+  int quotient = 0;
+  const int shift = 16;
+  const int val1 = 1;
+  const int64_t val11 = 0x00010001ULL;
+
+  asm(
+      "dsll    %[num],     %[num],     %[shift]    \n\t"
+      "dsub    %[num],     %[num],     %[val11]    \n\t"
+      "dsub    %[div],     %[div],     %[val1]     \n\t"
+      "ddiv    %[num],     %[div]                  \t\n"
+      "mflo    %[quo]                              \t\n"
+      : [quo] "+&r"(quotient)
+      : [num] "r"(num), [div] "r"(div), [val1] "r"(val1), [val11] "r"(val11),
+        [shift] "r"(shift));
+
+  return quotient;
+}
+
+// Read 8x2 upsample with filtering and write 16x1.
+// actually reads an extra pixel, so 9x2.
+void ScaleRowUp2_16_MMI(const uint16_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint16_t* dst,
+                        int dst_width) {
+  const uint16_t* src2_ptr = src_ptr + src_stride;
+
+  uint64_t src0, src1;
+  uint64_t dest, dest04, dest15, dest26, dest37;
+  uint64_t tmp0, tmp1, tmp2, tmp3;
+
+  const uint64_t mask0 = 0x0003000900030009ULL;
+  const uint64_t mask1 = 0x0001000300010003ULL;
+  const uint64_t mask2 = 0x0009000300090003ULL;
+  const uint64_t mask3 = 0x0003000100030001ULL;
+  const uint64_t ph = 0x0000000800000008ULL;
+  const uint64_t shift = 4;
+
+  __asm__ volatile(
+      "1:                                                           \n\t"
+      "gsldrc1    %[src0],          0x00(%[src1_ptr])               \n\t"
+      "gsldlc1    %[src0],          0x07(%[src1_ptr])               \n\t"
+      "pmaddhw    %[dest04],        %[src0],          %[mask0]      \n\t"
+      "gsldrc1    %[src1],          0x00(%[src2_ptr])               \n\t"
+      "gsldlc1    %[src1],          0x07(%[src2_ptr])               \n\t"
+      "pmaddhw    %[dest],          %[src1],          %[mask1]      \n\t"
+      "paddw      %[dest04],        %[dest04],        %[dest]       \n\t"
+      "paddw      %[dest04],        %[dest04],        %[ph]         \n\t"
+      "psrlw      %[dest04],        %[dest04],        %[shift]      \n\t"
+
+      "pmaddhw    %[dest15],        %[src0],          %[mask2]      \n\t"
+      "pmaddhw    %[dest],          %[src1],          %[mask3]      \n\t"
+      "paddw      %[dest15],        %[dest15],        %[dest]       \n\t"
+      "paddw      %[dest15],        %[dest15],        %[ph]         \n\t"
+      "psrlw      %[dest15],        %[dest15],        %[shift]      \n\t"
+
+      "gsldrc1    %[src0],          0x02(%[src1_ptr])               \n\t"
+      "gsldlc1    %[src0],          0x09(%[src1_ptr])               \n\t"
+      "pmaddhw    %[dest26],        %[src0],          %[mask0]      \n\t"
+      "gsldrc1    %[src1],          0x02(%[src2_ptr])               \n\t"
+      "gsldlc1    %[src1],          0x09(%[src2_ptr])               \n\t"
+      "pmaddhw    %[dest],          %[src1],          %[mask1]      \n\t"
+      "paddw      %[dest26],        %[dest26],        %[dest]       \n\t"
+      "paddw      %[dest26],        %[dest26],        %[ph]         \n\t"
+      "psrlw      %[dest26],        %[dest26],        %[shift]      \n\t"
+
+      "pmaddhw    %[dest37],        %[src0],          %[mask2]      \n\t"
+      "pmaddhw    %[dest],          %[src1],          %[mask3]      \n\t"
+      "paddw      %[dest37],        %[dest37],        %[dest]       \n\t"
+      "paddw      %[dest37],        %[dest37],        %[ph]         \n\t"
+      "psrlw      %[dest37],        %[dest37],        %[shift]      \n\t"
+
+      /* tmp0 = ( 00 04 02 06 ) */
+      "packsswh   %[tmp0],          %[dest04],        %[dest26]     \n\t"
+      /* tmp1 = ( 01 05 03 07 ) */
+      "packsswh   %[tmp1],          %[dest15],        %[dest37]     \n\t"
+
+      /* tmp2 = ( 00 01 04 05 )*/
+      "punpcklhw  %[tmp2],          %[tmp0],          %[tmp1]       \n\t"
+      /* tmp3 = ( 02 03 06 07 )*/
+      "punpckhhw  %[tmp3],          %[tmp0],          %[tmp1]       \n\t"
+
+      /* ( 00 01 02 03 ) */
+      "punpcklwd  %[dest],          %[tmp2],          %[tmp3]       \n\t"
+      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
+      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
+
+      /* ( 04 05 06 07 ) */
+      "punpckhwd  %[dest],          %[tmp2],          %[tmp3]       \n\t"
+      "gssdlc1    %[dest],          0x0f(%[dst_ptr])                \n\t"
+      "gssdrc1    %[dest],          0x08(%[dst_ptr])                \n\t"
+
+      "daddiu     %[src1_ptr],      %[src1_ptr],      0x08          \n\t"
+      "daddiu     %[src2_ptr],      %[src2_ptr],      0x08          \n\t"
+      "daddiu     %[dst_ptr],       %[dst_ptr],       0x10          \n\t"
+      "daddi      %[width],         %[width],        -0x08          \n\t"
+      "bnez       %[width],         1b                              \n\t"
+      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest04] "=&f"(dest04),
+        [dest15] "=&f"(dest15), [dest26] "=&f"(dest26), [dest37] "=&f"(dest37),
+        [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
+        [tmp3] "=&f"(tmp3), [dest] "=&f"(dest)
+      : [src1_ptr] "r"(src_ptr), [src2_ptr] "r"(src2_ptr), [dst_ptr] "r"(dst),
+        [width] "r"(dst_width), [mask0] "f"(mask0), [mask1] "f"(mask1),
+        [mask2] "f"(mask2), [mask3] "f"(mask3), [shift] "f"(shift), [ph] "f"(ph)
+      : "memory");
+}
+
+void ScaleRowDown34_MMI(const uint8_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      uint8_t* dst,
+                      int dst_width) {
+  (void)src_stride;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  uint64_t src[2];
+  uint64_t tmp[2];
+  __asm__ volatile (
+    "1:                                                           \n\t"
+    "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
+    "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
+    "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
+    "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
+    "and        %[tmp1],         %[src0],        %[mask1]         \n\t"
+    "psrlw      %[tmp0],         %[src0],        %[rmov]          \n\t"
+    "psllw      %[tmp0],         %[tmp0],        %[lmov1]         \n\t"
+    "or         %[src0],         %[tmp0],        %[tmp1]          \n\t"
+    "punpckhwd  %[tmp0],         %[src0],        %[src0]          \n\t"
+    "psllw      %[tmp1],         %[tmp0],        %[rmov]          \n\t"
+    "or         %[src0],         %[src0],        %[tmp1]          \n\t"
+    "psrlw      %[tmp0],         %[tmp0],        %[rmov8]         \n\t"
+    "pextrh     %[tmp0],         %[tmp0],        %[zero]          \n\t"
+    "pinsrh_2   %[src0],         %[src0],        %[tmp0]          \n\t"
+    "pextrh     %[tmp0],         %[src1],        %[zero]          \n\t"
+    "pinsrh_3   %[src0],         %[src0],        %[tmp0]          \n\t"
+
+    "punpckhwd  %[tmp0],         %[src1],        %[src1]          \n\t"
+    "pextrh     %[tmp1],         %[tmp0],        %[zero]          \n\t"
+    "psrlw      %[src1],         %[src1],        %[rmov]          \n\t"
+    "psllw      %[tmp1],         %[tmp1],        %[rmov8]         \n\t"
+    "or         %[src1],         %[src1],        %[tmp1]          \n\t"
+    "and        %[tmp0],         %[tmp0],        %[mask2]         \n\t"
+    "or         %[src1],         %[src1],        %[tmp0]          \n\t"
+
+    "gssdlc1    %[src0],         0x07(%[dst_ptr])                 \n\t"
+    "gssdrc1    %[src0],         0x00(%[dst_ptr])                 \n\t"
+    "gsswlc1    %[src1],         0x0b(%[dst_ptr])                 \n\t"
+    "gsswrc1    %[src1],         0x08(%[dst_ptr])                 \n\t"
+
+    "daddiu     %[src_ptr],      %[src_ptr],     0x10             \n\t"
+    "daddi      %[width],        %[width],      -0x0c             \n\t"
+    "daddiu     %[dst_ptr],      %[dst_ptr],     0x0c             \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [src0]"=&f"(src[0]),              [src1]"=&f"(src[1]),
+      [tmp0]"=&f"(tmp[0]),              [tmp1]"=&f"(tmp[1])
+    : [src_ptr]"r"(src_ptr),            [dst_ptr]"r"(dst),
+      [lmov]"f"(0xc),                   [rmov]"f"(0x18),
+      [mask1]"f"(0xffff0000ffff),       [rmov8]"f"(0x8),
+      [zero]"f"(0x0),                   [mask2]"f"(0xff000000),
+      [width]"r"(dst_width),            [lmov1]"f"(0x10)
+    : "memory"
+  );
+}
+// clang-format on
+
+#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/scale_neon.cc b/chromium/third_party/libyuv/source/scale_neon.cc
index 459a2995dfe..572b4bfa9b3 100644
--- a/chromium/third_party/libyuv/source/scale_neon.cc
+++ b/chromium/third_party/libyuv/source/scale_neon.cc
@@ -31,16 +31,16 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
   asm volatile(
       "1:                                        \n"
       // load even pixels into q0, odd into q1
-      "vld2.8     {q0, q1}, [%0]!                \n"
-      "subs       %2, %2, #16                    \n"  // 16 processed per loop
-      "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
-      "bgt        1b                             \n"
+      "vld2.8      {q0, q1}, [%0]!               \n"
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop
+      "vst1.8      {q1}, [%1]!                   \n"  // store odd pixels
+      "bgt         1b                            \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst),       // %1
         "+r"(dst_width)  // %2
       :
       : "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 // Read 32x1 average down and write 16x1.
@@ -51,17 +51,17 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
   (void)src_stride;
   asm volatile(
       "1:                                        \n"
-      "vld2.8     {q0, q1}, [%0]!                \n"  // load 32 pixels
-      "subs       %2, %2, #16                    \n"  // 16 processed per loop
-      "vrhadd.u8  q0, q0, q1                     \n"  // rounding half add
-      "vst1.8     {q0}, [%1]!                    \n"
-      "bgt        1b                             \n"
+      "vld2.8      {q0, q1}, [%0]!               \n"  // load 32 pixels
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop
+      "vrhadd.u8   q0, q0, q1                    \n"  // rounding half add
+      "vst1.8      {q0}, [%1]!                   \n"
+      "bgt         1b                            \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst),       // %1
         "+r"(dst_width)  // %2
       :
       : "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 // Read 32x2 average down and write 16x1.
@@ -71,28 +71,28 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
                            int dst_width) {
   asm volatile(
       // change the stride to row 2 pointer
-      "add        %1, %0                         \n"
+      "add         %1, %0                        \n"
       "1:                                        \n"
-      "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
-      "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
-      "subs       %3, %3, #16                    \n"  // 16 processed per loop
-      "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
-      "vpaddl.u8  q1, q1                         \n"
-      "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent +
+      "vld1.8      {q0, q1}, [%0]!               \n"  // load row 1 and post inc
+      "vld1.8      {q2, q3}, [%1]!               \n"  // load row 2 and post inc
+      "subs        %3, %3, #16                   \n"  // 16 processed per loop
+      "vpaddl.u8   q0, q0                        \n"  // row 1 add adjacent
+      "vpaddl.u8   q1, q1                        \n"
+      "vpadal.u8   q0, q2                        \n"  // row 2 add adjacent +
                                                       // row1
-      "vpadal.u8  q1, q3                         \n"
-      "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and
+      "vpadal.u8   q1, q3                        \n"
+      "vrshrn.u16  d0, q0, #2                    \n"  // downshift, round and
                                                       // pack
-      "vrshrn.u16 d1, q1, #2                     \n"
-      "vst1.8     {q0}, [%2]!                    \n"
-      "bgt        1b                             \n"
+      "vrshrn.u16  d1, q1, #2                    \n"
+      "vst1.8      {q0}, [%2]!                   \n"
+      "bgt         1b                            \n"
       : "+r"(src_ptr),     // %0
         "+r"(src_stride),  // %1
         "+r"(dst),         // %2
         "+r"(dst_width)    // %3
       :
       : "q0", "q1", "q2", "q3"  // Clobber List
-      );
+  );
 }
 
 void ScaleRowDown4_NEON(const uint8_t* src_ptr,
@@ -102,10 +102,10 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
   (void)src_stride;
   asm volatile(
       "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // src line 0
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop
-      "vst1.8     {d2}, [%1]!                    \n"
-      "bgt        1b                             \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop
+      "vst1.8      {d2}, [%1]!                   \n"
+      "bgt         1b                            \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
         "+r"(dst_width)  // %2
@@ -122,20 +122,20 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
   const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
   asm volatile(
       "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load up 16x4
-      "vld1.8     {q1}, [%3]!                    \n"
-      "vld1.8     {q2}, [%4]!                    \n"
-      "vld1.8     {q3}, [%5]!                    \n"
-      "subs       %2, %2, #4                     \n"
-      "vpaddl.u8  q0, q0                         \n"
-      "vpadal.u8  q0, q1                         \n"
-      "vpadal.u8  q0, q2                         \n"
-      "vpadal.u8  q0, q3                         \n"
-      "vpaddl.u16 q0, q0                         \n"
-      "vrshrn.u32 d0, q0, #4                     \n"  // divide by 16 w/rounding
-      "vmovn.u16  d0, q0                         \n"
-      "vst1.32    {d0[0]}, [%1]!                 \n"
-      "bgt        1b                             \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load up 16x4
+      "vld1.8      {q1}, [%3]!                   \n"
+      "vld1.8      {q2}, [%4]!                   \n"
+      "vld1.8      {q3}, [%5]!                   \n"
+      "subs        %2, %2, #4                    \n"
+      "vpaddl.u8   q0, q0                        \n"
+      "vpadal.u8   q0, q1                        \n"
+      "vpadal.u8   q0, q2                        \n"
+      "vpadal.u8   q0, q3                        \n"
+      "vpaddl.u16  q0, q0                        \n"
+      "vrshrn.u32  d0, q0, #4                    \n"  // divide by 16 w/rounding
+      "vmovn.u16   d0, q0                        \n"
+      "vst1.32     {d0[0]}, [%1]!                \n"
+      "bgt         1b                            \n"
       : "+r"(src_ptr),    // %0
         "+r"(dst_ptr),    // %1
         "+r"(dst_width),  // %2
@@ -156,11 +156,11 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
   (void)src_stride;
   asm volatile(
       "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // src line 0
-      "subs       %2, %2, #24                    \n"
-      "vmov       d2, d3                         \n"  // order d0, d1, d2
-      "vst3.8     {d0, d1, d2}, [%1]!            \n"
-      "bgt        1b                             \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
+      "subs        %2, %2, #24                   \n"
+      "vmov        d2, d3                        \n"  // order d0, d1, d2
+      "vst3.8      {d0, d1, d2}, [%1]!           \n"
+      "bgt         1b                            \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
         "+r"(dst_width)  // %2
@@ -173,49 +173,49 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int dst_width) {
   asm volatile(
-      "vmov.u8    d24, #3                        \n"
-      "add        %3, %0                         \n"
+      "vmov.u8     d24, #3                       \n"
+      "add         %3, %0                        \n"
       "1:                                        \n"
-      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"  // src line 0
-      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"  // src line 1
-      "subs         %2, %2, #24                  \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
+      "vld4.8      {d4, d5, d6, d7}, [%3]!       \n"  // src line 1
+      "subs        %2, %2, #24                   \n"
 
       // filter src line 0 with src line 1
       // expand chars to shorts to allow for room
       // when adding lines together
-      "vmovl.u8     q8, d4                       \n"
-      "vmovl.u8     q9, d5                       \n"
-      "vmovl.u8     q10, d6                      \n"
-      "vmovl.u8     q11, d7                      \n"
+      "vmovl.u8    q8, d4                        \n"
+      "vmovl.u8    q9, d5                        \n"
+      "vmovl.u8    q10, d6                       \n"
+      "vmovl.u8    q11, d7                       \n"
 
       // 3 * line_0 + line_1
-      "vmlal.u8     q8, d0, d24                  \n"
-      "vmlal.u8     q9, d1, d24                  \n"
-      "vmlal.u8     q10, d2, d24                 \n"
-      "vmlal.u8     q11, d3, d24                 \n"
+      "vmlal.u8    q8, d0, d24                   \n"
+      "vmlal.u8    q9, d1, d24                   \n"
+      "vmlal.u8    q10, d2, d24                  \n"
+      "vmlal.u8    q11, d3, d24                  \n"
 
       // (3 * line_0 + line_1) >> 2
-      "vqrshrn.u16  d0, q8, #2                   \n"
-      "vqrshrn.u16  d1, q9, #2                   \n"
-      "vqrshrn.u16  d2, q10, #2                  \n"
-      "vqrshrn.u16  d3, q11, #2                  \n"
+      "vqrshrn.u16 d0, q8, #2                    \n"
+      "vqrshrn.u16 d1, q9, #2                    \n"
+      "vqrshrn.u16 d2, q10, #2                   \n"
+      "vqrshrn.u16 d3, q11, #2                   \n"
 
       // a0 = (src[0] * 3 + s[1] * 1) >> 2
-      "vmovl.u8     q8, d1                       \n"
-      "vmlal.u8     q8, d0, d24                  \n"
-      "vqrshrn.u16  d0, q8, #2                   \n"
+      "vmovl.u8    q8, d1                        \n"
+      "vmlal.u8    q8, d0, d24                   \n"
+      "vqrshrn.u16 d0, q8, #2                    \n"
 
       // a1 = (src[1] * 1 + s[2] * 1) >> 1
-      "vrhadd.u8    d1, d1, d2                   \n"
+      "vrhadd.u8   d1, d1, d2                    \n"
 
       // a2 = (src[2] * 1 + s[3] * 3) >> 2
-      "vmovl.u8     q8, d2                       \n"
-      "vmlal.u8     q8, d3, d24                  \n"
-      "vqrshrn.u16  d2, q8, #2                   \n"
+      "vmovl.u8    q8, d2                        \n"
+      "vmlal.u8    q8, d3, d24                   \n"
+      "vqrshrn.u16 d2, q8, #2                    \n"
 
-      "vst3.8       {d0, d1, d2}, [%1]!          \n"
+      "vst3.8      {d0, d1, d2}, [%1]!           \n"
 
-      "bgt          1b                           \n"
+      "bgt         1b                            \n"
       : "+r"(src_ptr),    // %0
         "+r"(dst_ptr),    // %1
         "+r"(dst_width),  // %2
@@ -230,31 +230,31 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int dst_width) {
   asm volatile(
-      "vmov.u8    d24, #3                        \n"
-      "add        %3, %0                         \n"
+      "vmov.u8     d24, #3                       \n"
+      "add         %3, %0                        \n"
       "1:                                        \n"
-      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"  // src line 0
-      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"  // src line 1
-      "subs         %2, %2, #24                  \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
+      "vld4.8      {d4, d5, d6, d7}, [%3]!       \n"  // src line 1
+      "subs        %2, %2, #24                   \n"
       // average src line 0 with src line 1
-      "vrhadd.u8    q0, q0, q2                   \n"
-      "vrhadd.u8    q1, q1, q3                   \n"
+      "vrhadd.u8   q0, q0, q2                    \n"
+      "vrhadd.u8   q1, q1, q3                    \n"
 
       // a0 = (src[0] * 3 + s[1] * 1) >> 2
-      "vmovl.u8     q3, d1                       \n"
-      "vmlal.u8     q3, d0, d24                  \n"
-      "vqrshrn.u16  d0, q3, #2                   \n"
+      "vmovl.u8    q3, d1                        \n"
+      "vmlal.u8    q3, d0, d24                   \n"
+      "vqrshrn.u16 d0, q3, #2                    \n"
 
       // a1 = (src[1] * 1 + s[2] * 1) >> 1
-      "vrhadd.u8    d1, d1, d2                   \n"
+      "vrhadd.u8   d1, d1, d2                    \n"
 
       // a2 = (src[2] * 1 + s[3] * 3) >> 2
-      "vmovl.u8     q3, d2                       \n"
-      "vmlal.u8     q3, d3, d24                  \n"
-      "vqrshrn.u16  d2, q3, #2                   \n"
+      "vmovl.u8    q3, d2                        \n"
+      "vmlal.u8    q3, d3, d24                   \n"
+      "vqrshrn.u16 d2, q3, #2                    \n"
 
-      "vst3.8       {d0, d1, d2}, [%1]!          \n"
-      "bgt          1b                           \n"
+      "vst3.8      {d0, d1, d2}, [%1]!           \n"
+      "bgt         1b                            \n"
       : "+r"(src_ptr),    // %0
         "+r"(dst_ptr),    // %1
         "+r"(dst_width),  // %2
@@ -282,15 +282,15 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
                          int dst_width) {
   (void)src_stride;
   asm volatile(
-      "vld1.8     {q3}, [%3]                     \n"
+      "vld1.8      {q3}, [%3]                    \n"
       "1:                                        \n"
-      "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
-      "subs       %2, %2, #12                    \n"
-      "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
-      "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
-      "vst1.8     {d4}, [%1]!                    \n"
-      "vst1.32    {d5[0]}, [%1]!                 \n"
-      "bgt        1b                             \n"
+      "vld1.8      {d0, d1, d2, d3}, [%0]!       \n"
+      "subs        %2, %2, #12                   \n"
+      "vtbl.u8     d4, {d0, d1, d2, d3}, d6      \n"
+      "vtbl.u8     d5, {d0, d1, d2, d3}, d7      \n"
+      "vst1.8      {d4}, [%1]!                   \n"
+      "vst1.32     {d5[0]}, [%1]!                \n"
+      "bgt         1b                            \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
         "+r"(dst_width)  // %2
@@ -306,57 +306,57 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
   const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
 
   asm volatile(
-      "vld1.16    {q13}, [%5]                    \n"
-      "vld1.8     {q14}, [%6]                    \n"
-      "vld1.8     {q15}, [%7]                    \n"
-      "add        %3, %0                         \n"
+      "vld1.16     {q13}, [%5]                   \n"
+      "vld1.8      {q14}, [%6]                   \n"
+      "vld1.8      {q15}, [%7]                   \n"
+      "add         %3, %0                        \n"
       "1:                                        \n"
 
       // d0 = 00 40 01 41 02 42 03 43
       // d1 = 10 50 11 51 12 52 13 53
       // d2 = 20 60 21 61 22 62 23 63
       // d3 = 30 70 31 71 32 72 33 73
-      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
-      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
-      "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
-      "subs         %2, %2, #12                  \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"
+      "vld4.8      {d4, d5, d6, d7}, [%3]!       \n"
+      "vld4.8      {d16, d17, d18, d19}, [%4]!   \n"
+      "subs        %2, %2, #12                   \n"
 
       // Shuffle the input data around to get align the data
       //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
       // d0 = 00 10 01 11 02 12 03 13
       // d1 = 40 50 41 51 42 52 43 53
-      "vtrn.u8      d0, d1                       \n"
-      "vtrn.u8      d4, d5                       \n"
-      "vtrn.u8      d16, d17                     \n"
+      "vtrn.u8     d0, d1                        \n"
+      "vtrn.u8     d4, d5                        \n"
+      "vtrn.u8     d16, d17                      \n"
 
       // d2 = 20 30 21 31 22 32 23 33
       // d3 = 60 70 61 71 62 72 63 73
-      "vtrn.u8      d2, d3                       \n"
-      "vtrn.u8      d6, d7                       \n"
-      "vtrn.u8      d18, d19                     \n"
+      "vtrn.u8     d2, d3                        \n"
+      "vtrn.u8     d6, d7                        \n"
+      "vtrn.u8     d18, d19                      \n"
 
       // d0 = 00+10 01+11 02+12 03+13
       // d2 = 40+50 41+51 42+52 43+53
-      "vpaddl.u8    q0, q0                       \n"
-      "vpaddl.u8    q2, q2                       \n"
-      "vpaddl.u8    q8, q8                       \n"
+      "vpaddl.u8   q0, q0                        \n"
+      "vpaddl.u8   q2, q2                        \n"
+      "vpaddl.u8   q8, q8                        \n"
 
       // d3 = 60+70 61+71 62+72 63+73
-      "vpaddl.u8    d3, d3                       \n"
-      "vpaddl.u8    d7, d7                       \n"
-      "vpaddl.u8    d19, d19                     \n"
+      "vpaddl.u8   d3, d3                        \n"
+      "vpaddl.u8   d7, d7                        \n"
+      "vpaddl.u8   d19, d19                      \n"
 
       // combine source lines
-      "vadd.u16     q0, q2                       \n"
-      "vadd.u16     q0, q8                       \n"
-      "vadd.u16     d4, d3, d7                   \n"
-      "vadd.u16     d4, d19                      \n"
+      "vadd.u16    q0, q2                        \n"
+      "vadd.u16    q0, q8                        \n"
+      "vadd.u16    d4, d3, d7                    \n"
+      "vadd.u16    d4, d19                       \n"
 
       // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
       //             + s[6 + st * 1] + s[7 + st * 1]
       //             + s[6 + st * 2] + s[7 + st * 2]) / 6
       "vqrdmulh.s16 q2, q2, q13                  \n"
-      "vmovn.u16    d4, q2                       \n"
+      "vmovn.u16   d4, q2                        \n"
 
       // Shuffle 2,3 reg around so that 2 can be added to the
       //  0,1 reg and 3 can be added to the 4,5 reg. This
@@ -364,24 +364,24 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
       //  registers are already expanded. Then do transposes
       //  to get aligned.
       // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-      "vmovl.u8     q1, d2                       \n"
-      "vmovl.u8     q3, d6                       \n"
-      "vmovl.u8     q9, d18                      \n"
+      "vmovl.u8    q1, d2                        \n"
+      "vmovl.u8    q3, d6                        \n"
+      "vmovl.u8    q9, d18                       \n"
 
       // combine source lines
-      "vadd.u16     q1, q3                       \n"
-      "vadd.u16     q1, q9                       \n"
+      "vadd.u16    q1, q3                        \n"
+      "vadd.u16    q1, q9                        \n"
 
       // d4 = xx 20 xx 30 xx 22 xx 32
       // d5 = xx 21 xx 31 xx 23 xx 33
-      "vtrn.u32     d2, d3                       \n"
+      "vtrn.u32    d2, d3                        \n"
 
       // d4 = xx 20 xx 21 xx 22 xx 23
       // d5 = xx 30 xx 31 xx 32 xx 33
-      "vtrn.u16     d2, d3                       \n"
+      "vtrn.u16    d2, d3                        \n"
 
       // 0+1+2, 3+4+5
-      "vadd.u16     q0, q1                       \n"
+      "vadd.u16    q0, q1                        \n"
 
       // Need to divide, but can't downshift as the the value
       //  isn't a power of 2. So multiply by 65536 / n
@@ -390,14 +390,14 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
 
       // Align for table lookup, vtbl requires registers to
       //  be adjacent
-      "vmov.u8      d2, d4                       \n"
+      "vmov.u8     d2, d4                        \n"
 
-      "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
-      "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+      "vtbl.u8     d3, {d0, d1, d2}, d28         \n"
+      "vtbl.u8     d4, {d0, d1, d2}, d29         \n"
 
-      "vst1.8       {d3}, [%1]!                  \n"
-      "vst1.32      {d4[0]}, [%1]!               \n"
-      "bgt          1b                           \n"
+      "vst1.8      {d3}, [%1]!                   \n"
+      "vst1.32     {d4[0]}, [%1]!                \n"
+      "bgt         1b                            \n"
       : "+r"(src_ptr),       // %0
         "+r"(dst_ptr),       // %1
         "+r"(dst_width),     // %2
@@ -416,46 +416,46 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int dst_width) {
   asm volatile(
-      "vld1.16    {q13}, [%4]                    \n"
-      "vld1.8     {q14}, [%5]                    \n"
-      "add        %3, %0                         \n"
+      "vld1.16     {q13}, [%4]                   \n"
+      "vld1.8      {q14}, [%5]                   \n"
+      "add         %3, %0                        \n"
       "1:                                        \n"
 
       // d0 = 00 40 01 41 02 42 03 43
       // d1 = 10 50 11 51 12 52 13 53
       // d2 = 20 60 21 61 22 62 23 63
       // d3 = 30 70 31 71 32 72 33 73
-      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
-      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
-      "subs         %2, %2, #12                  \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"
+      "vld4.8      {d4, d5, d6, d7}, [%3]!       \n"
+      "subs        %2, %2, #12                   \n"
 
       // Shuffle the input data around to get align the data
       //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
       // d0 = 00 10 01 11 02 12 03 13
       // d1 = 40 50 41 51 42 52 43 53
-      "vtrn.u8      d0, d1                       \n"
-      "vtrn.u8      d4, d5                       \n"
+      "vtrn.u8     d0, d1                        \n"
+      "vtrn.u8     d4, d5                        \n"
 
       // d2 = 20 30 21 31 22 32 23 33
       // d3 = 60 70 61 71 62 72 63 73
-      "vtrn.u8      d2, d3                       \n"
-      "vtrn.u8      d6, d7                       \n"
+      "vtrn.u8     d2, d3                        \n"
+      "vtrn.u8     d6, d7                        \n"
 
       // d0 = 00+10 01+11 02+12 03+13
       // d2 = 40+50 41+51 42+52 43+53
-      "vpaddl.u8    q0, q0                       \n"
-      "vpaddl.u8    q2, q2                       \n"
+      "vpaddl.u8   q0, q0                        \n"
+      "vpaddl.u8   q2, q2                        \n"
 
       // d3 = 60+70 61+71 62+72 63+73
-      "vpaddl.u8    d3, d3                       \n"
-      "vpaddl.u8    d7, d7                       \n"
+      "vpaddl.u8   d3, d3                        \n"
+      "vpaddl.u8   d7, d7                        \n"
 
       // combine source lines
-      "vadd.u16     q0, q2                       \n"
-      "vadd.u16     d4, d3, d7                   \n"
+      "vadd.u16    q0, q2                        \n"
+      "vadd.u16    d4, d3, d7                    \n"
 
       // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
-      "vqrshrn.u16  d4, q2, #2                   \n"
+      "vqrshrn.u16 d4, q2, #2                    \n"
 
       // Shuffle 2,3 reg around so that 2 can be added to the
       //  0,1 reg and 3 can be added to the 4,5 reg. This
@@ -463,22 +463,22 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
       //  registers are already expanded. Then do transposes
       //  to get aligned.
       // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-      "vmovl.u8     q1, d2                       \n"
-      "vmovl.u8     q3, d6                       \n"
+      "vmovl.u8    q1, d2                        \n"
+      "vmovl.u8    q3, d6                        \n"
 
       // combine source lines
-      "vadd.u16     q1, q3                       \n"
+      "vadd.u16    q1, q3                        \n"
 
       // d4 = xx 20 xx 30 xx 22 xx 32
       // d5 = xx 21 xx 31 xx 23 xx 33
-      "vtrn.u32     d2, d3                       \n"
+      "vtrn.u32    d2, d3                        \n"
 
       // d4 = xx 20 xx 21 xx 22 xx 23
       // d5 = xx 30 xx 31 xx 32 xx 33
-      "vtrn.u16     d2, d3                       \n"
+      "vtrn.u16    d2, d3                        \n"
 
       // 0+1+2, 3+4+5
-      "vadd.u16     q0, q1                       \n"
+      "vadd.u16    q0, q1                        \n"
 
       // Need to divide, but can't downshift as the the value
       //  isn't a power of 2. So multiply by 65536 / n
@@ -487,14 +487,14 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
 
       // Align for table lookup, vtbl requires registers to
       //  be adjacent
-      "vmov.u8      d2, d4                       \n"
+      "vmov.u8     d2, d4                        \n"
 
-      "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
-      "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+      "vtbl.u8     d3, {d0, d1, d2}, d28         \n"
+      "vtbl.u8     d4, {d0, d1, d2}, d29         \n"
 
-      "vst1.8       {d3}, [%1]!                  \n"
-      "vst1.32      {d4[0]}, [%1]!               \n"
-      "bgt          1b                           \n"
+      "vst1.8      {d3}, [%1]!                   \n"
+      "vst1.32     {d4[0]}, [%1]!                \n"
+      "bgt         1b                            \n"
       : "+r"(src_ptr),       // %0
         "+r"(dst_ptr),       // %1
         "+r"(dst_width),     // %2
@@ -504,38 +504,26 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
       : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
 }
 
-void ScaleAddRows_NEON(const uint8_t* src_ptr,
-                       ptrdiff_t src_stride,
-                       uint16_t* dst_ptr,
-                       int src_width,
-                       int src_height) {
-  const uint8_t* src_tmp;
+// Add a row of bytes to a row of shorts.  Used for box filter.
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+void ScaleAddRow_NEON(const uint8_t* src_ptr,
+                      uint16_t* dst_ptr,
+                      int src_width) {
   asm volatile(
       "1:                                        \n"
-      "mov       %0, %1                          \n"
-      "mov       r12, %5                         \n"
-      "veor      q2, q2, q2                      \n"
-      "veor      q3, q3, q3                      \n"
-      "2:                                        \n"
-      // load 16 pixels into q0
-      "vld1.8     {q0}, [%0], %3                 \n"
-      "vaddw.u8   q3, q3, d1                     \n"
-      "vaddw.u8   q2, q2, d0                     \n"
-      "subs       r12, r12, #1                   \n"
-      "bgt        2b                             \n"
-      "vst1.16    {q2, q3}, [%2]!                \n"  // store pixels
-      "add        %1, %1, #16                    \n"
-      "subs       %4, %4, #16                    \n"  // 16 processed per loop
-      "bgt        1b                             \n"
-      : "=&r"(src_tmp),    // %0
-        "+r"(src_ptr),     // %1
-        "+r"(dst_ptr),     // %2
-        "+r"(src_stride),  // %3
-        "+r"(src_width),   // %4
-        "+r"(src_height)   // %5
+      "vld1.16     {q1, q2}, [%1]                \n"  // load accumulator
+      "vld1.8      {q0}, [%0]!                   \n"  // load 16 bytes
+      "vaddw.u8    q2, q2, d1                    \n"  // add
+      "vaddw.u8    q1, q1, d0                    \n"
+      "vst1.16     {q1, q2}, [%1]!               \n"  // store accumulator
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_width)  // %2
       :
-      : "memory", "cc", "r12", "q0", "q1", "q2", "q3"  // Clobber List
-      );
+      : "memory", "cc", "q0", "q1", "q2"  // Clobber List
+  );
 }
 
 // TODO(Yang Zhang): Investigate less load instructions for
@@ -559,17 +547,17 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
   int* tmp = dx_offset;
   const uint8_t* src_tmp = src_ptr;
   asm volatile (
-    "vdup.32    q0, %3                         \n"  // x
-    "vdup.32    q1, %4                         \n"  // dx
-    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
-    "vshl.i32   q3, q1, #2                     \n"  // 4 * dx
-    "vmul.s32   q1, q1, q2                     \n"
+      "vdup.32     q0, %3                        \n"  // x
+      "vdup.32     q1, %4                        \n"  // dx
+      "vld1.32     {q2}, [%5]                    \n"  // 0 1 2 3
+      "vshl.i32    q3, q1, #2                    \n"  // 4 * dx
+      "vmul.s32    q1, q1, q2                    \n"
     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
-    "vadd.s32   q1, q1, q0                     \n"
+      "vadd.s32    q1, q1, q0                    \n"
     // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
-    "vadd.s32   q2, q1, q3                     \n"
-    "vshl.i32   q0, q3, #1                     \n"  // 8 * dx
-  "1:                                          \n"
+      "vadd.s32    q2, q1, q3                    \n"
+      "vshl.i32    q0, q3, #1                    \n"  // 8 * dx
+      "1:                                        \n"
     LOAD2_DATA8_LANE(0)
     LOAD2_DATA8_LANE(1)
     LOAD2_DATA8_LANE(2)
@@ -578,27 +566,27 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
     LOAD2_DATA8_LANE(5)
     LOAD2_DATA8_LANE(6)
     LOAD2_DATA8_LANE(7)
-    "vmov       q10, q1                        \n"
-    "vmov       q11, q2                        \n"
-    "vuzp.16    q10, q11                       \n"
-    "vmovl.u8   q8, d6                         \n"
-    "vmovl.u8   q9, d7                         \n"
-    "vsubl.s16  q11, d18, d16                  \n"
-    "vsubl.s16  q12, d19, d17                  \n"
-    "vmovl.u16  q13, d20                       \n"
-    "vmovl.u16  q10, d21                       \n"
-    "vmul.s32   q11, q11, q13                  \n"
-    "vmul.s32   q12, q12, q10                  \n"
-    "vrshrn.s32  d18, q11, #16                 \n"
-    "vrshrn.s32  d19, q12, #16                 \n"
-    "vadd.s16   q8, q8, q9                     \n"
-    "vmovn.s16  d6, q8                         \n"
-
-    "vst1.8     {d6}, [%0]!                    \n"  // store pixels
-    "vadd.s32   q1, q1, q0                     \n"
-    "vadd.s32   q2, q2, q0                     \n"
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    "bgt        1b                             \n"
+      "vmov        q10, q1                       \n"
+      "vmov        q11, q2                       \n"
+      "vuzp.16     q10, q11                      \n"
+      "vmovl.u8    q8, d6                        \n"
+      "vmovl.u8    q9, d7                        \n"
+      "vsubl.s16   q11, d18, d16                 \n"
+      "vsubl.s16   q12, d19, d17                 \n"
+      "vmovl.u16   q13, d20                      \n"
+      "vmovl.u16   q10, d21                      \n"
+      "vmul.s32    q11, q11, q13                 \n"
+      "vmul.s32    q12, q12, q10                 \n"
+      "vrshrn.s32  d18, q11, #16                 \n"
+      "vrshrn.s32  d19, q12, #16                 \n"
+      "vadd.s16    q8, q8, q9                    \n"
+      "vmovn.s16   d6, q8                        \n"
+
+      "vst1.8      {d6}, [%0]!                   \n"  // store pixels
+      "vadd.s32    q1, q1, q0                    \n"
+      "vadd.s32    q2, q2, q0                    \n"
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop
+      "bgt         1b                            \n"
   : "+r"(dst_ptr),          // %0
     "+r"(src_ptr),          // %1
     "+r"(dst_width),        // %2
@@ -621,75 +609,75 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
                           int dst_width,
                           int source_y_fraction) {
   asm volatile(
-      "cmp          %4, #0                       \n"
-      "beq          100f                         \n"
-      "add          %2, %1                       \n"
-      "cmp          %4, #64                      \n"
-      "beq          75f                          \n"
-      "cmp          %4, #128                     \n"
-      "beq          50f                          \n"
-      "cmp          %4, #192                     \n"
-      "beq          25f                          \n"
-
-      "vdup.8       d5, %4                       \n"
-      "rsb          %4, #256                     \n"
-      "vdup.8       d4, %4                       \n"
+      "cmp         %4, #0                        \n"
+      "beq         100f                          \n"
+      "add         %2, %1                        \n"
+      "cmp         %4, #64                       \n"
+      "beq         75f                           \n"
+      "cmp         %4, #128                      \n"
+      "beq         50f                           \n"
+      "cmp         %4, #192                      \n"
+      "beq         25f                           \n"
+
+      "vdup.8      d5, %4                        \n"
+      "rsb         %4, #256                      \n"
+      "vdup.8      d4, %4                        \n"
       // General purpose row blend.
       "1:                                        \n"
-      "vld1.8       {q0}, [%1]!                  \n"
-      "vld1.8       {q1}, [%2]!                  \n"
-      "subs         %3, %3, #16                  \n"
-      "vmull.u8     q13, d0, d4                  \n"
-      "vmull.u8     q14, d1, d4                  \n"
-      "vmlal.u8     q13, d2, d5                  \n"
-      "vmlal.u8     q14, d3, d5                  \n"
-      "vrshrn.u16   d0, q13, #8                  \n"
-      "vrshrn.u16   d1, q14, #8                  \n"
-      "vst1.8       {q0}, [%0]!                  \n"
-      "bgt          1b                           \n"
-      "b            99f                          \n"
+      "vld1.8      {q0}, [%1]!                   \n"
+      "vld1.8      {q1}, [%2]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vmull.u8    q13, d0, d4                   \n"
+      "vmull.u8    q14, d1, d4                   \n"
+      "vmlal.u8    q13, d2, d5                   \n"
+      "vmlal.u8    q14, d3, d5                   \n"
+      "vrshrn.u16  d0, q13, #8                   \n"
+      "vrshrn.u16  d1, q14, #8                   \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         1b                            \n"
+      "b           99f                           \n"
 
       // Blend 25 / 75.
       "25:                                       \n"
-      "vld1.8       {q0}, [%1]!                  \n"
-      "vld1.8       {q1}, [%2]!                  \n"
-      "subs         %3, %3, #16                  \n"
-      "vrhadd.u8    q0, q1                       \n"
-      "vrhadd.u8    q0, q1                       \n"
-      "vst1.8       {q0}, [%0]!                  \n"
-      "bgt          25b                          \n"
-      "b            99f                          \n"
+      "vld1.8      {q0}, [%1]!                   \n"
+      "vld1.8      {q1}, [%2]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vrhadd.u8   q0, q1                        \n"
+      "vrhadd.u8   q0, q1                        \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         25b                           \n"
+      "b           99f                           \n"
 
       // Blend 50 / 50.
       "50:                                       \n"
-      "vld1.8       {q0}, [%1]!                  \n"
-      "vld1.8       {q1}, [%2]!                  \n"
-      "subs         %3, %3, #16                  \n"
-      "vrhadd.u8    q0, q1                       \n"
-      "vst1.8       {q0}, [%0]!                  \n"
-      "bgt          50b                          \n"
-      "b            99f                          \n"
+      "vld1.8      {q0}, [%1]!                   \n"
+      "vld1.8      {q1}, [%2]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vrhadd.u8   q0, q1                        \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         50b                           \n"
+      "b           99f                           \n"
 
       // Blend 75 / 25.
       "75:                                       \n"
-      "vld1.8       {q1}, [%1]!                  \n"
-      "vld1.8       {q0}, [%2]!                  \n"
-      "subs         %3, %3, #16                  \n"
-      "vrhadd.u8    q0, q1                       \n"
-      "vrhadd.u8    q0, q1                       \n"
-      "vst1.8       {q0}, [%0]!                  \n"
-      "bgt          75b                          \n"
-      "b            99f                          \n"
+      "vld1.8      {q1}, [%1]!                   \n"
+      "vld1.8      {q0}, [%2]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vrhadd.u8   q0, q1                        \n"
+      "vrhadd.u8   q0, q1                        \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         75b                           \n"
+      "b           99f                           \n"
 
       // Blend 100 / 0 - Copy row unchanged.
       "100:                                      \n"
-      "vld1.8       {q0}, [%1]!                  \n"
-      "subs         %3, %3, #16                  \n"
-      "vst1.8       {q0}, [%0]!                  \n"
-      "bgt          100b                         \n"
+      "vld1.8      {q0}, [%1]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         100b                          \n"
 
       "99:                                       \n"
-      "vst1.8       {d1[7]}, [%0]                \n"
+      "vst1.8      {d1[7]}, [%0]                 \n"
       : "+r"(dst_ptr),           // %0
         "+r"(src_ptr),           // %1
         "+r"(src_stride),        // %2
@@ -706,18 +694,18 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
   (void)src_stride;
   asm volatile(
       "1:                                        \n"
-      "vld4.32    {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-      "vld4.32    {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop
-      "vmov       q2, q1                         \n"  // load next 8 ARGB
-      "vst2.32    {q2, q3}, [%1]!                \n"  // store odd pixels
-      "bgt        1b                             \n"
+      "vld4.32     {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.32     {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop
+      "vmov        q2, q1                        \n"  // load next 8 ARGB
+      "vst2.32     {q2, q3}, [%1]!               \n"  // store odd pixels
+      "bgt         1b                            \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst),       // %1
         "+r"(dst_width)  // %2
       :
       : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
-      );
+  );
 }
 
 //  46:  f964 018d   vld4.32  {d16,d18,d20,d22}, [r4]!
@@ -734,19 +722,19 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
   (void)src_stride;
   asm volatile(
       "1:                                        \n"
-      "vld4.32    {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-      "vld4.32    {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop
-      "vrhadd.u8  q0, q0, q1                     \n"  // rounding half add
-      "vrhadd.u8  q1, q2, q3                     \n"  // rounding half add
-      "vst2.32    {q0, q1}, [%1]!                \n"
-      "bgt       1b                              \n"
+      "vld4.32     {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.32     {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop
+      "vrhadd.u8   q0, q0, q1                    \n"  // rounding half add
+      "vrhadd.u8   q1, q2, q3                    \n"  // rounding half add
+      "vst2.32     {q0, q1}, [%1]!               \n"
+      "bgt         1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
         "+r"(dst_width)  // %2
       :
       : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
-      );
+  );
 }
 
 void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
@@ -755,27 +743,27 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
                                int dst_width) {
   asm volatile(
       // change the stride to row 2 pointer
-      "add        %1, %1, %0                     \n"
+      "add         %1, %1, %0                    \n"
       "1:                                        \n"
-      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-      "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB
-      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-      "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-      "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-      "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-      "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
-      "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB
-      "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB
-      "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
-      "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
-      "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
-      "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
-      "vrshrn.u16 d0, q0, #2                     \n"  // round and pack to bytes
-      "vrshrn.u16 d1, q1, #2                     \n"
-      "vrshrn.u16 d2, q2, #2                     \n"
-      "vrshrn.u16 d3, q3, #2                     \n"
-      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
-      "bgt        1b                             \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
+      "vpaddl.u8   q3, q3                        \n"  // A 16 bytes -> 8 shorts.
+      "vld4.8      {d16, d18, d20, d22}, [%1]!   \n"  // load 8 more ARGB
+      "vld4.8      {d17, d19, d21, d23}, [%1]!   \n"  // load last 8 ARGB
+      "vpadal.u8   q0, q8                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q9                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q10                       \n"  // R 16 bytes -> 8 shorts.
+      "vpadal.u8   q3, q11                       \n"  // A 16 bytes -> 8 shorts.
+      "vrshrn.u16  d0, q0, #2                    \n"  // round and pack to bytes
+      "vrshrn.u16  d1, q1, #2                    \n"
+      "vrshrn.u16  d2, q2, #2                    \n"
+      "vrshrn.u16  d3, q3, #2                    \n"
+      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"
+      "bgt         1b                            \n"
       : "+r"(src_ptr),     // %0
         "+r"(src_stride),  // %1
         "+r"(dst),         // %2
@@ -793,15 +781,15 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
                                int dst_width) {
   (void)src_stride;
   asm volatile(
-      "mov        r12, %3, lsl #2                \n"
+      "mov         r12, %3, lsl #2               \n"
       "1:                                        \n"
-      "vld1.32    {d0[0]}, [%0], r12             \n"
-      "vld1.32    {d0[1]}, [%0], r12             \n"
-      "vld1.32    {d1[0]}, [%0], r12             \n"
-      "vld1.32    {d1[1]}, [%0], r12             \n"
-      "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
-      "vst1.8     {q0}, [%1]!                    \n"
-      "bgt        1b                             \n"
+      "vld1.32     {d0[0]}, [%0], r12            \n"
+      "vld1.32     {d0[1]}, [%0], r12            \n"
+      "vld1.32     {d1[0]}, [%0], r12            \n"
+      "vld1.32     {d1[1]}, [%0], r12            \n"
+      "subs        %2, %2, #4                    \n"  // 4 pixels per loop.
+      "vst1.8      {q0}, [%1]!                   \n"
+      "bgt         1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
         "+r"(dst_width)  // %2
@@ -817,30 +805,30 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
                                   uint8_t* dst_argb,
                                   int dst_width) {
   asm volatile(
-      "mov        r12, %4, lsl #2                \n"
-      "add        %1, %1, %0                     \n"
+      "mov         r12, %4, lsl #2               \n"
+      "add         %1, %1, %0                    \n"
       "1:                                        \n"
-      "vld1.8     {d0}, [%0], r12                \n"  // 4 2x2 blocks -> 2x1
-      "vld1.8     {d1}, [%1], r12                \n"
-      "vld1.8     {d2}, [%0], r12                \n"
-      "vld1.8     {d3}, [%1], r12                \n"
-      "vld1.8     {d4}, [%0], r12                \n"
-      "vld1.8     {d5}, [%1], r12                \n"
-      "vld1.8     {d6}, [%0], r12                \n"
-      "vld1.8     {d7}, [%1], r12                \n"
-      "vaddl.u8   q0, d0, d1                     \n"
-      "vaddl.u8   q1, d2, d3                     \n"
-      "vaddl.u8   q2, d4, d5                     \n"
-      "vaddl.u8   q3, d6, d7                     \n"
-      "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd
-      "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh
-      "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)
-      "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)
-      "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
-      "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
-      "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
-      "vst1.8     {q0}, [%2]!                    \n"
-      "bgt        1b                             \n"
+      "vld1.8      {d0}, [%0], r12               \n"  // 4 2x2 blocks -> 2x1
+      "vld1.8      {d1}, [%1], r12               \n"
+      "vld1.8      {d2}, [%0], r12               \n"
+      "vld1.8      {d3}, [%1], r12               \n"
+      "vld1.8      {d4}, [%0], r12               \n"
+      "vld1.8      {d5}, [%1], r12               \n"
+      "vld1.8      {d6}, [%0], r12               \n"
+      "vld1.8      {d7}, [%1], r12               \n"
+      "vaddl.u8    q0, d0, d1                    \n"
+      "vaddl.u8    q1, d2, d3                    \n"
+      "vaddl.u8    q2, d4, d5                    \n"
+      "vaddl.u8    q3, d6, d7                    \n"
+      "vswp.8      d1, d2                        \n"  // ab_cd -> ac_bd
+      "vswp.8      d5, d6                        \n"  // ef_gh -> eg_fh
+      "vadd.u16    q0, q0, q1                    \n"  // (a+b)_(c+d)
+      "vadd.u16    q2, q2, q3                    \n"  // (e+f)_(g+h)
+      "vrshrn.u16  d0, q0, #2                    \n"  // first 2 pixels.
+      "vrshrn.u16  d1, q2, #2                    \n"  // next 2 pixels.
+      "subs        %3, %3, #4                    \n"  // 4 pixels per loop.
+      "vst1.8      {q0}, [%2]!                   \n"
+      "bgt         1b                            \n"
       : "+r"(src_argb),    // %0
         "+r"(src_stride),  // %1
         "+r"(dst_argb),    // %2
@@ -877,8 +865,8 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
       LOAD1_DATA32_LANE(d3, 1)
       // clang-format on
       "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop
-      "bgt        1b                             \n"
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop
+      "bgt         1b                            \n"
       : "+r"(dst_argb),   // %0
         "+r"(src_argb),   // %1
         "+r"(dst_width),  // %2
@@ -909,16 +897,16 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
   int* tmp = dx_offset;
   const uint8_t* src_tmp = src_argb;
   asm volatile (
-    "vdup.32    q0, %3                         \n"  // x
-    "vdup.32    q1, %4                         \n"  // dx
-    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
-    "vshl.i32   q9, q1, #2                     \n"  // 4 * dx
-    "vmul.s32   q1, q1, q2                     \n"
-    "vmov.i8    q3, #0x7f                      \n"  // 0x7F
-    "vmov.i16   q15, #0x7f                     \n"  // 0x7F
+      "vdup.32     q0, %3                        \n"  // x
+      "vdup.32     q1, %4                        \n"  // dx
+      "vld1.32     {q2}, [%5]                    \n"  // 0 1 2 3
+      "vshl.i32    q9, q1, #2                    \n"  // 4 * dx
+      "vmul.s32    q1, q1, q2                    \n"
+      "vmov.i8     q3, #0x7f                     \n"  // 0x7F
+      "vmov.i16    q15, #0x7f                    \n"  // 0x7F
     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
-    "vadd.s32   q8, q1, q0                     \n"
-  "1:                                          \n"
+      "vadd.s32    q8, q1, q0                    \n"
+      "1:                                        \n"
     // d0, d1: a
     // d2, d3: b
     LOAD2_DATA32_LANE(d0, d2, 0)
@@ -962,6 +950,64 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
 
 #undef LOAD2_DATA32_LANE
 
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add         %1, %1, %0                    \n"
+      "1:                                        \n"
+      "vld2.8      {d0, d2}, [%0]!               \n"  // load 8 UV pixels.
+      "vld2.8      {d1, d3}, [%0]!               \n"  // load next 8 UV
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vpaddl.u8   q0, q0                        \n"  // U 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // V 16 bytes -> 8 shorts.
+      "vld2.8      {d16, d18}, [%1]!             \n"  // load 8 more UV
+      "vld2.8      {d17, d19}, [%1]!             \n"  // load last 8 UV
+      "vpadal.u8   q0, q8                        \n"  // U 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q9                        \n"  // V 16 bytes -> 8 shorts.
+      "vrshrn.u16  d0, q0, #2                    \n"  // round and pack to bytes
+      "vrshrn.u16  d1, q1, #2                    \n"
+      "vst2.8      {d0, d1}, [%2]!               \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "memory", "cc", "q0", "q1", "q8", "q9");
+}
+
+// Reads 4 pixels at a time.
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             int src_stepx,  // pixel step
+                             uint8_t* dst_ptr,
+                             int dst_width) {
+  const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
+  const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
+  const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld1.16    {d0[0]}, [%0], %6              \n"
+      "vld1.16    {d0[1]}, [%1], %6              \n"
+      "vld1.16    {d0[2]}, [%2], %6              \n"
+      "vld1.16    {d0[3]}, [%3], %6              \n"
+      "subs       %5, %5, #4                     \n"  // 4 pixels per loop.
+      "vst1.8     {d0}, [%4]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),                 // %0
+        "+r"(src1_ptr),                // %1
+        "+r"(src2_ptr),                // %2
+        "+r"(src3_ptr),                // %3
+        "+r"(dst_ptr),                 // %4
+        "+r"(dst_width)                // %5
+      : "r"(src_stepx * 8)             // %6
+      : "memory", "cc", "d0");
+}
+
 #endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
 
 #ifdef __cplusplus
diff --git a/chromium/third_party/libyuv/source/scale_neon64.cc b/chromium/third_party/libyuv/source/scale_neon64.cc
index 494a9cfbfbe..185591cb55b 100644
--- a/chromium/third_party/libyuv/source/scale_neon64.cc
+++ b/chromium/third_party/libyuv/source/scale_neon64.cc
@@ -29,16 +29,17 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
   asm volatile(
       "1:                                        \n"
       // load even pixels into v0, odd into v1
-      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
-      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
-      "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
-      "b.gt       1b                             \n"
+      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v1.16b}, [%1], #16           \n"  // store odd pixels
+      "b.gt        1b                            \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst),       // %1
         "+r"(dst_width)  // %2
       :
       : "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 // Read 32x1 average down and write 16x1.
@@ -50,17 +51,18 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
   asm volatile(
       "1:                                        \n"
       // load even pixels into v0, odd into v1
-      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
-      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
-      "urhadd     v0.16b, v0.16b, v1.16b         \n"  // rounding half add
-      "st1        {v0.16b}, [%1], #16            \n"
-      "b.gt       1b                             \n"
+      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"  // rounding half add
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v0.16b}, [%1], #16           \n"
+      "b.gt        1b                            \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst),       // %1
         "+r"(dst_width)  // %2
       :
       : "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 // Read 32x2 average down and write 16x1.
@@ -70,26 +72,28 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
                            int dst_width) {
   asm volatile(
       // change the stride to row 2 pointer
-      "add        %1, %1, %0                     \n"
+      "add         %1, %1, %0                    \n"
       "1:                                        \n"
-      "ld1        {v0.16b, v1.16b}, [%0], #32    \n"  // load row 1 and post inc
-      "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
-      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
-      "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
-      "uaddlp     v1.8h, v1.16b                  \n"
-      "uadalp     v0.8h, v2.16b                  \n"  // += row 2 add adjacent
-      "uadalp     v1.8h, v3.16b                  \n"
-      "rshrn      v0.8b, v0.8h, #2               \n"  // round and pack
-      "rshrn2     v0.16b, v1.8h, #2              \n"
-      "st1        {v0.16b}, [%2], #16            \n"
-      "b.gt       1b                             \n"
+      "ld1         {v0.16b, v1.16b}, [%0], #32   \n"  // load row 1 and post inc
+      "ld1         {v2.16b, v3.16b}, [%1], #32   \n"  // load row 2 and post inc
+      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
+      "uaddlp      v0.8h, v0.16b                 \n"  // row 1 add adjacent
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "uaddlp      v1.8h, v1.16b                 \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v0.8h, v2.16b                 \n"  // += row 2 add adjacent
+      "uadalp      v1.8h, v3.16b                 \n"
+      "rshrn       v0.8b, v0.8h, #2              \n"  // round and pack
+      "rshrn2      v0.16b, v1.8h, #2             \n"
+      "st1         {v0.16b}, [%2], #16           \n"
+      "b.gt        1b                            \n"
       : "+r"(src_ptr),     // %0
         "+r"(src_stride),  // %1
         "+r"(dst),         // %2
         "+r"(dst_width)    // %3
       :
       : "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 void ScaleRowDown4_NEON(const uint8_t* src_ptr,
@@ -99,10 +103,11 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
   (void)src_stride;
   asm volatile(
       "1:                                        \n"
-      "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32  \n"  // src line 0
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-      "st1     {v2.8b}, [%1], #8                 \n"
-      "b.gt       1b                             \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v2.8b}, [%1], #8             \n"
+      "b.gt        1b                            \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
         "+r"(dst_width)  // %2
@@ -119,19 +124,23 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
   const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
   asm volatile(
       "1:                                        \n"
-      "ld1     {v0.16b}, [%0], #16               \n"  // load up 16x4
-      "ld1     {v1.16b}, [%2], #16               \n"
-      "ld1     {v2.16b}, [%3], #16               \n"
-      "ld1     {v3.16b}, [%4], #16               \n"
-      "subs    %w5, %w5, #4                      \n"
-      "uaddlp  v0.8h, v0.16b                     \n"
-      "uadalp  v0.8h, v1.16b                     \n"
-      "uadalp  v0.8h, v2.16b                     \n"
-      "uadalp  v0.8h, v3.16b                     \n"
-      "addp    v0.8h, v0.8h, v0.8h               \n"
-      "rshrn   v0.8b, v0.8h, #4                  \n"  // divide by 16 w/rounding
-      "st1    {v0.s}[0], [%1], #4                \n"
-      "b.gt       1b                             \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load up 16x4
+      "ld1         {v1.16b}, [%2], #16           \n"
+      "ld1         {v2.16b}, [%3], #16           \n"
+      "ld1         {v3.16b}, [%4], #16           \n"
+      "subs        %w5, %w5, #4                  \n"
+      "uaddlp      v0.8h, v0.16b                 \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "uadalp      v0.8h, v1.16b                 \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "uadalp      v0.8h, v2.16b                 \n"
+      "prfm        pldl1keep, [%3, 448]          \n"
+      "uadalp      v0.8h, v3.16b                 \n"
+      "prfm        pldl1keep, [%4, 448]          \n"
+      "addp        v0.8h, v0.8h, v0.8h           \n"
+      "rshrn       v0.8b, v0.8h, #4              \n"  // divide by 16 w/rounding
+      "st1         {v0.s}[0], [%1], #4           \n"
+      "b.gt        1b                            \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
         "+r"(src_ptr1),  // %2
@@ -151,12 +160,13 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
                          int dst_width) {
   (void)src_stride;
   asm volatile(
-      "1:                                                \n"
-      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
-      "subs      %w2, %w2, #24                           \n"
-      "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0,v1,v2
-      "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
-      "b.gt      1b                                      \n"
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
+      "subs        %w2, %w2, #24                 \n"
+      "orr         v2.16b, v3.16b, v3.16b        \n"  // order v0,v1,v2
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st3         {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+      "b.gt        1b                            \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
         "+r"(dst_width)  // %2
@@ -169,49 +179,51 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int dst_width) {
   asm volatile(
-      "movi      v20.8b, #3                              \n"
-      "add       %3, %3, %0                              \n"
-      "1:                                                \n"
-      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
-      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32    \n"  // src line 1
-      "subs         %w2, %w2, #24                        \n"
+      "movi        v20.8b, #3                    \n"
+      "add         %3, %3, %0                    \n"
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n"  // src line 1
+      "subs        %w2, %w2, #24                 \n"
 
       // filter src line 0 with src line 1
       // expand chars to shorts to allow for room
       // when adding lines together
-      "ushll     v16.8h, v4.8b, #0                       \n"
-      "ushll     v17.8h, v5.8b, #0                       \n"
-      "ushll     v18.8h, v6.8b, #0                       \n"
-      "ushll     v19.8h, v7.8b, #0                       \n"
+      "ushll       v16.8h, v4.8b, #0             \n"
+      "ushll       v17.8h, v5.8b, #0             \n"
+      "ushll       v18.8h, v6.8b, #0             \n"
+      "ushll       v19.8h, v7.8b, #0             \n"
 
       // 3 * line_0 + line_1
-      "umlal     v16.8h, v0.8b, v20.8b                   \n"
-      "umlal     v17.8h, v1.8b, v20.8b                   \n"
-      "umlal     v18.8h, v2.8b, v20.8b                   \n"
-      "umlal     v19.8h, v3.8b, v20.8b                   \n"
+      "umlal       v16.8h, v0.8b, v20.8b         \n"
+      "umlal       v17.8h, v1.8b, v20.8b         \n"
+      "umlal       v18.8h, v2.8b, v20.8b         \n"
+      "umlal       v19.8h, v3.8b, v20.8b         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
 
       // (3 * line_0 + line_1) >> 2
-      "uqrshrn   v0.8b, v16.8h, #2                       \n"
-      "uqrshrn   v1.8b, v17.8h, #2                       \n"
-      "uqrshrn   v2.8b, v18.8h, #2                       \n"
-      "uqrshrn   v3.8b, v19.8h, #2                       \n"
+      "uqrshrn     v0.8b, v16.8h, #2             \n"
+      "uqrshrn     v1.8b, v17.8h, #2             \n"
+      "uqrshrn     v2.8b, v18.8h, #2             \n"
+      "uqrshrn     v3.8b, v19.8h, #2             \n"
+      "prfm        pldl1keep, [%3, 448]          \n"
 
       // a0 = (src[0] * 3 + s[1] * 1) >> 2
-      "ushll     v16.8h, v1.8b, #0                       \n"
-      "umlal     v16.8h, v0.8b, v20.8b                   \n"
-      "uqrshrn   v0.8b, v16.8h, #2                       \n"
+      "ushll       v16.8h, v1.8b, #0             \n"
+      "umlal       v16.8h, v0.8b, v20.8b         \n"
+      "uqrshrn     v0.8b, v16.8h, #2             \n"
 
       // a1 = (src[1] * 1 + s[2] * 1) >> 1
-      "urhadd    v1.8b, v1.8b, v2.8b                     \n"
+      "urhadd      v1.8b, v1.8b, v2.8b           \n"
 
       // a2 = (src[2] * 1 + s[3] * 3) >> 2
-      "ushll     v16.8h, v2.8b, #0                       \n"
-      "umlal     v16.8h, v3.8b, v20.8b                   \n"
-      "uqrshrn   v2.8b, v16.8h, #2                       \n"
+      "ushll       v16.8h, v2.8b, #0             \n"
+      "umlal       v16.8h, v3.8b, v20.8b         \n"
+      "uqrshrn     v2.8b, v16.8h, #2             \n"
 
-      "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
+      "st3         {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
 
-      "b.gt      1b                                      \n"
+      "b.gt        1b                            \n"
       : "+r"(src_ptr),    // %0
         "+r"(dst_ptr),    // %1
         "+r"(dst_width),  // %2
@@ -226,33 +238,35 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int dst_width) {
   asm volatile(
-      "movi      v20.8b, #3                              \n"
-      "add       %3, %3, %0                              \n"
-      "1:                                                \n"
-      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
-      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32    \n"  // src line 1
-      "subs         %w2, %w2, #24                        \n"
+      "movi        v20.8b, #3                    \n"
+      "add         %3, %3, %0                    \n"
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n"  // src line 1
+      "subs        %w2, %w2, #24                 \n"
       // average src line 0 with src line 1
-      "urhadd    v0.8b, v0.8b, v4.8b                     \n"
-      "urhadd    v1.8b, v1.8b, v5.8b                     \n"
-      "urhadd    v2.8b, v2.8b, v6.8b                     \n"
-      "urhadd    v3.8b, v3.8b, v7.8b                     \n"
+      "urhadd      v0.8b, v0.8b, v4.8b           \n"
+      "urhadd      v1.8b, v1.8b, v5.8b           \n"
+      "urhadd      v2.8b, v2.8b, v6.8b           \n"
+      "urhadd      v3.8b, v3.8b, v7.8b           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
 
       // a0 = (src[0] * 3 + s[1] * 1) >> 2
-      "ushll     v4.8h, v1.8b, #0                        \n"
-      "umlal     v4.8h, v0.8b, v20.8b                    \n"
-      "uqrshrn   v0.8b, v4.8h, #2                        \n"
+      "ushll       v4.8h, v1.8b, #0              \n"
+      "umlal       v4.8h, v0.8b, v20.8b          \n"
+      "uqrshrn     v0.8b, v4.8h, #2              \n"
+      "prfm        pldl1keep, [%3, 448]          \n"
 
       // a1 = (src[1] * 1 + s[2] * 1) >> 1
-      "urhadd    v1.8b, v1.8b, v2.8b                     \n"
+      "urhadd      v1.8b, v1.8b, v2.8b           \n"
 
       // a2 = (src[2] * 1 + s[3] * 3) >> 2
-      "ushll     v4.8h, v2.8b, #0                        \n"
-      "umlal     v4.8h, v3.8b, v20.8b                    \n"
-      "uqrshrn   v2.8b, v4.8h, #2                        \n"
+      "ushll       v4.8h, v2.8b, #0              \n"
+      "umlal       v4.8h, v3.8b, v20.8b          \n"
+      "uqrshrn     v2.8b, v4.8h, #2              \n"
 
-      "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
-      "b.gt      1b                                      \n"
+      "st3         {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+      "b.gt        1b                            \n"
       : "+r"(src_ptr),    // %0
         "+r"(dst_ptr),    // %1
         "+r"(dst_width),  // %2
@@ -279,14 +293,15 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
                          int dst_width) {
   (void)src_stride;
   asm volatile(
-      "ld1       {v3.16b}, [%3]                          \n"
-      "1:                                                \n"
-      "ld1       {v0.16b,v1.16b}, [%0], #32              \n"
-      "subs      %w2, %w2, #12                           \n"
-      "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b         \n"
-      "st1       {v2.8b}, [%1], #8                       \n"
-      "st1       {v2.s}[2], [%1], #4                     \n"
-      "b.gt      1b                                      \n"
+      "ld1         {v3.16b}, [%3]                \n"
+      "1:                                        \n"
+      "ld1         {v0.16b,v1.16b}, [%0], #32    \n"
+      "subs        %w2, %w2, #12                 \n"
+      "tbl         v2.16b, {v0.16b,v1.16b}, v3.16b \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v2.8b}, [%1], #8             \n"
+      "st1         {v2.s}[2], [%1], #4           \n"
+      "b.gt        1b                            \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
         "+r"(dst_width)  // %2
@@ -303,68 +318,68 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
   ptrdiff_t tmp_src_stride = src_stride;
 
   asm volatile(
-      "ld1       {v29.8h}, [%5]                          \n"
-      "ld1       {v30.16b}, [%6]                         \n"
-      "ld1       {v31.8h}, [%7]                          \n"
-      "add       %2, %2, %0                              \n"
-      "1:                                                \n"
+      "ld1         {v29.8h}, [%5]                \n"
+      "ld1         {v30.16b}, [%6]               \n"
+      "ld1         {v31.8h}, [%7]                \n"
+      "add         %2, %2, %0                    \n"
+      "1:                                        \n"
 
       // 00 40 01 41 02 42 03 43
       // 10 50 11 51 12 52 13 53
       // 20 60 21 61 22 62 23 63
       // 30 70 31 71 32 72 33 73
-      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"
-      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32    \n"
-      "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32  \n"
-      "subs      %w4, %w4, #12                           \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
+      "subs        %w4, %w4, #12                 \n"
 
       // Shuffle the input data around to get align the data
       //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
       // 00 10 01 11 02 12 03 13
       // 40 50 41 51 42 52 43 53
-      "trn1      v20.8b, v0.8b, v1.8b                    \n"
-      "trn2      v21.8b, v0.8b, v1.8b                    \n"
-      "trn1      v22.8b, v4.8b, v5.8b                    \n"
-      "trn2      v23.8b, v4.8b, v5.8b                    \n"
-      "trn1      v24.8b, v16.8b, v17.8b                  \n"
-      "trn2      v25.8b, v16.8b, v17.8b                  \n"
+      "trn1        v20.8b, v0.8b, v1.8b          \n"
+      "trn2        v21.8b, v0.8b, v1.8b          \n"
+      "trn1        v22.8b, v4.8b, v5.8b          \n"
+      "trn2        v23.8b, v4.8b, v5.8b          \n"
+      "trn1        v24.8b, v16.8b, v17.8b        \n"
+      "trn2        v25.8b, v16.8b, v17.8b        \n"
 
       // 20 30 21 31 22 32 23 33
       // 60 70 61 71 62 72 63 73
-      "trn1      v0.8b, v2.8b, v3.8b                     \n"
-      "trn2      v1.8b, v2.8b, v3.8b                     \n"
-      "trn1      v4.8b, v6.8b, v7.8b                     \n"
-      "trn2      v5.8b, v6.8b, v7.8b                     \n"
-      "trn1      v16.8b, v18.8b, v19.8b                  \n"
-      "trn2      v17.8b, v18.8b, v19.8b                  \n"
+      "trn1        v0.8b, v2.8b, v3.8b           \n"
+      "trn2        v1.8b, v2.8b, v3.8b           \n"
+      "trn1        v4.8b, v6.8b, v7.8b           \n"
+      "trn2        v5.8b, v6.8b, v7.8b           \n"
+      "trn1        v16.8b, v18.8b, v19.8b        \n"
+      "trn2        v17.8b, v18.8b, v19.8b        \n"
 
       // 00+10 01+11 02+12 03+13
       // 40+50 41+51 42+52 43+53
-      "uaddlp    v20.4h, v20.8b                          \n"
-      "uaddlp    v21.4h, v21.8b                          \n"
-      "uaddlp    v22.4h, v22.8b                          \n"
-      "uaddlp    v23.4h, v23.8b                          \n"
-      "uaddlp    v24.4h, v24.8b                          \n"
-      "uaddlp    v25.4h, v25.8b                          \n"
+      "uaddlp      v20.4h, v20.8b                \n"
+      "uaddlp      v21.4h, v21.8b                \n"
+      "uaddlp      v22.4h, v22.8b                \n"
+      "uaddlp      v23.4h, v23.8b                \n"
+      "uaddlp      v24.4h, v24.8b                \n"
+      "uaddlp      v25.4h, v25.8b                \n"
 
       // 60+70 61+71 62+72 63+73
-      "uaddlp    v1.4h, v1.8b                            \n"
-      "uaddlp    v5.4h, v5.8b                            \n"
-      "uaddlp    v17.4h, v17.8b                          \n"
+      "uaddlp      v1.4h, v1.8b                  \n"
+      "uaddlp      v5.4h, v5.8b                  \n"
+      "uaddlp      v17.4h, v17.8b                \n"
 
       // combine source lines
-      "add       v20.4h, v20.4h, v22.4h                  \n"
-      "add       v21.4h, v21.4h, v23.4h                  \n"
-      "add       v20.4h, v20.4h, v24.4h                  \n"
-      "add       v21.4h, v21.4h, v25.4h                  \n"
-      "add       v2.4h, v1.4h, v5.4h                     \n"
-      "add       v2.4h, v2.4h, v17.4h                    \n"
+      "add         v20.4h, v20.4h, v22.4h        \n"
+      "add         v21.4h, v21.4h, v23.4h        \n"
+      "add         v20.4h, v20.4h, v24.4h        \n"
+      "add         v21.4h, v21.4h, v25.4h        \n"
+      "add         v2.4h, v1.4h, v5.4h           \n"
+      "add         v2.4h, v2.4h, v17.4h          \n"
 
       // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
       //             + s[6 + st * 1] + s[7 + st * 1]
       //             + s[6 + st * 2] + s[7 + st * 2]) / 6
-      "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"
-      "xtn       v2.8b,  v2.8h                           \n"
+      "sqrdmulh    v2.8h, v2.8h, v29.8h          \n"
+      "xtn         v2.8b,  v2.8h                 \n"
 
       // Shuffle 2,3 reg around so that 2 can be added to the
       //  0,1 reg and 3 can be added to the 4,5 reg. This
@@ -372,35 +387,38 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
       //  registers are already expanded. Then do transposes
       //  to get aligned.
       // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-      "ushll     v16.8h, v16.8b, #0                      \n"
-      "uaddl     v0.8h, v0.8b, v4.8b                     \n"
+      "ushll       v16.8h, v16.8b, #0            \n"
+      "uaddl       v0.8h, v0.8b, v4.8b           \n"
 
       // combine source lines
-      "add       v0.8h, v0.8h, v16.8h                    \n"
+      "add         v0.8h, v0.8h, v16.8h          \n"
 
       // xx 20 xx 21 xx 22 xx 23
       // xx 30 xx 31 xx 32 xx 33
-      "trn1      v1.8h, v0.8h, v0.8h                     \n"
-      "trn2      v4.8h, v0.8h, v0.8h                     \n"
-      "xtn       v0.4h, v1.4s                            \n"
-      "xtn       v4.4h, v4.4s                            \n"
+      "trn1        v1.8h, v0.8h, v0.8h           \n"
+      "trn2        v4.8h, v0.8h, v0.8h           \n"
+      "xtn         v0.4h, v1.4s                  \n"
+      "xtn         v4.4h, v4.4s                  \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
 
       // 0+1+2, 3+4+5
-      "add       v20.8h, v20.8h, v0.8h                   \n"
-      "add       v21.8h, v21.8h, v4.8h                   \n"
+      "add         v20.8h, v20.8h, v0.8h         \n"
+      "add         v21.8h, v21.8h, v4.8h         \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
 
       // Need to divide, but can't downshift as the the value
       //  isn't a power of 2. So multiply by 65536 / n
       //  and take the upper 16 bits.
-      "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"
-      "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"
+      "sqrdmulh    v0.8h, v20.8h, v31.8h         \n"
+      "sqrdmulh    v1.8h, v21.8h, v31.8h         \n"
+      "prfm        pldl1keep, [%3, 448]          \n"
 
       // Align for table lookup, vtbl requires registers to be adjacent
-      "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
+      "tbl         v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
 
-      "st1       {v3.8b}, [%1], #8                       \n"
-      "st1       {v3.s}[2], [%1], #4                     \n"
-      "b.gt      1b                                      \n"
+      "st1         {v3.8b}, [%1], #8             \n"
+      "st1         {v3.s}[2], [%1], #4           \n"
+      "b.gt        1b                            \n"
       : "+r"(src_ptr),         // %0
         "+r"(dst_ptr),         // %1
         "+r"(tmp_src_stride),  // %2
@@ -422,53 +440,53 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
   // TODO(fbarchard): use src_stride directly for clang 3.5+.
   ptrdiff_t tmp_src_stride = src_stride;
   asm volatile(
-      "ld1       {v30.8h}, [%4]                          \n"
-      "ld1       {v31.16b}, [%5]                         \n"
-      "add       %2, %2, %0                              \n"
-      "1:                                                \n"
+      "ld1         {v30.8h}, [%4]                \n"
+      "ld1         {v31.16b}, [%5]               \n"
+      "add         %2, %2, %0                    \n"
+      "1:                                        \n"
 
       // 00 40 01 41 02 42 03 43
       // 10 50 11 51 12 52 13 53
       // 20 60 21 61 22 62 23 63
       // 30 70 31 71 32 72 33 73
-      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"
-      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32    \n"
-      "subs      %w3, %w3, #12                           \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+      "subs        %w3, %w3, #12                 \n"
 
       // Shuffle the input data around to get align the data
       //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
       // 00 10 01 11 02 12 03 13
       // 40 50 41 51 42 52 43 53
-      "trn1      v16.8b, v0.8b, v1.8b                    \n"
-      "trn2      v17.8b, v0.8b, v1.8b                    \n"
-      "trn1      v18.8b, v4.8b, v5.8b                    \n"
-      "trn2      v19.8b, v4.8b, v5.8b                    \n"
+      "trn1        v16.8b, v0.8b, v1.8b          \n"
+      "trn2        v17.8b, v0.8b, v1.8b          \n"
+      "trn1        v18.8b, v4.8b, v5.8b          \n"
+      "trn2        v19.8b, v4.8b, v5.8b          \n"
 
       // 20 30 21 31 22 32 23 33
       // 60 70 61 71 62 72 63 73
-      "trn1      v0.8b, v2.8b, v3.8b                     \n"
-      "trn2      v1.8b, v2.8b, v3.8b                     \n"
-      "trn1      v4.8b, v6.8b, v7.8b                     \n"
-      "trn2      v5.8b, v6.8b, v7.8b                     \n"
+      "trn1        v0.8b, v2.8b, v3.8b           \n"
+      "trn2        v1.8b, v2.8b, v3.8b           \n"
+      "trn1        v4.8b, v6.8b, v7.8b           \n"
+      "trn2        v5.8b, v6.8b, v7.8b           \n"
 
       // 00+10 01+11 02+12 03+13
       // 40+50 41+51 42+52 43+53
-      "uaddlp    v16.4h, v16.8b                          \n"
-      "uaddlp    v17.4h, v17.8b                          \n"
-      "uaddlp    v18.4h, v18.8b                          \n"
-      "uaddlp    v19.4h, v19.8b                          \n"
+      "uaddlp      v16.4h, v16.8b                \n"
+      "uaddlp      v17.4h, v17.8b                \n"
+      "uaddlp      v18.4h, v18.8b                \n"
+      "uaddlp      v19.4h, v19.8b                \n"
 
       // 60+70 61+71 62+72 63+73
-      "uaddlp    v1.4h, v1.8b                            \n"
-      "uaddlp    v5.4h, v5.8b                            \n"
+      "uaddlp      v1.4h, v1.8b                  \n"
+      "uaddlp      v5.4h, v5.8b                  \n"
 
       // combine source lines
-      "add       v16.4h, v16.4h, v18.4h                  \n"
-      "add       v17.4h, v17.4h, v19.4h                  \n"
-      "add       v2.4h, v1.4h, v5.4h                     \n"
+      "add         v16.4h, v16.4h, v18.4h        \n"
+      "add         v17.4h, v17.4h, v19.4h        \n"
+      "add         v2.4h, v1.4h, v5.4h           \n"
 
       // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
-      "uqrshrn   v2.8b, v2.8h, #2                        \n"
+      "uqrshrn     v2.8b, v2.8h, #2              \n"
 
       // Shuffle 2,3 reg around so that 2 can be added to the
       //  0,1 reg and 3 can be added to the 4,5 reg. This
@@ -478,33 +496,35 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
       // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
 
       // combine source lines
-      "uaddl     v0.8h, v0.8b, v4.8b                     \n"
+      "uaddl       v0.8h, v0.8b, v4.8b           \n"
 
       // xx 20 xx 21 xx 22 xx 23
       // xx 30 xx 31 xx 32 xx 33
-      "trn1      v1.8h, v0.8h, v0.8h                     \n"
-      "trn2      v4.8h, v0.8h, v0.8h                     \n"
-      "xtn       v0.4h, v1.4s                            \n"
-      "xtn       v4.4h, v4.4s                            \n"
+      "trn1        v1.8h, v0.8h, v0.8h           \n"
+      "trn2        v4.8h, v0.8h, v0.8h           \n"
+      "xtn         v0.4h, v1.4s                  \n"
+      "xtn         v4.4h, v4.4s                  \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
 
       // 0+1+2, 3+4+5
-      "add       v16.8h, v16.8h, v0.8h                   \n"
-      "add       v17.8h, v17.8h, v4.8h                   \n"
+      "add         v16.8h, v16.8h, v0.8h         \n"
+      "add         v17.8h, v17.8h, v4.8h         \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
 
       // Need to divide, but can't downshift as the the value
       //  isn't a power of 2. So multiply by 65536 / n
       //  and take the upper 16 bits.
-      "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"
-      "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"
+      "sqrdmulh    v0.8h, v16.8h, v30.8h         \n"
+      "sqrdmulh    v1.8h, v17.8h, v30.8h         \n"
 
       // Align for table lookup, vtbl requires registers to
       //  be adjacent
 
-      "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
+      "tbl         v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
 
-      "st1       {v3.8b}, [%1], #8                       \n"
-      "st1       {v3.s}[2], [%1], #4                     \n"
-      "b.gt      1b                                      \n"
+      "st1         {v3.8b}, [%1], #8             \n"
+      "st1         {v3.s}[2], [%1], #4           \n"
+      "b.gt        1b                            \n"
       : "+r"(src_ptr),         // %0
         "+r"(dst_ptr),         // %1
         "+r"(tmp_src_stride),  // %2
@@ -515,38 +535,27 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
         "v19", "v30", "v31", "memory", "cc");
 }
 
-void ScaleAddRows_NEON(const uint8_t* src_ptr,
-                       ptrdiff_t src_stride,
-                       uint16_t* dst_ptr,
-                       int src_width,
-                       int src_height) {
-  const uint8_t* src_tmp;
+// Add a row of bytes to a row of shorts.  Used for box filter.
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+void ScaleAddRow_NEON(const uint8_t* src_ptr,
+                      uint16_t* dst_ptr,
+                      int src_width) {
   asm volatile(
       "1:                                        \n"
-      "mov       %0, %1                          \n"
-      "mov       w12, %w5                        \n"
-      "eor       v2.16b, v2.16b, v2.16b          \n"
-      "eor       v3.16b, v3.16b, v3.16b          \n"
-      "2:                                        \n"
-      // load 16 pixels into q0
-      "ld1       {v0.16b}, [%0], %3              \n"
-      "uaddw2    v3.8h, v3.8h, v0.16b            \n"
-      "uaddw     v2.8h, v2.8h, v0.8b             \n"
-      "subs      w12, w12, #1                    \n"
-      "b.gt      2b                              \n"
-      "st1      {v2.8h, v3.8h}, [%2], #32        \n"  // store pixels
-      "add      %1, %1, #16                      \n"
-      "subs     %w4, %w4, #16                    \n"  // 16 processed per loop
-      "b.gt     1b                               \n"
-      : "=&r"(src_tmp),    // %0
-        "+r"(src_ptr),     // %1
-        "+r"(dst_ptr),     // %2
-        "+r"(src_stride),  // %3
-        "+r"(src_width),   // %4
-        "+r"(src_height)   // %5
+      "ld1         {v1.8h, v2.8h}, [%1]          \n"  // load accumulator
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 16 bytes
+      "uaddw2      v2.8h, v2.8h, v0.16b          \n"  // add
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "uaddw       v1.8h, v1.8h, v0.8b           \n"
+      "st1         {v1.8h, v2.8h}, [%1], #32     \n"  // store accumulator
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_width)  // %2
       :
-      : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+      : "memory", "cc", "v0", "v1", "v2"  // Clobber List
+  );
 }
 
 // TODO(Yang Zhang): Investigate less load instructions for
@@ -572,17 +581,17 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
   int64_t x64 = (int64_t)x;    // NOLINT
   int64_t dx64 = (int64_t)dx;  // NOLINT
   asm volatile (
-    "dup        v0.4s, %w3                     \n"  // x
-    "dup        v1.4s, %w4                     \n"  // dx
-    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
-    "shl        v3.4s, v1.4s, #2               \n"  // 4 * dx
-    "mul        v1.4s, v1.4s, v2.4s            \n"
+      "dup         v0.4s, %w3                    \n"  // x
+      "dup         v1.4s, %w4                    \n"  // dx
+      "ld1         {v2.4s}, [%5]                 \n"  // 0 1 2 3
+      "shl         v3.4s, v1.4s, #2              \n"  // 4 * dx
+      "mul         v1.4s, v1.4s, v2.4s           \n"
     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
-    "add        v1.4s, v1.4s, v0.4s            \n"
+      "add         v1.4s, v1.4s, v0.4s           \n"
     // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
-    "add        v2.4s, v1.4s, v3.4s            \n"
-    "shl        v0.4s, v3.4s, #1               \n"  // 8 * dx
-  "1:                                          \n"
+      "add         v2.4s, v1.4s, v3.4s           \n"
+      "shl         v0.4s, v3.4s, #1              \n"  // 8 * dx
+      "1:                                        \n"
     LOAD2_DATA8_LANE(0)
     LOAD2_DATA8_LANE(1)
     LOAD2_DATA8_LANE(2)
@@ -591,27 +600,27 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
     LOAD2_DATA8_LANE(5)
     LOAD2_DATA8_LANE(6)
     LOAD2_DATA8_LANE(7)
-    "mov       v6.16b, v1.16b                  \n"
-    "mov       v7.16b, v2.16b                  \n"
-    "uzp1      v6.8h, v6.8h, v7.8h             \n"
-    "ushll     v4.8h, v4.8b, #0                \n"
-    "ushll     v5.8h, v5.8b, #0                \n"
-    "ssubl     v16.4s, v5.4h, v4.4h            \n"
-    "ssubl2    v17.4s, v5.8h, v4.8h            \n"
-    "ushll     v7.4s, v6.4h, #0                \n"
-    "ushll2    v6.4s, v6.8h, #0                \n"
-    "mul       v16.4s, v16.4s, v7.4s           \n"
-    "mul       v17.4s, v17.4s, v6.4s           \n"
-    "rshrn     v6.4h, v16.4s, #16              \n"
-    "rshrn2    v6.8h, v17.4s, #16              \n"
-    "add       v4.8h, v4.8h, v6.8h             \n"
-    "xtn       v4.8b, v4.8h                    \n"
-
-    "st1       {v4.8b}, [%0], #8               \n"  // store pixels
-    "add       v1.4s, v1.4s, v0.4s             \n"
-    "add       v2.4s, v2.4s, v0.4s             \n"
-    "subs      %w2, %w2, #8                    \n"  // 8 processed per loop
-    "b.gt      1b                              \n"
+      "mov         v6.16b, v1.16b                \n"
+      "mov         v7.16b, v2.16b                \n"
+      "uzp1        v6.8h, v6.8h, v7.8h           \n"
+      "ushll       v4.8h, v4.8b, #0              \n"
+      "ushll       v5.8h, v5.8b, #0              \n"
+      "ssubl       v16.4s, v5.4h, v4.4h          \n"
+      "ssubl2      v17.4s, v5.8h, v4.8h          \n"
+      "ushll       v7.4s, v6.4h, #0              \n"
+      "ushll2      v6.4s, v6.8h, #0              \n"
+      "mul         v16.4s, v16.4s, v7.4s         \n"
+      "mul         v17.4s, v17.4s, v6.4s         \n"
+      "rshrn       v6.4h, v16.4s, #16            \n"
+      "rshrn2      v6.8h, v17.4s, #16            \n"
+      "add         v4.8h, v4.8h, v6.8h           \n"
+      "xtn         v4.8b, v4.8h                  \n"
+
+      "st1         {v4.8b}, [%0], #8             \n"  // store pixels
+      "add         v1.4s, v1.4s, v0.4s           \n"
+      "add         v2.4s, v2.4s, v0.4s           \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "b.gt        1b                            \n"
   : "+r"(dst_ptr),          // %0
     "+r"(src_ptr),          // %1
     "+r"(dst_width),        // %2
@@ -635,74 +644,83 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
                           int source_y_fraction) {
   int y_fraction = 256 - source_y_fraction;
   asm volatile(
-      "cmp          %w4, #0                      \n"
-      "b.eq         100f                         \n"
-      "add          %2, %2, %1                   \n"
-      "cmp          %w4, #64                     \n"
-      "b.eq         75f                          \n"
-      "cmp          %w4, #128                    \n"
-      "b.eq         50f                          \n"
-      "cmp          %w4, #192                    \n"
-      "b.eq         25f                          \n"
-
-      "dup          v5.8b, %w4                   \n"
-      "dup          v4.8b, %w5                   \n"
+      "cmp         %w4, #0                       \n"
+      "b.eq        100f                          \n"
+      "add         %2, %2, %1                    \n"
+      "cmp         %w4, #64                      \n"
+      "b.eq        75f                           \n"
+      "cmp         %w4, #128                     \n"
+      "b.eq        50f                           \n"
+      "cmp         %w4, #192                     \n"
+      "b.eq        25f                           \n"
+
+      "dup         v5.8b, %w4                    \n"
+      "dup         v4.8b, %w5                    \n"
       // General purpose row blend.
       "1:                                        \n"
-      "ld1          {v0.16b}, [%1], #16          \n"
-      "ld1          {v1.16b}, [%2], #16          \n"
-      "subs         %w3, %w3, #16                \n"
-      "umull        v6.8h, v0.8b, v4.8b          \n"
-      "umull2       v7.8h, v0.16b, v4.16b        \n"
-      "umlal        v6.8h, v1.8b, v5.8b          \n"
-      "umlal2       v7.8h, v1.16b, v5.16b        \n"
-      "rshrn        v0.8b, v6.8h, #8             \n"
-      "rshrn2       v0.16b, v7.8h, #8            \n"
-      "st1          {v0.16b}, [%0], #16          \n"
-      "b.gt         1b                           \n"
-      "b            99f                          \n"
+      "ld1         {v0.16b}, [%1], #16           \n"
+      "ld1         {v1.16b}, [%2], #16           \n"
+      "subs        %w3, %w3, #16                 \n"
+      "umull       v6.8h, v0.8b, v4.8b           \n"
+      "umull2      v7.8h, v0.16b, v4.16b         \n"
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+      "umlal       v6.8h, v1.8b, v5.8b           \n"
+      "umlal2      v7.8h, v1.16b, v5.16b         \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "rshrn       v0.8b, v6.8h, #8              \n"
+      "rshrn2      v0.16b, v7.8h, #8             \n"
+      "st1         {v0.16b}, [%0], #16           \n"
+      "b.gt        1b                            \n"
+      "b           99f                           \n"
 
       // Blend 25 / 75.
       "25:                                       \n"
-      "ld1          {v0.16b}, [%1], #16          \n"
-      "ld1          {v1.16b}, [%2], #16          \n"
-      "subs         %w3, %w3, #16                \n"
-      "urhadd       v0.16b, v0.16b, v1.16b       \n"
-      "urhadd       v0.16b, v0.16b, v1.16b       \n"
-      "st1          {v0.16b}, [%0], #16          \n"
-      "b.gt         25b                          \n"
-      "b            99f                          \n"
+      "ld1         {v0.16b}, [%1], #16           \n"
+      "ld1         {v1.16b}, [%2], #16           \n"
+      "subs        %w3, %w3, #16                 \n"
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "st1         {v0.16b}, [%0], #16           \n"
+      "b.gt        25b                           \n"
+      "b           99f                           \n"
 
       // Blend 50 / 50.
       "50:                                       \n"
-      "ld1          {v0.16b}, [%1], #16          \n"
-      "ld1          {v1.16b}, [%2], #16          \n"
-      "subs         %w3, %w3, #16                \n"
-      "urhadd       v0.16b, v0.16b, v1.16b       \n"
-      "st1          {v0.16b}, [%0], #16          \n"
-      "b.gt         50b                          \n"
-      "b            99f                          \n"
+      "ld1         {v0.16b}, [%1], #16           \n"
+      "ld1         {v1.16b}, [%2], #16           \n"
+      "subs        %w3, %w3, #16                 \n"
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "st1         {v0.16b}, [%0], #16           \n"
+      "b.gt        50b                           \n"
+      "b           99f                           \n"
 
       // Blend 75 / 25.
       "75:                                       \n"
-      "ld1          {v1.16b}, [%1], #16          \n"
-      "ld1          {v0.16b}, [%2], #16          \n"
-      "subs         %w3, %w3, #16                \n"
-      "urhadd       v0.16b, v0.16b, v1.16b       \n"
-      "urhadd       v0.16b, v0.16b, v1.16b       \n"
-      "st1          {v0.16b}, [%0], #16          \n"
-      "b.gt         75b                          \n"
-      "b            99f                          \n"
+      "ld1         {v1.16b}, [%1], #16           \n"
+      "ld1         {v0.16b}, [%2], #16           \n"
+      "subs        %w3, %w3, #16                 \n"
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "st1         {v0.16b}, [%0], #16           \n"
+      "b.gt        75b                           \n"
+      "b           99f                           \n"
 
       // Blend 100 / 0 - Copy row unchanged.
       "100:                                      \n"
-      "ld1          {v0.16b}, [%1], #16          \n"
-      "subs         %w3, %w3, #16                \n"
-      "st1          {v0.16b}, [%0], #16          \n"
-      "b.gt         100b                         \n"
+      "ld1         {v0.16b}, [%1], #16           \n"
+      "subs        %w3, %w3, #16                 \n"
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v0.16b}, [%0], #16           \n"
+      "b.gt        100b                          \n"
 
       "99:                                       \n"
-      "st1          {v0.b}[15], [%0]             \n"
+      "st1         {v0.b}[15], [%0]              \n"
       : "+r"(dst_ptr),            // %0
         "+r"(src_ptr),            // %1
         "+r"(src_stride),         // %2
@@ -721,17 +739,18 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
   asm volatile(
       "1:                                        \n"
       // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
-      "ld4        {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-      "mov        v2.16b, v3.16b                 \n"
-      "st2        {v1.4s,v2.4s}, [%1], #32       \n"  // store 8 odd pixels
-      "b.gt       1b                             \n"
+      "ld4         {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "mov         v2.16b, v3.16b                \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st2         {v1.4s,v2.4s}, [%1], #32      \n"  // store 8 odd pixels
+      "b.gt        1b                            \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst),       // %1
         "+r"(dst_width)  // %2
       :
       : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
@@ -742,19 +761,20 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
   asm volatile(
       "1:                                        \n"
       // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
-      "ld4        {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-
-      "urhadd     v0.16b, v0.16b, v1.16b         \n"  // rounding half add
-      "urhadd     v1.16b, v2.16b, v3.16b         \n"
-      "st2        {v0.4s,v1.4s}, [%1], #32       \n"  // store 8 pixels
-      "b.gt       1b                             \n"
+      "ld4         {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"  // rounding half add
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "urhadd      v1.16b, v2.16b, v3.16b        \n"
+      "st2         {v0.4s,v1.4s}, [%1], #32      \n"  // store 8 pixels
+      "b.gt        1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
         "+r"(dst_width)  // %2
       :
       : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
@@ -763,25 +783,27 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
                                int dst_width) {
   asm volatile(
       // change the stride to row 2 pointer
-      "add        %1, %1, %0                     \n"
+      "add         %1, %1, %0                    \n"
       "1:                                        \n"
-      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 8 ARGB
-      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-      "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-      "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-      "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-      "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
-      "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8
-      "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
-      "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.
-      "rshrn      v0.8b, v0.8h, #2               \n"  // round and pack
-      "rshrn      v1.8b, v1.8h, #2               \n"
-      "rshrn      v2.8b, v2.8h, #2               \n"
-      "rshrn      v3.8b, v3.8h, #2               \n"
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
-      "b.gt       1b                             \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "uaddlp      v3.8h, v3.16b                 \n"  // A 16 bytes -> 8 shorts.
+      "ld4         {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8
+      "uadalp      v0.8h, v16.16b                \n"  // B 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v17.16b                \n"  // G 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "uadalp      v2.8h, v18.16b                \n"  // R 16 bytes -> 8 shorts.
+      "uadalp      v3.8h, v19.16b                \n"  // A 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "rshrn       v0.8b, v0.8h, #2              \n"  // round and pack
+      "rshrn       v1.8b, v1.8h, #2              \n"
+      "rshrn       v2.8b, v2.8h, #2              \n"
+      "rshrn       v3.8b, v3.8h, #2              \n"
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
+      "b.gt        1b                            \n"
       : "+r"(src_ptr),     // %0
         "+r"(src_stride),  // %1
         "+r"(dst),         // %2
@@ -800,13 +822,14 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
   (void)src_stride;
   asm volatile(
       "1:                                        \n"
-      "ld1        {v0.s}[0], [%0], %3            \n"
-      "ld1        {v0.s}[1], [%0], %3            \n"
-      "ld1        {v0.s}[2], [%0], %3            \n"
-      "ld1        {v0.s}[3], [%0], %3            \n"
-      "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
-      "st1        {v0.16b}, [%1], #16            \n"
-      "b.gt       1b                             \n"
+      "ld1         {v0.s}[0], [%0], %3           \n"
+      "ld1         {v0.s}[1], [%0], %3           \n"
+      "ld1         {v0.s}[2], [%0], %3           \n"
+      "ld1         {v0.s}[3], [%0], %3           \n"
+      "subs        %w2, %w2, #4                  \n"  // 4 pixels per loop.
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v0.16b}, [%1], #16           \n"
+      "b.gt        1b                            \n"
       : "+r"(src_argb),                // %0
         "+r"(dst_argb),                // %1
         "+r"(dst_width)                // %2
@@ -824,33 +847,35 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
                                   uint8_t* dst_argb,
                                   int dst_width) {
   asm volatile(
-      "add        %1, %1, %0                     \n"
+      "add         %1, %1, %0                    \n"
       "1:                                        \n"
-      "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 -> 2x1
-      "ld1        {v1.8b}, [%1], %4              \n"
-      "ld1        {v2.8b}, [%0], %4              \n"
-      "ld1        {v3.8b}, [%1], %4              \n"
-      "ld1        {v4.8b}, [%0], %4              \n"
-      "ld1        {v5.8b}, [%1], %4              \n"
-      "ld1        {v6.8b}, [%0], %4              \n"
-      "ld1        {v7.8b}, [%1], %4              \n"
-      "uaddl      v0.8h, v0.8b, v1.8b            \n"
-      "uaddl      v2.8h, v2.8b, v3.8b            \n"
-      "uaddl      v4.8h, v4.8b, v5.8b            \n"
-      "uaddl      v6.8h, v6.8b, v7.8b            \n"
-      "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd
-      "mov        v0.d[1], v2.d[0]               \n"
-      "mov        v2.d[0], v16.d[1]              \n"
-      "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh
-      "mov        v4.d[1], v6.d[0]               \n"
-      "mov        v6.d[0], v16.d[1]              \n"
-      "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)
-      "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)
-      "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.
-      "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
-      "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
-      "st1     {v0.16b}, [%2], #16               \n"
-      "b.gt       1b                             \n"
+      "ld1         {v0.8b}, [%0], %4             \n"  // Read 4 2x2 -> 2x1
+      "ld1         {v1.8b}, [%1], %4             \n"
+      "ld1         {v2.8b}, [%0], %4             \n"
+      "ld1         {v3.8b}, [%1], %4             \n"
+      "ld1         {v4.8b}, [%0], %4             \n"
+      "ld1         {v5.8b}, [%1], %4             \n"
+      "ld1         {v6.8b}, [%0], %4             \n"
+      "ld1         {v7.8b}, [%1], %4             \n"
+      "uaddl       v0.8h, v0.8b, v1.8b           \n"
+      "uaddl       v2.8h, v2.8b, v3.8b           \n"
+      "uaddl       v4.8h, v4.8b, v5.8b           \n"
+      "uaddl       v6.8h, v6.8b, v7.8b           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "mov         v16.d[1], v0.d[1]             \n"  // ab_cd -> ac_bd
+      "mov         v0.d[1], v2.d[0]              \n"
+      "mov         v2.d[0], v16.d[1]             \n"
+      "mov         v16.d[1], v4.d[1]             \n"  // ef_gh -> eg_fh
+      "mov         v4.d[1], v6.d[0]              \n"
+      "mov         v6.d[0], v16.d[1]             \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "add         v0.8h, v0.8h, v2.8h           \n"  // (a+b)_(c+d)
+      "add         v4.8h, v4.8h, v6.8h           \n"  // (e+f)_(g+h)
+      "rshrn       v0.8b, v0.8h, #2              \n"  // first 2 pixels.
+      "rshrn2      v0.16b, v4.8h, #2             \n"  // next 2 pixels.
+      "subs        %w3, %w3, #4                  \n"  // 4 pixels per loop.
+      "st1         {v0.16b}, [%2], #16           \n"
+      "b.gt        1b                            \n"
       : "+r"(src_argb),                // %0
         "+r"(src_stride),              // %1
         "+r"(dst_argb),                // %2
@@ -887,10 +912,11 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
       LOAD1_DATA32_LANE(v1, 1)
       LOAD1_DATA32_LANE(v1, 2)
       LOAD1_DATA32_LANE(v1, 3)
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
       // clang-format on
-      "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-      "b.gt       1b                             \n"
+      "st1         {v0.4s, v1.4s}, [%0], #32     \n"  // store pixels
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "b.gt        1b                            \n"
       : "+r"(dst_argb),   // %0
         "+r"(src_argb),   // %1
         "+r"(dst_width),  // %2
@@ -923,16 +949,16 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
   int64_t x64 = (int64_t)x;    // NOLINT
   int64_t dx64 = (int64_t)dx;  // NOLINT
   asm volatile (
-    "dup        v0.4s, %w3                     \n"  // x
-    "dup        v1.4s, %w4                     \n"  // dx
-    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
-    "shl        v6.4s, v1.4s, #2               \n"  // 4 * dx
-    "mul        v1.4s, v1.4s, v2.4s            \n"
-    "movi       v3.16b, #0x7f                  \n"  // 0x7F
-    "movi       v4.8h, #0x7f                   \n"  // 0x7F
+      "dup         v0.4s, %w3                    \n"  // x
+      "dup         v1.4s, %w4                    \n"  // dx
+      "ld1         {v2.4s}, [%5]                 \n"  // 0 1 2 3
+      "shl         v6.4s, v1.4s, #2              \n"  // 4 * dx
+      "mul         v1.4s, v1.4s, v2.4s           \n"
+      "movi        v3.16b, #0x7f                 \n"  // 0x7F
+      "movi        v4.8h, #0x7f                  \n"  // 0x7F
     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
-    "add        v5.4s, v1.4s, v0.4s            \n"
-  "1:                                          \n"
+      "add         v5.4s, v1.4s, v0.4s           \n"
+      "1:                                        \n"
     // d0, d1: a
     // d2, d3: b
     LOAD2_DATA32_LANE(v0, v1, 0)
@@ -953,15 +979,15 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
     "umull2     v17.8h, v0.16b, v7.16b         \n"
     "umull      v18.8h, v1.8b, v2.8b           \n"
     "umull2     v19.8h, v1.16b, v2.16b         \n"
+    "prfm       pldl1keep, [%1, 448]           \n"  // prefetch 7 lines ahead
     "add        v16.8h, v16.8h, v18.8h         \n"
     "add        v17.8h, v17.8h, v19.8h         \n"
     "shrn       v0.8b, v16.8h, #7              \n"
     "shrn2      v0.16b, v17.8h, #7             \n"
-
     "st1     {v0.4s}, [%0], #16                \n"  // store pixels
     "add     v5.4s, v5.4s, v6.4s               \n"
     "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
-    "b.gt    1b                                \n"
+    "b.gt       1b                             \n"
   : "+r"(dst_argb),         // %0
     "+r"(src_argb),         // %1
     "+r"(dst_width),        // %2
@@ -984,26 +1010,28 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
                               int dst_width) {
   asm volatile(
       // change the stride to row 2 pointer
-      "add        %1, %0, %1, lsl #1             \n"  // ptr + stide * 2
+      "add         %1, %0, %1, lsl #1            \n"  // ptr + stide * 2
       "1:                                        \n"
-      "ld1        {v0.8h, v1.8h}, [%0], #32      \n"  // load row 1 and post inc
-      "ld1        {v2.8h, v3.8h}, [%1], #32      \n"  // load row 2 and post inc
-      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop
-      "uaddlp     v0.4s, v0.8h                   \n"  // row 1 add adjacent
-      "uaddlp     v1.4s, v1.8h                   \n"
-      "uadalp     v0.4s, v2.8h                   \n"  // +row 2 add adjacent
-      "uadalp     v1.4s, v3.8h                   \n"
-      "rshrn      v0.4h, v0.4s, #2               \n"  // round and pack
-      "rshrn2     v0.8h, v1.4s, #2               \n"
-      "st1        {v0.8h}, [%2], #16             \n"
-      "b.gt       1b                             \n"
+      "ld1         {v0.8h, v1.8h}, [%0], #32     \n"  // load row 1 and post inc
+      "ld1         {v2.8h, v3.8h}, [%1], #32     \n"  // load row 2 and post inc
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop
+      "uaddlp      v0.4s, v0.8h                  \n"  // row 1 add adjacent
+      "uaddlp      v1.4s, v1.8h                  \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "uadalp      v0.4s, v2.8h                  \n"  // +row 2 add adjacent
+      "uadalp      v1.4s, v3.8h                  \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "rshrn       v0.4h, v0.4s, #2              \n"  // round and pack
+      "rshrn2      v0.8h, v1.4s, #2              \n"
+      "st1         {v0.8h}, [%2], #16            \n"
+      "b.gt        1b                            \n"
       : "+r"(src_ptr),     // %0
         "+r"(src_stride),  // %1
         "+r"(dst),         // %2
         "+r"(dst_width)    // %3
       :
       : "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 // Read 8x2 upsample with filtering and write 16x1.
@@ -1013,38 +1041,40 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
                          uint16_t* dst,
                          int dst_width) {
   asm volatile(
-      "add        %1, %0, %1, lsl #1             \n"  // ptr + stide * 2
-      "movi       v0.8h, #9                      \n"  // constants
-      "movi       v1.4s, #3                      \n"
+      "add         %1, %0, %1, lsl #1            \n"  // ptr + stide * 2
+      "movi        v0.8h, #9                     \n"  // constants
+      "movi        v1.4s, #3                     \n"
 
       "1:                                        \n"
-      "ld1        {v3.8h}, [%0], %4              \n"  // TL read first 8
-      "ld1        {v4.8h}, [%0], %5              \n"  // TR read 8 offset by 1
-      "ld1        {v5.8h}, [%1], %4              \n"  // BL read 8 from next row
-      "ld1        {v6.8h}, [%1], %5              \n"  // BR offset by 1
-      "subs       %w3, %w3, #16                  \n"  // 16 dst pixels per loop
-      "umull      v16.4s, v3.4h, v0.4h           \n"
-      "umull2     v7.4s, v3.8h, v0.8h            \n"
-      "umull      v18.4s, v4.4h, v0.4h           \n"
-      "umull2     v17.4s, v4.8h, v0.8h           \n"
-      "uaddw      v16.4s, v16.4s, v6.4h          \n"
-      "uaddl2     v19.4s, v6.8h, v3.8h           \n"
-      "uaddl      v3.4s, v6.4h, v3.4h            \n"
-      "uaddw2     v6.4s, v7.4s, v6.8h            \n"
-      "uaddl2     v7.4s, v5.8h, v4.8h            \n"
-      "uaddl      v4.4s, v5.4h, v4.4h            \n"
-      "uaddw      v18.4s, v18.4s, v5.4h          \n"
-      "mla        v16.4s, v4.4s, v1.4s           \n"
-      "mla        v18.4s, v3.4s, v1.4s           \n"
-      "mla        v6.4s, v7.4s, v1.4s            \n"
-      "uaddw2     v4.4s, v17.4s, v5.8h           \n"
-      "uqrshrn    v16.4h,  v16.4s, #4            \n"
-      "mla        v4.4s, v19.4s, v1.4s           \n"
-      "uqrshrn2   v16.8h, v6.4s, #4              \n"
-      "uqrshrn    v17.4h, v18.4s, #4             \n"
-      "uqrshrn2   v17.8h, v4.4s, #4              \n"
-      "st2        {v16.8h-v17.8h}, [%2], #32     \n"
-      "b.gt       1b                             \n"
+      "ld1         {v3.8h}, [%0], %4             \n"  // TL read first 8
+      "ld1         {v4.8h}, [%0], %5             \n"  // TR read 8 offset by 1
+      "ld1         {v5.8h}, [%1], %4             \n"  // BL read 8 from next row
+      "ld1         {v6.8h}, [%1], %5             \n"  // BR offset by 1
+      "subs        %w3, %w3, #16                 \n"  // 16 dst pixels per loop
+      "umull       v16.4s, v3.4h, v0.4h          \n"
+      "umull2      v7.4s, v3.8h, v0.8h           \n"
+      "umull       v18.4s, v4.4h, v0.4h          \n"
+      "umull2      v17.4s, v4.8h, v0.8h          \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "uaddw       v16.4s, v16.4s, v6.4h         \n"
+      "uaddl2      v19.4s, v6.8h, v3.8h          \n"
+      "uaddl       v3.4s, v6.4h, v3.4h           \n"
+      "uaddw2      v6.4s, v7.4s, v6.8h           \n"
+      "uaddl2      v7.4s, v5.8h, v4.8h           \n"
+      "uaddl       v4.4s, v5.4h, v4.4h           \n"
+      "uaddw       v18.4s, v18.4s, v5.4h         \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "mla         v16.4s, v4.4s, v1.4s          \n"
+      "mla         v18.4s, v3.4s, v1.4s          \n"
+      "mla         v6.4s, v7.4s, v1.4s           \n"
+      "uaddw2      v4.4s, v17.4s, v5.8h          \n"
+      "uqrshrn     v16.4h,  v16.4s, #4           \n"
+      "mla         v4.4s, v19.4s, v1.4s          \n"
+      "uqrshrn2    v16.8h, v6.4s, #4             \n"
+      "uqrshrn     v17.4h, v18.4s, #4            \n"
+      "uqrshrn2    v17.8h, v4.4s, #4             \n"
+      "st2         {v16.8h-v17.8h}, [%2], #32    \n"
+      "b.gt        1b                            \n"
       : "+r"(src_ptr),     // %0
         "+r"(src_stride),  // %1
         "+r"(dst),         // %2
@@ -1053,7 +1083,65 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
         "r"(14LL)          // %5
       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
         "v19"  // Clobber List
-      );
+  );
+}
+
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add         %1, %1, %0                    \n"
+      "1:                                        \n"
+      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 UV
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "uaddlp      v0.8h, v0.16b                 \n"  // U 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v1.16b                 \n"  // V 16 bytes -> 8 shorts.
+      "ld2         {v16.16b,v17.16b}, [%1], #32  \n"  // load 16
+      "uadalp      v0.8h, v16.16b                \n"  // U 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v17.16b                \n"  // V 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "rshrn       v0.8b, v0.8h, #2              \n"  // round and pack
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "rshrn       v1.8b, v1.8h, #2              \n"
+      "st2         {v0.8b,v1.8b}, [%2], #16      \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "memory", "cc", "v0", "v1", "v16", "v17");
+}
+
+// Reads 4 pixels at a time.
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             int src_stepx,  // pixel step
+                             uint8_t* dst_ptr,
+                             int dst_width) {
+  const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
+  const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
+  const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v0.h}[0], [%0], %6            \n"
+      "ld1        {v1.h}[0], [%1], %6            \n"
+      "ld1        {v2.h}[0], [%2], %6            \n"
+      "ld1        {v3.h}[0], [%3], %6            \n"
+      "subs       %w5, %w5, #4                   \n"  // 4 pixels per loop.
+      "st4        {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),                 // %0
+        "+r"(src1_ptr),                // %1
+        "+r"(src2_ptr),                // %2
+        "+r"(src3_ptr),                // %3
+        "+r"(dst_ptr),                 // %4
+        "+r"(dst_width)                // %5
+      : "r"((int64_t)(src_stepx * 8))  // %6
+      : "memory", "cc", "v0", "v1", "v2", "v3");
 }
 
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
diff --git a/chromium/third_party/libyuv/source/scale_uv.cc b/chromium/third_party/libyuv/source/scale_uv.cc
new file mode 100644
index 00000000000..b0469f09b87
--- /dev/null
+++ b/chromium/third_party/libyuv/source/scale_uv.cc
@@ -0,0 +1,891 @@
+/*
+ *  Copyright 2020 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyUV
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Macros to enable specialized scalers
+
+#ifndef HAS_SCALEUVDOWN2
+#define HAS_SCALEUVDOWN2 1
+#endif
+#ifndef HAS_SCALEUVDOWN4BOX
+#define HAS_SCALEUVDOWN4BOX 1
+#endif
+#ifndef HAS_SCALEUVDOWNEVEN
+#define HAS_SCALEUVDOWNEVEN 1
+#endif
+#ifndef HAS_SCALEUVBILINEARDOWN
+#define HAS_SCALEUVBILINEARDOWN 1
+#endif
+#ifndef HAS_SCALEUVBILINEARUP
+#define HAS_SCALEUVBILINEARUP 1
+#endif
+#ifndef HAS_UVCOPY
+#define HAS_UVCOPY 1
+#endif
+#ifndef HAS_SCALEPLANEVERTICAL
+#define HAS_SCALEPLANEVERTICAL 1
+#endif
+
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// ScaleUV, 1/2
+// This is an optimized version for scaling down a UV to 1/2 of
+// its original size.
+#if HAS_SCALEUVDOWN2
+static void ScaleUVDown2(int src_width,
+                         int src_height,
+                         int dst_width,
+                         int dst_height,
+                         int src_stride,
+                         int dst_stride,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_uv,
+                         int x,
+                         int dx,
+                         int y,
+                         int dy,
+                         enum FilterMode filtering) {
+  int j;
+  int row_stride = src_stride * (dy >> 16);
+  void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
+                          uint8_t* dst_uv, int dst_width) =
+      filtering == kFilterNone
+          ? ScaleUVRowDown2_C
+          : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_C
+                                        : ScaleUVRowDown2Box_C);
+  (void)src_width;
+  (void)src_height;
+  (void)dx;
+  assert(dx == 65536 * 2);      // Test scale factor of 2.
+  assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
+  // Advance to odd row, even column.
+  if (filtering == kFilterBilinear) {
+    src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
+  } else {
+    src_uv += (y >> 16) * src_stride + ((x >> 16) - 1) * 2;
+  }
+
+#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && filtering) {
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && filtering) {
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && filtering) {
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON;
+    }
+  }
+#endif
+
+// This code is not enabled.  Only box filter is available at this time.
+#if defined(HAS_SCALEUVROWDOWN2_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleUVRowDown2 =
+        filtering == kFilterNone
+            ? ScaleUVRowDown2_Any_SSSE3
+            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_SSSE3
+                                          : ScaleUVRowDown2Box_Any_SSSE3);
+    if (IS_ALIGNED(dst_width, 2)) {
+      ScaleUVRowDown2 =
+          filtering == kFilterNone
+              ? ScaleUVRowDown2_SSSE3
+              : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_SSSE3
+                                            : ScaleUVRowDown2Box_SSSE3);
+    }
+  }
+#endif
+// This code is not enabled.  Only box filter is available at this time.
+#if defined(HAS_SCALEUVROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVRowDown2 =
+        filtering == kFilterNone
+            ? ScaleUVRowDown2_Any_NEON
+            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON
+                                          : ScaleUVRowDown2Box_Any_NEON);
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVRowDown2 =
+          filtering == kFilterNone
+              ? ScaleUVRowDown2_NEON
+              : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON
+                                            : ScaleUVRowDown2Box_NEON);
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ScaleUVRowDown2 =
+        filtering == kFilterNone
+            ? ScaleUVRowDown2_Any_MMI
+            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MMI
+                                          : ScaleUVRowDown2Box_Any_MMI);
+    if (IS_ALIGNED(dst_width, 2)) {
+      ScaleUVRowDown2 =
+          filtering == kFilterNone
+              ? ScaleUVRowDown2_MMI
+              : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MMI
+                                            : ScaleUVRowDown2Box_MMI);
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleUVRowDown2 =
+        filtering == kFilterNone
+            ? ScaleUVRowDown2_Any_MSA
+            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MSA
+                                          : ScaleUVRowDown2Box_Any_MSA);
+    if (IS_ALIGNED(dst_width, 2)) {
+      ScaleUVRowDown2 =
+          filtering == kFilterNone
+              ? ScaleUVRowDown2_MSA
+              : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MSA
+                                            : ScaleUVRowDown2Box_MSA);
+    }
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (j = 0; j < dst_height; ++j) {
+    ScaleUVRowDown2(src_uv, src_stride, dst_uv, dst_width);
+    src_uv += row_stride;
+    dst_uv += dst_stride;
+  }
+}
+#endif  // HAS_SCALEUVDOWN2
+
+// ScaleUV, 1/4
+// This is an optimized version for scaling down a UV to 1/4 of
+// its original size.
+#if HAS_SCALEUVDOWN4BOX
+static void ScaleUVDown4Box(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t* src_uv,
+                            uint8_t* dst_uv,
+                            int x,
+                            int dx,
+                            int y,
+                            int dy) {
+  int j;
+  // Allocate 2 rows of UV.
+  const int kRowSize = (dst_width * 2 * 2 + 15) & ~15;
+  align_buffer_64(row, kRowSize * 2);
+  int row_stride = src_stride * (dy >> 16);
+  void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
+                          uint8_t* dst_uv, int dst_width) =
+      ScaleUVRowDown2Box_C;
+  // Advance to odd row, even column.
+  src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
+  (void)src_width;
+  (void)src_height;
+  (void)dx;
+  assert(dx == 65536 * 4);      // Test scale factor of 4.
+  assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
+
+#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON;
+    }
+  }
+#endif
+
+  for (j = 0; j < dst_height; ++j) {
+    ScaleUVRowDown2(src_uv, src_stride, row, dst_width * 2);
+    ScaleUVRowDown2(src_uv + src_stride * 2, src_stride, row + kRowSize,
+                    dst_width * 2);
+    ScaleUVRowDown2(row, kRowSize, dst_uv, dst_width);
+    src_uv += row_stride;
+    dst_uv += dst_stride;
+  }
+  free_aligned_buffer_64(row);
+}
+#endif  // HAS_SCALEUVDOWN4BOX
+
+// ScaleUV Even
+// This is an optimized version for scaling down a UV to even
+// multiple of its original size.
+#if HAS_SCALEUVDOWNEVEN
+static void ScaleUVDownEven(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t* src_uv,
+                            uint8_t* dst_uv,
+                            int x,
+                            int dx,
+                            int y,
+                            int dy,
+                            enum FilterMode filtering) {
+  int j;
+  int col_step = dx >> 16;
+  int row_stride = (dy >> 16) * src_stride;
+  void (*ScaleUVRowDownEven)(const uint8_t* src_uv, ptrdiff_t src_stride,
+                             int src_step, uint8_t* dst_uv, int dst_width) =
+      filtering ? ScaleUVRowDownEvenBox_C : ScaleUVRowDownEven_C;
+  (void)src_width;
+  (void)src_height;
+  assert(IS_ALIGNED(src_width, 2));
+  assert(IS_ALIGNED(src_height, 2));
+  src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
+#if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3
+                                   : ScaleUVRowDownEven_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVRowDownEven =
+          filtering ? ScaleUVRowDownEvenBox_SSE2 : ScaleUVRowDownEven_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && !filtering) {
+    ScaleUVRowDownEven = ScaleUVRowDownEven_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVRowDownEven = ScaleUVRowDownEven_NEON;
+    }
+  }
+#endif// TODO(fbarchard): Enable Box filter
+#if defined(HAS_SCALEUVROWDOWNEVENBOX_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_NEON
+                                   : ScaleUVRowDownEven_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVRowDownEven =
+          filtering ? ScaleUVRowDownEvenBox_NEON : ScaleUVRowDownEven_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ScaleUVRowDownEven =
+        filtering ? ScaleUVRowDownEvenBox_Any_MMI : ScaleUVRowDownEven_Any_MMI;
+    if (IS_ALIGNED(dst_width, 2)) {
+      ScaleUVRowDownEven =
+          filtering ? ScaleUVRowDownEvenBox_MMI : ScaleUVRowDownEven_MMI;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleUVRowDownEven =
+        filtering ? ScaleUVRowDownEvenBox_Any_MSA : ScaleUVRowDownEven_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVRowDownEven =
+          filtering ? ScaleUVRowDownEvenBox_MSA : ScaleUVRowDownEven_MSA;
+    }
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (j = 0; j < dst_height; ++j) {
+    ScaleUVRowDownEven(src_uv, src_stride, col_step, dst_uv, dst_width);
+    src_uv += row_stride;
+    dst_uv += dst_stride;
+  }
+}
+#endif
+
+// Scale UV down with bilinear interpolation.
+#if HAS_SCALEUVBILINEARDOWN
+static void ScaleUVBilinearDown(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint8_t* src_uv,
+                                uint8_t* dst_uv,
+                                int x,
+                                int dx,
+                                int y,
+                                int dy,
+                                enum FilterMode filtering) {
+  int j;
+  void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv,
+                            int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleUVFilterCols64_C : ScaleUVFilterCols_C;
+  int64_t xlast = x + (int64_t)(dst_width - 1) * dx;
+  int64_t xl = (dx >= 0) ? x : xlast;
+  int64_t xr = (dx >= 0) ? xlast : x;
+  int clip_src_width;
+  xl = (xl >> 16) & ~3;    // Left edge aligned.
+  xr = (xr >> 16) + 1;     // Right most pixel used.  Bilinear uses 2 pixels.
+  xr = (xr + 1 + 3) & ~3;  // 1 beyond 4 pixel aligned right most pixel.
+  if (xr > src_width) {
+    xr = src_width;
+  }
+  clip_src_width = (int)(xr - xl) * 2;  // Width aligned to 2.
+  src_uv += xl * 2;
+  x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(clip_src_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(clip_src_width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleUVFilterCols = ScaleUVFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVFilterCols = ScaleUVFilterCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVFilterCols = ScaleUVFilterCols_MSA;
+    }
+  }
+#endif
+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+  // Allocate a row of UV.
+  {
+    align_buffer_64(row, clip_src_width * 2);
+
+    const int max_y = (src_height - 1) << 16;
+    if (y > max_y) {
+      y = max_y;
+    }
+    for (j = 0; j < dst_height; ++j) {
+      int yi = y >> 16;
+      const uint8_t* src = src_uv + yi * src_stride;
+      if (filtering == kFilterLinear) {
+        ScaleUVFilterCols(dst_uv, src, dst_width, x, dx);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(row, src, src_stride, clip_src_width, yf);
+        ScaleUVFilterCols(dst_uv, row, dst_width, x, dx);
+      }
+      dst_uv += dst_stride;
+      y += dy;
+      if (y > max_y) {
+        y = max_y;
+      }
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+#endif
+
+// Scale UV up with bilinear interpolation.
+#if HAS_SCALEUVBILINEARUP
+static void ScaleUVBilinearUp(int src_width,
+                              int src_height,
+                              int dst_width,
+                              int dst_height,
+                              int src_stride,
+                              int dst_stride,
+                              const uint8_t* src_uv,
+                              uint8_t* dst_uv,
+                              int x,
+                              int dx,
+                              int y,
+                              int dy,
+                              enum FilterMode filtering) {
+  int j;
+  void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv,
+                            int dst_width, int x, int dx) =
+      filtering ? ScaleUVFilterCols_C : ScaleUVCols_C;
+  const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    InterpolateRow = InterpolateRow_Any_MMI;
+    if (IS_ALIGNED(dst_width, 2)) {
+      InterpolateRow = InterpolateRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
+  if (src_width >= 32768) {
+    ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C;
+  }
+#if defined(HAS_SCALEUVFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleUVFilterCols = ScaleUVFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVFilterCols = ScaleUVFilterCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_MSA)
+  if (filtering && TestCpuFlag(kCpuHasMSA)) {
+    ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVFilterCols = ScaleUVFilterCols_MSA;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVCOLS_SSSE3)
+  if (!filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleUVFilterCols = ScaleUVCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEUVCOLS_NEON)
+  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVFilterCols = ScaleUVCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVFilterCols = ScaleUVCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVCOLS_MMI)
+  if (!filtering && TestCpuFlag(kCpuHasMMI)) {
+    ScaleUVFilterCols = ScaleUVCols_Any_MMI;
+    if (IS_ALIGNED(dst_width, 1)) {
+      ScaleUVFilterCols = ScaleUVCols_MMI;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVCOLS_MSA)
+  if (!filtering && TestCpuFlag(kCpuHasMSA)) {
+    ScaleUVFilterCols = ScaleUVCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVFilterCols = ScaleUVCols_MSA;
+    }
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleUVFilterCols = ScaleUVColsUp2_C;
+#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) {
+      ScaleUVFilterCols = ScaleUVColsUp2_SSSE3;
+    }
+#endif
+#if defined(HAS_SCALEUVCOLSUP2_MMI)
+    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+      ScaleUVFilterCols = ScaleUVColsUp2_MMI;
+    }
+#endif
+  }
+
+  if (y > max_y) {
+    y = max_y;
+  }
+
+  {
+    int yi = y >> 16;
+    const uint8_t* src = src_uv + yi * src_stride;
+
+    // Allocate 2 rows of UV.
+    const int kRowSize = (dst_width * 2 + 15) & ~15;
+    align_buffer_64(row, kRowSize * 2);
+
+    uint8_t* rowptr = row;
+    int rowstride = kRowSize;
+    int lasty = yi;
+
+    ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
+    if (src_height > 1) {
+      src += src_stride;
+    }
+    ScaleUVFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+    src += src_stride;
+
+    for (j = 0; j < dst_height; ++j) {
+      yi = y >> 16;
+      if (yi != lasty) {
+        if (y > max_y) {
+          y = max_y;
+          yi = y >> 16;
+          src = src_uv + yi * src_stride;
+        }
+        if (yi != lasty) {
+          ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
+          rowptr += rowstride;
+          rowstride = -rowstride;
+          lasty = yi;
+          src += src_stride;
+        }
+      }
+      if (filtering == kFilterLinear) {
+        InterpolateRow(dst_uv, rowptr, 0, dst_width * 2, 0);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(dst_uv, rowptr, rowstride, dst_width * 2, yf);
+      }
+      dst_uv += dst_stride;
+      y += dy;
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+#endif  // HAS_SCALEUVBILINEARUP
+
+// Scale UV to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScaleUVSimple(int src_width,
+                          int src_height,
+                          int dst_width,
+                          int dst_height,
+                          int src_stride,
+                          int dst_stride,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_uv,
+                          int x,
+                          int dx,
+                          int y,
+                          int dy) {
+  int j;
+  void (*ScaleUVCols)(uint8_t * dst_uv, const uint8_t* src_uv, int dst_width,
+                      int x, int dx) =
+      (src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C;
+  (void)src_height;
+#if defined(HAS_SCALEUVCOLS_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleUVCols = ScaleUVCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEUVCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVCols = ScaleUVCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVCols = ScaleUVCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVCOLS_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ScaleUVCols = ScaleUVCols_Any_MMI;
+    if (IS_ALIGNED(dst_width, 1)) {
+      ScaleUVCols = ScaleUVCols_MMI;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVCOLS_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleUVCols = ScaleUVCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVCols = ScaleUVCols_MSA;
+    }
+  }
+#endif
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleUVCols = ScaleUVColsUp2_C;
+#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) {
+      ScaleUVCols = ScaleUVColsUp2_SSSE3;
+    }
+#endif
+#if defined(HAS_SCALEUVCOLSUP2_MMI)
+    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+      ScaleUVCols = ScaleUVColsUp2_MMI;
+    }
+#endif
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    ScaleUVCols(dst_uv, src_uv + (y >> 16) * src_stride, dst_width, x, dx);
+    dst_uv += dst_stride;
+    y += dy;
+  }
+}
+
+// Copy UV with optional flipping
+#if HAS_UVCOPY
+static int UVCopy(const uint8_t* src_UV,
+                  int src_stride_UV,
+                  uint8_t* dst_UV,
+                  int dst_stride_UV,
+                  int width,
+                  int height) {
+  if (!src_UV || !dst_UV || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_UV = src_UV + (height - 1) * src_stride_UV;
+    src_stride_UV = -src_stride_UV;
+  }
+
+  CopyPlane(src_UV, src_stride_UV, dst_UV, dst_stride_UV, width * 2, height);
+  return 0;
+}
+#endif  // HAS_UVCOPY
+
+// Scale a UV plane (from NV12)
+// This function in turn calls a scaling function
+// suitable for handling the desired resolutions.
+static void ScaleUV(const uint8_t* src,
+                    int src_stride,
+                    int src_width,
+                    int src_height,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int dst_width,
+                    int dst_height,
+                    int clip_x,
+                    int clip_y,
+                    int clip_width,
+                    int clip_height,
+                    enum FilterMode filtering) {
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  // UV does not support box filter yet, but allow the user to pass it.
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+                                filtering);
+
+  // Negative src_height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
+  src_width = Abs(src_width);
+  if (clip_x) {
+    int64_t clipf = (int64_t)(clip_x)*dx;
+    x += (clipf & 0xffff);
+    src += (clipf >> 16) * 2;
+    dst += clip_x * 2;
+  }
+  if (clip_y) {
+    int64_t clipf = (int64_t)(clip_y)*dy;
+    y += (clipf & 0xffff);
+    src += (clipf >> 16) * src_stride;
+    dst += clip_y * dst_stride;
+  }
+
+  // Special case for integer step values.
+  if (((dx | dy) & 0xffff) == 0) {
+    if (!dx || !dy) {  // 1 pixel wide and/or tall.
+      filtering = kFilterNone;
+    } else {
+      // Optimized even scale down. ie 2, 4, 6, 8, 10x.
+      if (!(dx & 0x10000) && !(dy & 0x10000)) {
+#if HAS_SCALEUVDOWN2
+        if (dx == 0x20000) {
+          // Optimized 1/2 downsample.
+          ScaleUVDown2(src_width, src_height, clip_width, clip_height,
+                       src_stride, dst_stride, src, dst, x, dx, y, dy,
+                       filtering);
+          return;
+        }
+#endif
+#if HAS_SCALEUVDOWN4BOX
+        if (dx == 0x40000 && filtering == kFilterBox) {
+          // Optimized 1/4 box downsample.
+          ScaleUVDown4Box(src_width, src_height, clip_width, clip_height,
+                          src_stride, dst_stride, src, dst, x, dx, y, dy);
+          return;
+        }
+#endif
+#if HAS_SCALEUVDOWNEVEN
+        ScaleUVDownEven(src_width, src_height, clip_width, clip_height,
+                        src_stride, dst_stride, src, dst, x, dx, y, dy,
+                        filtering);
+        return;
+#endif
+      }
+      // Optimized odd scale down. ie 3, 5, 7, 9x.
+      if ((dx & 0x10000) && (dy & 0x10000)) {
+        filtering = kFilterNone;
+#ifdef HAS_UVCOPY
+        if (dx == 0x10000 && dy == 0x10000) {
+          // Straight copy.
+          UVCopy(src + (y >> 16) * src_stride + (x >> 16) * 2, src_stride, dst,
+                 dst_stride, clip_width, clip_height);
+          return;
+        }
+#endif
+      }
+    }
+  }
+  // HAS_SCALEPLANEVERTICAL
+  if (dx == 0x10000 && (x & 0xffff) == 0) {
+    // Arbitrary scale vertically, but unscaled horizontally.
+    ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
+                       dst_stride, src, dst, x, y, dy, 4, filtering);
+    return;
+  }
+
+#if HAS_SCALEUVBILINEARUP
+  if (filtering && dy < 65536) {
+    ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height,
+                      src_stride, dst_stride, src, dst, x, dx, y, dy,
+                      filtering);
+    return;
+  }
+#endif
+#if HAS_SCALEUVBILINEARDOWN
+  if (filtering) {
+    ScaleUVBilinearDown(src_width, src_height, clip_width, clip_height,
+                        src_stride, dst_stride, src, dst, x, dx, y, dy,
+                        filtering);
+    return;
+  }
+#endif
+  ScaleUVSimple(src_width, src_height, clip_width, clip_height, src_stride,
+                dst_stride, src, dst, x, dx, y, dy);
+}
+
+// Scale an UV image.
+LIBYUV_API
+int UVScale(const uint8_t* src_uv,
+            int src_stride_uv,
+            int src_width,
+            int src_height,
+            uint8_t* dst_uv,
+            int dst_stride_uv,
+            int dst_width,
+            int dst_height,
+            enum FilterMode filtering) {
+  if (!src_uv || src_width == 0 || src_height == 0 || src_width > 32768 ||
+      src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+  ScaleUV(src_uv, src_stride_uv, src_width, src_height, dst_uv, dst_stride_uv,
+          dst_width, dst_height, 0, 0, dst_width, dst_height, filtering);
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/tools_libyuv/OWNERS b/chromium/third_party/libyuv/tools_libyuv/OWNERS
index 2cb971d2b72..aae4fb6e021 100644
--- a/chromium/third_party/libyuv/tools_libyuv/OWNERS
+++ b/chromium/third_party/libyuv/tools_libyuv/OWNERS
@@ -1 +1,4 @@
-phoglund@chromium.org
+mbonadei@chromium.org
+fbarchard@chromium.org
+pbos@chromium.org
+
diff --git a/chromium/third_party/libyuv/tools_libyuv/autoroller/roll_deps.py b/chromium/third_party/libyuv/tools_libyuv/autoroller/roll_deps.py
index 37727ab1a69..9b9660de0bb 100755
--- a/chromium/third_party/libyuv/tools_libyuv/autoroller/roll_deps.py
+++ b/chromium/third_party/libyuv/tools_libyuv/autoroller/roll_deps.py
@@ -8,7 +8,7 @@
 # be found in the AUTHORS file in the root of the source tree.
 
 # This is a modified copy of the script in
-# https://chromium.googlesource.com/external/webrtc/+/master/tools-webrtc/autoroller/roll_deps.py
+# https://webrtc.googlesource.com/src/+/master/tools_webrtc/autoroller/roll_deps.py
 # customized for libyuv.
 
 
@@ -22,7 +22,7 @@ import os
 import re
 import subprocess
 import sys
-import urllib
+import urllib2
 
 
 # Skip these dependencies (list without solution name prefix).
@@ -37,7 +37,7 @@ CHROMIUM_LOG_TEMPLATE = CHROMIUM_SRC_URL + '/+log/%s'
 CHROMIUM_FILE_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s/%s'
 
 COMMIT_POSITION_RE = re.compile('^Cr-Commit-Position: .*#([0-9]+).*$')
-CLANG_REVISION_RE = re.compile(r'^CLANG_REVISION = \'(\d+)\'$')
+CLANG_REVISION_RE = re.compile(r'^CLANG_REVISION = \'([0-9a-z-]+)\'$')
 ROLL_BRANCH_NAME = 'roll_chromium_revision'
 
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -69,6 +69,7 @@ def ParseDepsDict(deps_content):
   local_scope = {}
   global_scope = {
     'Var': VarLookup(local_scope),
+    'Str': lambda s: s,
     'deps_os': {},
   }
   exec(deps_content, global_scope, local_scope)
@@ -90,7 +91,7 @@ def ParseCommitPosition(commit_message):
   for line in reversed(commit_message.splitlines()):
     m = COMMIT_POSITION_RE.match(line.strip())
     if m:
-      return m.group(1)
+      return int(m.group(1))
   logging.error('Failed to parse commit position id from:\n%s\n',
                 commit_message)
   sys.exit(-1)
@@ -109,7 +110,7 @@ def _RunCommand(command, working_dir=None, ignore_exit_code=False,
   logging.debug('CMD: %s CWD: %s', ' '.join(command), working_dir)
   env = os.environ.copy()
   if extra_env:
-    assert all(type(value) == str for value in extra_env.values())
+    assert all(isinstance(value, str) for value in extra_env.values())
     logging.debug('extra env: %s', extra_env)
     env.update(extra_env)
   p = subprocess.Popen(command, stdout=subprocess.PIPE,
@@ -169,7 +170,7 @@ def ReadRemoteCrCommit(revision):
 
 def ReadUrlContent(url):
   """Connect to a remote host and read the contents. Returns a list of lines."""
-  conn = urllib.urlopen(url)
+  conn = urllib2.urlopen(url)
   try:
     return conn.readlines()
   except IOError as e:
@@ -274,7 +275,7 @@ def CalculateChangedClang(new_cr_rev):
       match = CLANG_REVISION_RE.match(line)
       if match:
         return match.group(1)
-    raise RollError('Could not parse Clang revision!')
+    raise RollError('Could not parse Clang revision from:\n' + '\n'.join('  ' + l for l in lines))
 
   with open(CLANG_UPDATE_SCRIPT_LOCAL_PATH, 'rb') as f:
     current_lines = f.readlines()
@@ -298,9 +299,6 @@ def GenerateCommitMessage(current_cr_rev, new_cr_rev, current_commit_pos,
   commit_msg.append('Change log: %s' % (CHROMIUM_LOG_TEMPLATE % rev_interval))
   commit_msg.append('Full diff: %s\n' % (CHROMIUM_COMMIT_TEMPLATE %
                                          rev_interval))
-  # TBR field will be empty unless in some custom cases, where some engineers
-  # are added.
-  tbr_authors = ''
   if changed_deps_list:
     commit_msg.append('Changed dependencies:')
 
@@ -322,7 +320,11 @@ def GenerateCommitMessage(current_cr_rev, new_cr_rev, current_commit_pos,
   else:
     commit_msg.append('No update to Clang.\n')
 
-  commit_msg.append('TBR=%s' % tbr_authors)
+  # TBR needs to be non-empty for Gerrit to process it.
+  git_author = _RunCommand(['git', 'config', 'user.email'],
+                           working_dir=CHECKOUT_SRC_DIR)[0].strip()
+  commit_msg.append('TBR=%s' % git_author)
+
   commit_msg.append('BUG=None')
   return '\n'.join(commit_msg)
 
@@ -397,20 +399,36 @@ def _LocalCommit(commit_msg, dry_run):
     _RunCommand(['git', 'commit', '-m', commit_msg])
 
 
-def _UploadCL(dry_run, rietveld_email=None):
-  logging.info('Uploading CL...')
-  if not dry_run:
-    cmd = ['git', 'cl', 'upload', '-f']
-    if rietveld_email:
-      cmd.append('--email=%s' % rietveld_email)
-    _RunCommand(cmd, extra_env={'EDITOR': 'true'})
+def ChooseCQMode(skip_cq, cq_over, current_commit_pos, new_commit_pos):
+  if skip_cq:
+    return 0
+  if (new_commit_pos - current_commit_pos) < cq_over:
+    return 1
+  return 2
 
 
-def _SendToCQ(dry_run, skip_cq):
-  logging.info('Sending the CL to the CQ...')
-  if not dry_run and not skip_cq:
-    _RunCommand(['git', 'cl', 'set_commit'])
-    logging.info('Sent the CL to the CQ.')
+def _UploadCL(commit_queue_mode):
+  """Upload the committed changes as a changelist to Gerrit.
+
+  commit_queue_mode:
+    - 2: Submit to commit queue.
+    - 1: Run trybots but do not submit to CQ.
+    - 0: Skip CQ, upload only.
+  """
+  cmd = ['git', 'cl', 'upload', '--force', '--bypass-hooks', '--send-mail']
+  if commit_queue_mode >= 2:
+    logging.info('Sending the CL to the CQ...')
+    cmd.extend(['--use-commit-queue'])
+  elif commit_queue_mode >= 1:
+    logging.info('Starting CQ dry run...')
+    cmd.extend(['--cq-dry-run'])
+  extra_env = {
+      'EDITOR': 'true',
+      'SKIP_GCE_AUTH_FOR_GIT': '1',
+  }
+  stdout, stderr = _RunCommand(cmd, extra_env=extra_env)
+  logging.debug('Output from "git cl upload":\nstdout:\n%s\n\nstderr:\n%s',
+      stdout, stderr)
 
 
 def main():
@@ -420,10 +438,6 @@ def main():
   p.add_argument('-r', '--revision',
                  help=('Chromium Git revision to roll to. Defaults to the '
                        'Chromium HEAD revision if omitted.'))
-  p.add_argument('-u', '--rietveld-email',
-                 help=('E-mail address to use for creating the CL at Rietveld'
-                       'If omitted a previously cached one will be used or an '
-                       'error will be thrown during upload.'))
   p.add_argument('--dry-run', action='store_true', default=False,
                  help=('Calculate changes and modify DEPS, but don\'t create '
                        'any local branch, commit, upload CL or send any '
@@ -432,8 +446,12 @@ def main():
                  default=False,
                  help=('Ignore if the current branch is not master or if there '
                        'are uncommitted changes (default: %(default)s).'))
-  p.add_argument('--skip-cq', action='store_true', default=False,
-                 help='Skip sending the CL to the CQ (default: %(default)s)')
+  grp = p.add_mutually_exclusive_group()
+  grp.add_argument('--skip-cq', action='store_true', default=False,
+                   help='Skip sending the CL to the CQ (default: %(default)s)')
+  grp.add_argument('--cq-over', type=int, default=1,
+                   help=('Commit queue dry run if the revision difference '
+                         'is below this number (default: %(default)s)'))
   p.add_argument('-v', '--verbose', action='store_true', default=False,
                  help='Be extra verbose in printing of log messages.')
   opts = p.parse_args()
@@ -478,8 +496,11 @@ def main():
   _CreateRollBranch(opts.dry_run)
   UpdateDepsFile(deps_filename, current_cr_rev, new_cr_rev, changed_deps)
   _LocalCommit(commit_msg, opts.dry_run)
-  _UploadCL(opts.dry_run, opts.rietveld_email)
-  _SendToCQ(opts.dry_run, opts.skip_cq)
+  commit_queue_mode = ChooseCQMode(opts.skip_cq, opts.cq_over,
+                                   current_commit_pos, new_commit_pos)
+  logging.info('Uploading CL...')
+  if not opts.dry_run:
+    _UploadCL(commit_queue_mode)
   return 0
 
 
diff --git a/chromium/third_party/libyuv/tools_libyuv/msan/OWNERS b/chromium/third_party/libyuv/tools_libyuv/msan/OWNERS
index 0a919805c2c..9b67a8f6789 100644
--- a/chromium/third_party/libyuv/tools_libyuv/msan/OWNERS
+++ b/chromium/third_party/libyuv/tools_libyuv/msan/OWNERS
@@ -1,3 +1,3 @@
+mbonadei@chromium.org
+fbarchard@chromium.org
 pbos@chromium.org
-phoglund@chromium.org
-
diff --git a/chromium/third_party/libyuv/tools_libyuv/ubsan/OWNERS b/chromium/third_party/libyuv/tools_libyuv/ubsan/OWNERS
index da77b4ef23f..9b67a8f6789 100644
--- a/chromium/third_party/libyuv/tools_libyuv/ubsan/OWNERS
+++ b/chromium/third_party/libyuv/tools_libyuv/ubsan/OWNERS
@@ -1,4 +1,3 @@
-pbos@webrtc.org
-phoglund@webrtc.org
+mbonadei@chromium.org
 fbarchard@chromium.org
-
+pbos@chromium.org
diff --git a/chromium/third_party/libyuv/unit_test/color_test.cc b/chromium/third_party/libyuv/unit_test/color_test.cc
index 4bb448d56fe..842fd994441 100644
--- a/chromium/third_party/libyuv/unit_test/color_test.cc
+++ b/chromium/third_party/libyuv/unit_test/color_test.cc
@@ -20,21 +20,19 @@
 
 namespace libyuv {
 
-// TODO(fbarchard): Port high accuracy YUV to RGB to Neon.
-#if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define ERROR_R 1
-#define ERROR_G 1
-#define ERROR_B 3
-#define ERROR_FULL 6
-#define ERROR_J420 5
+// TODO(fbarchard): clang x86 has a higher accuracy YUV to RGB.
+// Port to Visual C and other CPUs
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#define ERROR_FULL 5
+#define ERROR_J420 4
 #else
+#define ERROR_FULL 6
+#define ERROR_J420 6
+#endif
 #define ERROR_R 1
 #define ERROR_G 1
 #define ERROR_B 3
-#define ERROR_FULL 5
-#define ERROR_J420 3
-#endif
 
 #define TESTCS(TESTNAME, YUVTOARGB, ARGBTOYUV, HS1, HS, HN, DIFF)              \
   TEST_F(LibYUVColorTest, TESTNAME) {                                          \
@@ -187,6 +185,52 @@ static void YUVJToRGB(int y, int u, int v, int* r, int* g, int* b) {
   *r = orig_pixels[2];
 }
 
+static void YUVHToRGB(int y, int u, int v, int* r, int* g, int* b) {
+  const int kWidth = 16;
+  const int kHeight = 1;
+  const int kPixels = kWidth * kHeight;
+  const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
+
+  SIMD_ALIGNED(uint8_t orig_y[16]);
+  SIMD_ALIGNED(uint8_t orig_u[8]);
+  SIMD_ALIGNED(uint8_t orig_v[8]);
+  SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]);
+  memset(orig_y, y, kPixels);
+  memset(orig_u, u, kHalfPixels);
+  memset(orig_v, v, kHalfPixels);
+
+  /* YUV converted to ARGB. */
+  H422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2,
+             orig_pixels, kWidth * 4, kWidth, kHeight);
+
+  *b = orig_pixels[0];
+  *g = orig_pixels[1];
+  *r = orig_pixels[2];
+}
+
+static void YUVRec2020ToRGB(int y, int u, int v, int* r, int* g, int* b) {
+  const int kWidth = 16;
+  const int kHeight = 1;
+  const int kPixels = kWidth * kHeight;
+  const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
+
+  SIMD_ALIGNED(uint8_t orig_y[16]);
+  SIMD_ALIGNED(uint8_t orig_u[8]);
+  SIMD_ALIGNED(uint8_t orig_v[8]);
+  SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]);
+  memset(orig_y, y, kPixels);
+  memset(orig_u, u, kHalfPixels);
+  memset(orig_v, v, kHalfPixels);
+
+  /* YUV converted to ARGB. */
+  U422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2,
+             orig_pixels, kWidth * 4, kWidth, kHeight);
+
+  *b = orig_pixels[0];
+  *g = orig_pixels[1];
+  *r = orig_pixels[2];
+}
+
 static void YToRGB(int y, int* r, int* g, int* b) {
   const int kWidth = 16;
   const int kHeight = 1;
@@ -335,18 +379,41 @@ TEST_F(LibYUVColorTest, TestRoundToByte) {
   EXPECT_LE(allb, 255);
 }
 
+// BT.601 YUV to RGB reference
 static void YUVToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
   *r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.596);
   *g = RoundToByte((y - 16) * 1.164 - (u - 128) * 0.391 - (v - 128) * 0.813);
   *b = RoundToByte((y - 16) * 1.164 - (u - 128) * -2.018);
 }
 
+// JPEG YUV to RGB reference
 static void YUVJToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
   *r = RoundToByte(y - (v - 128) * -1.40200);
   *g = RoundToByte(y - (u - 128) * 0.34414 - (v - 128) * 0.71414);
   *b = RoundToByte(y - (u - 128) * -1.77200);
 }
 
+// BT.709 YUV to RGB reference
+// See also http://www.equasys.de/colorconversion.html
+static void YUVHToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
+  *r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.793);
+  *g = RoundToByte((y - 16) * 1.164 - (u - 128) * 0.213 - (v - 128) * 0.533);
+  *b = RoundToByte((y - 16) * 1.164 - (u - 128) * -2.112);
+}
+
+// BT.2020 YUV to RGB reference
+static void YUVRec2020ToRGBReference(int y,
+                                     int u,
+                                     int v,
+                                     int* r,
+                                     int* g,
+                                     int* b) {
+  *r = RoundToByte((y - 16) * 1.164384 - (v - 128) * -1.67867);
+  *g = RoundToByte((y - 16) * 1.164384 - (u - 128) * 0.187326 -
+                   (v - 128) * 0.65042);
+  *b = RoundToByte((y - 16) * 1.164384 - (u - 128) * -2.14177);
+}
+
 TEST_F(LibYUVColorTest, TestYUV) {
   int r0, g0, b0, r1, g1, b1;
 
@@ -473,7 +540,11 @@ static void PrintHistogram(int rh[256], int gh[256], int bh[256]) {
 
 // Step by 5 on inner loop goes from 0 to 255 inclusive.
 // Set to 1 for better converage.  3, 5 or 17 for faster testing.
+#ifdef ENABLE_SLOW_TESTS
+#define FASTSTEP 1
+#else
 #define FASTSTEP 5
+#endif
 TEST_F(LibYUVColorTest, TestFullYUV) {
   int rh[256] = {
       0,
@@ -531,6 +602,66 @@ TEST_F(LibYUVColorTest, TestFullYUVJ) {
   }
   PrintHistogram(rh, gh, bh);
 }
+
+TEST_F(LibYUVColorTest, TestFullYUVH) {
+  int rh[256] = {
+      0,
+  };
+  int gh[256] = {
+      0,
+  };
+  int bh[256] = {
+      0,
+  };
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
+      for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
+        int r0, g0, b0, r1, g1, b1;
+        int y = RANDOM256(y2);
+        YUVHToRGBReference(y, u, v, &r0, &g0, &b0);
+        YUVHToRGB(y, u, v, &r1, &g1, &b1);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, ERROR_G);
+        // TODO(crbug.com/libyuv/862): Reduce the errors in the B channel.
+        EXPECT_NEAR(b0, b1, 15);
+        ++rh[r1 - r0 + 128];
+        ++gh[g1 - g0 + 128];
+        ++bh[b1 - b0 + 128];
+      }
+    }
+  }
+  PrintHistogram(rh, gh, bh);
+}
+
+TEST_F(LibYUVColorTest, TestFullYUVRec2020) {
+  int rh[256] = {
+      0,
+  };
+  int gh[256] = {
+      0,
+  };
+  int bh[256] = {
+      0,
+  };
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
+      for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
+        int r0, g0, b0, r1, g1, b1;
+        int y = RANDOM256(y2);
+        YUVRec2020ToRGBReference(y, u, v, &r0, &g0, &b0);
+        YUVRec2020ToRGB(y, u, v, &r1, &g1, &b1);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, ERROR_G);
+        // TODO(crbug.com/libyuv/863): Reduce the errors in the B channel.
+        EXPECT_NEAR(b0, b1, 18);
+        ++rh[r1 - r0 + 128];
+        ++gh[g1 - g0 + 128];
+        ++bh[b1 - b0 + 128];
+      }
+    }
+  }
+  PrintHistogram(rh, gh, bh);
+}
 #undef FASTSTEP
 
 TEST_F(LibYUVColorTest, TestGreyYUVJ) {
diff --git a/chromium/third_party/libyuv/unit_test/compare_test.cc b/chromium/third_party/libyuv/unit_test/compare_test.cc
index 136254e169b..bd99cdd3ac3 100644
--- a/chromium/third_party/libyuv/unit_test/compare_test.cc
+++ b/chromium/third_party/libyuv/unit_test/compare_test.cc
@@ -15,10 +15,13 @@
 #include "../unit_test/unit_test.h"
 #include "libyuv/basic_types.h"
 #include "libyuv/compare.h"
-#include "libyuv/compare_row.h" /* For HammingDistance_C */
 #include "libyuv/cpu_id.h"
 #include "libyuv/video_common.h"
 
+#ifdef ENABLE_ROW_TESTS
+#include "libyuv/compare_row.h" /* For HammingDistance_C */
+#endif
+
 namespace libyuv {
 
 // hash seed of 5381 recommended.
@@ -206,6 +209,7 @@ TEST_F(LibYUVCompareTest, BenchmarkARGBDetect_Unaligned) {
   free_aligned_buffer_page_end(src_a);
 }
 
+#ifdef ENABLE_ROW_TESTS
 TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) {
   const int kMaxWidth = 4096 * 3;
   align_buffer_page_end(src_a, kMaxWidth);
@@ -403,6 +407,7 @@ TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) {
   free_aligned_buffer_page_end(src_a);
   free_aligned_buffer_page_end(src_b);
 }
+#endif  // ENABLE_ROW_TESTS
 
 TEST_F(LibYUVCompareTest, TestHammingDistance) {
   align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_);
diff --git a/chromium/third_party/libyuv/unit_test/convert_test.cc b/chromium/third_party/libyuv/unit_test/convert_test.cc
index e11b101fca2..59a9480d679 100644
--- a/chromium/third_party/libyuv/unit_test/convert_test.cc
+++ b/chromium/third_party/libyuv/unit_test/convert_test.cc
@@ -12,8 +12,6 @@
 #include <stdlib.h>
 #include <time.h>
 
-#include "libyuv/row.h" /* For ARGBToAR30Row_AVX2 */
-
 #include "libyuv/basic_types.h"
 #include "libyuv/compare.h"
 #include "libyuv/convert.h"
@@ -29,12 +27,14 @@
 #include "libyuv/rotate.h"
 #include "libyuv/video_common.h"
 
-#if defined(__arm__) || defined(__aarch64__)
-// arm version subsamples by summing 4 pixels then multiplying by matrix with
-// 4x smaller coefficients which are rounded to nearest integer.
-#define ARM_YUV_ERROR 4
-#else
-#define ARM_YUV_ERROR 0
+#ifdef ENABLE_ROW_TESTS
+#include "libyuv/row.h" /* For ARGBToAR30Row_AVX2 */
+#endif
+
+// Some functions fail on big endian. Enable these tests on all cpus except
+// PowerPC, but they are not optimized so disabled by default.
+#if !defined(__powerpc__) && defined(ENABLE_SLOW_TESTS)
+#define LITTLE_ENDIAN_ONLY_TEST 1
 #endif
 
 namespace libyuv {
@@ -216,41 +216,23 @@ TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2)
           dst_y_opt, kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X),         \
           dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight);      \
     }                                                                         \
-    int max_diff = 0;                                                         \
     for (int i = 0; i < kHeight; ++i) {                                       \
       for (int j = 0; j < kWidth; ++j) {                                      \
-        int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) -        \
-                           static_cast<int>(dst_y_opt[i * kWidth + j]));      \
-        if (abs_diff > max_diff) {                                            \
-          max_diff = abs_diff;                                                \
-        }                                                                     \
+        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);        \
       }                                                                       \
     }                                                                         \
-    EXPECT_EQ(0, max_diff);                                                   \
     for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
       for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                \
-        int abs_diff = abs(                                                   \
-            static_cast<int>(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
-            static_cast<int>(                                                 \
-                dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
-        if (abs_diff > max_diff) {                                            \
-          max_diff = abs_diff;                                                \
-        }                                                                     \
+        EXPECT_EQ(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],              \
+                  dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]);           \
       }                                                                       \
     }                                                                         \
-    EXPECT_LE(max_diff, 3);                                                   \
     for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
       for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                \
-        int abs_diff = abs(                                                   \
-            static_cast<int>(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
-            static_cast<int>(                                                 \
-                dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
-        if (abs_diff > max_diff) {                                            \
-          max_diff = abs_diff;                                                \
-        }                                                                     \
+        EXPECT_EQ(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],              \
+                  dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]);           \
       }                                                                       \
     }                                                                         \
-    EXPECT_LE(max_diff, 3);                                                   \
     free_aligned_buffer_page_end(dst_y_c);                                    \
     free_aligned_buffer_page_end(dst_u_c);                                    \
     free_aligned_buffer_page_end(dst_v_c);                                    \
@@ -281,6 +263,23 @@ TESTAPLANARTOP(Android420, I420, 1, 0, 0, 2, 2, I420, 2, 2)
 TESTAPLANARTOP(Android420, NV12, 2, 0, 1, 2, 2, I420, 2, 2)
 TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2)
 
+// wrapper to keep API the same
+int I400ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* /* src_u */,
+               int /* src_stride_u */,
+               const uint8_t* /* src_v */,
+               int /* src_stride_v */,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  return I400ToNV21(src_y, src_stride_y, dst_y, dst_stride_y, dst_vu,
+                    dst_stride_vu, width, height);
+}
+
 #define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,         \
                         FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
   TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {              \
@@ -294,10 +293,10 @@ TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2)
                                          SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) +  \
                                      OFF);                                    \
     align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
-    align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) *        \
+    align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 *        \
                                         SUBSAMPLE(kHeight, SUBSAMP_Y));       \
     align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
-    align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) *      \
+    align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 *      \
                                           SUBSAMPLE(kHeight, SUBSAMP_Y));     \
     for (int i = 0; i < kHeight; ++i)                                         \
       for (int j = 0; j < kWidth; ++j)                                        \
@@ -312,46 +311,33 @@ TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2)
     }                                                                         \
     memset(dst_y_c, 1, kWidth* kHeight);                                      \
     memset(dst_uv_c, 2,                                                       \
-           SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+           SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
     memset(dst_y_opt, 101, kWidth* kHeight);                                  \
     memset(dst_uv_opt, 102,                                                   \
-           SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+           SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
     MaskCpuFlags(disable_cpu_flags_);                                         \
     SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
         src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X),   \
         src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth,       \
-        dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight);     \
+        dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight);     \
     MaskCpuFlags(benchmark_cpu_info_);                                        \
     for (int i = 0; i < benchmark_iterations_; ++i) {                         \
       SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
           src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
           src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth,   \
-          dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \
+          dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \
     }                                                                         \
-    int max_diff = 0;                                                         \
     for (int i = 0; i < kHeight; ++i) {                                       \
       for (int j = 0; j < kWidth; ++j) {                                      \
-        int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) -        \
-                           static_cast<int>(dst_y_opt[i * kWidth + j]));      \
-        if (abs_diff > max_diff) {                                            \
-          max_diff = abs_diff;                                                \
-        }                                                                     \
+        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);        \
       }                                                                       \
     }                                                                         \
-    EXPECT_LE(max_diff, 1);                                                   \
     for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
-      for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) {            \
-        int abs_diff =                                                        \
-            abs(static_cast<int>(                                             \
-                    dst_uv_c[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) -     \
-                static_cast<int>(                                             \
-                    dst_uv_opt[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]));   \
-        if (abs_diff > max_diff) {                                            \
-          max_diff = abs_diff;                                                \
-        }                                                                     \
+      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) {            \
+        EXPECT_EQ(dst_uv_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j],         \
+                  dst_uv_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]);      \
       }                                                                       \
     }                                                                         \
-    EXPECT_LE(max_diff, 1);                                                   \
     free_aligned_buffer_page_end(dst_y_c);                                    \
     free_aligned_buffer_page_end(dst_uv_c);                                   \
     free_aligned_buffer_page_end(dst_y_opt);                                  \
@@ -374,6 +360,92 @@ TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2)
 
 TESTPLANARTOBP(I420, 2, 2, NV12, 2, 2)
 TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2)
+TESTPLANARTOBP(I422, 2, 1, NV21, 2, 2)
+TESTPLANARTOBP(I444, 1, 1, NV12, 2, 2)
+TESTPLANARTOBP(I444, 1, 1, NV21, 2, 2)
+TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2)
+
+#define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,       \
+                          FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG,    \
+                          OFF, DOY)                                           \
+  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {              \
+    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
+    const int kHeight = benchmark_height_;                                    \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
+    align_buffer_page_end(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *      \
+                                          SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
+                                      OFF);                                   \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
+    align_buffer_page_end(dst_uv_c, 2 * SUBSAMPLE(kWidth, SUBSAMP_X) *        \
+                                        SUBSAMPLE(kHeight, SUBSAMP_Y));       \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
+    align_buffer_page_end(dst_uv_opt, 2 * SUBSAMPLE(kWidth, SUBSAMP_X) *      \
+                                          SUBSAMPLE(kHeight, SUBSAMP_Y));     \
+    for (int i = 0; i < kHeight; ++i)                                         \
+      for (int j = 0; j < kWidth; ++j)                                        \
+        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                    \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {             \
+      for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {        \
+        src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =        \
+            (fastrand() & 0xff);                                              \
+      }                                                                       \
+    }                                                                         \
+    memset(dst_y_c, 1, kWidth* kHeight);                                      \
+    memset(dst_uv_c, 2,                                                       \
+           2 * SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
+    memset(dst_uv_opt, 102,                                                   \
+           2 * SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
+        src_y + OFF, kWidth, src_uv + OFF,                                    \
+        2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_c : NULL, kWidth,   \
+        dst_uv_c, 2 * SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight);     \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
+          src_y + OFF, kWidth, src_uv + OFF,                                  \
+          2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_opt : NULL,       \
+          kWidth, dst_uv_opt, 2 * SUBSAMPLE(kWidth, SUBSAMP_X), kWidth,       \
+          NEG kHeight);                                                       \
+    }                                                                         \
+    if (DOY) {                                                                \
+      for (int i = 0; i < kHeight; ++i) {                                     \
+        for (int j = 0; j < kWidth; ++j) {                                    \
+          EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);      \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
+      for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {            \
+        EXPECT_EQ(dst_uv_c[i * 2 * SUBSAMPLE(kWidth, SUBSAMP_X) + j],         \
+                  dst_uv_opt[i * 2 * SUBSAMPLE(kWidth, SUBSAMP_X) + j]);      \
+      }                                                                       \
+    }                                                                         \
+    free_aligned_buffer_page_end(dst_y_c);                                    \
+    free_aligned_buffer_page_end(dst_uv_c);                                   \
+    free_aligned_buffer_page_end(dst_y_opt);                                  \
+    free_aligned_buffer_page_end(dst_uv_opt);                                 \
+    free_aligned_buffer_page_end(src_y);                                      \
+    free_aligned_buffer_page_end(src_uv);                                     \
+  }
+
+#define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,         \
+                         FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                     \
+  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
+                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, 1) \
+  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
+                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1,  \
+                    1)                                                         \
+  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
+                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1)  \
+  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
+                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1)     \
+  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
+                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0)
+
+TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2)
+TESTBIPLANARTOBP(NV12, 2, 2, NV12Mirror, 2, 2)
 
 #define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,         \
                          FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \
@@ -428,43 +500,25 @@ TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2)
           kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_opt,          \
           SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight);                  \
     }                                                                          \
-    int max_diff = 0;                                                          \
     if (DOY) {                                                                 \
       for (int i = 0; i < kHeight; ++i) {                                      \
         for (int j = 0; j < kWidth; ++j) {                                     \
-          int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) -       \
-                             static_cast<int>(dst_y_opt[i * kWidth + j]));     \
-          if (abs_diff > max_diff) {                                           \
-            max_diff = abs_diff;                                               \
-          }                                                                    \
+          EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);       \
         }                                                                      \
       }                                                                        \
-      EXPECT_LE(max_diff, 1);                                                  \
     }                                                                          \
     for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                  \
       for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                 \
-        int abs_diff = abs(                                                    \
-            static_cast<int>(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -  \
-            static_cast<int>(                                                  \
-                dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]));             \
-        if (abs_diff > max_diff) {                                             \
-          max_diff = abs_diff;                                                 \
-        }                                                                      \
+        EXPECT_EQ(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],               \
+                  dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]);            \
       }                                                                        \
     }                                                                          \
-    EXPECT_LE(max_diff, 1);                                                    \
     for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                  \
       for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                 \
-        int abs_diff = abs(                                                    \
-            static_cast<int>(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -  \
-            static_cast<int>(                                                  \
-                dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]));             \
-        if (abs_diff > max_diff) {                                             \
-          max_diff = abs_diff;                                                 \
-        }                                                                      \
+        EXPECT_EQ(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],               \
+                  dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]);            \
       }                                                                        \
     }                                                                          \
-    EXPECT_LE(max_diff, 1);                                                    \
     free_aligned_buffer_page_end(dst_y_c);                                     \
     free_aligned_buffer_page_end(dst_u_c);                                     \
     free_aligned_buffer_page_end(dst_v_c);                                     \
@@ -554,43 +608,60 @@ TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
                  YALIGN, benchmark_width_, _Opt, +, 0)
 
 TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1)
 TESTPLANARTOB(J420, 2, 2, ARGB, 4, 4, 1)
 TESTPLANARTOB(J420, 2, 2, ABGR, 4, 4, 1)
 TESTPLANARTOB(H420, 2, 2, ARGB, 4, 4, 1)
 TESTPLANARTOB(H420, 2, 2, ABGR, 4, 4, 1)
+TESTPLANARTOB(U420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(U420, 2, 2, ABGR, 4, 4, 1)
 TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1)
-TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1)
 TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1)
 TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1)
 TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 1)
+TESTPLANARTOB(J420, 2, 2, RAW, 3, 3, 1)
+TESTPLANARTOB(J420, 2, 2, RGB24, 3, 3, 1)
 TESTPLANARTOB(H420, 2, 2, RAW, 3, 3, 1)
 TESTPLANARTOB(H420, 2, 2, RGB24, 3, 3, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
 TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 1)
+TESTPLANARTOB(J420, 2, 2, RGB565, 2, 2, 1)
+TESTPLANARTOB(H420, 2, 2, RGB565, 2, 2, 1)
 TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 1)
 TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 1)
-TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, RGB565, 2, 2, 1)
+#endif
+TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(J422, 2, 1, ARGB, 4, 4, 1)
 TESTPLANARTOB(J422, 2, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(H422, 2, 1, ARGB, 4, 4, 1)
 TESTPLANARTOB(H422, 2, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(U422, 2, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(U422, 2, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1)
-TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1)
 TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1)
-TESTPLANARTOB(J444, 1, 1, ARGB, 4, 4, 1)
 TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(J444, 1, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(J444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(H444, 1, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(H444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(U444, 1, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(U444, 1, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1)
 TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1)
 TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1)
 TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1)
 TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1)
 TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
 TESTPLANARTOB(I420, 2, 2, AR30, 4, 4, 1)
 TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
+#endif
 
 #define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
-                        YALIGN, W1280, DIFF, N, NEG, OFF, ATTEN)               \
+                        YALIGN, W1280, N, NEG, OFF, ATTEN)                     \
   TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
     const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
     const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
@@ -625,15 +696,9 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
                             dst_argb_opt + OFF, kStrideB, kWidth, NEG kHeight, \
                             ATTEN);                                            \
     }                                                                          \
-    int max_diff = 0;                                                          \
     for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                       \
-      int abs_diff = abs(static_cast<int>(dst_argb_c[i + OFF]) -               \
-                         static_cast<int>(dst_argb_opt[i + OFF]));             \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
+      EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]);                   \
     }                                                                          \
-    EXPECT_LE(max_diff, DIFF);                                                 \
     free_aligned_buffer_page_end(src_y);                                       \
     free_aligned_buffer_page_end(src_u);                                       \
     free_aligned_buffer_page_end(src_v);                                       \
@@ -643,23 +708,48 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
   }
 
 #define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
-                       YALIGN, DIFF)                                          \
+                       YALIGN)                                                \
   TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                  YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0)          \
+                  YALIGN, benchmark_width_ - 4, _Any, +, 0, 0)                \
   TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                  YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 0)        \
+                  YALIGN, benchmark_width_, _Unaligned, +, 1, 0)              \
   TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                  YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0)           \
+                  YALIGN, benchmark_width_, _Invert, -, 0, 0)                 \
   TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                  YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0)              \
+                  YALIGN, benchmark_width_, _Opt, +, 0, 0)                    \
   TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                  YALIGN, benchmark_width_, DIFF, _Premult, +, 0, 1)
-
-TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2)
-TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)
-
-#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,       \
-                         W1280, DIFF, N, NEG, OFF)                             \
+                  YALIGN, benchmark_width_, _Premult, +, 0, 1)
+
+#define J420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define J420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define H420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define H420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define U420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define U420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+
+TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(J420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(J420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(H420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(H420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(U420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(U420Alpha, 2, 2, ABGR, 4, 4, 1)
+
+#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C,       \
+                         BPP_B, W1280, N, NEG, OFF)                            \
   TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
     const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
     const int kHeight = benchmark_height_;                                     \
@@ -694,22 +784,16 @@ TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)
     align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight);               \
     memset(dst_argb32_c, 2, kWidth * 4 * kHeight);                             \
     memset(dst_argb32_opt, 102, kWidth * 4 * kHeight);                         \
-    FMT_B##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth,      \
+    FMT_C##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth,      \
                   kHeight);                                                    \
-    FMT_B##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth,  \
+    FMT_C##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth,  \
                   kHeight);                                                    \
-    int max_diff = 0;                                                          \
     for (int i = 0; i < kHeight; ++i) {                                        \
       for (int j = 0; j < kWidth * 4; ++j) {                                   \
-        int abs_diff =                                                         \
-            abs(static_cast<int>(dst_argb32_c[i * kWidth * 4 + j]) -           \
-                static_cast<int>(dst_argb32_opt[i * kWidth * 4 + j]));         \
-        if (abs_diff > max_diff) {                                             \
-          max_diff = abs_diff;                                                 \
-        }                                                                      \
+        EXPECT_EQ(dst_argb32_c[i * kWidth * 4 + j],                            \
+                  dst_argb32_opt[i * kWidth * 4 + j]);                         \
       }                                                                        \
     }                                                                          \
-    EXPECT_LE(max_diff, DIFF);                                                 \
     free_aligned_buffer_page_end(src_y);                                       \
     free_aligned_buffer_page_end(src_uv);                                      \
     free_aligned_buffer_page_end(dst_argb_c);                                  \
@@ -718,89 +802,62 @@ TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)
     free_aligned_buffer_page_end(dst_argb32_opt);                              \
   }
 
-#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, DIFF) \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,            \
-                   benchmark_width_ - 4, DIFF, _Any, +, 0)                    \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,            \
-                   benchmark_width_, DIFF, _Unaligned, +, 1)                  \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,            \
-                   benchmark_width_, DIFF, _Invert, -, 0)                     \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,            \
-                   benchmark_width_, DIFF, _Opt, +, 0)
-
-TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4, 2)
-TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4, 2)
-TESTBIPLANARTOB(NV12, 2, 2, ABGR, 4, 2)
-TESTBIPLANARTOB(NV21, 2, 2, ABGR, 4, 2)
-TESTBIPLANARTOB(NV12, 2, 2, RGB24, 3, 2)
-TESTBIPLANARTOB(NV21, 2, 2, RGB24, 3, 2)
-TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9)
-
-#ifdef DO_THREE_PLANES
-// Do 3 allocations for yuv.  conventional but slower.
-#define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
-                       W1280, DIFF, N, NEG, OFF)                               \
-  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) {                        \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
-    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
-    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
-    const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8;           \
-    align_buffer_page_end(src_argb, kStride* kHeight + OFF);                   \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                           \
-    align_buffer_page_end(dst_u_c, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y));  \
-    align_buffer_page_end(dst_v_c, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y));  \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                         \
-    align_buffer_page_end(dst_u_opt,                                           \
-                          kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y));           \
-    align_buffer_page_end(dst_v_opt,                                           \
-                          kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y));           \
-    memset(dst_y_c, 1, kWidth* kHeight);                                       \
-    memset(dst_u_c, 2, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y));              \
-    memset(dst_v_c, 3, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y));              \
-    memset(dst_y_opt, 101, kWidth* kHeight);                                   \
-    memset(dst_u_opt, 102, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y));          \
-    memset(dst_v_opt, 103, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y));          \
-    for (int i = 0; i < kHeight; ++i)                                          \
-      for (int j = 0; j < kStride; ++j)                                        \
-        src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff);               \
-    MaskCpuFlags(disable_cpu_flags_);                                          \
-    FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_u_c,   \
-                          kStrideUV, dst_v_c, kStrideUV, kWidth, NEG kHeight); \
-    MaskCpuFlags(benchmark_cpu_info_);                                         \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
-      FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth,        \
-                            dst_u_opt, kStrideUV, dst_v_opt, kStrideUV,        \
-                            kWidth, NEG kHeight);                              \
-    }                                                                          \
-    for (int i = 0; i < kHeight; ++i) {                                        \
-      for (int j = 0; j < kWidth; ++j) {                                       \
-        EXPECT_NEAR(static_cast<int>(dst_y_c[i * kWidth + j]),                 \
-                    static_cast<int>(dst_y_opt[i * kWidth + j]), DIFF);        \
-      }                                                                        \
-    }                                                                          \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                  \
-      for (int j = 0; j < kStrideUV; ++j) {                                    \
-        EXPECT_NEAR(static_cast<int>(dst_u_c[i * kStrideUV + j]),              \
-                    static_cast<int>(dst_u_opt[i * kStrideUV + j]), DIFF);     \
-      }                                                                        \
-    }                                                                          \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                  \
-      for (int j = 0; j < kStrideUV; ++j) {                                    \
-        EXPECT_NEAR(static_cast<int>(dst_v_c[i * kStrideUV + j]),              \
-                    static_cast<int>(dst_v_opt[i * kStrideUV + j]), DIFF);     \
-      }                                                                        \
-    }                                                                          \
-    free_aligned_buffer_page_end(dst_y_c);                                     \
-    free_aligned_buffer_page_end(dst_u_c);                                     \
-    free_aligned_buffer_page_end(dst_v_c);                                     \
-    free_aligned_buffer_page_end(dst_y_opt);                                   \
-    free_aligned_buffer_page_end(dst_u_opt);                                   \
-    free_aligned_buffer_page_end(dst_v_opt);                                   \
-    free_aligned_buffer_page_end(src_argb);                                    \
-  }
-#else
+#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B) \
+  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+                   benchmark_width_ - 4, _Any, +, 0)                           \
+  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+                   benchmark_width_, _Unaligned, +, 1)                         \
+  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+                   benchmark_width_, _Invert, -, 0)                            \
+  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+                   benchmark_width_, _Opt, +, 0)
+
+#define JNV12ToARGB(a, b, c, d, e, f, g, h) \
+  NV12ToARGBMatrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
+#define JNV21ToARGB(a, b, c, d, e, f, g, h) \
+  NV21ToARGBMatrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
+#define JNV12ToABGR(a, b, c, d, e, f, g, h) \
+  NV21ToARGBMatrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h)
+#define JNV21ToABGR(a, b, c, d, e, f, g, h) \
+  NV12ToARGBMatrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h)
+#define JNV12ToRGB24(a, b, c, d, e, f, g, h) \
+  NV12ToRGB24Matrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
+#define JNV21ToRGB24(a, b, c, d, e, f, g, h) \
+  NV21ToRGB24Matrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
+#define JNV12ToRAW(a, b, c, d, e, f, g, h) \
+  NV21ToRGB24Matrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h)
+#define JNV21ToRAW(a, b, c, d, e, f, g, h) \
+  NV12ToRGB24Matrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h)
+#define JNV12ToRGB565(a, b, c, d, e, f, g, h) \
+  NV12ToRGB565Matrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
+
+TESTBIPLANARTOB(JNV12, 2, 2, ARGB, ARGB, 4)
+TESTBIPLANARTOB(JNV21, 2, 2, ARGB, ARGB, 4)
+TESTBIPLANARTOB(JNV12, 2, 2, ABGR, ABGR, 4)
+TESTBIPLANARTOB(JNV21, 2, 2, ABGR, ABGR, 4)
+TESTBIPLANARTOB(JNV12, 2, 2, RGB24, RGB24, 3)
+TESTBIPLANARTOB(JNV21, 2, 2, RGB24, RGB24, 3)
+TESTBIPLANARTOB(JNV12, 2, 2, RAW, RAW, 3)
+TESTBIPLANARTOB(JNV21, 2, 2, RAW, RAW, 3)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTBIPLANARTOB(JNV12, 2, 2, RGB565, RGB565, 2)
+#endif
+
+TESTBIPLANARTOB(NV12, 2, 2, ARGB, ARGB, 4)
+TESTBIPLANARTOB(NV21, 2, 2, ARGB, ARGB, 4)
+TESTBIPLANARTOB(NV12, 2, 2, ABGR, ABGR, 4)
+TESTBIPLANARTOB(NV21, 2, 2, ABGR, ABGR, 4)
+TESTBIPLANARTOB(NV12, 2, 2, RGB24, RGB24, 3)
+TESTBIPLANARTOB(NV21, 2, 2, RGB24, RGB24, 3)
+TESTBIPLANARTOB(NV12, 2, 2, RAW, RAW, 3)
+TESTBIPLANARTOB(NV21, 2, 2, RAW, RAW, 3)
+TESTBIPLANARTOB(NV21, 2, 2, YUV24, RAW, 3)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTBIPLANARTOB(NV12, 2, 2, RGB565, RGB565, 2)
+#endif
+
 #define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
-                       W1280, DIFF, N, NEG, OFF)                               \
+                       W1280, N, NEG, OFF)                                     \
   TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) {                        \
     const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
     const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
@@ -832,14 +889,12 @@ TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9)
     }                                                                          \
     for (int i = 0; i < kHeight; ++i) {                                        \
       for (int j = 0; j < kWidth; ++j) {                                       \
-        EXPECT_NEAR(static_cast<int>(dst_y_c[i * kWidth + j]),                 \
-                    static_cast<int>(dst_y_opt[i * kWidth + j]), DIFF);        \
+        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);         \
       }                                                                        \
     }                                                                          \
     for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; ++i) {              \
       for (int j = 0; j < kStrideUV; ++j) {                                    \
-        EXPECT_NEAR(static_cast<int>(dst_uv_c[i * kStrideUV + j]),             \
-                    static_cast<int>(dst_uv_opt[i * kStrideUV + j]), DIFF);    \
+        EXPECT_EQ(dst_uv_c[i * kStrideUV + j], dst_uv_opt[i * kStrideUV + j]); \
       }                                                                        \
     }                                                                          \
     free_aligned_buffer_page_end(dst_y_c);                                     \
@@ -848,39 +903,39 @@ TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9)
     free_aligned_buffer_page_end(dst_uv_opt);                                  \
     free_aligned_buffer_page_end(src_argb);                                    \
   }
-#endif
 
-#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
-                      DIFF)                                                   \
+#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
   TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                 benchmark_width_ - 4, DIFF, _Any, +, 0)                      \
+                 benchmark_width_ - 4, _Any, +, 0)                            \
   TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                 benchmark_width_, DIFF, _Unaligned, +, 1)                    \
+                 benchmark_width_, _Unaligned, +, 1)                          \
   TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                 benchmark_width_, DIFF, _Invert, -, 0)                       \
+                 benchmark_width_, _Invert, -, 0)                             \
   TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                 benchmark_width_, DIFF, _Opt, +, 0)
-
-TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, ARM_YUV_ERROR)
-TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1, ARM_YUV_ERROR)
-TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4)
-TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4)
-TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5)
-// TODO(fbarchard): Make 1555 neon work same as C code, reduce to diff 9.
-TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2, 15)
-TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2, 17)
-TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 2)
-TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 2)
-TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2, 2)
-TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2, 2)
-TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2)
-TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1, 2)
-TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2)
-TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2)
+                 benchmark_width_, _Opt, +, 0)
+
+TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2)
+TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2)
+TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1)
+TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1)
+TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2)
+TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2)
+TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2)
+TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2)
+#endif
+TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2)
+TESTATOPLANAR(I400, 1, 1, I420, 2, 2)
+TESTATOPLANAR(J400, 1, 1, J420, 2, 2)
+TESTATOPLANAR(RAW, 3, 1, I420, 2, 2)
+TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2)
+TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2)
+TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2)
+TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2)
+TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1)
+TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2)
+TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1)
 
 #define TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X,          \
                          SUBSAMP_Y, W1280, N, NEG, OFF)                       \
@@ -911,28 +966,17 @@ TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2)
       FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth,       \
                             dst_uv_opt, kStrideUV * 2, kWidth, NEG kHeight);  \
     }                                                                         \
-    int max_diff = 0;                                                         \
     for (int i = 0; i < kHeight; ++i) {                                       \
       for (int j = 0; j < kWidth; ++j) {                                      \
-        int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) -        \
-                           static_cast<int>(dst_y_opt[i * kWidth + j]));      \
-        if (abs_diff > max_diff) {                                            \
-          max_diff = abs_diff;                                                \
-        }                                                                     \
+        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);        \
       }                                                                       \
     }                                                                         \
-    EXPECT_LE(max_diff, 4);                                                   \
     for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
       for (int j = 0; j < kStrideUV * 2; ++j) {                               \
-        int abs_diff =                                                        \
-            abs(static_cast<int>(dst_uv_c[i * kStrideUV * 2 + j]) -           \
-                static_cast<int>(dst_uv_opt[i * kStrideUV * 2 + j]));         \
-        if (abs_diff > max_diff) {                                            \
-          max_diff = abs_diff;                                                \
-        }                                                                     \
+        EXPECT_EQ(dst_uv_c[i * kStrideUV * 2 + j],                            \
+                  dst_uv_opt[i * kStrideUV * 2 + j]);                         \
       }                                                                       \
     }                                                                         \
-    EXPECT_LE(max_diff, 4);                                                   \
     free_aligned_buffer_page_end(dst_y_c);                                    \
     free_aligned_buffer_page_end(dst_uv_c);                                   \
     free_aligned_buffer_page_end(dst_y_opt);                                  \
@@ -952,11 +996,15 @@ TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2)
 
 TESTATOBIPLANAR(ARGB, 1, 4, NV12, 2, 2)
 TESTATOBIPLANAR(ARGB, 1, 4, NV21, 2, 2)
+TESTATOBIPLANAR(ABGR, 1, 4, NV12, 2, 2)
+TESTATOBIPLANAR(ABGR, 1, 4, NV21, 2, 2)
 TESTATOBIPLANAR(YUY2, 2, 4, NV12, 2, 2)
 TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
+TESTATOBIPLANAR(AYUV, 1, 4, NV12, 2, 2)
+TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2)
 
 #define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,  \
-                  HEIGHT_B, W1280, DIFF, N, NEG, OFF)                        \
+                  HEIGHT_B, W1280, N, NEG, OFF)                              \
   TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) {                           \
     const int kWidth = ((W1280) > 0) ? (W1280) : 1;                          \
     const int kHeight = benchmark_height_;                                   \
@@ -982,22 +1030,16 @@ TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
       FMT_A##To##FMT_B(src_argb + OFF, kStrideA, dst_argb_opt, kStrideB,     \
                        kWidth, NEG kHeight);                                 \
     }                                                                        \
-    int max_diff = 0;                                                        \
     for (int i = 0; i < kStrideB * kHeightB; ++i) {                          \
-      int abs_diff = abs(static_cast<int>(dst_argb_c[i]) -                   \
-                         static_cast<int>(dst_argb_opt[i]));                 \
-      if (abs_diff > max_diff) {                                             \
-        max_diff = abs_diff;                                                 \
-      }                                                                      \
+      EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                             \
     }                                                                        \
-    EXPECT_LE(max_diff, DIFF);                                               \
     free_aligned_buffer_page_end(src_argb);                                  \
     free_aligned_buffer_page_end(dst_argb_c);                                \
     free_aligned_buffer_page_end(dst_argb_opt);                              \
   }
 
 #define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B,     \
-                       STRIDE_B, HEIGHT_B, DIFF)                           \
+                       STRIDE_B, HEIGHT_B)                                 \
   TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) {                   \
     for (int times = 0; times < benchmark_iterations_; ++times) {          \
       const int kWidth = (fastrand() & 63) + 1;                            \
@@ -1023,7 +1065,7 @@ TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
       FMT_A##To##FMT_B(src_argb, kStrideA, dst_argb_opt, kStrideB, kWidth, \
                        kHeight);                                           \
       for (int i = 0; i < kStrideB * kHeightB; ++i) {                      \
-        EXPECT_NEAR(dst_argb_c[i], dst_argb_opt[i], DIFF);                 \
+        EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                         \
       }                                                                    \
       free_aligned_buffer_page_end(src_argb);                              \
       free_aligned_buffer_page_end(dst_argb_c);                            \
@@ -1032,61 +1074,79 @@ TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
   }
 
 #define TESTATOB(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
-                 HEIGHT_B, DIFF)                                           \
+                 HEIGHT_B)                                                 \
   TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
-            HEIGHT_B, benchmark_width_ - 4, DIFF, _Any, +, 0)              \
+            HEIGHT_B, benchmark_width_ - 4, _Any, +, 0)                    \
   TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
-            HEIGHT_B, benchmark_width_, DIFF, _Unaligned, +, 1)            \
+            HEIGHT_B, benchmark_width_, _Unaligned, +, 1)                  \
   TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
-            HEIGHT_B, benchmark_width_, DIFF, _Invert, -, 0)               \
+            HEIGHT_B, benchmark_width_, _Invert, -, 0)                     \
   TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
-            HEIGHT_B, benchmark_width_, DIFF, _Opt, +, 0)                  \
+            HEIGHT_B, benchmark_width_, _Opt, +, 0)                        \
   TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
-                 HEIGHT_B, DIFF)
-
-// TODO(fbarchard): make ARM version of C code that matches NEON.
-TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0)
-TESTATOB(ABGR, 4, 4, 1, AR30, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, AR30, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 4)
-TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1, 4)
-TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 2)
-TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 2)
-TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(AR30, 4, 4, 1, AR30, 4, 4, 1, 0)
-TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RAW, 3, 3, 1, RGB24, 3, 3, 1, 0)
-TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(AR30, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(AR30, 4, 4, 1, ABGR, 4, 4, 1, 0)
-TESTATOB(AB30, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(AB30, 4, 4, 1, ABGR, 4, 4, 1, 0)
-TESTATOB(AR30, 4, 4, 1, AB30, 4, 4, 1, 0)
-TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1, ARM_YUV_ERROR)
-TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1, ARM_YUV_ERROR)
-TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1, 0)
-TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(J400, 1, 1, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1, 0)
-TESTATOB(J400, 1, 1, 1, J400, 1, 1, 1, 0)
-TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
+                 HEIGHT_B)
+
+TESTATOB(AB30, 4, 4, 1, ABGR, 4, 4, 1)
+TESTATOB(AB30, 4, 4, 1, ARGB, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(ABGR, 4, 4, 1, AR30, 4, 4, 1)
+#endif
+TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(AR30, 4, 4, 1, AB30, 4, 4, 1)
+#endif
+TESTATOB(AR30, 4, 4, 1, ABGR, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(AR30, 4, 4, 1, AR30, 4, 4, 1)
+TESTATOB(AR30, 4, 4, 1, ARGB, 4, 4, 1)
+#endif
+TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(ARGB, 4, 4, 1, AR30, 4, 4, 1)
+#endif
+TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1)
+TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1)
+TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1)
+TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1)
+TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1)
+TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1)
+TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1)
+TESTATOB(RGBA, 4, 4, 1, J400, 1, 1, 1)
+TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1)
+TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1)
+TESTATOB(ABGR, 4, 4, 1, RAW, 3, 3, 1)
+TESTATOB(ABGR, 4, 4, 1, RGB24, 3, 3, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1)
+#endif
+TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1)
+TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1)
+TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1)  // 4
+TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1)
+TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1)
+TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1)
+TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1)
+TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1)
+TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1)
+TESTATOB(J400, 1, 1, 1, ARGB, 4, 4, 1)
+TESTATOB(J400, 1, 1, 1, J400, 1, 1, 1)
+TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1)
+TESTATOB(RAW, 3, 3, 1, RGBA, 4, 4, 1)
+TESTATOB(RAW, 3, 3, 1, RGB24, 3, 3, 1)
+TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1)
+TESTATOB(RGB24, 3, 3, 1, J400, 1, 1, 1)
+TESTATOB(RGB24, 3, 3, 1, RGB24Mirror, 3, 3, 1)
+TESTATOB(RAW, 3, 3, 1, J400, 1, 1, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1)
+#endif
+TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1)
+TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1)
+TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1)
+TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1)
 
 #define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
-                   HEIGHT_B, W1280, DIFF, N, NEG, OFF)                       \
+                   HEIGHT_B, W1280, N, NEG, OFF)                             \
   TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither##N) {                   \
     const int kWidth = ((W1280) > 0) ? (W1280) : 1;                          \
     const int kHeight = benchmark_height_;                                   \
@@ -1112,22 +1172,16 @@ TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
       FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_opt,       \
                                kStrideB, NULL, kWidth, NEG kHeight);         \
     }                                                                        \
-    int max_diff = 0;                                                        \
     for (int i = 0; i < kStrideB * kHeightB; ++i) {                          \
-      int abs_diff = abs(static_cast<int>(dst_argb_c[i]) -                   \
-                         static_cast<int>(dst_argb_opt[i]));                 \
-      if (abs_diff > max_diff) {                                             \
-        max_diff = abs_diff;                                                 \
-      }                                                                      \
+      EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                             \
     }                                                                        \
-    EXPECT_LE(max_diff, DIFF);                                               \
     free_aligned_buffer_page_end(src_argb);                                  \
     free_aligned_buffer_page_end(dst_argb_c);                                \
     free_aligned_buffer_page_end(dst_argb_opt);                              \
   }
 
 #define TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B,        \
-                        STRIDE_B, HEIGHT_B, DIFF)                              \
+                        STRIDE_B, HEIGHT_B)                                    \
   TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither_Random) {                 \
     for (int times = 0; times < benchmark_iterations_; ++times) {              \
       const int kWidth = (fastrand() & 63) + 1;                                \
@@ -1152,15 +1206,9 @@ TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
       MaskCpuFlags(benchmark_cpu_info_);                                       \
       FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_opt, kStrideB,     \
                                NULL, kWidth, kHeight);                         \
-      int max_diff = 0;                                                        \
       for (int i = 0; i < kStrideB * kHeightB; ++i) {                          \
-        int abs_diff = abs(static_cast<int>(dst_argb_c[i]) -                   \
-                           static_cast<int>(dst_argb_opt[i]));                 \
-        if (abs_diff > max_diff) {                                             \
-          max_diff = abs_diff;                                                 \
-        }                                                                      \
+        EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                             \
       }                                                                        \
-      EXPECT_LE(max_diff, DIFF);                                               \
       free_aligned_buffer_page_end(src_argb);                                  \
       free_aligned_buffer_page_end(dst_argb_c);                                \
       free_aligned_buffer_page_end(dst_argb_opt);                              \
@@ -1168,19 +1216,21 @@ TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
   }
 
 #define TESTATOBD(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
-                  HEIGHT_B, DIFF)                                           \
+                  HEIGHT_B)                                                 \
   TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
-             HEIGHT_B, benchmark_width_ - 4, DIFF, _Any, +, 0)              \
+             HEIGHT_B, benchmark_width_ - 4, _Any, +, 0)                    \
   TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
-             HEIGHT_B, benchmark_width_, DIFF, _Unaligned, +, 1)            \
+             HEIGHT_B, benchmark_width_, _Unaligned, +, 1)                  \
   TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
-             HEIGHT_B, benchmark_width_, DIFF, _Invert, -, 0)               \
+             HEIGHT_B, benchmark_width_, _Invert, -, 0)                     \
   TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
-             HEIGHT_B, benchmark_width_, DIFF, _Opt, +, 0)                  \
+             HEIGHT_B, benchmark_width_, _Opt, +, 0)                        \
   TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
-                  HEIGHT_B, DIFF)
+                  HEIGHT_B)
 
-TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1)
+#endif
 
 #define TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, W1280, N, NEG, OFF)      \
   TEST_F(LibYUVConvertTest, FMT_ATOB##_Symetric##N) {                          \
@@ -1267,6 +1317,7 @@ TEST_F(LibYUVConvertTest, ValidateJpeg) {
   // EOI, SOI. Expect pass.
   orig_pixels[0] = 0xff;
   orig_pixels[1] = 0xd8;  // SOI.
+  orig_pixels[2] = 0xff;
   orig_pixels[kSize - kOff + 0] = 0xff;
   orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
   for (int times = 0; times < benchmark_iterations_; ++times) {
@@ -1293,6 +1344,7 @@ TEST_F(LibYUVConvertTest, ValidateJpegLarge) {
   // EOI, SOI. Expect pass.
   orig_pixels[0] = 0xff;
   orig_pixels[1] = 0xd8;  // SOI.
+  orig_pixels[2] = 0xff;
   orig_pixels[kSize - kOff + 0] = 0xff;
   orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
   for (int times = 0; times < benchmark_iterations_; ++times) {
@@ -1326,6 +1378,7 @@ TEST_F(LibYUVConvertTest, InvalidateJpeg) {
   // SOI but no EOI. Expect fail.
   orig_pixels[0] = 0xff;
   orig_pixels[1] = 0xd8;  // SOI.
+  orig_pixels[2] = 0xff;
   for (int times = 0; times < benchmark_iterations_; ++times) {
     EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
   }
@@ -1343,85 +1396,823 @@ TEST_F(LibYUVConvertTest, InvalidateJpeg) {
 TEST_F(LibYUVConvertTest, FuzzJpeg) {
   // SOI but no EOI. Expect fail.
   for (int times = 0; times < benchmark_iterations_; ++times) {
-    const int kSize = fastrand() % 5000 + 2;
+    const int kSize = fastrand() % 5000 + 3;
     align_buffer_page_end(orig_pixels, kSize);
     MemRandomize(orig_pixels, kSize);
 
     // Add SOI so frame will be scanned.
     orig_pixels[0] = 0xff;
     orig_pixels[1] = 0xd8;  // SOI.
+    orig_pixels[2] = 0xff;
     orig_pixels[kSize - 1] = 0xff;
-    ValidateJpeg(orig_pixels, kSize);  // Failure normally expected.
+    ValidateJpeg(orig_pixels,
+                 kSize);  // Failure normally expected.
     free_aligned_buffer_page_end(orig_pixels);
   }
 }
 
-TEST_F(LibYUVConvertTest, MJPGToI420) {
-  const int kOff = 10;
-  const int kMinJpeg = 64;
-  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
-                             ? benchmark_width_ * benchmark_height_
-                             : kMinJpeg;
-  const int kSize = kImageSize + kOff;
-  align_buffer_page_end(orig_pixels, kSize);
-  align_buffer_page_end(dst_y_opt, benchmark_width_ * benchmark_height_);
-  align_buffer_page_end(dst_u_opt, SUBSAMPLE(benchmark_width_, 2) *
-                                       SUBSAMPLE(benchmark_height_, 2));
-  align_buffer_page_end(dst_v_opt, SUBSAMPLE(benchmark_width_, 2) *
-                                       SUBSAMPLE(benchmark_height_, 2));
+// Test data created in GIMP.  In export jpeg, disable
+// thumbnails etc, choose a subsampling, and use low quality
+// (50) to keep size small. Generated with xxd -i test.jpg
+// test 0 is J400
+static const uint8_t kTest0Jpg[] = {
+    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xc2, 0x00, 0x0b, 0x08, 0x00, 0x10,
+    0x00, 0x20, 0x01, 0x01, 0x11, 0x00, 0xff, 0xc4, 0x00, 0x17, 0x00, 0x01,
+    0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xda, 0x00, 0x08, 0x01,
+    0x01, 0x00, 0x00, 0x00, 0x01, 0x43, 0x7e, 0xa7, 0x97, 0x57, 0xff, 0xc4,
+    0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
+    0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
+    0x02, 0x3b, 0xc0, 0x6f, 0x66, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
+    0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03,
+    0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff,
+    0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28,
+    0x32, 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4,
+    0x00, 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51,
+    0x31, 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+    0x3f, 0x21, 0x65, 0x6e, 0x31, 0x86, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb,
+    0xa9, 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9,
+    0xc6, 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x08,
+    0x01, 0x01, 0x00, 0x00, 0x00, 0x10, 0x35, 0xff, 0xc4, 0x00, 0x1f, 0x10,
+    0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91,
+    0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+    0x3f, 0x10, 0x0b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x88, 0xab, 0x8b,
+    0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec,
+    0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c,
+    0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff,
+    0xd9};
+static const size_t kTest0JpgLen = 421;
+
+// test 1 is J444
+static const uint8_t kTest1Jpg[] = {
+    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+    0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+    0x01, 0x11, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+    0x17, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xc4,
+    0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x01, 0x03, 0xff, 0xda,
+    0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00, 0x01,
+    0x40, 0x8f, 0x26, 0xe8, 0xf4, 0xcc, 0xf9, 0x69, 0x2b, 0x1b, 0x2a, 0xcb,
+    0xff, 0xc4, 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11,
+    0x00, 0x03, 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00,
+    0x01, 0x05, 0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99,
+    0x0d, 0x26, 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x01, 0x00,
+    0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x01, 0x00, 0x10, 0x11, 0x02, 0x12, 0xff, 0xda, 0x00, 0x08,
+    0x01, 0x03, 0x01, 0x01, 0x3f, 0x01, 0xf1, 0x00, 0x27, 0x45, 0xbb, 0x31,
+    0xaf, 0xff, 0xc4, 0x00, 0x1a, 0x11, 0x00, 0x02, 0x03, 0x01, 0x01, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+    0x02, 0x10, 0x11, 0x41, 0x12, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01,
+    0x01, 0x3f, 0x01, 0xf6, 0x4b, 0x5f, 0x48, 0xb3, 0x69, 0x63, 0x35, 0x72,
+    0xbf, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11,
+    0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda, 0x00,
+    0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32, 0xd2,
+    0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00, 0x1c,
+    0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31, 0x61,
+    0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x21,
+    0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9, 0x01,
+    0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6, 0x48,
+    0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01,
+    0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x26, 0x61, 0xd4, 0xff,
+    0xc4, 0x00, 0x1a, 0x11, 0x00, 0x03, 0x01, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x21,
+    0x31, 0x41, 0x51, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f,
+    0x10, 0x54, 0xa8, 0xbf, 0x50, 0x87, 0xb0, 0x9d, 0x8b, 0xc4, 0x6a, 0x26,
+    0x6b, 0x2a, 0x9c, 0x1f, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x01, 0x01, 0x01,
+    0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x01, 0x00, 0x11, 0x21, 0x51, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02,
+    0x01, 0x01, 0x3f, 0x10, 0x70, 0xe1, 0x3e, 0xd1, 0x8e, 0x0d, 0xe1, 0xb5,
+    0xd5, 0x91, 0x76, 0x43, 0x82, 0x45, 0x4c, 0x7b, 0x7f, 0xff, 0xc4, 0x00,
+    0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61,
+    0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01,
+    0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x8a,
+    0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96,
+    0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad,
+    0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7,
+    0xd4, 0xff, 0xd9};
+static const size_t kTest1JpgLen = 735;
+
+// test 2 is J420
+static const uint8_t kTest2Jpg[] = {
+    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+    0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+    0x01, 0x22, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+    0x18, 0x00, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x05, 0x01, 0x02, 0x04, 0xff,
+    0xc4, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x01, 0x02, 0xff,
+    0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
+    0x01, 0x20, 0xe7, 0x28, 0xa3, 0x0b, 0x2e, 0x2d, 0xcf, 0xff, 0xc4, 0x00,
+    0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03, 0x10,
+    0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05, 0x02,
+    0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26, 0x62,
+    0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x00, 0x03, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f,
+    0x01, 0xc8, 0x53, 0xff, 0xc4, 0x00, 0x16, 0x11, 0x01, 0x01, 0x01, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x11, 0x32, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f,
+    0x01, 0xd2, 0xc7, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03,
+    0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff,
+    0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28,
+    0x32, 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4,
+    0x00, 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51,
+    0x31, 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+    0x3f, 0x21, 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb,
+    0xa9, 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9,
+    0xc6, 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c,
+    0x03, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x13, 0x5f,
+    0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11,
+    0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x0e,
+    0xa1, 0x3a, 0x76, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x21, 0x11, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01,
+    0x3f, 0x10, 0x57, 0x0b, 0x08, 0x70, 0xdb, 0xff, 0xc4, 0x00, 0x1f, 0x10,
+    0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91,
+    0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+    0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x8a, 0xeb, 0x8b,
+    0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec,
+    0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c,
+    0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff,
+    0xd9};
+static const size_t kTest2JpgLen = 685;
+
+// test 3 is J422
+static const uint8_t kTest3Jpg[] = {
+    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+    0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+    0x01, 0x21, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+    0x17, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xc4,
+    0x00, 0x17, 0x01, 0x00, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x00, 0xff,
+    0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
+    0x01, 0x43, 0x8d, 0x1f, 0xa2, 0xb3, 0xca, 0x1b, 0x57, 0x0f, 0xff, 0xc4,
+    0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
+    0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
+    0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
+    0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x00, 0x02, 0x03, 0x01,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x01, 0x02, 0x10, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03,
+    0x01, 0x01, 0x3f, 0x01, 0x51, 0xce, 0x8c, 0x75, 0xff, 0xc4, 0x00, 0x18,
+    0x11, 0x00, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x61, 0x21, 0xff, 0xda,
+    0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f, 0x01, 0xa6, 0xd9, 0x2f, 0x84,
+    0xe8, 0xf0, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda,
+    0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32,
+    0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00,
+    0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31,
+    0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f,
+    0x21, 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9,
+    0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6,
+    0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03,
+    0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x2e, 0x45, 0xff,
+    0xc4, 0x00, 0x18, 0x11, 0x00, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x21,
+    0x31, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x53,
+    0x50, 0xba, 0x54, 0xc1, 0x67, 0x4f, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x00,
+    0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x01, 0x11, 0x21, 0x00, 0x10, 0xff, 0xda, 0x00, 0x08,
+    0x01, 0x02, 0x01, 0x01, 0x3f, 0x10, 0x18, 0x81, 0x5c, 0x04, 0x1a, 0xca,
+    0x91, 0xbf, 0xff, 0xc4, 0x00, 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04,
+    0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+    0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff,
+    0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9,
+    0x58, 0xbe, 0x1a, 0xfd, 0x8a, 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5,
+    0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c,
+    0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00,
+    0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff, 0xd9};
+static const size_t kTest3JpgLen = 704;
+
+// test 4 is J422 vertical - not supported
+static const uint8_t kTest4Jpg[] = {
+    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+    0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+    0x01, 0x12, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+    0x18, 0x00, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x05, 0x01, 0x02, 0x03, 0xff,
+    0xc4, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x03, 0xff,
+    0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
+    0x01, 0xd2, 0x98, 0xe9, 0x03, 0x0c, 0x00, 0x46, 0x21, 0xd9, 0xff, 0xc4,
+    0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
+    0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
+    0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
+    0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x11, 0x01, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01,
+    0x3f, 0x01, 0x98, 0xb1, 0xbd, 0x47, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x00,
+    0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x12, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08,
+    0x01, 0x02, 0x01, 0x01, 0x3f, 0x01, 0xb6, 0x35, 0xa2, 0xe1, 0x47, 0xff,
+    0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x21, 0x02,
+    0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda, 0x00, 0x08, 0x01,
+    0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32, 0xd2, 0xed, 0xf9,
+    0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00, 0x1c, 0x10, 0x01,
+    0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31, 0x61, 0x81, 0xf0,
+    0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x21, 0x75, 0x6e,
+    0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9, 0x01, 0xf3, 0xde,
+    0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6, 0x48, 0x5d, 0x7a,
+    0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02,
+    0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x24, 0xaf, 0xff, 0xc4, 0x00, 0x19,
+    0x11, 0x00, 0x03, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x51, 0x21, 0x31, 0xff,
+    0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x59, 0x11, 0xca,
+    0x42, 0x60, 0x9f, 0x69, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x00, 0x02, 0x03,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x01, 0x11, 0x21, 0x31, 0x61, 0xff, 0xda, 0x00, 0x08, 0x01,
+    0x02, 0x01, 0x01, 0x3f, 0x10, 0xb0, 0xd7, 0x27, 0x51, 0xb6, 0x41, 0xff,
+    0xc4, 0x00, 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31,
+    0x41, 0x61, 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08,
+    0x01, 0x01, 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a,
+    0xfd, 0x8a, 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd,
+    0x46, 0x96, 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30,
+    0x49, 0xad, 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03,
+    0x0b, 0xb7, 0xd4, 0xff, 0xd9};
+static const size_t kTest4JpgLen = 701;
+
+TEST_F(LibYUVConvertTest, TestMJPGSize) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  printf("test jpeg size %d x %d\n", width, height);
+}
 
-  // EOI, SOI to make MJPG appear valid.
-  memset(orig_pixels, 0, kSize);
-  orig_pixels[0] = 0xff;
-  orig_pixels[1] = 0xd8;  // SOI.
-  orig_pixels[kSize - kOff + 0] = 0xff;
-  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
+TEST_F(LibYUVConvertTest, TestMJPGToI420) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_u, half_width * half_height);
+  align_buffer_page_end(dst_v, half_width * half_height);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_u, half_width,
+                     dst_v, half_width, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  uint32_t dst_u_hash = HashDjb2(dst_u, half_width * half_height, 5381);
+  uint32_t dst_v_hash = HashDjb2(dst_v, half_width * half_height, 5381);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_u_hash, 2501859930u);
+  EXPECT_EQ(dst_v_hash, 2126459123u);
 
-  for (int times = 0; times < benchmark_iterations_; ++times) {
-    int ret =
-        MJPGToI420(orig_pixels, kSize, dst_y_opt, benchmark_width_, dst_u_opt,
-                   SUBSAMPLE(benchmark_width_, 2), dst_v_opt,
-                   SUBSAMPLE(benchmark_width_, 2), benchmark_width_,
-                   benchmark_height_, benchmark_width_, benchmark_height_);
-    // Expect failure because image is not really valid.
-    EXPECT_EQ(1, ret);
-  }
-
-  free_aligned_buffer_page_end(dst_y_opt);
-  free_aligned_buffer_page_end(dst_u_opt);
-  free_aligned_buffer_page_end(dst_v_opt);
-  free_aligned_buffer_page_end(orig_pixels);
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_u);
+  free_aligned_buffer_page_end(dst_v);
 }
 
-TEST_F(LibYUVConvertTest, MJPGToARGB) {
-  const int kOff = 10;
-  const int kMinJpeg = 64;
-  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
-                             ? benchmark_width_ * benchmark_height_
-                             : kMinJpeg;
-  const int kSize = kImageSize + kOff;
-  align_buffer_page_end(orig_pixels, kSize);
-  align_buffer_page_end(dst_argb_opt, benchmark_width_ * benchmark_height_ * 4);
+TEST_F(LibYUVConvertTest, TestMJPGToI420_NV21) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
 
-  // EOI, SOI to make MJPG appear valid.
-  memset(orig_pixels, 0, kSize);
-  orig_pixels[0] = 0xff;
-  orig_pixels[1] = 0xd8;  // SOI.
-  orig_pixels[kSize - kOff + 0] = 0xff;
-  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
 
-  for (int times = 0; times < benchmark_iterations_; ++times) {
-    int ret = MJPGToARGB(orig_pixels, kSize, dst_argb_opt, benchmark_width_ * 4,
-                         benchmark_width_, benchmark_height_, benchmark_width_,
-                         benchmark_height_);
-    // Expect failure because image is not really valid.
-    EXPECT_EQ(1, ret);
+  // Convert to NV21
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_vu, half_width * half_height * 2);
+
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV21(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_vu,
+                     half_width * 2, width, height, width, height);
   }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
 
-  free_aligned_buffer_page_end(dst_argb_opt);
-  free_aligned_buffer_page_end(orig_pixels);
+  // Convert to I420
+  align_buffer_page_end(dst2_y, width * height);
+  align_buffer_page_end(dst2_u, half_width * half_height);
+  align_buffer_page_end(dst2_v, half_width * half_height);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst2_y, width, dst2_u, half_width,
+                     dst2_v, half_width, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Convert I420 to NV21
+  align_buffer_page_end(dst3_y, width * height);
+  align_buffer_page_end(dst3_vu, half_width * half_height * 2);
+
+  I420ToNV21(dst2_y, width, dst2_u, half_width, dst2_v, half_width, dst3_y,
+             width, dst3_vu, half_width * 2, width, height);
+
+  for (int i = 0; i < width * height; ++i) {
+    EXPECT_EQ(dst_y[i], dst3_y[i]);
+  }
+  for (int i = 0; i < half_width * half_height * 2; ++i) {
+    EXPECT_EQ(dst_vu[i], dst3_vu[i]);
+    EXPECT_EQ(dst_vu[i], dst3_vu[i]);
+  }
+
+  free_aligned_buffer_page_end(dst3_y);
+  free_aligned_buffer_page_end(dst3_vu);
+
+  free_aligned_buffer_page_end(dst2_y);
+  free_aligned_buffer_page_end(dst2_u);
+  free_aligned_buffer_page_end(dst2_v);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_vu);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToI420_NV12) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+
+  // Convert to NV12
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV12(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Convert to I420
+  align_buffer_page_end(dst2_y, width * height);
+  align_buffer_page_end(dst2_u, half_width * half_height);
+  align_buffer_page_end(dst2_v, half_width * half_height);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst2_y, width, dst2_u, half_width,
+                     dst2_v, half_width, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Convert I420 to NV12
+  align_buffer_page_end(dst3_y, width * height);
+  align_buffer_page_end(dst3_uv, half_width * half_height * 2);
+
+  I420ToNV12(dst2_y, width, dst2_u, half_width, dst2_v, half_width, dst3_y,
+             width, dst3_uv, half_width * 2, width, height);
+
+  for (int i = 0; i < width * height; ++i) {
+    EXPECT_EQ(dst_y[i], dst3_y[i]);
+  }
+  for (int i = 0; i < half_width * half_height * 2; ++i) {
+    EXPECT_EQ(dst_uv[i], dst3_uv[i]);
+    EXPECT_EQ(dst_uv[i], dst3_uv[i]);
+  }
+
+  free_aligned_buffer_page_end(dst3_y);
+  free_aligned_buffer_page_end(dst3_uv);
+
+  free_aligned_buffer_page_end(dst2_y);
+  free_aligned_buffer_page_end(dst2_u);
+  free_aligned_buffer_page_end(dst2_v);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV21_420) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV21(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_uv_hash, 1069662856u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV12_420) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV12(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value. Hashes are for VU so flip the plane.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  align_buffer_page_end(dst_vu, half_width * half_height * 2);
+  SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
+              half_height);
+  uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_vu_hash, 1069662856u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+  free_aligned_buffer_page_end(dst_vu);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV21_422) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV21(kTest3Jpg, kTest3JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_uv_hash, 3543430771u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV12_422) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV12(kTest3Jpg, kTest3JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value. Hashes are for VU so flip the plane.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  align_buffer_page_end(dst_vu, half_width * half_height * 2);
+  SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
+              half_height);
+  uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_vu_hash, 3543430771u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+  free_aligned_buffer_page_end(dst_vu);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV21_400) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest0Jpg, kTest0JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV21(kTest0Jpg, kTest0JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 330644005u);
+  EXPECT_EQ(dst_uv_hash, 135214341u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV12_400) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest0Jpg, kTest0JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV12(kTest0Jpg, kTest0JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value. Hashes are for VU so flip the plane.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  align_buffer_page_end(dst_vu, half_width * half_height * 2);
+  SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
+              half_height);
+  uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 330644005u);
+  EXPECT_EQ(dst_vu_hash, 135214341u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+  free_aligned_buffer_page_end(dst_vu);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV21_444) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest1Jpg, kTest1JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV21(kTest1Jpg, kTest1JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_uv_hash, 506143297u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV12_444) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest1Jpg, kTest1JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV12(kTest1Jpg, kTest1JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value. Hashes are for VU so flip the plane.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  align_buffer_page_end(dst_vu, half_width * half_height * 2);
+  SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
+              half_height);
+  uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_vu_hash, 506143297u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+  free_aligned_buffer_page_end(dst_vu);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToARGB) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+
+  align_buffer_page_end(dst_argb, width * height * 4);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToARGB(kTest3Jpg, kTest3JpgLen, dst_argb, width * 4, width,
+                     height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value.
+  uint32_t dst_argb_hash = HashDjb2(dst_argb, width * height, 5381);
+  EXPECT_EQ(dst_argb_hash, 2355976473u);
+
+  free_aligned_buffer_page_end(dst_argb);
 }
 
+static int ShowJPegInfo(const uint8_t* sample, size_t sample_size) {
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+
+  int width = mjpeg_decoder.GetWidth();
+  int height = mjpeg_decoder.GetHeight();
+
+  // YUV420
+  if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+      mjpeg_decoder.GetNumComponents() == 3 &&
+      mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+      mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+      mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+      mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+      mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+      mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+    printf("JPeg is J420, %dx%d %d bytes\n", width, height,
+           static_cast<int>(sample_size));
+    // YUV422
+  } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+             mjpeg_decoder.GetNumComponents() == 3 &&
+             mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+             mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+             mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+             mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+             mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+             mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+    printf("JPeg is J422, %dx%d %d bytes\n", width, height,
+           static_cast<int>(sample_size));
+    // YUV444
+  } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+             mjpeg_decoder.GetNumComponents() == 3 &&
+             mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+             mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+             mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+             mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+             mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+             mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+    printf("JPeg is J444, %dx%d %d bytes\n", width, height,
+           static_cast<int>(sample_size));
+    // YUV400
+  } else if (mjpeg_decoder.GetColorSpace() ==
+                 MJpegDecoder::kColorSpaceGrayscale &&
+             mjpeg_decoder.GetNumComponents() == 1 &&
+             mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+             mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+    printf("JPeg is J400, %dx%d %d bytes\n", width, height,
+           static_cast<int>(sample_size));
+  } else {
+    // Unknown colorspace.
+    printf("JPeg is Unknown colorspace.\n");
+  }
+  mjpeg_decoder.UnloadFrame();
+  return ret;
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGInfo) {
+  EXPECT_EQ(1, ShowJPegInfo(kTest0Jpg, kTest0JpgLen));
+  EXPECT_EQ(1, ShowJPegInfo(kTest1Jpg, kTest1JpgLen));
+  EXPECT_EQ(1, ShowJPegInfo(kTest2Jpg, kTest2JpgLen));
+  EXPECT_EQ(1, ShowJPegInfo(kTest3Jpg, kTest3JpgLen));
+  EXPECT_EQ(1, ShowJPegInfo(kTest4Jpg,
+                            kTest4JpgLen));  // Valid but unsupported.
+}
 #endif  // HAVE_JPEG
 
 TEST_F(LibYUVConvertTest, NV12Crop) {
@@ -1504,6 +2295,78 @@ TEST_F(LibYUVConvertTest, NV12Crop) {
   free_aligned_buffer_page_end(src_y);
 }
 
+TEST_F(LibYUVConvertTest, I420CropOddY) {
+  const int SUBSAMP_X = 2;
+  const int SUBSAMP_Y = 2;
+  const int kWidth = benchmark_width_;
+  const int kHeight = benchmark_height_;
+  const int crop_y = 1;
+  const int kDestWidth = benchmark_width_;
+  const int kDestHeight = benchmark_height_ - crop_y * 2;
+  const int kStrideU = SUBSAMPLE(kWidth, SUBSAMP_X);
+  const int kStrideV = SUBSAMPLE(kWidth, SUBSAMP_X);
+  const int sample_size = kWidth * kHeight +
+                          kStrideU * SUBSAMPLE(kHeight, SUBSAMP_Y) +
+                          kStrideV * SUBSAMPLE(kHeight, SUBSAMP_Y);
+  align_buffer_page_end(src_y, sample_size);
+  uint8_t* src_u = src_y + kWidth * kHeight;
+  uint8_t* src_v = src_u + kStrideU * SUBSAMPLE(kHeight, SUBSAMP_Y);
+
+  align_buffer_page_end(dst_y, kDestWidth * kDestHeight);
+  align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  align_buffer_page_end(dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+  for (int i = 0; i < kHeight * kWidth; ++i) {
+    src_y[i] = (fastrand() & 0xff);
+  }
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideU; ++i) {
+    src_u[i] = (fastrand() & 0xff);
+  }
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideV; ++i) {
+    src_v[i] = (fastrand() & 0xff);
+  }
+  memset(dst_y, 1, kDestWidth * kDestHeight);
+  memset(dst_u, 2,
+         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  memset(dst_v, 3,
+         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+  MaskCpuFlags(benchmark_cpu_info_);
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ConvertToI420(src_y, sample_size, dst_y, kDestWidth, dst_u,
+                  SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v,
+                  SUBSAMPLE(kDestWidth, SUBSAMP_X), 0, crop_y, kWidth, kHeight,
+                  kDestWidth, kDestHeight, libyuv::kRotate0,
+                  libyuv::FOURCC_I420);
+  }
+
+  for (int i = 0; i < kDestHeight; ++i) {
+    for (int j = 0; j < kDestWidth; ++j) {
+      EXPECT_EQ(src_y[crop_y * kWidth + i * kWidth + j],
+                dst_y[i * kDestWidth + j]);
+    }
+  }
+  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+      EXPECT_EQ(src_u[(crop_y / 2 + i) * kStrideU + j],
+                dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+    }
+  }
+  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+      EXPECT_EQ(src_v[(crop_y / 2 + i) * kStrideV + j],
+                dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+    }
+  }
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_u);
+  free_aligned_buffer_page_end(dst_v);
+  free_aligned_buffer_page_end(src_y);
+}
+
 TEST_F(LibYUVConvertTest, TestYToARGB) {
   uint8_t y[32];
   uint8_t expectedg[32];
@@ -1588,7 +2451,7 @@ TEST_F(LibYUVConvertTest, TestDither) {
 }
 
 #define TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
-                        YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C)        \
+                        YALIGN, W1280, N, NEG, OFF, FMT_C, BPP_C)              \
   TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##Dither##N) {                \
     const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
     const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
@@ -1619,7 +2482,6 @@ TEST_F(LibYUVConvertTest, TestDither) {
           src_y + OFF, kWidth, src_u + OFF, kStrideUV, src_v + OFF, kStrideUV, \
           dst_argb_opt + OFF, kStrideB, NULL, kWidth, NEG kHeight);            \
     }                                                                          \
-    int max_diff = 0;                                                          \
     /* Convert to ARGB so 565 is expanded to bytes that can be compared. */    \
     align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight);               \
     align_buffer_page_end(dst_argb32_opt, kWidth* BPP_C* kHeight);             \
@@ -1630,13 +2492,8 @@ TEST_F(LibYUVConvertTest, TestDither) {
     FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, dst_argb32_opt,             \
                      kWidth * BPP_C, kWidth, kHeight);                         \
     for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) {                       \
-      int abs_diff = abs(static_cast<int>(dst_argb32_c[i]) -                   \
-                         static_cast<int>(dst_argb32_opt[i]));                 \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
+      EXPECT_EQ(dst_argb32_c[i], dst_argb32_opt[i]);                           \
     }                                                                          \
-    EXPECT_LE(max_diff, DIFF);                                                 \
     free_aligned_buffer_page_end(src_y);                                       \
     free_aligned_buffer_page_end(src_u);                                       \
     free_aligned_buffer_page_end(src_v);                                       \
@@ -1646,20 +2503,20 @@ TEST_F(LibYUVConvertTest, TestDither) {
     free_aligned_buffer_page_end(dst_argb32_opt);                              \
   }
 
-#define TESTPLANARTOBD(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
-                       YALIGN, DIFF, FMT_C, BPP_C)                             \
-  TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,       \
-                  YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C,       \
-                  BPP_C)                                                       \
-  TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,       \
-                  YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C,     \
-                  BPP_C)                                                       \
-  TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,       \
-                  YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C) \
-  TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,       \
-                  YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
+#define TESTPLANARTOBD(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+                       YALIGN, FMT_C, BPP_C)                                  \
+  TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                  YALIGN, benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C)     \
+  TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                  YALIGN, benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C)   \
+  TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                  YALIGN, benchmark_width_, _Invert, -, 0, FMT_C, BPP_C)      \
+  TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                  YALIGN, benchmark_width_, _Opt, +, 0, FMT_C, BPP_C)
 
-TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, 9, ARGB, 4)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, ARGB, 4)
+#endif
 
 #define TESTPTOB(NAME, UYVYTOI420, UYVYTONV12)                                \
   TEST_F(LibYUVConvertTest, NAME) {                                           \
@@ -1783,12 +2640,14 @@ TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12)
                  benchmark_width_, _Opt, +, 0, FMT_C, BPP_C)
 
 TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTPLANARTOE(I420, 2, 2, ABGR, 1, 4, ARGB, 4)
 TESTPLANARTOE(J420, 2, 2, ARGB, 1, 4, ARGB, 4)
 TESTPLANARTOE(J420, 2, 2, ABGR, 1, 4, ARGB, 4)
 TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, ARGB, 4)
 TESTPLANARTOE(H420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(U420, 2, 2, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(U420, 2, 2, ABGR, 1, 4, ARGB, 4)
 TESTPLANARTOE(I420, 2, 2, BGRA, 1, 4, ARGB, 4)
-TESTPLANARTOE(I420, 2, 2, ABGR, 1, 4, ARGB, 4)
 TESTPLANARTOE(I420, 2, 2, RGBA, 1, 4, ARGB, 4)
 TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, ARGB, 4)
 TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, RGB24, 3)
@@ -1800,20 +2659,30 @@ TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, RGB24, 3)
 TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, RAW, 3)
 TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, RAW, 3)
 TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, ARGB, 4)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
 TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB565, 2)
 TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB1555, 2)
 TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB4444, 2)
 TESTPLANARTOE(I422, 2, 1, ARGB, 1, 4, RGB565, 2)
+#endif
+TESTPLANARTOE(I422, 2, 1, ARGB, 1, 4, ABGR, 4)
+TESTPLANARTOE(I422, 2, 1, ABGR, 1, 4, ARGB, 4)
 TESTPLANARTOE(J422, 2, 1, ARGB, 1, 4, ARGB, 4)
 TESTPLANARTOE(J422, 2, 1, ABGR, 1, 4, ARGB, 4)
 TESTPLANARTOE(H422, 2, 1, ARGB, 1, 4, ARGB, 4)
 TESTPLANARTOE(H422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(U422, 2, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(U422, 2, 1, ABGR, 1, 4, ARGB, 4)
 TESTPLANARTOE(I422, 2, 1, BGRA, 1, 4, ARGB, 4)
-TESTPLANARTOE(I422, 2, 1, ABGR, 1, 4, ARGB, 4)
 TESTPLANARTOE(I422, 2, 1, RGBA, 1, 4, ARGB, 4)
-TESTPLANARTOE(I444, 1, 1, ARGB, 1, 4, ARGB, 4)
-TESTPLANARTOE(J444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(I444, 1, 1, ARGB, 1, 4, ABGR, 4)
 TESTPLANARTOE(I444, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(J444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(J444, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(H444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(H444, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(U444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(U444, 1, 1, ABGR, 1, 4, ARGB, 4)
 TESTPLANARTOE(I420, 2, 2, YUY2, 2, 4, ARGB, 4)
 TESTPLANARTOE(I420, 2, 2, UYVY, 2, 4, ARGB, 4)
 TESTPLANARTOE(I422, 2, 1, YUY2, 2, 4, ARGB, 4)
@@ -1887,6 +2756,12 @@ TESTPLANARTOE(I422, 2, 1, UYVY, 2, 4, ARGB, 4)
 
 TESTQPLANARTOE(I420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
 TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(J420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(J420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(H420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(H420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(U420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(U420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
 
 #define TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, W1280, N, NEG, \
                       OFF, FMT_C, BPP_C)                                       \
@@ -1937,6 +2812,7 @@ TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
                 _Opt, +, 0, FMT_C, BPP_C)
 
 // Caveat: Destination needs to be 4 bytes
+#ifdef LITTLE_ENDIAN_ONLY_TEST
 TESTPLANETOE(ARGB, 1, 4, AR30, 1, 4, ARGB, 4)
 TESTPLANETOE(ABGR, 1, 4, AR30, 1, 4, ABGR, 4)
 TESTPLANETOE(AR30, 1, 4, ARGB, 1, 4, ABGR, 4)
@@ -1945,6 +2821,7 @@ TESTPLANETOE(ARGB, 1, 4, AB30, 1, 4, ARGB, 4)
 TESTPLANETOE(ABGR, 1, 4, AB30, 1, 4, ABGR, 4)
 TESTPLANETOE(AB30, 1, 4, ARGB, 1, 4, ABGR, 4)
 TESTPLANETOE(AB30, 1, 4, ABGR, 1, 4, ARGB, 4)
+#endif
 
 TEST_F(LibYUVConvertTest, RotateWithARGBSource) {
   // 2x2 frames
@@ -2051,7 +2928,7 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
 
 // TODO(fbarchard): Fix clamping issue affected by U channel.
 #define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,   \
-                         ALIGN, YALIGN, W1280, DIFF, N, NEG, SOFF, DOFF)   \
+                         ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF)         \
   TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                    \
     const int kWidth = ((W1280) > 0) ? (W1280) : 1;                        \
     const int kHeight = ALIGNINT(benchmark_height_, YALIGN);               \
@@ -2087,15 +2964,9 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
           reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV,            \
           dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight);             \
     }                                                                      \
-    int max_diff = 0;                                                      \
     for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                   \
-      int abs_diff = abs(static_cast<int>(dst_argb_c[i + DOFF]) -          \
-                         static_cast<int>(dst_argb_opt[i + DOFF]));        \
-      if (abs_diff > max_diff) {                                           \
-        max_diff = abs_diff;                                               \
-      }                                                                    \
+      EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]);             \
     }                                                                      \
-    EXPECT_LE(max_diff, DIFF);                                             \
     free_aligned_buffer_page_end(src_y);                                   \
     free_aligned_buffer_page_end(src_u);                                   \
     free_aligned_buffer_page_end(src_v);                                   \
@@ -2104,24 +2975,42 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
   }
 
 #define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
-                        YALIGN, DIFF)                                          \
+                        YALIGN)                                                \
   TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                   YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0)          \
+                   YALIGN, benchmark_width_ - 4, _Any, +, 0, 0)                \
   TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                   YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 1)        \
+                   YALIGN, benchmark_width_, _Unaligned, +, 1, 1)              \
   TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                   YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0)           \
+                   YALIGN, benchmark_width_, _Invert, -, 0, 0)                 \
   TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                   YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0)
-
-TESTPLANAR16TOB(I010, 2, 2, ARGB, 4, 4, 1, 2)
-TESTPLANAR16TOB(I010, 2, 2, ABGR, 4, 4, 1, 2)
-TESTPLANAR16TOB(I010, 2, 2, AR30, 4, 4, 1, 2)
-TESTPLANAR16TOB(I010, 2, 2, AB30, 4, 4, 1, 2)
-TESTPLANAR16TOB(H010, 2, 2, ARGB, 4, 4, 1, 2)
-TESTPLANAR16TOB(H010, 2, 2, ABGR, 4, 4, 1, 2)
-TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1, 2)
-TESTPLANAR16TOB(H010, 2, 2, AB30, 4, 4, 1, 2)
+                   YALIGN, benchmark_width_, _Opt, +, 0, 0)
+
+TESTPLANAR16TOB(I010, 2, 2, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(I010, 2, 2, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, ABGR, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTPLANAR16TOB(I010, 2, 2, AR30, 4, 4, 1)
+TESTPLANAR16TOB(I010, 2, 2, AB30, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, AB30, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, AR30, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, AB30, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, AR30, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, AB30, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, AR30, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, AB30, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, AR30, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, AB30, 4, 4, 1)
+#endif
 
 static int Clamp(int y) {
   if (y < 0) {
@@ -2266,7 +3155,8 @@ TEST_F(LibYUVConvertTest, TestH010ToARGB) {
 }
 
 // Test 10 bit YUV to 10 bit RGB
-// Caveat: Result is near due to float rounding in expected result.
+// Caveat: Result is near due to float rounding in expected
+// result.
 TEST_F(LibYUVConvertTest, TestH010ToAR30) {
   const int kSize = 1024;
   int histogram_b[1024];
@@ -2329,7 +3219,8 @@ TEST_F(LibYUVConvertTest, TestH010ToAR30) {
 }
 
 // Test 10 bit YUV to 10 bit RGB
-// Caveat: Result is near due to float rounding in expected result.
+// Caveat: Result is near due to float rounding in expected
+// result.
 TEST_F(LibYUVConvertTest, TestH010ToAB30) {
   const int kSize = 1024;
   int histogram_b[1024];
@@ -2477,4 +3368,66 @@ TEST_F(LibYUVConvertTest, TestARGBToRGB24) {
   free_aligned_buffer_page_end(dest_rgb24);
 }
 
+// Test I400 with jpeg matrix is same as J400
+TEST_F(LibYUVConvertTest, TestI400) {
+  const int kSize = 256;
+  align_buffer_page_end(orig_i400, kSize);
+  align_buffer_page_end(argb_pixels_i400, kSize * 4);
+  align_buffer_page_end(argb_pixels_j400, kSize * 4);
+  align_buffer_page_end(argb_pixels_jpeg_i400, kSize * 4);
+  align_buffer_page_end(argb_pixels_h709_i400, kSize * 4);
+  align_buffer_page_end(argb_pixels_2020_i400, kSize * 4);
+
+  // Test grey scale
+  for (int i = 0; i < kSize; ++i) {
+    orig_i400[i] = i;
+  }
+
+  J400ToARGB(orig_i400, 0, argb_pixels_j400, 0, kSize, 1);
+  I400ToARGB(orig_i400, 0, argb_pixels_i400, 0, kSize, 1);
+  I400ToARGBMatrix(orig_i400, 0, argb_pixels_jpeg_i400, 0, &kYuvJPEGConstants,
+                   kSize, 1);
+  I400ToARGBMatrix(orig_i400, 0, argb_pixels_h709_i400, 0, &kYuvH709Constants,
+                   kSize, 1);
+  I400ToARGBMatrix(orig_i400, 0, argb_pixels_2020_i400, 0, &kYuv2020Constants,
+                   kSize, 1);
+
+  EXPECT_EQ(0, argb_pixels_i400[0]);
+  EXPECT_EQ(0, argb_pixels_j400[0]);
+  EXPECT_EQ(0, argb_pixels_jpeg_i400[0]);
+  EXPECT_EQ(0, argb_pixels_h709_i400[0]);
+  EXPECT_EQ(0, argb_pixels_2020_i400[0]);
+  EXPECT_EQ(0, argb_pixels_i400[16 * 4]);
+  EXPECT_EQ(16, argb_pixels_j400[16 * 4]);
+  EXPECT_EQ(16, argb_pixels_jpeg_i400[16 * 4]);
+  EXPECT_EQ(0, argb_pixels_h709_i400[16 * 4]);
+  EXPECT_EQ(0, argb_pixels_2020_i400[16 * 4]);
+  EXPECT_EQ(130, argb_pixels_i400[128 * 4]);
+  EXPECT_EQ(128, argb_pixels_j400[128 * 4]);
+  EXPECT_EQ(128, argb_pixels_jpeg_i400[128 * 4]);
+  EXPECT_EQ(130, argb_pixels_h709_i400[128 * 4]);
+  EXPECT_EQ(130, argb_pixels_2020_i400[128 * 4]);
+  EXPECT_EQ(255, argb_pixels_i400[255 * 4]);
+  EXPECT_EQ(255, argb_pixels_j400[255 * 4]);
+  EXPECT_EQ(255, argb_pixels_jpeg_i400[255 * 4]);
+  EXPECT_EQ(255, argb_pixels_h709_i400[255 * 4]);
+  EXPECT_EQ(255, argb_pixels_2020_i400[255 * 4]);
+
+  for (int i = 0; i < kSize * 4; ++i) {
+    if ((i & 3) == 3) {
+      EXPECT_EQ(255, argb_pixels_j400[i]);
+    } else {
+      EXPECT_EQ(i / 4, argb_pixels_j400[i]);
+    }
+    EXPECT_EQ(argb_pixels_jpeg_i400[i], argb_pixels_j400[i]);
+  }
+
+  free_aligned_buffer_page_end(orig_i400);
+  free_aligned_buffer_page_end(argb_pixels_i400);
+  free_aligned_buffer_page_end(argb_pixels_j400);
+  free_aligned_buffer_page_end(argb_pixels_jpeg_i400);
+  free_aligned_buffer_page_end(argb_pixels_h709_i400);
+  free_aligned_buffer_page_end(argb_pixels_2020_i400);
+}
+
 }  // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/cpu_test.cc b/chromium/third_party/libyuv/unit_test/cpu_test.cc
index c4648bb949f..7264de08016 100644
--- a/chromium/third_party/libyuv/unit_test/cpu_test.cc
+++ b/chromium/third_party/libyuv/unit_test/cpu_test.cc
@@ -67,6 +67,8 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
   printf("Has MIPS %d\n", has_mips);
   int has_msa = TestCpuFlag(kCpuHasMSA);
   printf("Has MSA %d\n", has_msa);
+  int has_mmi = TestCpuFlag(kCpuHasMMI);
+  printf("Has MMI %d\n", has_mmi);
 #endif
 }
 
@@ -158,7 +160,29 @@ TEST_F(LibYUVBaseTest, TestLinuxNeon) {
 #endif
 }
 
+TEST_F(LibYUVBaseTest, TestLinuxMipsMsaMmi) {
+  if (FileExists("../../unit_test/testdata/mips.txt")) {
+    printf("Note: testing to load \"../../unit_test/testdata/mips.txt\"\n");
+
+    EXPECT_EQ(0, MipsCpuCaps("../../unit_test/testdata/mips.txt"));
+    EXPECT_EQ(kCpuHasMMI,
+              MipsCpuCaps("../../unit_test/testdata/mips_loongson3.txt"));
+    EXPECT_EQ(kCpuHasMMI,
+              MipsCpuCaps("../../unit_test/testdata/mips_loongson_mmi.txt"));
+    EXPECT_EQ(kCpuHasMSA, MipsCpuCaps("../../unit_test/testdata/mips_msa.txt"));
+    EXPECT_EQ(kCpuHasMMI | kCpuHasMSA,
+              MipsCpuCaps("../../unit_test/testdata/mips_loongson2k.txt"));
+  } else {
+    printf("WARNING: unable to load \"../../unit_test/testdata/mips.txt\"\n");
+  }
+}
+
+// TODO(fbarchard): Fix clangcl test of cpuflags.
+#ifdef _MSC_VER
+TEST_F(LibYUVBaseTest, DISABLED_TestSetCpuFlags) {
+#else
 TEST_F(LibYUVBaseTest, TestSetCpuFlags) {
+#endif
   // Reset any masked flags that may have been set so auto init is enabled.
   MaskCpuFlags(0);
 
diff --git a/chromium/third_party/libyuv/unit_test/cpu_thread_test.cc b/chromium/third_party/libyuv/unit_test/cpu_thread_test.cc
index 59061b98e0b..69aab74e7c8 100644
--- a/chromium/third_party/libyuv/unit_test/cpu_thread_test.cc
+++ b/chromium/third_party/libyuv/unit_test/cpu_thread_test.cc
@@ -12,7 +12,7 @@
 
 #include "libyuv/cpu_id.h"
 
-#if defined(__clang__)
+#if defined(__clang__) && !defined(__wasm__)
 #if __has_include(<pthread.h>)
 #define LIBYUV_HAVE_PTHREAD 1
 #endif
@@ -30,7 +30,7 @@ namespace libyuv {
 void* ThreadMain(void* arg) {
   int* flags = static_cast<int*>(arg);
 
-  *flags = TestCpuFlag(kCpuHasSSSE3);
+  *flags = TestCpuFlag(kCpuInitialized);
   return nullptr;
 }
 #endif  // LIBYUV_HAVE_PTHREAD
diff --git a/chromium/third_party/libyuv/unit_test/math_test.cc b/chromium/third_party/libyuv/unit_test/math_test.cc
index 0abbad51321..a1544c122b5 100644
--- a/chromium/third_party/libyuv/unit_test/math_test.cc
+++ b/chromium/third_party/libyuv/unit_test/math_test.cc
@@ -16,10 +16,14 @@
 #include "libyuv/basic_types.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/scale.h"
+
+#ifdef ENABLE_ROW_TESTS
 #include "libyuv/scale_row.h"
+#endif
 
 namespace libyuv {
 
+#ifdef ENABLE_ROW_TESTS
 TEST_F(LibYUVBaseTest, TestFixedDiv) {
   int num[1280];
   int div[1280];
@@ -151,5 +155,6 @@ TEST_F(LibYUVBaseTest, TestFixedDiv1_Opt) {
     EXPECT_NEAR(result_c[j], result_opt[j], 1);
   }
 }
+#endif  // ENABLE_ROW_TESTS
 
 }  // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/planar_test.cc b/chromium/third_party/libyuv/unit_test/planar_test.cc
index 756089558f7..e05ff15640c 100644
--- a/chromium/third_party/libyuv/unit_test/planar_test.cc
+++ b/chromium/third_party/libyuv/unit_test/planar_test.cc
@@ -12,9 +12,6 @@
 #include <stdlib.h>
 #include <time.h>
 
-// row.h defines SIMD_ALIGNED, overriding unit_test.h
-#include "libyuv/row.h" /* For ScaleSumSamples_Neon */
-
 #include "../unit_test/unit_test.h"
 #include "libyuv/compare.h"
 #include "libyuv/convert.h"
@@ -24,6 +21,13 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
+#include "libyuv/scale.h"
+
+#ifdef ENABLE_ROW_TESTS
+// row.h defines SIMD_ALIGNED, overriding unit_test.h
+// TODO(fbarchard): Remove row.h from unittests.  Test public functions.
+#include "libyuv/row.h" /* For ScaleSumSamples_Neon */
+#endif
 
 namespace libyuv {
 
@@ -277,6 +281,7 @@ TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) {
   }
 }
 
+// near is for legacy platforms.
 TEST_F(LibYUVPlanarTest, TestARGBGray) {
   SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
   memset(orig_pixels, 0, sizeof(orig_pixels));
@@ -313,17 +318,17 @@ TEST_F(LibYUVPlanarTest, TestARGBGray) {
   orig_pixels[5][3] = 224u;
   // Do 16 to test asm version.
   ARGBGray(&orig_pixels[0][0], 0, 0, 0, 16, 1);
-  EXPECT_EQ(30u, orig_pixels[0][0]);
-  EXPECT_EQ(30u, orig_pixels[0][1]);
-  EXPECT_EQ(30u, orig_pixels[0][2]);
+  EXPECT_NEAR(29u, orig_pixels[0][0], 1);
+  EXPECT_NEAR(29u, orig_pixels[0][1], 1);
+  EXPECT_NEAR(29u, orig_pixels[0][2], 1);
   EXPECT_EQ(128u, orig_pixels[0][3]);
   EXPECT_EQ(149u, orig_pixels[1][0]);
   EXPECT_EQ(149u, orig_pixels[1][1]);
   EXPECT_EQ(149u, orig_pixels[1][2]);
   EXPECT_EQ(0u, orig_pixels[1][3]);
-  EXPECT_EQ(76u, orig_pixels[2][0]);
-  EXPECT_EQ(76u, orig_pixels[2][1]);
-  EXPECT_EQ(76u, orig_pixels[2][2]);
+  EXPECT_NEAR(77u, orig_pixels[2][0], 1);
+  EXPECT_NEAR(77u, orig_pixels[2][1], 1);
+  EXPECT_NEAR(77u, orig_pixels[2][2], 1);
   EXPECT_EQ(255u, orig_pixels[2][3]);
   EXPECT_EQ(0u, orig_pixels[3][0]);
   EXPECT_EQ(0u, orig_pixels[3][1]);
@@ -333,9 +338,9 @@ TEST_F(LibYUVPlanarTest, TestARGBGray) {
   EXPECT_EQ(255u, orig_pixels[4][1]);
   EXPECT_EQ(255u, orig_pixels[4][2]);
   EXPECT_EQ(255u, orig_pixels[4][3]);
-  EXPECT_EQ(96u, orig_pixels[5][0]);
-  EXPECT_EQ(96u, orig_pixels[5][1]);
-  EXPECT_EQ(96u, orig_pixels[5][2]);
+  EXPECT_NEAR(97u, orig_pixels[5][0], 1);
+  EXPECT_NEAR(97u, orig_pixels[5][1], 1);
+  EXPECT_NEAR(97u, orig_pixels[5][2], 1);
   EXPECT_EQ(224u, orig_pixels[5][3]);
   for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
@@ -385,30 +390,30 @@ TEST_F(LibYUVPlanarTest, TestARGBGrayTo) {
   orig_pixels[5][3] = 224u;
   // Do 16 to test asm version.
   ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 16, 1);
-  EXPECT_EQ(30u, gray_pixels[0][0]);
-  EXPECT_EQ(30u, gray_pixels[0][1]);
-  EXPECT_EQ(30u, gray_pixels[0][2]);
-  EXPECT_EQ(128u, gray_pixels[0][3]);
-  EXPECT_EQ(149u, gray_pixels[1][0]);
-  EXPECT_EQ(149u, gray_pixels[1][1]);
-  EXPECT_EQ(149u, gray_pixels[1][2]);
-  EXPECT_EQ(0u, gray_pixels[1][3]);
-  EXPECT_EQ(76u, gray_pixels[2][0]);
-  EXPECT_EQ(76u, gray_pixels[2][1]);
-  EXPECT_EQ(76u, gray_pixels[2][2]);
-  EXPECT_EQ(255u, gray_pixels[2][3]);
-  EXPECT_EQ(0u, gray_pixels[3][0]);
-  EXPECT_EQ(0u, gray_pixels[3][1]);
-  EXPECT_EQ(0u, gray_pixels[3][2]);
-  EXPECT_EQ(255u, gray_pixels[3][3]);
-  EXPECT_EQ(255u, gray_pixels[4][0]);
-  EXPECT_EQ(255u, gray_pixels[4][1]);
-  EXPECT_EQ(255u, gray_pixels[4][2]);
-  EXPECT_EQ(255u, gray_pixels[4][3]);
-  EXPECT_EQ(96u, gray_pixels[5][0]);
-  EXPECT_EQ(96u, gray_pixels[5][1]);
-  EXPECT_EQ(96u, gray_pixels[5][2]);
-  EXPECT_EQ(224u, gray_pixels[5][3]);
+  EXPECT_NEAR(30u, gray_pixels[0][0], 1);
+  EXPECT_NEAR(30u, gray_pixels[0][1], 1);
+  EXPECT_NEAR(30u, gray_pixels[0][2], 1);
+  EXPECT_NEAR(128u, gray_pixels[0][3], 1);
+  EXPECT_NEAR(149u, gray_pixels[1][0], 1);
+  EXPECT_NEAR(149u, gray_pixels[1][1], 1);
+  EXPECT_NEAR(149u, gray_pixels[1][2], 1);
+  EXPECT_NEAR(0u, gray_pixels[1][3], 1);
+  EXPECT_NEAR(76u, gray_pixels[2][0], 1);
+  EXPECT_NEAR(76u, gray_pixels[2][1], 1);
+  EXPECT_NEAR(76u, gray_pixels[2][2], 1);
+  EXPECT_NEAR(255u, gray_pixels[2][3], 1);
+  EXPECT_NEAR(0u, gray_pixels[3][0], 1);
+  EXPECT_NEAR(0u, gray_pixels[3][1], 1);
+  EXPECT_NEAR(0u, gray_pixels[3][2], 1);
+  EXPECT_NEAR(255u, gray_pixels[3][3], 1);
+  EXPECT_NEAR(255u, gray_pixels[4][0], 1);
+  EXPECT_NEAR(255u, gray_pixels[4][1], 1);
+  EXPECT_NEAR(255u, gray_pixels[4][2], 1);
+  EXPECT_NEAR(255u, gray_pixels[4][3], 1);
+  EXPECT_NEAR(96u, gray_pixels[5][0], 1);
+  EXPECT_NEAR(96u, gray_pixels[5][1], 1);
+  EXPECT_NEAR(96u, gray_pixels[5][2], 1);
+  EXPECT_NEAR(224u, gray_pixels[5][3], 1);
   for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
     orig_pixels[i][1] = i / 2;
@@ -418,6 +423,20 @@ TEST_F(LibYUVPlanarTest, TestARGBGrayTo) {
   for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
     ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 1280, 1);
   }
+
+  for (int i = 0; i < 256; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i;
+    orig_pixels[i][2] = i;
+    orig_pixels[i][3] = i;
+  }
+  ARGBGray(&orig_pixels[0][0], 0, 0, 0, 256, 1);
+  for (int i = 0; i < 256; ++i) {
+    EXPECT_EQ(i, orig_pixels[i][0]);
+    EXPECT_EQ(i, orig_pixels[i][1]);
+    EXPECT_EQ(i, orig_pixels[i][2]);
+    EXPECT_EQ(i, orig_pixels[i][3]);
+  }
 }
 
 TEST_F(LibYUVPlanarTest, TestARGBSepia) {
@@ -763,27 +782,75 @@ TEST_F(LibYUVPlanarTest, TestARGBQuantize) {
   }
 }
 
-TEST_F(LibYUVPlanarTest, TestARGBMirror) {
-  SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
-  SIMD_ALIGNED(uint8_t dst_pixels[1280][4]);
+TEST_F(LibYUVPlanarTest, ARGBMirror_Opt) {
+  align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 4);
+  align_buffer_page_end(dst_pixels_opt,
+                        benchmark_width_ * benchmark_height_ * 4);
+  align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 4);
 
-  for (int i = 0; i < 1280; ++i) {
-    orig_pixels[i][0] = i;
-    orig_pixels[i][1] = i / 2;
-    orig_pixels[i][2] = i / 3;
-    orig_pixels[i][3] = i / 4;
+  MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 4);
+  MaskCpuFlags(disable_cpu_flags_);
+  ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_c,
+             benchmark_width_ * 4, benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_opt,
+               benchmark_width_ * 4, benchmark_width_, benchmark_height_);
   }
-  ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
+  for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(dst_pixels_c);
+}
 
-  for (int i = 0; i < 1280; ++i) {
-    EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i][0]);
-    EXPECT_EQ((i / 2) & 255, dst_pixels[1280 - 1 - i][1]);
-    EXPECT_EQ((i / 3) & 255, dst_pixels[1280 - 1 - i][2]);
-    EXPECT_EQ((i / 4) & 255, dst_pixels[1280 - 1 - i][3]);
+TEST_F(LibYUVPlanarTest, MirrorPlane_Opt) {
+  align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(dst_pixels_opt, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_);
+
+  MemRandomize(src_pixels, benchmark_width_ * benchmark_height_);
+  MaskCpuFlags(disable_cpu_flags_);
+  MirrorPlane(src_pixels, benchmark_width_, dst_pixels_c, benchmark_width_,
+              benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    MirrorPlane(src_pixels, benchmark_width_, dst_pixels_opt, benchmark_width_,
+                benchmark_width_, benchmark_height_);
   }
-  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
-    ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(dst_pixels_c);
+}
+
+TEST_F(LibYUVPlanarTest, MirrorUVPlane_Opt) {
+  align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 2);
+  align_buffer_page_end(dst_pixels_opt,
+                        benchmark_width_ * benchmark_height_ * 2);
+  align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 2);
+
+  MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 2);
+  MaskCpuFlags(disable_cpu_flags_);
+  MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_c,
+                benchmark_width_ * 2, benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_opt,
+                  benchmark_width_ * 2, benchmark_width_, benchmark_height_);
   }
+  for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(dst_pixels_c);
 }
 
 TEST_F(LibYUVPlanarTest, TestShade) {
@@ -1058,7 +1125,8 @@ static int TestBlend(int width,
                      int disable_cpu_flags,
                      int benchmark_cpu_info,
                      int invert,
-                     int off) {
+                     int off,
+                     int attenuate) {
   if (width < 1) {
     width = 1;
   }
@@ -1072,10 +1140,12 @@ static int TestBlend(int width,
     src_argb_a[i + off] = (fastrand() & 0xff);
     src_argb_b[i + off] = (fastrand() & 0xff);
   }
-  ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width,
-                height);
-  ARGBAttenuate(src_argb_b + off, kStride, src_argb_b + off, kStride, width,
-                height);
+  MemRandomize(src_argb_a, kStride * height + off);
+  MemRandomize(src_argb_b, kStride * height + off);
+  if (attenuate) {
+    ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width,
+                  height);
+  }
   memset(dst_argb_c, 255, kStride * height);
   memset(dst_argb_opt, 255, kStride * height);
 
@@ -1105,28 +1175,35 @@ static int TestBlend(int width,
 TEST_F(LibYUVPlanarTest, ARGBBlend_Any) {
   int max_diff =
       TestBlend(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_,
-                disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+                disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1);
   EXPECT_LE(max_diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBBlend_Unaligned) {
   int max_diff =
       TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
-                disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+                disable_cpu_flags_, benchmark_cpu_info_, +1, 1, 1);
   EXPECT_LE(max_diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBBlend_Invert) {
   int max_diff =
       TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
-                disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+                disable_cpu_flags_, benchmark_cpu_info_, -1, 0, 1);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlend_Unattenuated) {
+  int max_diff =
+      TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
+                disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 0);
   EXPECT_LE(max_diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBBlend_Opt) {
   int max_diff =
       TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
-                disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+                disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1);
   EXPECT_LE(max_diff, 1);
 }
 
@@ -2321,7 +2398,8 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyAlpha) {
 }
 
 TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) {
-  const int kPixels = benchmark_width_ * benchmark_height_;
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
   align_buffer_page_end(src_pixels, kPixels * 4);
   align_buffer_page_end(dst_pixels_opt, kPixels);
   align_buffer_page_end(dst_pixels_c, kPixels);
@@ -2349,7 +2427,8 @@ TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) {
 }
 
 TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) {
-  const int kPixels = benchmark_width_ * benchmark_height_;
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
   align_buffer_page_end(orig_pixels, kPixels);
   align_buffer_page_end(dst_pixels_opt, kPixels * 4);
   align_buffer_page_end(dst_pixels_c, kPixels * 4);
@@ -2482,7 +2561,8 @@ TEST_F(LibYUVPlanarTest, SetPlane_Opt) {
 }
 
 TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
-  const int kPixels = benchmark_width_ * benchmark_height_;
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
   align_buffer_page_end(src_pixels, kPixels * 2);
   align_buffer_page_end(tmp_pixels_u, kPixels);
   align_buffer_page_end(tmp_pixels_v, kPixels);
@@ -2526,7 +2606,8 @@ TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
 }
 
 TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
-  const int kPixels = benchmark_width_ * benchmark_height_;
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
   align_buffer_page_end(src_pixels, kPixels * 2);
   align_buffer_page_end(tmp_pixels_u, kPixels);
   align_buffer_page_end(tmp_pixels_v, kPixels);
@@ -2568,8 +2649,39 @@ TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
   free_aligned_buffer_page_end(dst_pixels_c);
 }
 
+TEST_F(LibYUVPlanarTest, SwapUVPlane_Opt) {
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
+  align_buffer_page_end(src_pixels, kPixels * 2);
+  align_buffer_page_end(dst_pixels_opt, kPixels * 2);
+  align_buffer_page_end(dst_pixels_c, kPixels * 2);
+
+  MemRandomize(src_pixels, kPixels * 2);
+  MemRandomize(dst_pixels_opt, kPixels * 2);
+  MemRandomize(dst_pixels_c, kPixels * 2);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  SwapUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_c,
+              benchmark_width_ * 2, benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    SwapUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_opt,
+                benchmark_width_ * 2, benchmark_width_, benchmark_height_);
+  }
+
+  for (int i = 0; i < kPixels * 2; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(dst_pixels_c);
+}
+
 TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) {
-  const int kPixels = benchmark_width_ * benchmark_height_;
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
   align_buffer_page_end(src_pixels, kPixels * 3);
   align_buffer_page_end(tmp_pixels_r, kPixels);
   align_buffer_page_end(tmp_pixels_g, kPixels);
@@ -2617,7 +2729,8 @@ TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) {
 }
 
 TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
-  const int kPixels = benchmark_width_ * benchmark_height_;
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
   align_buffer_page_end(src_pixels, kPixels * 3);
   align_buffer_page_end(tmp_pixels_r, kPixels);
   align_buffer_page_end(tmp_pixels_g, kPixels);
@@ -2666,7 +2779,8 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
 // TODO(fbarchard): improve test for platforms and cpu detect
 #ifdef HAS_MERGEUVROW_16_AVX2
 TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
-  const int kPixels = benchmark_width_ * benchmark_height_;
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
   align_buffer_page_end(src_pixels_u, kPixels * 2);
   align_buffer_page_end(src_pixels_v, kPixels * 2);
   align_buffer_page_end(dst_pixels_uv_opt, kPixels * 2 * 2);
@@ -2710,7 +2824,8 @@ TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
 // TODO(fbarchard): Improve test for more platforms.
 #ifdef HAS_MULTIPLYROW_16_AVX2
 TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) {
-  const int kPixels = benchmark_width_ * benchmark_height_;
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
   align_buffer_page_end(src_pixels_y, kPixels * 2);
   align_buffer_page_end(dst_pixels_y_opt, kPixels * 2);
   align_buffer_page_end(dst_pixels_y_c, kPixels * 2);
@@ -2746,7 +2861,8 @@ TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) {
 #endif  // HAS_MULTIPLYROW_16_AVX2
 
 TEST_F(LibYUVPlanarTest, Convert16To8Plane) {
-  const int kPixels = benchmark_width_ * benchmark_height_;
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
   align_buffer_page_end(src_pixels_y, kPixels * 2);
   align_buffer_page_end(dst_pixels_y_opt, kPixels);
   align_buffer_page_end(dst_pixels_y_c, kPixels);
@@ -2776,6 +2892,7 @@ TEST_F(LibYUVPlanarTest, Convert16To8Plane) {
   free_aligned_buffer_page_end(dst_pixels_y_c);
 }
 
+#ifdef ENABLE_ROW_TESTS
 // TODO(fbarchard): Improve test for more platforms.
 #ifdef HAS_CONVERT16TO8ROW_AVX2
 TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
@@ -2821,9 +2938,11 @@ TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
   free_aligned_buffer_page_end(dst_pixels_y_c);
 }
 #endif  // HAS_CONVERT16TO8ROW_AVX2
+#endif  // ENABLE_ROW_TESTS
 
 TEST_F(LibYUVPlanarTest, Convert8To16Plane) {
-  const int kPixels = benchmark_width_ * benchmark_height_;
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
   align_buffer_page_end(src_pixels_y, kPixels);
   align_buffer_page_end(dst_pixels_y_opt, kPixels * 2);
   align_buffer_page_end(dst_pixels_y_c, kPixels * 2);
@@ -2855,6 +2974,7 @@ TEST_F(LibYUVPlanarTest, Convert8To16Plane) {
   free_aligned_buffer_page_end(dst_pixels_y_c);
 }
 
+#ifdef ENABLE_ROW_TESTS
 // TODO(fbarchard): Improve test for more platforms.
 #ifdef HAS_CONVERT8TO16ROW_AVX2
 TEST_F(LibYUVPlanarTest, Convert8To16Row_Opt) {
@@ -3173,32 +3293,33 @@ extern "C" void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width);
 extern "C" void GaussRow_C(const uint32_t* src, uint16_t* dst, int width);
 
 TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
-  SIMD_ALIGNED(uint32_t orig_pixels[640 + 4]);
-  SIMD_ALIGNED(uint16_t dst_pixels_c[640]);
-  SIMD_ALIGNED(uint16_t dst_pixels_opt[640]);
+  SIMD_ALIGNED(uint32_t orig_pixels[1280 + 8]);
+  SIMD_ALIGNED(uint16_t dst_pixels_c[1280]);
+  SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]);
 
   memset(orig_pixels, 0, sizeof(orig_pixels));
   memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
   memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
 
-  for (int i = 0; i < 640 + 4; ++i) {
+  for (int i = 0; i < 1280 + 8; ++i) {
     orig_pixels[i] = i * 256;
   }
-  GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 640);
-  for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) {
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+  GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 1280);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
     int has_neon = TestCpuFlag(kCpuHasNEON);
     if (has_neon) {
-      GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 640);
+      GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
     } else {
-      GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 640);
+      GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
     }
 #else
-    GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 640);
+    GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
 #endif
   }
 
-  for (int i = 0; i < 640; ++i) {
+  for (int i = 0; i < 1280; ++i) {
     EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
 
@@ -3224,47 +3345,285 @@ extern "C" void GaussCol_C(const uint16_t* src0,
                            int width);
 
 TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
-  SIMD_ALIGNED(uint16_t orig_pixels[640 * 5]);
-  SIMD_ALIGNED(uint32_t dst_pixels_c[640]);
-  SIMD_ALIGNED(uint32_t dst_pixels_opt[640]);
+  SIMD_ALIGNED(uint16_t orig_pixels[1280 * 5]);
+  SIMD_ALIGNED(uint32_t dst_pixels_c[1280]);
+  SIMD_ALIGNED(uint32_t dst_pixels_opt[1280]);
 
   memset(orig_pixels, 0, sizeof(orig_pixels));
   memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
   memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
 
-  for (int i = 0; i < 640 * 5; ++i) {
-    orig_pixels[i] = i;
+  for (int i = 0; i < 1280 * 5; ++i) {
+    orig_pixels[i] = static_cast<float>(i);
   }
-  GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
-             &orig_pixels[640 * 3], &orig_pixels[640 * 4], &dst_pixels_c[0],
-             640);
-  for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) {
+  GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+             &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_c[0],
+             1280);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+    int has_neon = TestCpuFlag(kCpuHasNEON);
+    if (has_neon) {
+      GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+                    &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+                    &dst_pixels_opt[0], 1280);
+    } else {
+      GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+                 &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+                 &dst_pixels_opt[0], 1280);
+    }
+#else
+    GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+               &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+               &dst_pixels_opt[0], 1280);
+#endif
+  }
+
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+}
+
+TEST_F(LibYUVPlanarTest, TestGaussRow_F32_Opt) {
+  SIMD_ALIGNED(float orig_pixels[1280 + 4]);
+  SIMD_ALIGNED(float dst_pixels_c[1280]);
+  SIMD_ALIGNED(float dst_pixels_opt[1280]);
+
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+  memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
+  memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
+
+  for (int i = 0; i < 1280 + 4; ++i) {
+    orig_pixels[i] = static_cast<float>(i);
+  }
+  GaussRow_F32_C(&orig_pixels[0], &dst_pixels_c[0], 1280);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
     int has_neon = TestCpuFlag(kCpuHasNEON);
     if (has_neon) {
-      GaussCol_NEON(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
-                    &orig_pixels[640 * 3], &orig_pixels[640 * 4],
-                    &dst_pixels_opt[0], 640);
+      GaussRow_F32_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
     } else {
-      GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
-                 &orig_pixels[640 * 3], &orig_pixels[640 * 4],
-                 &dst_pixels_opt[0], 640);
+      GaussRow_F32_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
     }
 #else
-    GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
-               &orig_pixels[640 * 3], &orig_pixels[640 * 4], &dst_pixels_opt[0],
-               640);
+    GaussRow_F32_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
 #endif
   }
 
-  for (int i = 0; i < 640; ++i) {
+  for (int i = 0; i < 1280; ++i) {
     EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
+}
 
-  EXPECT_EQ(dst_pixels_c[0],
-            static_cast<uint32_t>(0 * 1 + 640 * 4 + 640 * 2 * 6 + 640 * 3 * 4 +
-                                  640 * 4 * 1));
-  EXPECT_EQ(dst_pixels_c[639], static_cast<uint32_t>(30704));
+TEST_F(LibYUVPlanarTest, TestGaussCol_F32_Opt) {
+  SIMD_ALIGNED(float dst_pixels_c[1280]);
+  SIMD_ALIGNED(float dst_pixels_opt[1280]);
+  align_buffer_page_end(orig_pixels_buf, 1280 * 5 * 4);  // 5 rows
+  float* orig_pixels = reinterpret_cast<float*>(orig_pixels_buf);
+
+  memset(orig_pixels, 0, 1280 * 5 * 4);
+  memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
+  memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
+
+  for (int i = 0; i < 1280 * 5; ++i) {
+    orig_pixels[i] = static_cast<float>(i);
+  }
+  GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+                 &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+                 &dst_pixels_c[0], 1280);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+    int has_neon = TestCpuFlag(kCpuHasNEON);
+    if (has_neon) {
+      GaussCol_F32_NEON(&orig_pixels[0], &orig_pixels[1280],
+                        &orig_pixels[1280 * 2], &orig_pixels[1280 * 3],
+                        &orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280);
+    } else {
+      GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280],
+                     &orig_pixels[1280 * 2], &orig_pixels[1280 * 3],
+                     &orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280);
+    }
+#else
+    GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+                   &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+                   &dst_pixels_opt[0], 1280);
+#endif
+  }
+
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+  free_aligned_buffer_page_end(orig_pixels_buf);
+}
+
+TEST_F(LibYUVPlanarTest, SwapUVRow) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  void (*SwapUVRow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) =
+      SwapUVRow_C;
+
+  align_buffer_page_end(src_pixels_vu, kPixels * 2);
+  align_buffer_page_end(dst_pixels_uv, kPixels * 2);
+  MemRandomize(src_pixels_vu, kPixels * 2);
+  memset(dst_pixels_uv, 1, kPixels * 2);
+
+#if defined(HAS_SWAPUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SwapUVRow = SwapUVRow_Any_NEON;
+    if (IS_ALIGNED(kPixels, 16)) {
+      SwapUVRow = SwapUVRow_NEON;
+    }
+  }
+#endif
+
+  for (int j = 0; j < benchmark_iterations_; j++) {
+    SwapUVRow(src_pixels_vu, dst_pixels_uv, kPixels);
+  }
+  for (int i = 0; i < kPixels; ++i) {
+    EXPECT_EQ(dst_pixels_uv[i * 2 + 0], src_pixels_vu[i * 2 + 1]);
+    EXPECT_EQ(dst_pixels_uv[i * 2 + 1], src_pixels_vu[i * 2 + 0]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels_vu);
+  free_aligned_buffer_page_end(dst_pixels_uv);
+}
+#endif  // ENABLE_ROW_TESTS
+
+TEST_F(LibYUVPlanarTest, TestGaussPlane_F32) {
+  const int kSize = benchmark_width_ * benchmark_height_ * 4;
+  align_buffer_page_end(orig_pixels, kSize);
+  align_buffer_page_end(dst_pixels_opt, kSize);
+  align_buffer_page_end(dst_pixels_c, kSize);
+
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+    ((float*)(orig_pixels))[i] = (i & 1023) * 3.14f;
+  }
+  memset(dst_pixels_opt, 1, kSize);
+  memset(dst_pixels_c, 2, kSize);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
+                 (float*)(dst_pixels_c), benchmark_width_, benchmark_width_,
+                 benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
+                   (float*)(dst_pixels_opt), benchmark_width_, benchmark_width_,
+                   benchmark_height_);
+  }
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+    EXPECT_NEAR(((float*)(dst_pixels_c))[i], ((float*)(dst_pixels_opt))[i], 1.f)
+        << i;
+  }
+
+  free_aligned_buffer_page_end(dst_pixels_c);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVPlanarTest, HalfMergeUVPlane_Opt) {
+  int dst_width = (benchmark_width_ + 1) / 2;
+  int dst_height = (benchmark_height_ + 1) / 2;
+  align_buffer_page_end(src_pixels_u, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(src_pixels_v, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(tmp_pixels_u, dst_width * dst_height);
+  align_buffer_page_end(tmp_pixels_v, dst_width * dst_height);
+  align_buffer_page_end(dst_pixels_uv_opt, dst_width * 2 * dst_height);
+  align_buffer_page_end(dst_pixels_uv_c, dst_width * 2 * dst_height);
+
+  MemRandomize(src_pixels_u, benchmark_width_ * benchmark_height_);
+  MemRandomize(src_pixels_v, benchmark_width_ * benchmark_height_);
+  MemRandomize(tmp_pixels_u, dst_width * dst_height);
+  MemRandomize(tmp_pixels_v, dst_width * dst_height);
+  MemRandomize(dst_pixels_uv_opt, dst_width * 2 * dst_height);
+  MemRandomize(dst_pixels_uv_c, dst_width * 2 * dst_height);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  HalfMergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v,
+                   benchmark_width_, dst_pixels_uv_c, dst_width * 2,
+                   benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    HalfMergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v,
+                     benchmark_width_, dst_pixels_uv_opt, dst_width * 2,
+                     benchmark_width_, benchmark_height_);
+  }
+
+  for (int i = 0; i < dst_width * 2 * dst_height; ++i) {
+    EXPECT_EQ(dst_pixels_uv_c[i], dst_pixels_uv_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels_u);
+  free_aligned_buffer_page_end(src_pixels_v);
+  free_aligned_buffer_page_end(tmp_pixels_u);
+  free_aligned_buffer_page_end(tmp_pixels_v);
+  free_aligned_buffer_page_end(dst_pixels_uv_opt);
+  free_aligned_buffer_page_end(dst_pixels_uv_c);
+}
+
+TEST_F(LibYUVPlanarTest, NV12Copy) {
+  const int halfwidth = (benchmark_width_ + 1) >> 1;
+  const int halfheight = (benchmark_height_ + 1) >> 1;
+  align_buffer_page_end(src_y, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(src_uv, halfwidth * 2 * halfheight);
+  align_buffer_page_end(dst_y, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(dst_uv, halfwidth * 2 * halfheight);
+
+  MemRandomize(src_y, benchmark_width_ * benchmark_height_);
+  MemRandomize(src_uv, halfwidth * 2 * halfheight);
+  MemRandomize(dst_y, benchmark_width_ * benchmark_height_);
+  MemRandomize(dst_uv, halfwidth * 2 * halfheight);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    NV12Copy(src_y, benchmark_width_, src_uv, halfwidth * 2, dst_y,
+             benchmark_width_, dst_uv, halfwidth * 2, benchmark_width_,
+             benchmark_height_);
+  }
+
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+    EXPECT_EQ(src_y[i], dst_y[i]);
+  }
+  for (int i = 0; i < halfwidth * 2 * halfheight; ++i) {
+    EXPECT_EQ(src_uv[i], dst_uv[i]);
+  }
+
+  free_aligned_buffer_page_end(src_y);
+  free_aligned_buffer_page_end(src_uv);
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVPlanarTest, NV21Copy) {
+  const int halfwidth = (benchmark_width_ + 1) >> 1;
+  const int halfheight = (benchmark_height_ + 1) >> 1;
+  align_buffer_page_end(src_y, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(src_vu, halfwidth * 2 * halfheight);
+  align_buffer_page_end(dst_y, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(dst_vu, halfwidth * 2 * halfheight);
+
+  MemRandomize(src_y, benchmark_width_ * benchmark_height_);
+  MemRandomize(src_vu, halfwidth * 2 * halfheight);
+  MemRandomize(dst_y, benchmark_width_ * benchmark_height_);
+  MemRandomize(dst_vu, halfwidth * 2 * halfheight);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    NV21Copy(src_y, benchmark_width_, src_vu, halfwidth * 2, dst_y,
+             benchmark_width_, dst_vu, halfwidth * 2, benchmark_width_,
+             benchmark_height_);
+  }
+
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+    EXPECT_EQ(src_y[i], dst_y[i]);
+  }
+  for (int i = 0; i < halfwidth * 2 * halfheight; ++i) {
+    EXPECT_EQ(src_vu[i], dst_vu[i]);
+  }
+
+  free_aligned_buffer_page_end(src_y);
+  free_aligned_buffer_page_end(src_vu);
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_vu);
 }
 
 }  // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/rotate_argb_test.cc b/chromium/third_party/libyuv/unit_test/rotate_argb_test.cc
index d2003895961..3208b66a2ad 100644
--- a/chromium/third_party/libyuv/unit_test/rotate_argb_test.cc
+++ b/chromium/third_party/libyuv/unit_test/rotate_argb_test.cc
@@ -183,4 +183,46 @@ TEST_F(LibYUVRotateTest, DISABLED_RotatePlane270_Odd) {
                   benchmark_cpu_info_);
 }
 
+TEST_F(LibYUVRotateTest, RotatePlane90_TestStride) {
+  int argb_plane_size = benchmark_width_ * 4 * abs(benchmark_height_);
+
+  align_buffer_page_end(src_argb, argb_plane_size);
+  align_buffer_page_end(dst_argb, argb_plane_size);
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+                          benchmark_width_ * 4, benchmark_width_,
+                          benchmark_height_, kRotate0));
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+                          benchmark_width_ * 4 - 1, benchmark_width_ - 1,
+                          benchmark_height_, kRotate0));
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+                          benchmark_width_ * 4, benchmark_width_,
+                          benchmark_height_, kRotate180));
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+                          benchmark_width_ * 4 - 1, benchmark_width_ - 1,
+                          benchmark_height_, kRotate180));
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+                          abs(benchmark_height_) * 4, benchmark_width_,
+                          benchmark_height_, kRotate90));
+
+  EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+                           abs(benchmark_height_) * 4, benchmark_width_ - 1,
+                           benchmark_height_, kRotate90));
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+                          abs(benchmark_height_) * 4, benchmark_width_,
+                          benchmark_height_, kRotate270));
+
+  EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+                           abs(benchmark_height_) * 4, benchmark_width_ - 1,
+                           benchmark_height_, kRotate270));
+
+  free_aligned_buffer_page_end(dst_argb);
+  free_aligned_buffer_page_end(src_argb);
+}
+
 }  // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/rotate_test.cc b/chromium/third_party/libyuv/unit_test/rotate_test.cc
index d04b96e9c68..61941e63e0e 100644
--- a/chromium/third_party/libyuv/unit_test/rotate_test.cc
+++ b/chromium/third_party/libyuv/unit_test/rotate_test.cc
@@ -135,6 +135,123 @@ TEST_F(LibYUVRotateTest, DISABLED_I420Rotate270_Odd) {
                  benchmark_cpu_info_);
 }
 
+static void I444TestRotate(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           libyuv::RotationMode mode,
+                           int benchmark_iterations,
+                           int disable_cpu_flags,
+                           int benchmark_cpu_info) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height == 0) {
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_i444_y_size = src_width * Abs(src_height);
+  int src_i444_uv_size = src_width * Abs(src_height);
+  int src_i444_size = src_i444_y_size + src_i444_uv_size * 2;
+  align_buffer_page_end(src_i444, src_i444_size);
+  for (int i = 0; i < src_i444_size; ++i) {
+    src_i444[i] = fastrand() & 0xff;
+  }
+
+  int dst_i444_y_size = dst_width * dst_height;
+  int dst_i444_uv_size = dst_width * dst_height;
+  int dst_i444_size = dst_i444_y_size + dst_i444_uv_size * 2;
+  align_buffer_page_end(dst_i444_c, dst_i444_size);
+  align_buffer_page_end(dst_i444_opt, dst_i444_size);
+  memset(dst_i444_c, 2, dst_i444_size);
+  memset(dst_i444_opt, 3, dst_i444_size);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  I444Rotate(src_i444, src_width, src_i444 + src_i444_y_size, src_width,
+             src_i444 + src_i444_y_size + src_i444_uv_size, src_width,
+             dst_i444_c, dst_width, dst_i444_c + dst_i444_y_size, dst_width,
+             dst_i444_c + dst_i444_y_size + dst_i444_uv_size, dst_width,
+             src_width, src_height, mode);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    I444Rotate(src_i444, src_width, src_i444 + src_i444_y_size, src_width,
+               src_i444 + src_i444_y_size + src_i444_uv_size, src_width,
+               dst_i444_opt, dst_width, dst_i444_opt + dst_i444_y_size,
+               dst_width, dst_i444_opt + dst_i444_y_size + dst_i444_uv_size,
+               dst_width, src_width, src_height, mode);
+  }
+
+  // Rotation should be exact.
+  for (int i = 0; i < dst_i444_size; ++i) {
+    EXPECT_EQ(dst_i444_c[i], dst_i444_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(dst_i444_c);
+  free_aligned_buffer_page_end(dst_i444_opt);
+  free_aligned_buffer_page_end(src_i444);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate0_Opt) {
+  I444TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate90_Opt) {
+  I444TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate180_Opt) {
+  I444TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate270_Opt) {
+  I444TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+// TODO(fbarchard): Remove odd width tests.
+// Odd width tests work but disabled because they use C code and can be
+// tested by passing an odd width command line or environment variable.
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate0_Odd) {
+  I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate90_Odd) {
+  I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate180_Odd) {
+  I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate270_Odd) {
+  I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
+
 static void NV12TestRotate(int src_width,
                            int src_height,
                            int dst_width,
diff --git a/chromium/third_party/libyuv/unit_test/scale_argb_test.cc b/chromium/third_party/libyuv/unit_test/scale_argb_test.cc
index 6a0a58640e4..2fdf5f60341 100644
--- a/chromium/third_party/libyuv/unit_test/scale_argb_test.cc
+++ b/chromium/third_party/libyuv/unit_test/scale_argb_test.cc
@@ -259,7 +259,7 @@ static int ARGBClipTestFilter(int src_width,
 
 TEST_FACTOR(2, 1, 2)
 TEST_FACTOR(4, 1, 4)
-TEST_FACTOR(8, 1, 8)
+// TEST_FACTOR(8, 1, 8)  Disable for benchmark performance.
 TEST_FACTOR(3by4, 3, 4)
 TEST_FACTOR(3by8, 3, 8)
 TEST_FACTOR(3, 1, 3)
@@ -303,10 +303,12 @@ TEST_FACTOR(3, 1, 3)
 
 TEST_SCALETO(ARGBScale, 1, 1)
 TEST_SCALETO(ARGBScale, 320, 240)
-TEST_SCALETO(ARGBScale, 352, 288)
 TEST_SCALETO(ARGBScale, 569, 480)
 TEST_SCALETO(ARGBScale, 640, 360)
+#ifdef ENABLE_SLOW_TESTS
 TEST_SCALETO(ARGBScale, 1280, 720)
+TEST_SCALETO(ARGBScale, 1920, 1080)
+#endif  // ENABLE_SLOW_TESTS
 #undef TEST_SCALETO1
 #undef TEST_SCALETO
 
@@ -454,4 +456,79 @@ TEST_F(LibYUVScaleTest, YUVToRGBScaleDown) {
   EXPECT_LE(diff, 10);
 }
 
+TEST_F(LibYUVScaleTest, ARGBTest3x) {
+  const int kSrcStride = 48 * 4;
+  const int kDstStride = 16 * 4;
+  const int kSize = kSrcStride * 3;
+  align_buffer_page_end(orig_pixels, kSize);
+  for (int i = 0; i < 48 * 3; ++i) {
+    orig_pixels[i * 4 + 0] = i;
+    orig_pixels[i * 4 + 1] = 255 - i;
+    orig_pixels[i * 4 + 2] = i + 1;
+    orig_pixels[i * 4 + 3] = i + 10;
+  }
+  align_buffer_page_end(dest_pixels, kDstStride);
+
+  int iterations16 =
+      benchmark_width_ * benchmark_height_ / (16 * 1) * benchmark_iterations_;
+  for (int i = 0; i < iterations16; ++i) {
+    ARGBScale(orig_pixels, kSrcStride, 48, 3, dest_pixels, kDstStride, 16, 1,
+              kFilterBilinear);
+  }
+
+  EXPECT_EQ(49, dest_pixels[0]);
+  EXPECT_EQ(255 - 49, dest_pixels[1]);
+  EXPECT_EQ(50, dest_pixels[2]);
+  EXPECT_EQ(59, dest_pixels[3]);
+
+  ARGBScale(orig_pixels, kSrcStride, 48, 3, dest_pixels, kDstStride, 16, 1,
+            kFilterNone);
+
+  EXPECT_EQ(49, dest_pixels[0]);
+  EXPECT_EQ(255 - 49, dest_pixels[1]);
+  EXPECT_EQ(50, dest_pixels[2]);
+  EXPECT_EQ(59, dest_pixels[3]);
+
+  free_aligned_buffer_page_end(dest_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, ARGBTest4x) {
+  const int kSrcStride = 64 * 4;
+  const int kDstStride = 16 * 4;
+  const int kSize = kSrcStride * 4;
+  align_buffer_page_end(orig_pixels, kSize);
+  for (int i = 0; i < 64 * 4; ++i) {
+    orig_pixels[i * 4 + 0] = i;
+    orig_pixels[i * 4 + 1] = 255 - i;
+    orig_pixels[i * 4 + 2] = i + 1;
+    orig_pixels[i * 4 + 3] = i + 10;
+  }
+  align_buffer_page_end(dest_pixels, kDstStride);
+
+  int iterations16 =
+      benchmark_width_ * benchmark_height_ / (16 * 1) * benchmark_iterations_;
+  for (int i = 0; i < iterations16; ++i) {
+    ARGBScale(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1,
+              kFilterBilinear);
+  }
+
+  EXPECT_NEAR((65 + 66 + 129 + 130 + 2) / 4, dest_pixels[0], 4);
+  EXPECT_NEAR((255 - 65 + 255 - 66 + 255 - 129 + 255 - 130 + 2) / 4,
+              dest_pixels[1], 4);
+  EXPECT_NEAR((1 * 4 + 65 + 66 + 129 + 130 + 2) / 4, dest_pixels[2], 4);
+  EXPECT_NEAR((10 * 4 + 65 + 66 + 129 + 130 + 2) / 4, dest_pixels[3], 4);
+
+  ARGBScale(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1,
+            kFilterNone);
+
+  EXPECT_EQ(130, dest_pixels[0]);
+  EXPECT_EQ(255 - 130, dest_pixels[1]);
+  EXPECT_EQ(130 + 1, dest_pixels[2]);
+  EXPECT_EQ(130 + 10, dest_pixels[3]);
+
+  free_aligned_buffer_page_end(dest_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
 }  // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/scale_test.cc b/chromium/third_party/libyuv/unit_test/scale_test.cc
index 08b6cffaa26..d627af02d63 100644
--- a/chromium/third_party/libyuv/unit_test/scale_test.cc
+++ b/chromium/third_party/libyuv/unit_test/scale_test.cc
@@ -14,7 +14,10 @@
 #include "../unit_test/unit_test.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/scale.h"
+
+#ifdef ENABLE_ROW_TESTS
 #include "libyuv/scale_row.h"  // For ScaleRowDown2Box_Odd_C
+#endif
 
 #define STRINGIZE(line) #line
 #define FILELINESTR(file, line) file ":" STRINGIZE(line)
@@ -22,14 +25,14 @@
 namespace libyuv {
 
 // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
-static int TestFilter(int src_width,
-                      int src_height,
-                      int dst_width,
-                      int dst_height,
-                      FilterMode f,
-                      int benchmark_iterations,
-                      int disable_cpu_flags,
-                      int benchmark_cpu_info) {
+static int I420TestFilter(int src_width,
+                          int src_height,
+                          int dst_width,
+                          int dst_height,
+                          FilterMode f,
+                          int benchmark_iterations,
+                          int disable_cpu_flags,
+                          int benchmark_cpu_info) {
   if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
     return 0;
   }
@@ -141,14 +144,14 @@ static int TestFilter(int src_width,
 
 // Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference.
 // 0 = exact.
-static int TestFilter_16(int src_width,
-                         int src_height,
-                         int dst_width,
-                         int dst_height,
-                         FilterMode f,
-                         int benchmark_iterations,
-                         int disable_cpu_flags,
-                         int benchmark_cpu_info) {
+static int I420TestFilter_16(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             FilterMode f,
+                             int benchmark_iterations,
+                             int disable_cpu_flags,
+                             int benchmark_cpu_info) {
   if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
     return 0;
   }
@@ -256,41 +259,412 @@ static int TestFilter_16(int src_width,
   return max_diff;
 }
 
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
+static int I444TestFilter(int src_width,
+                          int src_height,
+                          int dst_width,
+                          int dst_height,
+                          FilterMode f,
+                          int benchmark_iterations,
+                          int disable_cpu_flags,
+                          int benchmark_cpu_info) {
+  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+    return 0;
+  }
+
+  int i, j;
+  int src_width_uv = Abs(src_width);
+  int src_height_uv = Abs(src_height);
+
+  int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+  int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv);
+
+  int src_stride_y = Abs(src_width);
+  int src_stride_uv = src_width_uv;
+
+  align_buffer_page_end(src_y, src_y_plane_size);
+  align_buffer_page_end(src_u, src_uv_plane_size);
+  align_buffer_page_end(src_v, src_uv_plane_size);
+  if (!src_y || !src_u || !src_v) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+  MemRandomize(src_y, src_y_plane_size);
+  MemRandomize(src_u, src_uv_plane_size);
+  MemRandomize(src_v, src_uv_plane_size);
+
+  int dst_width_uv = dst_width;
+  int dst_height_uv = dst_height;
+
+  int64_t dst_y_plane_size = (dst_width) * (dst_height);
+  int64_t dst_uv_plane_size = (dst_width_uv) * (dst_height_uv);
+
+  int dst_stride_y = dst_width;
+  int dst_stride_uv = dst_width_uv;
+
+  align_buffer_page_end(dst_y_c, dst_y_plane_size);
+  align_buffer_page_end(dst_u_c, dst_uv_plane_size);
+  align_buffer_page_end(dst_v_c, dst_uv_plane_size);
+  align_buffer_page_end(dst_y_opt, dst_y_plane_size);
+  align_buffer_page_end(dst_u_opt, dst_uv_plane_size);
+  align_buffer_page_end(dst_v_opt, dst_uv_plane_size);
+  if (!dst_y_c || !dst_u_c || !dst_v_c || !dst_y_opt || !dst_u_opt ||
+      !dst_v_opt) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  double c_time = get_time();
+  I444Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv,
+            src_width, src_height, dst_y_c, dst_stride_y, dst_u_c,
+            dst_stride_uv, dst_v_c, dst_stride_uv, dst_width, dst_height, f);
+  c_time = (get_time() - c_time);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  double opt_time = get_time();
+  for (i = 0; i < benchmark_iterations; ++i) {
+    I444Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv,
+              src_width, src_height, dst_y_opt, dst_stride_y, dst_u_opt,
+              dst_stride_uv, dst_v_opt, dst_stride_uv, dst_width, dst_height,
+              f);
+  }
+  opt_time = (get_time() - opt_time) / benchmark_iterations;
+  // Report performance of C vs OPT.
+  printf("filter %d - %8d us C - %8d us OPT\n", f,
+         static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+
+  // C version may be a little off from the optimized. Order of
+  //  operations may introduce rounding somewhere. So do a difference
+  //  of the buffers and look to see that the max difference is not
+  //  over 3.
+  int max_diff = 0;
+  for (i = 0; i < (dst_height); ++i) {
+    for (j = 0; j < (dst_width); ++j) {
+      int abs_diff = Abs(dst_y_c[(i * dst_stride_y) + j] -
+                         dst_y_opt[(i * dst_stride_y) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+    }
+  }
+
+  for (i = 0; i < (dst_height_uv); ++i) {
+    for (j = 0; j < (dst_width_uv); ++j) {
+      int abs_diff = Abs(dst_u_c[(i * dst_stride_uv) + j] -
+                         dst_u_opt[(i * dst_stride_uv) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+      abs_diff = Abs(dst_v_c[(i * dst_stride_uv) + j] -
+                     dst_v_opt[(i * dst_stride_uv) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+    }
+  }
+
+  free_aligned_buffer_page_end(dst_y_c);
+  free_aligned_buffer_page_end(dst_u_c);
+  free_aligned_buffer_page_end(dst_v_c);
+  free_aligned_buffer_page_end(dst_y_opt);
+  free_aligned_buffer_page_end(dst_u_opt);
+  free_aligned_buffer_page_end(dst_v_opt);
+  free_aligned_buffer_page_end(src_y);
+  free_aligned_buffer_page_end(src_u);
+  free_aligned_buffer_page_end(src_v);
+
+  return max_diff;
+}
+
+// Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference.
+// 0 = exact.
+static int I444TestFilter_16(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             FilterMode f,
+                             int benchmark_iterations,
+                             int disable_cpu_flags,
+                             int benchmark_cpu_info) {
+  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+    return 0;
+  }
+
+  int i;
+  int src_width_uv = Abs(src_width);
+  int src_height_uv = Abs(src_height);
+
+  int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+  int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv);
+
+  int src_stride_y = Abs(src_width);
+  int src_stride_uv = src_width_uv;
+
+  align_buffer_page_end(src_y, src_y_plane_size);
+  align_buffer_page_end(src_u, src_uv_plane_size);
+  align_buffer_page_end(src_v, src_uv_plane_size);
+  align_buffer_page_end(src_y_16, src_y_plane_size * 2);
+  align_buffer_page_end(src_u_16, src_uv_plane_size * 2);
+  align_buffer_page_end(src_v_16, src_uv_plane_size * 2);
+  if (!src_y || !src_u || !src_v || !src_y_16 || !src_u_16 || !src_v_16) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+  uint16_t* p_src_y_16 = reinterpret_cast<uint16_t*>(src_y_16);
+  uint16_t* p_src_u_16 = reinterpret_cast<uint16_t*>(src_u_16);
+  uint16_t* p_src_v_16 = reinterpret_cast<uint16_t*>(src_v_16);
+
+  MemRandomize(src_y, src_y_plane_size);
+  MemRandomize(src_u, src_uv_plane_size);
+  MemRandomize(src_v, src_uv_plane_size);
+
+  for (i = 0; i < src_y_plane_size; ++i) {
+    p_src_y_16[i] = src_y[i];
+  }
+  for (i = 0; i < src_uv_plane_size; ++i) {
+    p_src_u_16[i] = src_u[i];
+    p_src_v_16[i] = src_v[i];
+  }
+
+  int dst_width_uv = dst_width;
+  int dst_height_uv = dst_height;
+
+  int dst_y_plane_size = (dst_width) * (dst_height);
+  int dst_uv_plane_size = (dst_width_uv) * (dst_height_uv);
+
+  int dst_stride_y = dst_width;
+  int dst_stride_uv = dst_width_uv;
+
+  align_buffer_page_end(dst_y_8, dst_y_plane_size);
+  align_buffer_page_end(dst_u_8, dst_uv_plane_size);
+  align_buffer_page_end(dst_v_8, dst_uv_plane_size);
+  align_buffer_page_end(dst_y_16, dst_y_plane_size * 2);
+  align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2);
+  align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2);
+
+  uint16_t* p_dst_y_16 = reinterpret_cast<uint16_t*>(dst_y_16);
+  uint16_t* p_dst_u_16 = reinterpret_cast<uint16_t*>(dst_u_16);
+  uint16_t* p_dst_v_16 = reinterpret_cast<uint16_t*>(dst_v_16);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  I444Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv,
+            src_width, src_height, dst_y_8, dst_stride_y, dst_u_8,
+            dst_stride_uv, dst_v_8, dst_stride_uv, dst_width, dst_height, f);
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (i = 0; i < benchmark_iterations; ++i) {
+    I444Scale_16(p_src_y_16, src_stride_y, p_src_u_16, src_stride_uv,
+                 p_src_v_16, src_stride_uv, src_width, src_height, p_dst_y_16,
+                 dst_stride_y, p_dst_u_16, dst_stride_uv, p_dst_v_16,
+                 dst_stride_uv, dst_width, dst_height, f);
+  }
+
+  // Expect an exact match.
+  int max_diff = 0;
+  for (i = 0; i < dst_y_plane_size; ++i) {
+    int abs_diff = Abs(dst_y_8[i] - p_dst_y_16[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  for (i = 0; i < dst_uv_plane_size; ++i) {
+    int abs_diff = Abs(dst_u_8[i] - p_dst_u_16[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+    abs_diff = Abs(dst_v_8[i] - p_dst_v_16[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+
+  free_aligned_buffer_page_end(dst_y_8);
+  free_aligned_buffer_page_end(dst_u_8);
+  free_aligned_buffer_page_end(dst_v_8);
+  free_aligned_buffer_page_end(dst_y_16);
+  free_aligned_buffer_page_end(dst_u_16);
+  free_aligned_buffer_page_end(dst_v_16);
+  free_aligned_buffer_page_end(src_y);
+  free_aligned_buffer_page_end(src_u);
+  free_aligned_buffer_page_end(src_v);
+  free_aligned_buffer_page_end(src_y_16);
+  free_aligned_buffer_page_end(src_u_16);
+  free_aligned_buffer_page_end(src_v_16);
+
+  return max_diff;
+}
+
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
+static int NV12TestFilter(int src_width,
+                          int src_height,
+                          int dst_width,
+                          int dst_height,
+                          FilterMode f,
+                          int benchmark_iterations,
+                          int disable_cpu_flags,
+                          int benchmark_cpu_info) {
+  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+    return 0;
+  }
+
+  int i, j;
+  int src_width_uv = (Abs(src_width) + 1) >> 1;
+  int src_height_uv = (Abs(src_height) + 1) >> 1;
+
+  int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+  int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv)*2;
+
+  int src_stride_y = Abs(src_width);
+  int src_stride_uv = src_width_uv * 2;
+
+  align_buffer_page_end(src_y, src_y_plane_size);
+  align_buffer_page_end(src_uv, src_uv_plane_size);
+  if (!src_y || !src_uv) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+  MemRandomize(src_y, src_y_plane_size);
+  MemRandomize(src_uv, src_uv_plane_size);
+
+  int dst_width_uv = (dst_width + 1) >> 1;
+  int dst_height_uv = (dst_height + 1) >> 1;
+
+  int64_t dst_y_plane_size = (dst_width) * (dst_height);
+  int64_t dst_uv_plane_size = (dst_width_uv) * (dst_height_uv)*2;
+
+  int dst_stride_y = dst_width;
+  int dst_stride_uv = dst_width_uv * 2;
+
+  align_buffer_page_end(dst_y_c, dst_y_plane_size);
+  align_buffer_page_end(dst_uv_c, dst_uv_plane_size);
+  align_buffer_page_end(dst_y_opt, dst_y_plane_size);
+  align_buffer_page_end(dst_uv_opt, dst_uv_plane_size);
+  if (!dst_y_c || !dst_uv_c || !dst_y_opt || !dst_uv_opt) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  double c_time = get_time();
+  NV12Scale(src_y, src_stride_y, src_uv, src_stride_uv, src_width, src_height,
+            dst_y_c, dst_stride_y, dst_uv_c, dst_stride_uv, dst_width,
+            dst_height, f);
+  c_time = (get_time() - c_time);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  double opt_time = get_time();
+  for (i = 0; i < benchmark_iterations; ++i) {
+    NV12Scale(src_y, src_stride_y, src_uv, src_stride_uv, src_width, src_height,
+              dst_y_opt, dst_stride_y, dst_uv_opt, dst_stride_uv, dst_width,
+              dst_height, f);
+  }
+  opt_time = (get_time() - opt_time) / benchmark_iterations;
+  // Report performance of C vs OPT.
+  printf("filter %d - %8d us C - %8d us OPT\n", f,
+         static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+
+  // C version may be a little off from the optimized. Order of
+  //  operations may introduce rounding somewhere. So do a difference
+  //  of the buffers and look to see that the max difference is not
+  //  over 3.
+  int max_diff = 0;
+  for (i = 0; i < (dst_height); ++i) {
+    for (j = 0; j < (dst_width); ++j) {
+      int abs_diff = Abs(dst_y_c[(i * dst_stride_y) + j] -
+                         dst_y_opt[(i * dst_stride_y) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+    }
+  }
+
+  for (i = 0; i < (dst_height_uv); ++i) {
+    for (j = 0; j < (dst_width_uv * 2); ++j) {
+      int abs_diff = Abs(dst_uv_c[(i * dst_stride_uv) + j] -
+                         dst_uv_opt[(i * dst_stride_uv) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+    }
+  }
+
+  free_aligned_buffer_page_end(dst_y_c);
+  free_aligned_buffer_page_end(dst_uv_c);
+  free_aligned_buffer_page_end(dst_y_opt);
+  free_aligned_buffer_page_end(dst_uv_opt);
+  free_aligned_buffer_page_end(src_y);
+  free_aligned_buffer_page_end(src_uv);
+
+  return max_diff;
+}
+
 // The following adjustments in dimensions ensure the scale factor will be
 // exactly achieved.
 // 2 is chroma subsample.
 #define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2)
 #define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2)
 
-#define TEST_FACTOR1(name, filter, nom, denom, max_diff)                     \
-  TEST_F(LibYUVScaleTest, ScaleDownBy##name##_##filter) {                    \
-    int diff = TestFilter(                                                   \
-        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
-        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
-        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
-        benchmark_cpu_info_);                                                \
-    EXPECT_LE(diff, max_diff);                                               \
-  }                                                                          \
-  TEST_F(LibYUVScaleTest, ScaleDownBy##name##_##filter##_16) {               \
-    int diff = TestFilter_16(                                                \
-        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
-        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
-        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
-        benchmark_cpu_info_);                                                \
-    EXPECT_LE(diff, max_diff);                                               \
+#define TEST_FACTOR1(DISABLED_, name, filter, nom, denom, max_diff)           \
+  TEST_F(LibYUVScaleTest, I420ScaleDownBy##name##_##filter) {                 \
+    int diff = I420TestFilter(                                                \
+        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom),  \
+        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),  \
+        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,           \
+        benchmark_cpu_info_);                                                 \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest, I444ScaleDownBy##name##_##filter) {                 \
+    int diff = I444TestFilter(                                                \
+        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom),  \
+        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),  \
+        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,           \
+        benchmark_cpu_info_);                                                 \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest, DISABLED_##I420ScaleDownBy##name##_##filter##_16) { \
+    int diff = I420TestFilter_16(                                             \
+        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom),  \
+        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),  \
+        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,           \
+        benchmark_cpu_info_);                                                 \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest, DISABLED_##I444ScaleDownBy##name##_##filter##_16) { \
+    int diff = I444TestFilter_16(                                             \
+        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom),  \
+        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),  \
+        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,           \
+        benchmark_cpu_info_);                                                 \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest, NV12ScaleDownBy##name##_##filter) {                 \
+    int diff = NV12TestFilter(                                                \
+        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom),  \
+        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),  \
+        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,           \
+        benchmark_cpu_info_);                                                 \
+    EXPECT_LE(diff, max_diff);                                                \
   }
 
 // Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
 // filtering is different fixed point implementations for SSSE3, Neon and C.
-#define TEST_FACTOR(name, nom, denom, boxdiff) \
-  TEST_FACTOR1(name, None, nom, denom, 0)      \
-  TEST_FACTOR1(name, Linear, nom, denom, 3)    \
-  TEST_FACTOR1(name, Bilinear, nom, denom, 3)  \
-  TEST_FACTOR1(name, Box, nom, denom, boxdiff)
+#ifdef ENABLE_SLOW_TESTS
+#define TEST_FACTOR(name, nom, denom, boxdiff)  \
+  TEST_FACTOR1(, name, None, nom, denom, 0)     \
+  TEST_FACTOR1(, name, Linear, nom, denom, 3)   \
+  TEST_FACTOR1(, name, Bilinear, nom, denom, 3) \
+  TEST_FACTOR1(, name, Box, nom, denom, boxdiff)
+#else
+#define TEST_FACTOR(name, nom, denom, boxdiff)           \
+  TEST_FACTOR1(DISABLED_, name, None, nom, denom, 0)     \
+  TEST_FACTOR1(DISABLED_, name, Linear, nom, denom, 3)   \
+  TEST_FACTOR1(DISABLED_, name, Bilinear, nom, denom, 3) \
+  TEST_FACTOR1(DISABLED_, name, Box, nom, denom, boxdiff)
+#endif
 
 TEST_FACTOR(2, 1, 2, 0)
 TEST_FACTOR(4, 1, 4, 0)
-TEST_FACTOR(8, 1, 8, 0)
+// TEST_FACTOR(8, 1, 8, 0) Disable for benchmark performance.  Takes 90 seconds.
 TEST_FACTOR(3by4, 3, 4, 1)
 TEST_FACTOR(3by8, 3, 8, 1)
 TEST_FACTOR(3, 1, 3, 0)
@@ -299,50 +673,105 @@ TEST_FACTOR(3, 1, 3, 0)
 #undef SX
 #undef DX
 
-#define TEST_SCALETO1(name, width, height, filter, max_diff)                  \
-  TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) {            \
-    int diff = TestFilter(benchmark_width_, benchmark_height_, width, height, \
-                          kFilter##filter, benchmark_iterations_,             \
-                          disable_cpu_flags_, benchmark_cpu_info_);           \
+#define TEST_SCALETO1(DISABLED_, name, width, height, filter, max_diff)       \
+  TEST_F(LibYUVScaleTest, I420##name##To##width##x##height##_##filter) {      \
+    int diff = I420TestFilter(benchmark_width_, benchmark_height_, width,     \
+                              height, kFilter##filter, benchmark_iterations_, \
+                              disable_cpu_flags_, benchmark_cpu_info_);       \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest, I444##name##To##width##x##height##_##filter) {      \
+    int diff = I444TestFilter(benchmark_width_, benchmark_height_, width,     \
+                              height, kFilter##filter, benchmark_iterations_, \
+                              disable_cpu_flags_, benchmark_cpu_info_);       \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest,                                                     \
+         DISABLED_##I420##name##To##width##x##height##_##filter##_16) {       \
+    int diff = I420TestFilter_16(                                             \
+        benchmark_width_, benchmark_height_, width, height, kFilter##filter,  \
+        benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_);      \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest,                                                     \
+         DISABLED_##I444##name##To##width##x##height##_##filter##_16) {       \
+    int diff = I444TestFilter_16(                                             \
+        benchmark_width_, benchmark_height_, width, height, kFilter##filter,  \
+        benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_);      \
     EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
-  TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) {          \
-    int diff = TestFilter(width, height, Abs(benchmark_width_),               \
-                          Abs(benchmark_height_), kFilter##filter,            \
-                          benchmark_iterations_, disable_cpu_flags_,          \
-                          benchmark_cpu_info_);                               \
+  TEST_F(LibYUVScaleTest, NV12##name##To##width##x##height##_##filter) {      \
+    int diff = NV12TestFilter(benchmark_width_, benchmark_height_, width,     \
+                              height, kFilter##filter, benchmark_iterations_, \
+                              disable_cpu_flags_, benchmark_cpu_info_);       \
     EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
-  TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter##_16) {       \
-    int diff = TestFilter_16(benchmark_width_, benchmark_height_, width,      \
-                             height, kFilter##filter, benchmark_iterations_,  \
-                             disable_cpu_flags_, benchmark_cpu_info_);        \
+  TEST_F(LibYUVScaleTest, I420##name##From##width##x##height##_##filter) {    \
+    int diff = I420TestFilter(width, height, Abs(benchmark_width_),           \
+                              Abs(benchmark_height_), kFilter##filter,        \
+                              benchmark_iterations_, disable_cpu_flags_,      \
+                              benchmark_cpu_info_);                           \
     EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
-  TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter##_16) {     \
-    int diff = TestFilter_16(width, height, Abs(benchmark_width_),            \
-                             Abs(benchmark_height_), kFilter##filter,         \
-                             benchmark_iterations_, disable_cpu_flags_,       \
-                             benchmark_cpu_info_);                            \
+  TEST_F(LibYUVScaleTest, I444##name##From##width##x##height##_##filter) {    \
+    int diff = I444TestFilter(width, height, Abs(benchmark_width_),           \
+                              Abs(benchmark_height_), kFilter##filter,        \
+                              benchmark_iterations_, disable_cpu_flags_,      \
+                              benchmark_cpu_info_);                           \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest,                                                     \
+         DISABLED_##I420##name##From##width##x##height##_##filter##_16) {     \
+    int diff = I420TestFilter_16(width, height, Abs(benchmark_width_),        \
+                                 Abs(benchmark_height_), kFilter##filter,     \
+                                 benchmark_iterations_, disable_cpu_flags_,   \
+                                 benchmark_cpu_info_);                        \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest,                                                     \
+         DISABLED_##I444##name##From##width##x##height##_##filter##_16) {     \
+    int diff = I444TestFilter_16(width, height, Abs(benchmark_width_),        \
+                                 Abs(benchmark_height_), kFilter##filter,     \
+                                 benchmark_iterations_, disable_cpu_flags_,   \
+                                 benchmark_cpu_info_);                        \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest, NV12##name##From##width##x##height##_##filter) {    \
+    int diff = NV12TestFilter(width, height, Abs(benchmark_width_),           \
+                              Abs(benchmark_height_), kFilter##filter,        \
+                              benchmark_iterations_, disable_cpu_flags_,      \
+                              benchmark_cpu_info_);                           \
     EXPECT_LE(diff, max_diff);                                                \
   }
 
+#ifdef ENABLE_SLOW_TESTS
 // Test scale to a specified size with all 4 filters.
-#define TEST_SCALETO(name, width, height)         \
-  TEST_SCALETO1(name, width, height, None, 0)     \
-  TEST_SCALETO1(name, width, height, Linear, 3)   \
-  TEST_SCALETO1(name, width, height, Bilinear, 3) \
-  TEST_SCALETO1(name, width, height, Box, 3)
+#define TEST_SCALETO(name, width, height)           \
+  TEST_SCALETO1(, name, width, height, None, 0)     \
+  TEST_SCALETO1(, name, width, height, Linear, 3)   \
+  TEST_SCALETO1(, name, width, height, Bilinear, 3) \
+  TEST_SCALETO1(, name, width, height, Box, 3)
+#else
+// Test scale to a specified size with all 4 filters.
+#define TEST_SCALETO(name, width, height)                    \
+  TEST_SCALETO1(DISABLED_, name, width, height, None, 0)     \
+  TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3)   \
+  TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3) \
+  TEST_SCALETO1(DISABLED_, name, width, height, Box, 3)
+#endif
 
 TEST_SCALETO(Scale, 1, 1)
 TEST_SCALETO(Scale, 320, 240)
-TEST_SCALETO(Scale, 352, 288)
 TEST_SCALETO(Scale, 569, 480)
 TEST_SCALETO(Scale, 640, 360)
 TEST_SCALETO(Scale, 1280, 720)
+#ifdef ENABLE_SLOW_TESTS
+TEST_SCALETO(Scale, 1920, 1080)
+#endif  // ENABLE_SLOW_TESTS
 #undef TEST_SCALETO1
 #undef TEST_SCALETO
 
+#ifdef ENABLE_ROW_TESTS
 #ifdef HAS_SCALEROWDOWN2_SSSE3
 TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) {
   SIMD_ALIGNED(uint8_t orig_pixels[128 * 2]);
@@ -437,6 +866,10 @@ extern "C" void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
                                     ptrdiff_t src_stride,
                                     uint16_t* dst,
                                     int dst_width);
+extern "C" void ScaleRowUp2_16_MMI(const uint16_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint16_t* dst,
+                                   int dst_width);
 extern "C" void ScaleRowUp2_16_C(const uint16_t* src_ptr,
                                  ptrdiff_t src_stride,
                                  uint16_t* dst,
@@ -463,6 +896,13 @@ TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) {
     } else {
       ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
     }
+#elif !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+    int has_mmi = TestCpuFlag(kCpuHasMMI);
+    if (has_mmi) {
+      ScaleRowUp2_16_MMI(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
+    } else {
+      ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
+    }
 #else
     ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
 #endif
@@ -513,6 +953,7 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) {
   EXPECT_EQ(dst_pixels_c[0], (0 + 1 + 2560 + 2561 + 2) / 4);
   EXPECT_EQ(dst_pixels_c[1279], 3839);
 }
+#endif  // ENABLE_ROW_TESTS
 
 // Test scaling plane with 8 bit C vs 16 bit C and return maximum pixel
 // difference.
@@ -583,14 +1024,14 @@ static int TestPlaneFilter_16(int src_width,
 #define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2)
 #define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2)
 
-#define TEST_FACTOR1(name, filter, nom, denom, max_diff)                     \
-  TEST_F(LibYUVScaleTest, ScalePlaneDownBy##name##_##filter##_16) {          \
-    int diff = TestPlaneFilter_16(                                           \
-        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
-        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
-        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
-        benchmark_cpu_info_);                                                \
-    EXPECT_LE(diff, max_diff);                                               \
+#define TEST_FACTOR1(name, filter, nom, denom, max_diff)                       \
+  TEST_F(LibYUVScaleTest, DISABLED_##ScalePlaneDownBy##name##_##filter##_16) { \
+    int diff = TestPlaneFilter_16(                                             \
+        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom),   \
+        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),   \
+        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,            \
+        benchmark_cpu_info_);                                                  \
+    EXPECT_LE(diff, max_diff);                                                 \
   }
 
 // Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
@@ -603,7 +1044,7 @@ static int TestPlaneFilter_16(int src_width,
 
 TEST_FACTOR(2, 1, 2, 0)
 TEST_FACTOR(4, 1, 4, 0)
-TEST_FACTOR(8, 1, 8, 0)
+// TEST_FACTOR(8, 1, 8, 0) Disable for benchmark performance.  Takes 90 seconds.
 TEST_FACTOR(3by4, 3, 4, 1)
 TEST_FACTOR(3by8, 3, 8, 1)
 TEST_FACTOR(3, 1, 3, 0)
@@ -611,4 +1052,171 @@ TEST_FACTOR(3, 1, 3, 0)
 #undef TEST_FACTOR
 #undef SX
 #undef DX
+
+TEST_F(LibYUVScaleTest, PlaneTest3x) {
+  const int kSrcStride = 48;
+  const int kDstStride = 16;
+  const int kSize = kSrcStride * 3;
+  align_buffer_page_end(orig_pixels, kSize);
+  for (int i = 0; i < 48 * 3; ++i) {
+    orig_pixels[i] = i;
+  }
+  align_buffer_page_end(dest_pixels, kDstStride);
+
+  int iterations16 =
+      benchmark_width_ * benchmark_height_ / (16 * 1) * benchmark_iterations_;
+  for (int i = 0; i < iterations16; ++i) {
+    ScalePlane(orig_pixels, kSrcStride, 48, 3, dest_pixels, kDstStride, 16, 1,
+               kFilterBilinear);
+  }
+
+  EXPECT_EQ(49, dest_pixels[0]);
+
+  ScalePlane(orig_pixels, kSrcStride, 48, 3, dest_pixels, kDstStride, 16, 1,
+             kFilterNone);
+
+  EXPECT_EQ(49, dest_pixels[0]);
+
+  free_aligned_buffer_page_end(dest_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, PlaneTest4x) {
+  const int kSrcStride = 64;
+  const int kDstStride = 16;
+  const int kSize = kSrcStride * 4;
+  align_buffer_page_end(orig_pixels, kSize);
+  for (int i = 0; i < 64 * 4; ++i) {
+    orig_pixels[i] = i;
+  }
+  align_buffer_page_end(dest_pixels, kDstStride);
+
+  int iterations16 =
+      benchmark_width_ * benchmark_height_ / (16 * 1) * benchmark_iterations_;
+  for (int i = 0; i < iterations16; ++i) {
+    ScalePlane(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1,
+               kFilterBilinear);
+  }
+
+  EXPECT_EQ((65 + 66 + 129 + 130 + 2) / 4, dest_pixels[0]);
+
+  ScalePlane(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1,
+             kFilterNone);
+
+  EXPECT_EQ(130, dest_pixels[0]);  // expect the 3rd pixel of the 3rd row
+
+  free_aligned_buffer_page_end(dest_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+// Intent is to test 200x50 to 50x200 but width and height can be parameters.
+TEST_F(LibYUVScaleTest, PlaneTestRotate_None) {
+  const int kSize = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(orig_pixels, kSize);
+  for (int i = 0; i < kSize; ++i) {
+    orig_pixels[i] = i;
+  }
+  align_buffer_page_end(dest_opt_pixels, kSize);
+  align_buffer_page_end(dest_c_pixels, kSize);
+
+
+  MaskCpuFlags(disable_cpu_flags_);  // Disable all CPU optimization.
+  ScalePlane(orig_pixels, benchmark_width_,
+             benchmark_width_, benchmark_height_,
+             dest_c_pixels, benchmark_height_,
+             benchmark_height_, benchmark_width_,
+             kFilterNone);
+  MaskCpuFlags(benchmark_cpu_info_);  // Enable all CPU optimization.
+
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ScalePlane(orig_pixels, benchmark_width_,
+               benchmark_width_, benchmark_height_,
+               dest_opt_pixels, benchmark_height_,
+               benchmark_height_, benchmark_width_,
+               kFilterNone);
+  }
+
+  for (int i = 0; i < kSize; ++i) {
+    EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+  }
+
+  free_aligned_buffer_page_end(dest_c_pixels);
+  free_aligned_buffer_page_end(dest_opt_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, PlaneTestRotate_Bilinear) {
+  const int kSize = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(orig_pixels, kSize);
+  for (int i = 0; i < kSize; ++i) {
+    orig_pixels[i] = i;
+  }
+  align_buffer_page_end(dest_opt_pixels, kSize);
+  align_buffer_page_end(dest_c_pixels, kSize);
+
+
+  MaskCpuFlags(disable_cpu_flags_);  // Disable all CPU optimization.
+  ScalePlane(orig_pixels, benchmark_width_,
+             benchmark_width_, benchmark_height_,
+             dest_c_pixels, benchmark_height_,
+             benchmark_height_, benchmark_width_,
+             kFilterBilinear);
+  MaskCpuFlags(benchmark_cpu_info_);  // Enable all CPU optimization.
+
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ScalePlane(orig_pixels, benchmark_width_,
+               benchmark_width_, benchmark_height_,
+               dest_opt_pixels, benchmark_height_,
+               benchmark_height_, benchmark_width_,
+               kFilterBilinear);
+  }
+
+  for (int i = 0; i < kSize; ++i) {
+    EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+  }
+
+  free_aligned_buffer_page_end(dest_c_pixels);
+  free_aligned_buffer_page_end(dest_opt_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+// Intent is to test 200x50 to 50x200 but width and height can be parameters.
+TEST_F(LibYUVScaleTest, PlaneTestRotate_Box) {
+  const int kSize = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(orig_pixels, kSize);
+  for (int i = 0; i < kSize; ++i) {
+    orig_pixels[i] = i;
+  }
+  align_buffer_page_end(dest_opt_pixels, kSize);
+  align_buffer_page_end(dest_c_pixels, kSize);
+
+
+  MaskCpuFlags(disable_cpu_flags_);  // Disable all CPU optimization.
+  ScalePlane(orig_pixels, benchmark_width_,
+             benchmark_width_, benchmark_height_,
+             dest_c_pixels, benchmark_height_,
+             benchmark_height_, benchmark_width_,
+             kFilterBox);
+  MaskCpuFlags(benchmark_cpu_info_);  // Enable all CPU optimization.
+
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ScalePlane(orig_pixels, benchmark_width_,
+               benchmark_width_, benchmark_height_,
+               dest_opt_pixels, benchmark_height_,
+               benchmark_height_, benchmark_width_,
+               kFilterBox);
+  }
+
+  for (int i = 0; i < kSize; ++i) {
+    EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+  }
+
+  free_aligned_buffer_page_end(dest_c_pixels);
+  free_aligned_buffer_page_end(dest_opt_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
 }  // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/unit_test.cc b/chromium/third_party/libyuv/unit_test/unit_test.cc
index 7d662706aaa..2aa9cdaad6e 100644
--- a/chromium/third_party/libyuv/unit_test/unit_test.cc
+++ b/chromium/third_party/libyuv/unit_test/unit_test.cc
@@ -17,6 +17,9 @@
 #ifdef LIBYUV_USE_GFLAGS
 #include "gflags/gflags.h"
 #endif
+#ifdef LIBYUV_USE_BASE_FLAGS
+#include "base/commandlineflags.h"
+#endif
 #include "libyuv/cpu_id.h"
 
 unsigned int fastrand_seed = 0xfb;
@@ -66,6 +69,9 @@ int TestCpuEnv(int cpu_info) {
   if (TestEnv("LIBYUV_DISABLE_MSA")) {
     cpu_info &= ~libyuv::kCpuHasMSA;
   }
+  if (TestEnv("LIBYUV_DISABLE_MMI")) {
+    cpu_info &= ~libyuv::kCpuHasMMI;
+  }
 #endif
 #if !defined(__pnacl__) && !defined(__CLR_VER) &&                   \
     (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
diff --git a/chromium/third_party/libyuv/unit_test/video_common_test.cc b/chromium/third_party/libyuv/unit_test/video_common_test.cc
index a84206a2adb..eb183aaa796 100644
--- a/chromium/third_party/libyuv/unit_test/video_common_test.cc
+++ b/chromium/third_party/libyuv/unit_test/video_common_test.cc
@@ -65,7 +65,7 @@ TEST_F(LibYUVBaseTest, TestFourCC) {
   EXPECT_TRUE(TestValidFourCC(FOURCC_NV12, FOURCC_BPP_NV12));
   EXPECT_TRUE(TestValidFourCC(FOURCC_YUY2, FOURCC_BPP_YUY2));
   EXPECT_TRUE(TestValidFourCC(FOURCC_UYVY, FOURCC_BPP_UYVY));
-  EXPECT_TRUE(TestValidFourCC(FOURCC_M420, FOURCC_BPP_M420));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_M420, FOURCC_BPP_M420));  // deprecated.
   EXPECT_TRUE(TestValidFourCC(FOURCC_Q420, FOURCC_BPP_Q420));  // deprecated.
   EXPECT_TRUE(TestValidFourCC(FOURCC_ARGB, FOURCC_BPP_ARGB));
   EXPECT_TRUE(TestValidFourCC(FOURCC_BGRA, FOURCC_BPP_BGRA));
diff --git a/chromium/third_party/libyuv/util/cpuid.c b/chromium/third_party/libyuv/util/cpuid.c
index 59c65d60e0f..46f9c1bfff4 100644
--- a/chromium/third_party/libyuv/util/cpuid.c
+++ b/chromium/third_party/libyuv/util/cpuid.c
@@ -12,10 +12,11 @@
 #include <stdlib.h>
 #include <string.h>
 
-#define INCLUDE_LIBYUV_COMPARE_H_
-#include "libyuv.h"
-#include "./psnr.h"
-#include "./ssim.h"
+#include "libyuv/cpu_id.h"
+
+#ifdef __cplusplus
+using namespace libyuv;
+#endif
 
 int main(int argc, const char* argv[]) {
   int cpu_flags = TestCpuFlag(-1);
@@ -71,6 +72,8 @@ int main(int argc, const char* argv[]) {
   if (has_mips) {
     int has_msa = TestCpuFlag(kCpuHasMSA);
     printf("Has MSA %x\n", has_msa);
+    int has_mmi = TestCpuFlag(kCpuHasMMI);
+    printf("Has MMI %x\n", has_mmi);
   }
   if (has_x86) {
     int has_sse2 = TestCpuFlag(kCpuHasSSE2);
@@ -81,7 +84,7 @@ int main(int argc, const char* argv[]) {
     int has_avx2 = TestCpuFlag(kCpuHasAVX2);
     int has_erms = TestCpuFlag(kCpuHasERMS);
     int has_fma3 = TestCpuFlag(kCpuHasFMA3);
-    int has_f16c = TestCpuFlag(kCpuHasF16C); 
+    int has_f16c = TestCpuFlag(kCpuHasF16C);
     int has_gfni = TestCpuFlag(kCpuHasGFNI);
     int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW);
     int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL);
diff --git a/chromium/third_party/libyuv/util/i444tonv12_eg.cc b/chromium/third_party/libyuv/util/i444tonv12_eg.cc
new file mode 100644
index 00000000000..0fcb4095a80
--- /dev/null
+++ b/chromium/third_party/libyuv/util/i444tonv12_eg.cc
@@ -0,0 +1,28 @@
+
+#include "libyuv/convert.h"
+
+#include <stdio.h>   // for printf
+#include <string.h>  // for memset
+
+int main(int, char**) {
+  unsigned char src_i444[640 * 400 * 3];
+  unsigned char dst_nv12[640 * 400 * 3 / 2];
+
+  for (size_t i = 0; i < sizeof(src_i444); ++i) {
+    src_i444[i] = i & 255;
+  }
+  memset(dst_nv12, 0, sizeof(dst_nv12));
+  libyuv::I444ToNV12(&src_i444[0], 640,              // source Y
+                     &src_i444[640 * 400], 640,      // source U
+                     &src_i444[640 * 400 * 2], 640,  // source V
+                     &dst_nv12[0], 640,              // dest Y
+                     &dst_nv12[640 * 400], 640,      // dest UV
+                     640, 400);                      // width and height
+
+  int checksum = 0;
+  for (size_t i = 0; i < sizeof(dst_nv12); ++i) {
+    checksum += dst_nv12[i];
+  }
+  printf("checksum %x %s\n", checksum, checksum == 0x2ec0c00 ? "PASS" : "FAIL");
+  return 0;
+}
+\ No newline at end of file
diff --git a/chromium/third_party/libyuv/util/psnr.cc b/chromium/third_party/libyuv/util/psnr.cc
index f54015bab82..c7bee7f97d2 100644
--- a/chromium/third_party/libyuv/util/psnr.cc
+++ b/chromium/third_party/libyuv/util/psnr.cc
@@ -189,7 +189,7 @@ static uint32_t SumSquareError_SSE2(const uint8_t* src_a,
         ,
         "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
 #endif
-      );  // NOLINT
+  );  // NOLINT
   return sse;
 }
 #endif  // LIBYUV_DISABLE_X86 etc
diff --git a/chromium/third_party/libyuv/winarm.mk b/chromium/third_party/libyuv/winarm.mk
index c4307a431f9..b0a344ae06d 100644
--- a/chromium/third_party/libyuv/winarm.mk
+++ b/chromium/third_party/libyuv/winarm.mk
@@ -31,6 +31,7 @@ LOCAL_OBJ_FILES = \
 	source/scale_any.o\
 	source/scale_argb.o\
 	source/scale_common.o\
+	source/scale_uv.o\
 	source/video_common.o
 
 .cc.o:
author	Jana Grill <janagrill@google.com>	2021-02-17 12:35:20 +0000
committer	Michael Brüning <michael.bruning@qt.io>	2021-04-09 10:50:31 +0000
commit	fcd5c56fe795bb48ff3b31e0fff038875c5ad689 (patch)
tree	eb133a6c1e56daabcf4418a517d856fb8a727c4a
parent	35caa7c78115cc22c405c1b6de386b73d36f8609 (diff)
download	qtwebengine-chromium-fcd5c56fe795bb48ff3b31e0fff038875c5ad689.tar.gz