diff options
author | Jim Bankoski <jimbankoski@google.com> | 2016-06-28 07:26:07 -0700 |
---|---|---|
committer | James Bankoski <jimbankoski@google.com> | 2016-06-29 17:53:14 +0000 |
commit | b8f83282f8506ad3d839440341bbe683df7d2cf6 (patch) | |
tree | fa0097c5a4fd8b53fa39d11c20da05c656eaa535 /third_party/libyuv | |
parent | b34705f64ff0bc8facd0fc33fe07bf6def67cb45 (diff) | |
download | libvpx-b8f83282f8506ad3d839440341bbe683df7d2cf6.tar.gz |
libyuv: update to b8ddb5a2
Fixes color issue when scaling without breaking mingw.
BUG=https://bugs.chromium.org/p/libyuv/issues/detail?id=605
BUG=https://bugs.chromium.org/p/webm/issues/detail?id=1252
Change-Id: I3920c5664def7ae7a23f60fb160d26d23bc86a27
Diffstat (limited to 'third_party/libyuv')
55 files changed, 6814 insertions, 8027 deletions
diff --git a/third_party/libyuv/README.libvpx b/third_party/libyuv/README.libvpx index 09693c1f2..b52a68371 100644 --- a/third_party/libyuv/README.libvpx +++ b/third_party/libyuv/README.libvpx @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1456 +Version: b8ddb5a2 License: BSD License File: LICENSE @@ -13,3 +13,13 @@ which down-samples the original input video (f.g. 1280x720) a number of times in order to encode multiple resolution bit streams. Local Modifications: + +Remove files unnecessary to libvpx build. + +rm -rf .gitignore .gn AUTHORS Android.mk BUILD.gn CMakeLists.txt DEPS LICENSE \ + LICENSE_THIRD_PARTY OWNERS PATENTS PRESUBMIT.py README.chromium README.md \ + all.gyp build_overrides/ chromium/ codereview.settings docs/ \ + download_vs_toolchain.py gyp_libyuv gyp_libyuv.py include/libyuv.h \ + include/libyuv/compare_row.h libyuv.gyp libyuv.gypi libyuv_nacl.gyp \ + libyuv_test.gyp linux.mk public.mk setup_links.py sync_chromium.py \ + third_party/ tools/ unit_test/ util/ winarm.mk diff --git a/third_party/libyuv/include/libyuv/convert.h b/third_party/libyuv/include/libyuv/convert.h index a8d3fa07a..a2cdc5718 100644 --- a/third_party/libyuv/include/libyuv/convert.h +++ b/third_party/libyuv/include/libyuv/convert.h @@ -12,10 +12,8 @@ #define INCLUDE_LIBYUV_CONVERT_H_ #include "libyuv/basic_types.h" -// TODO(fbarchard): Remove the following headers includes. -#include "libyuv/convert_from.h" -#include "libyuv/planar_functions.h" -#include "libyuv/rotate.h" + +#include "libyuv/rotate.h" // For enum RotationMode. #ifdef __cplusplus namespace libyuv { diff --git a/third_party/libyuv/include/libyuv/convert_argb.h b/third_party/libyuv/include/libyuv/convert_argb.h index 360c6d359..079d273b1 100644 --- a/third_party/libyuv/include/libyuv/convert_argb.h +++ b/third_party/libyuv/include/libyuv/convert_argb.h @@ -12,10 +12,8 @@ #define INCLUDE_LIBYUV_CONVERT_ARGB_H_ #include "libyuv/basic_types.h" -// TODO(fbarchard): Remove the following headers includes -#include "libyuv/convert_from.h" -#include "libyuv/planar_functions.h" -#include "libyuv/rotate.h" + +#include "libyuv/rotate.h" // For enum RotationMode. // TODO(fbarchard): This set of functions should exactly match convert.h // TODO(fbarchard): Add tests. Create random content of right size and convert @@ -60,6 +58,22 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, uint8* dst_argb, int dst_stride_argb, int width, int height); +// Convert J444 to ARGB. +LIBYUV_API +int J444ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I444 to ABGR. +LIBYUV_API +int I444ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height); + // Convert I411 to ARGB. LIBYUV_API int I411ToARGB(const uint8* src_y, int src_stride_y, @@ -68,6 +82,24 @@ int I411ToARGB(const uint8* src_y, int src_stride_y, uint8* dst_argb, int dst_stride_argb, int width, int height); +// Convert I420 with Alpha to preattenuated ARGB. +LIBYUV_API +int I420AlphaToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + const uint8* src_a, int src_stride_a, + uint8* dst_argb, int dst_stride_argb, + int width, int height, int attenuate); + +// Convert I420 with Alpha to preattenuated ABGR. +LIBYUV_API +int I420AlphaToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + const uint8* src_a, int src_stride_a, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height, int attenuate); + // Convert I400 (grey) to ARGB. Reverse of ARGBToI400. LIBYUV_API int I400ToARGB(const uint8* src_y, int src_stride_y, @@ -131,6 +163,54 @@ int J422ToARGB(const uint8* src_y, int src_stride_y, uint8* dst_argb, int dst_stride_argb, int width, int height); +// Convert J420 to ABGR. +LIBYUV_API +int J420ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height); + +// Convert J422 to ABGR. +LIBYUV_API +int J422ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height); + +// Convert H420 to ARGB. +LIBYUV_API +int H420ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert H422 to ARGB. +LIBYUV_API +int H422ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert H420 to ABGR. +LIBYUV_API +int H420ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height); + +// Convert H422 to ABGR. +LIBYUV_API +int H422ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height); + // BGRA little endian (argb in memory) to ARGB. LIBYUV_API int BGRAToARGB(const uint8* src_frame, int src_stride_frame, diff --git a/third_party/libyuv/include/libyuv/convert_from.h b/third_party/libyuv/include/libyuv/convert_from.h index 9fd8d4de5..39e1578a0 100644 --- a/third_party/libyuv/include/libyuv/convert_from.h +++ b/third_party/libyuv/include/libyuv/convert_from.h @@ -56,8 +56,6 @@ int I400Copy(const uint8* src_y, int src_stride_y, uint8* dst_y, int dst_stride_y, int width, int height); -// TODO(fbarchard): I420ToM420 - LIBYUV_API int I420ToNV12(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, diff --git a/third_party/libyuv/include/libyuv/cpu_id.h b/third_party/libyuv/include/libyuv/cpu_id.h index dc858a814..dfb7445e2 100644 --- a/third_party/libyuv/include/libyuv/cpu_id.h +++ b/third_party/libyuv/include/libyuv/cpu_id.h @@ -18,9 +18,8 @@ namespace libyuv { extern "C" { #endif -// TODO(fbarchard): Consider overlapping bits for different architectures. // Internal flag to indicate cpuid requires initialization. -#define kCpuInit 0x1 +static const int kCpuInitialized = 0x1; // These flags are only valid on ARM processors. static const int kCpuHasARM = 0x2; @@ -37,12 +36,12 @@ static const int kCpuHasAVX = 0x200; static const int kCpuHasAVX2 = 0x400; static const int kCpuHasERMS = 0x800; static const int kCpuHasFMA3 = 0x1000; +static const int kCpuHasAVX3 = 0x2000; // 0x2000, 0x4000, 0x8000 reserved for future X86 flags. // These flags are only valid on MIPS processors. static const int kCpuHasMIPS = 0x10000; -static const int kCpuHasMIPS_DSP = 0x20000; -static const int kCpuHasMIPS_DSPR2 = 0x40000; +static const int kCpuHasDSPR2 = 0x20000; // Internal function used to auto-init. LIBYUV_API @@ -57,13 +56,13 @@ int ArmCpuCaps(const char* cpuinfo_name); // returns non-zero if instruction set is detected static __inline int TestCpuFlag(int test_flag) { LIBYUV_API extern int cpu_info_; - return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag; + return (!cpu_info_ ? InitCpuFlags() : cpu_info_) & test_flag; } // For testing, allow CPU flags to be disabled. // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. // MaskCpuFlags(-1) to enable all cpu specific optimizations. -// MaskCpuFlags(0) to disable all cpu specific optimizations. +// MaskCpuFlags(1) to disable all cpu specific optimizations. LIBYUV_API void MaskCpuFlags(int enable_flags); diff --git a/third_party/libyuv/include/libyuv/planar_functions.h b/third_party/libyuv/include/libyuv/planar_functions.h index ae994db89..881b0c5c6 100644 --- a/third_party/libyuv/include/libyuv/planar_functions.h +++ b/third_party/libyuv/include/libyuv/planar_functions.h @@ -145,13 +145,6 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, uint8* dst_rgb565, int dst_stride_rgb565, int width, int height); -// Convert NV21 to RGB565. -LIBYUV_API -int NV21ToRGB565(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_rgb565, int dst_stride_rgb565, - int width, int height); - // I422ToARGB is in convert_argb.h // Convert I422 to BGRA. LIBYUV_API @@ -177,6 +170,14 @@ int I422ToRGBA(const uint8* src_y, int src_stride_y, uint8* dst_rgba, int dst_stride_rgba, int width, int height); +// Alias +#define RGB24ToRAW RAWToRGB24 + +LIBYUV_API +int RAWToRGB24(const uint8* src_raw, int src_stride_raw, + uint8* dst_rgb24, int dst_stride_rgb24, + int width, int height); + // Draw a rectangle into I420. LIBYUV_API int I420Rect(uint8* dst_y, int dst_stride_y, @@ -281,13 +282,19 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, int width, int height); -// Copy ARGB to ARGB. +// Copy Alpha channel of ARGB to alpha of ARGB. LIBYUV_API int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, int width, int height); -// Copy ARGB to ARGB. +// Extract the alpha channel from ARGB. +LIBYUV_API +int ARGBExtractAlpha(const uint8* src_argb, int src_stride_argb, + uint8* dst_a, int dst_stride_a, + int width, int height); + +// Copy Y channel to Alpha of ARGB. LIBYUV_API int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, uint8* dst_argb, int dst_stride_argb, @@ -301,6 +308,7 @@ LIBYUV_API ARGBBlendRow GetARGBBlend(); // Alpha Blend ARGB images and store to destination. +// Source is pre-multiplied by alpha using ARGBAttenuate. // Alpha of destination is set to 255. LIBYUV_API int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, @@ -308,6 +316,31 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, uint8* dst_argb, int dst_stride_argb, int width, int height); +// Alpha Blend plane and store to destination. +// Source is not pre-multiplied by alpha. +LIBYUV_API +int BlendPlane(const uint8* src_y0, int src_stride_y0, + const uint8* src_y1, int src_stride_y1, + const uint8* alpha, int alpha_stride, + uint8* dst_y, int dst_stride_y, + int width, int height); + +// Alpha Blend YUV images and store to destination. +// Source is not pre-multiplied by alpha. +// Alpha is full width x height and subsampled to half size to apply to UV. +LIBYUV_API +int I420Blend(const uint8* src_y0, int src_stride_y0, + const uint8* src_u0, int src_stride_u0, + const uint8* src_v0, int src_stride_v0, + const uint8* src_y1, int src_stride_y1, + const uint8* src_u1, int src_stride_u1, + const uint8* src_v1, int src_stride_v1, + const uint8* alpha, int alpha_stride, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + // Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255. LIBYUV_API int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, @@ -357,12 +390,6 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, int width, int height); -// Convert MJPG to ARGB. -LIBYUV_API -int MJPGToARGB(const uint8* sample, size_t sample_size, - uint8* argb, int argb_stride, - int w, int h, int dw, int dh); - // Internal function - do not call directly. // Computes table of cumulative sum for image where the value is the sum // of all values above and to the left of the entry. Used by ARGBBlur. @@ -389,22 +416,49 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, int width, int height, uint32 value); -// Interpolate between two ARGB images using specified amount of interpolation +// Interpolate between two images using specified amount of interpolation // (0 to 255) and store to destination. -// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0 -// and 255 means 1% src_argb0 and 99% src_argb1. -// Internally uses ARGBScale bilinear filtering. -// Caveat: This function will write up to 16 bytes beyond the end of dst_argb. +// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0 +// and 255 means 1% src0 and 99% src1. +LIBYUV_API +int InterpolatePlane(const uint8* src0, int src_stride0, + const uint8* src1, int src_stride1, + uint8* dst, int dst_stride, + int width, int height, int interpolation); + +// Interpolate between two ARGB images using specified amount of interpolation +// Internally calls InterpolatePlane with width * 4 (bpp). LIBYUV_API int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, const uint8* src_argb1, int src_stride_argb1, uint8* dst_argb, int dst_stride_argb, int width, int height, int interpolation); +// Interpolate between two YUV images using specified amount of interpolation +// Internally calls InterpolatePlane on each plane where the U and V planes +// are half width and half height. +LIBYUV_API +int I420Interpolate(const uint8* src0_y, int src0_stride_y, + const uint8* src0_u, int src0_stride_u, + const uint8* src0_v, int src0_stride_v, + const uint8* src1_y, int src1_stride_y, + const uint8* src1_u, int src1_stride_u, + const uint8* src1_v, int src1_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height, int interpolation); + #if defined(__pnacl__) || defined(__CLR_VER) || \ (defined(__i386__) && !defined(__SSE2__)) #define LIBYUV_DISABLE_X86 #endif +// MemorySanitizer does not support assembly code yet. http://crbug.com/344505 +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) +#define LIBYUV_DISABLE_X86 +#endif +#endif // The following are available on all x86 platforms: #if !defined(LIBYUV_DISABLE_X86) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) diff --git a/third_party/libyuv/include/libyuv/rotate_row.h b/third_party/libyuv/include/libyuv/rotate_row.h index c41cf3273..ebc487f9a 100644 --- a/third_party/libyuv/include/libyuv/rotate_row.h +++ b/third_party/libyuv/include/libyuv/rotate_row.h @@ -22,53 +22,24 @@ extern "C" { (defined(__i386__) && !defined(__SSE2__)) #define LIBYUV_DISABLE_X86 #endif - -// Visual C 2012 required for AVX2. -#if defined(_M_IX86) && !defined(__clang__) && \ - defined(_MSC_VER) && _MSC_VER >= 1700 -#define VISUALC_HAS_AVX2 1 -#endif // VisualStudio >= 2012 - -// TODO(fbarchard): switch to standard form of inline; fails on clangcl. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) -#if defined(__APPLE__) && defined(__i386__) -#define DECLARE_FUNCTION(name) \ - ".text \n" \ - ".private_extern _" #name " \n" \ - ".align 4,0x90 \n" \ -"_" #name ": \n" -#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__) -#define DECLARE_FUNCTION(name) \ - ".text \n" \ - ".align 4,0x90 \n" \ -"_" #name ": \n" -#else -#define DECLARE_FUNCTION(name) \ - ".text \n" \ - ".align 4,0x90 \n" \ -#name ": \n" +// MemorySanitizer does not support assembly code yet. http://crbug.com/344505 +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) +#define LIBYUV_DISABLE_X86 #endif #endif - -// The following are available for Visual C: -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ - defined(_MSC_VER) && !defined(__clang__) +// The following are available for Visual C and clangcl 32 bit: +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) #define HAS_TRANSPOSEWX8_SSSE3 #define HAS_TRANSPOSEUVWX8_SSE2 #endif -// The following are available for GCC but not NaCL: +// The following are available for GCC 32 or 64 bit but not NaCL for 64 bit: #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) #define HAS_TRANSPOSEWX8_SSSE3 #endif -// The following are available for 32 bit GCC: -#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) && !defined(__clang__) -#define HAS_TRANSPOSEUVWX8_SSE2 -#endif - // The following are available for 64 bit GCC but not NaCL: #if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ defined(__x86_64__) @@ -85,8 +56,8 @@ extern "C" { #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ defined(__mips__) && \ defined(__mips_dsp) && (__mips_dsp_rev >= 2) -#define HAS_TRANSPOSEWX8_MIPS_DSPR2 -#define HAS_TRANSPOSEUVWx8_MIPS_DSPR2 +#define HAS_TRANSPOSEWX8_DSPR2 +#define HAS_TRANSPOSEUVWX8_DSPR2 #endif // defined(__mips__) void TransposeWxH_C(const uint8* src, int src_stride, @@ -100,7 +71,9 @@ void TransposeWx8_SSSE3(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width); void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width); -void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, +void TransposeWx8_DSPR2(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); +void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width); void TransposeWx8_Any_NEON(const uint8* src, int src_stride, @@ -109,8 +82,8 @@ void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width); void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width); -void TransposeWx8_Any_MIPS_DSPR2(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); +void TransposeWx8_Any_DSPR2(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); void TransposeUVWxH_C(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, @@ -126,9 +99,19 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, void TransposeUVWx8_NEON(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, int width); -void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width); +void TransposeUVWx8_DSPR2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width); + +void TransposeUVWx8_Any_SSE2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width); +void TransposeUVWx8_Any_NEON(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width); +void TransposeUVWx8_Any_DSPR2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width); #ifdef __cplusplus } // extern "C" diff --git a/third_party/libyuv/include/libyuv/row.h b/third_party/libyuv/include/libyuv/row.h index ebae3e719..b9ea5a842 100644 --- a/third_party/libyuv/include/libyuv/row.h +++ b/third_party/libyuv/include/libyuv/row.h @@ -41,6 +41,12 @@ extern "C" { (defined(__i386__) && !defined(__SSE2__)) #define LIBYUV_DISABLE_X86 #endif +// MemorySanitizer does not support assembly code yet. http://crbug.com/344505 +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) +#define LIBYUV_DISABLE_X86 +#endif +#endif // True if compiling for SSSE3 as a requirement. #if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3)) #define LIBYUV_SSSE3_ONLY @@ -56,6 +62,26 @@ extern "C" { #endif // clang >= 3.5 #endif // __clang__ +// GCC >= 4.7.0 required for AVX2. +#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) +#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) +#define GCC_HAS_AVX2 1 +#endif // GNUC >= 4.7 +#endif // __GNUC__ + +// clang >= 3.4.0 required for AVX2. +#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) +#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4)) +#define CLANG_HAS_AVX2 1 +#endif // clang >= 3.4 +#endif // __clang__ + +// Visual C 2012 required for AVX2. +#if defined(_M_IX86) && !defined(__clang__) && \ + defined(_MSC_VER) && _MSC_VER >= 1700 +#define VISUALC_HAS_AVX2 1 +#endif // VisualStudio >= 2012 + // The following are available on all x86 platforms: #if !defined(LIBYUV_DISABLE_X86) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) @@ -71,25 +97,23 @@ extern "C" { #define HAS_ARGBTOARGB4444ROW_SSE2 #define HAS_ARGBTORAWROW_SSSE3 #define HAS_ARGBTORGB24ROW_SSSE3 +#define HAS_ARGBTORGB565DITHERROW_SSE2 #define HAS_ARGBTORGB565ROW_SSE2 -#define HAS_ARGBTOUV422ROW_SSSE3 #define HAS_ARGBTOUV444ROW_SSSE3 #define HAS_ARGBTOUVJROW_SSSE3 #define HAS_ARGBTOUVROW_SSSE3 #define HAS_ARGBTOYJROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3 +#define HAS_ARGBEXTRACTALPHAROW_SSE2 #define HAS_BGRATOUVROW_SSSE3 #define HAS_BGRATOYROW_SSSE3 #define HAS_COPYROW_ERMS #define HAS_COPYROW_SSE2 +#define HAS_H422TOARGBROW_SSSE3 #define HAS_I400TOARGBROW_SSE2 -#define HAS_I411TOARGBROW_SSSE3 -#define HAS_I422TOABGRROW_SSSE3 #define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TOARGB4444ROW_SSSE3 #define HAS_I422TOARGBROW_SSSE3 -#define HAS_I422TOBGRAROW_SSSE3 -#define HAS_I422TORAWROW_SSSE3 #define HAS_I422TORGB24ROW_SSSE3 #define HAS_I422TORGB565ROW_SSSE3 #define HAS_I422TORGBAROW_SSSE3 @@ -99,15 +123,13 @@ extern "C" { #define HAS_J400TOARGBROW_SSE2 #define HAS_J422TOARGBROW_SSSE3 #define HAS_MERGEUVROW_SSE2 -#define HAS_MIRRORROW_SSE2 #define HAS_MIRRORROW_SSSE3 -#define HAS_MIRRORROW_UV_SSSE3 #define HAS_MIRRORUVROW_SSSE3 #define HAS_NV12TOARGBROW_SSSE3 #define HAS_NV12TORGB565ROW_SSSE3 #define HAS_NV21TOARGBROW_SSSE3 -#define HAS_NV21TORGB565ROW_SSSE3 #define HAS_RAWTOARGBROW_SSSE3 +#define HAS_RAWTORGB24ROW_SSSE3 #define HAS_RAWTOYROW_SSSE3 #define HAS_RGB24TOARGBROW_SSSE3 #define HAS_RGB24TOYROW_SSSE3 @@ -145,9 +167,9 @@ extern "C" { #define HAS_ARGBSHADEROW_SSE2 #define HAS_ARGBSUBTRACTROW_SSE2 #define HAS_ARGBUNATTENUATEROW_SSE2 +#define HAS_BLENDPLANEROW_SSSE3 #define HAS_COMPUTECUMULATIVESUMROW_SSE2 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 -#define HAS_INTERPOLATEROW_SSE2 #define HAS_INTERPOLATEROW_SSSE3 #define HAS_RGBCOLORTABLEROW_X86 #define HAS_SOBELROW_SSE2 @@ -155,54 +177,18 @@ extern "C" { #define HAS_SOBELXROW_SSE2 #define HAS_SOBELXYROW_SSE2 #define HAS_SOBELYROW_SSE2 -#endif -// The following are available on x64 Visual C and clangcl. -#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \ - (!defined(__clang__) || defined(__SSSE3__)) -#define HAS_I422TOARGBROW_SSSE3 +// The following functions fail on gcc/clang 32 bit with fpic and framepointer. +// caveat: clangcl uses row_win.cc which works. +#if defined(NDEBUG) || !(defined(_DEBUG) && defined(__i386__)) || \ + !defined(__i386__) || defined(_MSC_VER) +// TODO(fbarchard): fix build error on x86 debug +// https://code.google.com/p/libyuv/issues/detail?id=524 +#define HAS_I411TOARGBROW_SSSE3 +// TODO(fbarchard): fix build error on android_full_debug=1 +// https://code.google.com/p/libyuv/issues/detail?id=517 +#define HAS_I422ALPHATOARGBROW_SSSE3 #endif - -// GCC >= 4.7.0 required for AVX2. -#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) -#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) -#define GCC_HAS_AVX2 1 -#endif // GNUC >= 4.7 -#endif // __GNUC__ - -// clang >= 3.4.0 required for AVX2. -#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) -#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4)) -#define CLANG_HAS_AVX2 1 -#endif // clang >= 3.4 -#endif // __clang__ - -// Visual C 2012 required for AVX2. -#if defined(_M_IX86) && !defined(__clang__) && \ - defined(_MSC_VER) && _MSC_VER >= 1700 -#define VISUALC_HAS_AVX2 1 -#endif // VisualStudio >= 2012 - -// The following are available require VS2012. Port to GCC. -#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) -#define HAS_ARGB1555TOARGBROW_AVX2 -#define HAS_ARGB4444TOARGBROW_AVX2 -#define HAS_ARGBTOARGB1555ROW_AVX2 -#define HAS_ARGBTOARGB4444ROW_AVX2 -#define HAS_ARGBTORGB565DITHERROW_AVX2 -#define HAS_ARGBTORGB565DITHERROW_SSE2 -#define HAS_ARGBTORGB565ROW_AVX2 -#define HAS_I411TOARGBROW_AVX2 -#define HAS_I422TOARGB1555ROW_AVX2 -#define HAS_I422TOARGB4444ROW_AVX2 -#define HAS_I422TORGB565ROW_AVX2 -#define HAS_I444TOARGBROW_AVX2 -#define HAS_J400TOARGBROW_AVX2 -#define HAS_NV12TOARGBROW_AVX2 -#define HAS_NV12TORGB565ROW_AVX2 -#define HAS_NV21TOARGBROW_AVX2 -#define HAS_NV21TORGB565ROW_AVX2 -#define HAS_RGB565TOARGBROW_AVX2 #endif // The following are available on all x86 platforms, but @@ -215,21 +201,34 @@ extern "C" { #define HAS_ARGBMIRRORROW_AVX2 #define HAS_ARGBPOLYNOMIALROW_AVX2 #define HAS_ARGBSHUFFLEROW_AVX2 +#define HAS_ARGBTORGB565DITHERROW_AVX2 +#define HAS_ARGBTOUVJROW_AVX2 #define HAS_ARGBTOUVROW_AVX2 #define HAS_ARGBTOYJROW_AVX2 #define HAS_ARGBTOYROW_AVX2 #define HAS_COPYROW_AVX +#define HAS_H422TOARGBROW_AVX2 #define HAS_I400TOARGBROW_AVX2 -#define HAS_I422TOABGRROW_AVX2 +#if !(defined(_DEBUG) && defined(__i386__)) +// TODO(fbarchard): fix build error on android_full_debug=1 +// https://code.google.com/p/libyuv/issues/detail?id=517 +#define HAS_I422ALPHATOARGBROW_AVX2 +#endif +#define HAS_I411TOARGBROW_AVX2 +#define HAS_I422TOARGB1555ROW_AVX2 +#define HAS_I422TOARGB4444ROW_AVX2 #define HAS_I422TOARGBROW_AVX2 -#define HAS_I422TOBGRAROW_AVX2 -#define HAS_I422TORAWROW_AVX2 #define HAS_I422TORGB24ROW_AVX2 +#define HAS_I422TORGB565ROW_AVX2 #define HAS_I422TORGBAROW_AVX2 +#define HAS_I444TOARGBROW_AVX2 #define HAS_INTERPOLATEROW_AVX2 #define HAS_J422TOARGBROW_AVX2 #define HAS_MERGEUVROW_AVX2 #define HAS_MIRRORROW_AVX2 +#define HAS_NV12TOARGBROW_AVX2 +#define HAS_NV12TORGB565ROW_AVX2 +#define HAS_NV21TOARGBROW_AVX2 #define HAS_SPLITUVROW_AVX2 #define HAS_UYVYTOARGBROW_AVX2 #define HAS_UYVYTOUV422ROW_AVX2 @@ -246,15 +245,27 @@ extern "C" { #define HAS_ARGBMULTIPLYROW_AVX2 #define HAS_ARGBSUBTRACTROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2 +#define HAS_BLENDPLANEROW_AVX2 #endif -// The following are disabled when SSSE3 is available: -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ - !defined(LIBYUV_SSSE3_ONLY) -#define HAS_ARGBATTENUATEROW_SSE2 -#define HAS_ARGBBLENDROW_SSE2 -#define HAS_MIRRORROW_SSE2 +// The following are available for AVX2 Visual C and clangcl 32 bit: +// TODO(fbarchard): Port to gcc. +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ + (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2)) +#define HAS_ARGB1555TOARGBROW_AVX2 +#define HAS_ARGB4444TOARGBROW_AVX2 +#define HAS_ARGBTOARGB1555ROW_AVX2 +#define HAS_ARGBTOARGB4444ROW_AVX2 +#define HAS_ARGBTORGB565ROW_AVX2 +#define HAS_J400TOARGBROW_AVX2 +#define HAS_RGB565TOARGBROW_AVX2 +#endif + +// The following are also available on x64 Visual C. +#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \ + (!defined(__clang__) || defined(__SSSE3__)) +#define HAS_I422ALPHATOARGBROW_SSSE3 +#define HAS_I422TOARGBROW_SSSE3 #endif // The following are available on Neon platforms: @@ -268,43 +279,44 @@ extern "C" { #define HAS_ARGB4444TOARGBROW_NEON #define HAS_ARGB4444TOUVROW_NEON #define HAS_ARGB4444TOYROW_NEON +#define HAS_ARGBSETROW_NEON #define HAS_ARGBTOARGB1555ROW_NEON #define HAS_ARGBTOARGB4444ROW_NEON #define HAS_ARGBTORAWROW_NEON #define HAS_ARGBTORGB24ROW_NEON +#define HAS_ARGBTORGB565DITHERROW_NEON #define HAS_ARGBTORGB565ROW_NEON #define HAS_ARGBTOUV411ROW_NEON -#define HAS_ARGBTOUV422ROW_NEON #define HAS_ARGBTOUV444ROW_NEON #define HAS_ARGBTOUVJROW_NEON #define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOYJROW_NEON #define HAS_ARGBTOYROW_NEON +#define HAS_ARGBEXTRACTALPHAROW_NEON #define HAS_BGRATOUVROW_NEON #define HAS_BGRATOYROW_NEON #define HAS_COPYROW_NEON -#define HAS_J400TOARGBROW_NEON +#define HAS_I400TOARGBROW_NEON #define HAS_I411TOARGBROW_NEON -#define HAS_I422TOABGRROW_NEON +#define HAS_I422ALPHATOARGBROW_NEON #define HAS_I422TOARGB1555ROW_NEON #define HAS_I422TOARGB4444ROW_NEON #define HAS_I422TOARGBROW_NEON -#define HAS_I422TOBGRAROW_NEON -#define HAS_I422TORAWROW_NEON #define HAS_I422TORGB24ROW_NEON #define HAS_I422TORGB565ROW_NEON #define HAS_I422TORGBAROW_NEON #define HAS_I422TOUYVYROW_NEON #define HAS_I422TOYUY2ROW_NEON #define HAS_I444TOARGBROW_NEON +#define HAS_J400TOARGBROW_NEON #define HAS_MERGEUVROW_NEON #define HAS_MIRRORROW_NEON #define HAS_MIRRORUVROW_NEON #define HAS_NV12TOARGBROW_NEON #define HAS_NV12TORGB565ROW_NEON #define HAS_NV21TOARGBROW_NEON -#define HAS_NV21TORGB565ROW_NEON #define HAS_RAWTOARGBROW_NEON +#define HAS_RAWTORGB24ROW_NEON #define HAS_RAWTOUVROW_NEON #define HAS_RAWTOYROW_NEON #define HAS_RGB24TOARGBROW_NEON @@ -316,29 +328,28 @@ extern "C" { #define HAS_RGBATOUVROW_NEON #define HAS_RGBATOYROW_NEON #define HAS_SETROW_NEON -#define HAS_ARGBSETROW_NEON #define HAS_SPLITUVROW_NEON #define HAS_UYVYTOARGBROW_NEON #define HAS_UYVYTOUV422ROW_NEON #define HAS_UYVYTOUVROW_NEON #define HAS_UYVYTOYROW_NEON -#define HAS_I400TOARGBROW_NEON #define HAS_YUY2TOARGBROW_NEON #define HAS_YUY2TOUV422ROW_NEON #define HAS_YUY2TOUVROW_NEON #define HAS_YUY2TOYROW_NEON -#define HAS_ARGBTORGB565DITHERROW_NEON // Effects: #define HAS_ARGBADDROW_NEON #define HAS_ARGBATTENUATEROW_NEON #define HAS_ARGBBLENDROW_NEON +#define HAS_ARGBCOLORMATRIXROW_NEON #define HAS_ARGBGRAYROW_NEON #define HAS_ARGBMIRRORROW_NEON #define HAS_ARGBMULTIPLYROW_NEON #define HAS_ARGBQUANTIZEROW_NEON #define HAS_ARGBSEPIAROW_NEON #define HAS_ARGBSHADEROW_NEON +#define HAS_ARGBSHUFFLEROW_NEON #define HAS_ARGBSUBTRACTROW_NEON #define HAS_INTERPOLATEROW_NEON #define HAS_SOBELROW_NEON @@ -346,8 +357,6 @@ extern "C" { #define HAS_SOBELXROW_NEON #define HAS_SOBELXYROW_NEON #define HAS_SOBELYROW_NEON -#define HAS_ARGBCOLORMATRIXROW_NEON -#define HAS_ARGBSHUFFLEROW_NEON #endif // The following are available on Mips platforms: @@ -355,17 +364,15 @@ extern "C" { (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6) #define HAS_COPYROW_MIPS #if defined(__mips_dsp) && (__mips_dsp_rev >= 2) -#define HAS_I422TOABGRROW_MIPS_DSPR2 -#define HAS_I422TOARGBROW_MIPS_DSPR2 -#define HAS_I422TOBGRAROW_MIPS_DSPR2 -#define HAS_INTERPOLATEROW_MIPS_DSPR2 -#define HAS_MIRRORROW_MIPS_DSPR2 -#define HAS_MIRRORUVROW_MIPS_DSPR2 -#define HAS_SPLITUVROW_MIPS_DSPR2 +#define HAS_I422TOARGBROW_DSPR2 +#define HAS_INTERPOLATEROW_DSPR2 +#define HAS_MIRRORROW_DSPR2 +#define HAS_MIRRORUVROW_DSPR2 +#define HAS_SPLITUVROW_DSPR2 #endif #endif -#if defined(_MSC_VER) && !defined(__CLR_VER) +#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) #define SIMD_ALIGNED(var) __declspec(align(16)) var #define SIMD_ALIGNED32(var) __declspec(align(64)) var typedef __declspec(align(16)) int16 vec16[8]; @@ -380,7 +387,7 @@ typedef __declspec(align(32)) int8 lvec8[32]; typedef __declspec(align(32)) uint16 ulvec16[16]; typedef __declspec(align(32)) uint32 ulvec32[8]; typedef __declspec(align(32)) uint8 ulvec8[32]; -#elif defined(__GNUC__) +#elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__)) // Caveat GCC 4.2 to 4.7 have a known issue using vectors with const. #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) #define SIMD_ALIGNED32(var) var __attribute__((aligned(64))) @@ -413,6 +420,56 @@ typedef uint32 ulvec32[8]; typedef uint8 ulvec8[32]; #endif +#if defined(__aarch64__) +// This struct is for Arm64 color conversion. +struct YuvConstants { + uvec16 kUVToRB; + uvec16 kUVToRB2; + uvec16 kUVToG; + uvec16 kUVToG2; + vec16 kUVBiasBGR; + vec32 kYToRgb; +}; +#elif defined(__arm__) +// This struct is for ArmV7 color conversion. +struct YuvConstants { + uvec8 kUVToRB; + uvec8 kUVToG; + vec16 kUVBiasBGR; + vec32 kYToRgb; +}; +#else +// This struct is for Intel color conversion. +struct YuvConstants { + lvec8 kUVToB; + lvec8 kUVToG; + lvec8 kUVToR; + lvec16 kUVBiasB; + lvec16 kUVBiasG; + lvec16 kUVBiasR; + lvec16 kYToRgb; +}; + +// Offsets into YuvConstants structure +#define KUVTOB 0 +#define KUVTOG 32 +#define KUVTOR 64 +#define KUVBIASB 96 +#define KUVBIASG 128 +#define KUVBIASR 160 +#define KYTORGB 192 +#endif + +// Conversion matrix for YUV to RGB +extern const struct YuvConstants kYuvI601Constants; // BT.601 +extern const struct YuvConstants kYuvJPEGConstants; // JPeg color space +extern const struct YuvConstants kYuvH709Constants; // BT.709 + +// Conversion matrix for YVU to BGR +extern const struct YuvConstants kYvuI601Constants; // BT.601 +extern const struct YuvConstants kYvuJPEGConstants; // JPeg color space +extern const struct YuvConstants kYvuH709Constants; // BT.709 + #if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__) #define OMITFP #else @@ -502,159 +559,166 @@ void I444ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I422ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); -void I411ToARGBRow_NEON(const uint8* src_y, +void I422AlphaToARGBRow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); -void I422ToBGRARow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_bgra, - int width); -void I422ToABGRRow_NEON(const uint8* src_y, +void I411ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* dst_abgr, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I422ToRGBARow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgba, + const struct YuvConstants* yuvconstants, int width); void I422ToRGB24Row_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgb24, + const struct YuvConstants* yuvconstants, int width); -void I422ToRAWRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_raw, - int width); void I422ToRGB565Row_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgb565, + const struct YuvConstants* yuvconstants, int width); void I422ToARGB1555Row_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb1555, + const struct YuvConstants* yuvconstants, int width); void I422ToARGB4444Row_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb4444, + const struct YuvConstants* yuvconstants, int width); void NV12ToARGBRow_NEON(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, - int width); -void NV21ToARGBRow_NEON(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void NV12ToRGB565Row_NEON(const uint8* src_y, const uint8* src_uv, uint8* dst_rgb565, + const struct YuvConstants* yuvconstants, int width); -void NV21ToRGB565Row_NEON(const uint8* src_y, - const uint8* src_vu, - uint8* dst_rgb565, - int width); +void NV21ToARGBRow_NEON(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void YUY2ToARGBRow_NEON(const uint8* src_yuy2, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void UYVYToARGBRow_NEON(const uint8* src_uyvy, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); -void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix); -void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix); -void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix); -void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix); -void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix); -void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix); -void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix); -void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix); -void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int pix); -void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix); -void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width); +void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width); +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width); +void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width); +void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width); +void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width); +void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width); +void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width); +void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width); +void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width); +void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int width); +void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width); +void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width); void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int pix); -void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int pix); + int width); void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int pix); + int width); void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int pix); -void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix); -void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix); -void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix); -void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix); -void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix); -void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix); -void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix); -void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix); -void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); -void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int pix); -void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int pix); -void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int pix); -void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int pix); -void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int pix); -void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int pix); -void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int pix); -void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int pix); -void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int pix); -void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix); -void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix); -void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix); -void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix); -void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int pix); -void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix); -void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix); -void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int pix); -void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int pix); -void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int pix); -void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int pix); -void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int pix); -void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int pix); -void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y, int pix); -void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, int pix); + uint8* dst_u, uint8* dst_v, int width); +void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width); +void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width); +void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width); +void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width); +void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width); +void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width); +void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width); +void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width); +void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int width); +void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int width); +void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int width); +void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int width); +void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int width); +void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int width); +void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int width); +void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width); +void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width); +void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width); +void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width); +void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width); +void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int width); +void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int width); +void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int width); +void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width); +void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int width); +void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width); +void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width); +void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int width); +void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int width); +void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int width); +void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int width); +void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int width); +void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int width); +void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y, + int width); +void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, + int width); void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVJRow_AVX2(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb, @@ -665,6 +729,10 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr, uint8* dst_u, uint8* dst_v, int width); void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba, uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVJRow_Any_AVX2(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb, @@ -676,33 +744,31 @@ void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr, void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba, uint8* dst_u, uint8* dst_v, int width); void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int pix); -void ARGBToUV422Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int pix); + int width); void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int pix); + int width); void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb, @@ -729,25 +795,15 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width); -void ARGBToUV422Row_SSSE3(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUV422Row_Any_SSSE3(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width); - void ARGBToUV444Row_C(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width); -void ARGBToUV422Row_C(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width); void ARGBToUV411Row_C(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJ422Row_C(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width); void MirrorRow_AVX2(const uint8* src, uint8* dst, int width); void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width); -void MirrorRow_SSE2(const uint8* src, uint8* dst, int width); void MirrorRow_NEON(const uint8* src, uint8* dst, int width); -void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width); +void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width); void MirrorRow_C(const uint8* src, uint8* dst, int width); void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width); void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width); @@ -758,10 +814,9 @@ void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width); void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width); -void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width); -void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width); +void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width); +void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width); void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width); void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width); @@ -771,20 +826,23 @@ void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width); void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width); void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width); -void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); -void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); -void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); -void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); -void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int pix); +void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width); +void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width); +void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width); +void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width); +void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width); void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int pix); + int width); void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int pix); + int width); void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int pix); -void SplitUVRow_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int pix); + int width); +void SplitUVRow_Any_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width); void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width); @@ -816,10 +874,26 @@ void CopyRow_16_C(const uint16* src, uint16* dst, int count); void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width); void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBCopyAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, + int width); +void ARGBCopyAlphaRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, + int width); + +void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width); +void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width); +void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width); +void ARGBExtractAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_a, + int width); +void ARGBExtractAlphaRow_Any_NEON(const uint8* src_argb, uint8* dst_a, + int width); void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width); void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); +void ARGBCopyYToAlphaRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, + int width); +void ARGBCopyYToAlphaRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, + int width); void SetRow_C(uint8* dst, uint8 v8, int count); void SetRow_X86(uint8* dst, uint8 v8, int count); @@ -835,524 +909,541 @@ void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count); // ARGBShufflers for BGRAToARGB etc. void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix); + const uint8* shuffler, int width); void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix); + const uint8* shuffler, int width); void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix); + const uint8* shuffler, int width); void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix); + const uint8* shuffler, int width); void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix); + const uint8* shuffler, int width); void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix); + const uint8* shuffler, int width); void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix); + const uint8* shuffler, int width); void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix); + const uint8* shuffler, int width); void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix); + const uint8* shuffler, int width); -void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix); -void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix); -void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int pix); +void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width); +void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width); +void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width); +void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int width); void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, - int pix); + int width); void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, - int pix); -void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int pix); + int width); +void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int width); void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, - int pix); + int width); void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, - int pix); + int width); -void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix); -void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix); -void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix); +void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width); +void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width); +void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width); +void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width); void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, - int pix); + int width); void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, - int pix); -void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix); -void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix); -void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix); -void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix); -void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix); -void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix); -void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix); + int width); +void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width); +void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width); +void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width); +void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width); +void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width); +void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width); +void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, + int width); +void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int width); +void RAWToRGB24Row_Any_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width); void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb, - int pix); + int width); void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb, - int pix); + int width); void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb, - int pix); + int width); void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb, - int pix); + int width); void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb, - int pix); + int width); void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb, - int pix); + int width); -void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix); -void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix); +void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, + int width); +void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int width); +void RAWToRGB24Row_Any_NEON(const uint8* src_raw, uint8* dst_rgb24, int width); void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb, - int pix); + int width); void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb, - int pix); + int width); void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb, - int pix); + int width); -void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int pix); + const uint32 dither4, int width); void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int pix); + const uint32 dither4, int width); void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int pix); + const uint32 dither4, int width); -void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, const uint32 dither4, int width); -void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); - -void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); -void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix); -void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix); -void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); -void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix); -void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int pix); -void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix); +void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width); + +void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); +void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); +void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width); +void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width); +void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width); +void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width); +void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width); void I444ToARGBRow_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I422ToARGBRow_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); +void I422ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I411ToARGBRow_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void NV12ToARGBRow_C(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); -void NV21ToRGB565Row_C(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, - int width); void NV12ToRGB565Row_C(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void NV21ToARGBRow_C(const uint8* src_y, - const uint8* src_vu, + const uint8* src_uv, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void YUY2ToARGBRow_C(const uint8* src_yuy2, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void UYVYToARGBRow_C(const uint8* src_uyvy, uint8* dst_argb, - int width); -void J422ToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); -void I422ToBGRARow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_bgra, - int width); -void I422ToABGRRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_abgr, + const struct YuvConstants* yuvconstants, int width); void I422ToRGBARow_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgba, + const struct YuvConstants* yuvconstants, int width); void I422ToRGB24Row_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgb24, + const struct YuvConstants* yuvconstants, int width); -void I422ToRAWRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_raw, - int width); void I422ToARGB4444Row_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb4444, + const struct YuvConstants* yuvconstants, int width); void I422ToARGB1555Row_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb4444, + const struct YuvConstants* yuvconstants, int width); void I422ToRGB565Row_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgb565, + const struct YuvConstants* yuvconstants, int width); void I422ToARGBRow_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); -void I422ToBGRARow_AVX2(const uint8* src_y, +void I422ToARGBRow_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I422ToRGBARow_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); -void I422ToABGRRow_AVX2(const uint8* src_y, +void I444ToARGBRow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I444ToARGBRow_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I444ToARGBRow_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I444ToARGBRow_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I422ToARGBRow_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGBRow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I411ToARGBRow_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I411ToARGBRow_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void NV12ToARGBRow_SSSE3(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, - int width); -void NV21ToARGBRow_SSSE3(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void NV12ToARGBRow_AVX2(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, - int width); -void NV21ToARGBRow_AVX2(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, - int width); -void NV21ToRGB565Row_SSSE3(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void NV12ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); -void NV21ToRGB565Row_AVX2(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, - int width); +void NV21ToARGBRow_SSSE3(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToARGBRow_AVX2(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void UYVYToARGBRow_AVX2(const uint8* src_uyvy, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); -void J422ToARGBRow_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); -void J422ToARGBRow_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); -void I422ToBGRARow_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_bgra, - int width); -void I422ToABGRRow_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_abgr, - int width); void I422ToRGBARow_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgba, + const struct YuvConstants* yuvconstants, int width); void I422ToARGB4444Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I422ToARGB4444Row_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I422ToARGB1555Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I422ToARGB1555Row_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I422ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I422ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I422ToRGB24Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgb24, + const struct YuvConstants* yuvconstants, int width); void I422ToRGB24Row_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgb24, + const struct YuvConstants* yuvconstants, int width); -void I422ToRAWRow_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_raw, - int width); -void I422ToRAWRow_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_raw, - int width); void I422ToARGBRow_Any_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, - int width); -void I422ToBGRARow_Any_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I422ToRGBARow_Any_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, - int width); -void I422ToABGRRow_Any_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I444ToARGBRow_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I444ToARGBRow_Any_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I422ToARGBRow_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); +void I422AlphaToARGBRow_Any_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_Any_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I411ToARGBRow_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I411ToARGBRow_Any_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void NV12ToARGBRow_Any_SSSE3(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, - int width); -void NV21ToARGBRow_Any_SSSE3(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void NV12ToARGBRow_Any_AVX2(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); +void NV21ToARGBRow_Any_SSSE3(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void NV21ToARGBRow_Any_AVX2(const uint8* src_y, const uint8* src_vu, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, - int width); -void NV21ToRGB565Row_Any_SSSE3(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void NV12ToRGB565Row_Any_AVX2(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, - int width); -void NV21ToRGB565Row_Any_AVX2(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); -void J422ToARGBRow_Any_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); -void J422ToARGBRow_Any_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); -void I422ToBGRARow_Any_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_bgra, - int width); -void I422ToABGRRow_Any_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_abgr, - int width); void I422ToRGBARow_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgba, + const struct YuvConstants* yuvconstants, int width); void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgba, + const struct YuvConstants* yuvconstants, int width); void I422ToARGB4444Row_Any_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgba, + const struct YuvConstants* yuvconstants, int width); void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgba, + const struct YuvConstants* yuvconstants, int width); void I422ToARGB1555Row_Any_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgba, + const struct YuvConstants* yuvconstants, int width); void I422ToRGB565Row_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgba, + const struct YuvConstants* yuvconstants, int width); void I422ToRGB565Row_Any_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgba, + const struct YuvConstants* yuvconstants, int width); void I422ToRGB24Row_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I422ToRGB24Row_Any_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); -void I422ToRAWRow_Any_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); -void I422ToRAWRow_Any_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width); void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); @@ -1365,13 +1456,23 @@ void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width); // ARGB preattenuated alpha blend. void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); -void ARGBBlendRow_SSE2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); +// Unattenuated planar alpha blend. +void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, + const uint8* alpha, uint8* dst, int width); +void BlendPlaneRow_Any_SSSE3(const uint8* src0, const uint8* src1, + const uint8* alpha, uint8* dst, int width); +void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, + const uint8* alpha, uint8* dst, int width); +void BlendPlaneRow_Any_AVX2(const uint8* src0, const uint8* src1, + const uint8* alpha, uint8* dst, int width); +void BlendPlaneRow_C(const uint8* src0, const uint8* src1, + const uint8* alpha, uint8* dst, int width); + // ARGB multiply images. Same API as Blend, but these require // pointer and width alignment for SSE2. void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1, @@ -1422,26 +1523,32 @@ void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1, void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); -void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, + int width); +void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, + int width); void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int pix); + const uint32 dither4, int width); void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int pix); + const uint32 dither4, int width); -void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, + int width); +void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, + int width); -void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, + int width); +void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, + int width); void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, const uint32 dither4, int width); @@ -1449,186 +1556,169 @@ void I444ToARGBRow_Any_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I422ToARGBRow_Any_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); +void I422AlphaToARGBRow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + const uint8* src_a, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I411ToARGBRow_Any_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, - int width); -void I422ToBGRARow_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); -void I422ToABGRRow_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I422ToRGBARow_Any_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I422ToRGB24Row_Any_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); -void I422ToRAWRow_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); void I422ToARGB4444Row_Any_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I422ToARGB1555Row_Any_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void I422ToRGB565Row_Any_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void NV12ToARGBRow_Any_NEON(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void NV21ToARGBRow_Any_NEON(const uint8* src_y, - const uint8* src_uv, + const uint8* src_vu, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void NV12ToRGB565Row_Any_NEON(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, - int width); -void NV21ToRGB565Row_Any_NEON(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); -void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); -void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); -void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); -void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); -void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); +void I422ToARGBRow_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGBRow_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); -void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width); void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix); -void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); + uint8* dst_u, uint8* dst_v, int width); +void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width); void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix); -void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix); + uint8* dst_u, uint8* dst_v, int width); +void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width); void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void YUY2ToUV422Row_NEON(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix); -void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix); + uint8* dst_u, uint8* dst_v, int width); +void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width); void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void YUY2ToUV422Row_C(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix); -void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix); + uint8* dst_u, uint8* dst_v, int width); +void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int width); void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix); -void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); + uint8* dst_u, uint8* dst_v, int width); +void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int width); void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix); -void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int pix); + uint8* dst_u, uint8* dst_v, int width); +void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int width); void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix); -void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix); + uint8* dst_u, uint8* dst_v, int width); +void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width); void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void UYVYToUV422Row_AVX2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix); -void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix); + uint8* dst_u, uint8* dst_v, int width); +void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width); void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void UYVYToUV422Row_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix); -void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix); + uint8* dst_u, uint8* dst_v, int width); +void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width); void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void UYVYToUV422Row_AVX2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix); -void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix); + uint8* dst_u, uint8* dst_v, int width); +void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width); void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void UYVYToUV422Row_NEON(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); -void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width); void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void UYVYToUV422Row_C(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix); -void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix); + uint8* dst_u, uint8* dst_v, int width); +void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int width); void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix); -void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix); + uint8* dst_u, uint8* dst_v, int width); +void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int width); void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix); -void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int pix); + uint8* dst_u, uint8* dst_v, int width); +void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int width); void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int width); void I422ToYUY2Row_C(const uint8* src_y, const uint8* src_u, @@ -1673,7 +1763,6 @@ void I422ToUYVYRow_Any_NEON(const uint8* src_y, // Effects related row functions. void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width); @@ -1753,9 +1842,6 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); -void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, - int source_y_fraction); void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); @@ -1765,24 +1851,21 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); -void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, - int source_y_fraction); +void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); -void InterpolateRow_Any_SSE2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, - int source_y_fraction); void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); -void InterpolateRow_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, - int source_y_fraction); +void InterpolateRow_Any_DSPR2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr, ptrdiff_t src_stride_ptr, diff --git a/third_party/libyuv/include/libyuv/scale_argb.h b/third_party/libyuv/include/libyuv/scale_argb.h index 0c9b36257..b56cf5209 100644 --- a/third_party/libyuv/include/libyuv/scale_argb.h +++ b/third_party/libyuv/include/libyuv/scale_argb.h @@ -35,7 +35,6 @@ int ARGBScaleClip(const uint8* src_argb, int src_stride_argb, int clip_x, int clip_y, int clip_width, int clip_height, enum FilterMode filtering); -// TODO(fbarchard): Implement this. // Scale with YUV conversion to ARGB and clipping. LIBYUV_API int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y, diff --git a/third_party/libyuv/include/libyuv/scale_row.h b/third_party/libyuv/include/libyuv/scale_row.h index 94ad9cf86..df699e6c2 100644 --- a/third_party/libyuv/include/libyuv/scale_row.h +++ b/third_party/libyuv/include/libyuv/scale_row.h @@ -23,6 +23,26 @@ extern "C" { (defined(__i386__) && !defined(__SSE2__)) #define LIBYUV_DISABLE_X86 #endif +// MemorySanitizer does not support assembly code yet. http://crbug.com/344505 +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) +#define LIBYUV_DISABLE_X86 +#endif +#endif + +// GCC >= 4.7.0 required for AVX2. +#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) +#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) +#define GCC_HAS_AVX2 1 +#endif // GNUC >= 4.7 +#endif // __GNUC__ + +// clang >= 3.4.0 required for AVX2. +#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) +#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4)) +#define CLANG_HAS_AVX2 1 +#endif // clang >= 3.4 +#endif // __clang__ // Visual C 2012 required for AVX2. #if defined(_M_IX86) && !defined(__clang__) && \ @@ -42,24 +62,23 @@ extern "C" { #define HAS_SCALEARGBROWDOWNEVEN_SSE2 #define HAS_SCALECOLSUP2_SSE2 #define HAS_SCALEFILTERCOLS_SSSE3 -#define HAS_SCALEROWDOWN2_SSE2 +#define HAS_SCALEROWDOWN2_SSSE3 #define HAS_SCALEROWDOWN34_SSSE3 #define HAS_SCALEROWDOWN38_SSSE3 -#define HAS_SCALEROWDOWN4_SSE2 +#define HAS_SCALEROWDOWN4_SSSE3 +#define HAS_SCALEADDROW_SSE2 #endif -// The following are available on VS2012: -#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) +// The following are available on all x86 platforms, but +// require VS2012, clang 3.4 or gcc 4.7. +// The code supports NaCL but requires a new compiler and validator. +#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \ + defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) #define HAS_SCALEADDROW_AVX2 #define HAS_SCALEROWDOWN2_AVX2 #define HAS_SCALEROWDOWN4_AVX2 #endif -// The following are available on Visual C: -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && !defined(__clang__) -#define HAS_SCALEADDROW_SSE2 -#endif - // The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) @@ -77,10 +96,10 @@ extern "C" { // The following are available on Mips platforms: #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2) -#define HAS_SCALEROWDOWN2_MIPS_DSPR2 -#define HAS_SCALEROWDOWN4_MIPS_DSPR2 -#define HAS_SCALEROWDOWN34_MIPS_DSPR2 -#define HAS_SCALEROWDOWN38_MIPS_DSPR2 +#define HAS_SCALEROWDOWN2_DSPR2 +#define HAS_SCALEROWDOWN4_DSPR2 +#define HAS_SCALEROWDOWN34_DSPR2 +#define HAS_SCALEROWDOWN38_DSPR2 #endif // Scale ARGB vertically with bilinear interpolation. @@ -133,6 +152,8 @@ void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride, uint16* dst, int dst_width); void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); +void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, uint16* dst, int dst_width); void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride, @@ -214,22 +235,22 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx); // Specialized scalers for x86. -void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); -void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, @@ -251,22 +272,26 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); -void ScaleRowDown2_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, @@ -418,6 +443,8 @@ void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); +void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, @@ -447,28 +474,26 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, int dx); - -void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width); -void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width); -void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); #ifdef __cplusplus } // extern "C" diff --git a/third_party/libyuv/include/libyuv/version.h b/third_party/libyuv/include/libyuv/version.h index 9d1d746c2..896d1d9b7 100644 --- a/third_party/libyuv/include/libyuv/version.h +++ b/third_party/libyuv/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1456 +#define LIBYUV_VERSION 1601 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/third_party/libyuv/include/libyuv/video_common.h b/third_party/libyuv/include/libyuv/video_common.h index cb6582f24..ad934e424 100644 --- a/third_party/libyuv/include/libyuv/video_common.h +++ b/third_party/libyuv/include/libyuv/video_common.h @@ -62,7 +62,7 @@ enum FourCC { // 2 Secondary YUV formats: row biplanar. FOURCC_M420 = FOURCC('M', '4', '2', '0'), - FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated. + FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated. // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp. FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), @@ -90,7 +90,8 @@ enum FourCC { FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'), FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420. FOURCC_J420 = FOURCC('J', '4', '2', '0'), - FOURCC_J400 = FOURCC('J', '4', '0', '0'), + FOURCC_J400 = FOURCC('J', '4', '0', '0'), // unofficial fourcc + FOURCC_H420 = FOURCC('H', '4', '2', '0'), // unofficial fourcc // 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc. FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420. @@ -150,6 +151,7 @@ enum FourCCBpp { FOURCC_BPP_YU12 = 12, FOURCC_BPP_J420 = 12, FOURCC_BPP_J400 = 8, + FOURCC_BPP_H420 = 12, FOURCC_BPP_MJPG = 0, // 0 means unknown. FOURCC_BPP_H264 = 0, FOURCC_BPP_IYUV = 12, diff --git a/third_party/libyuv/source/compare.cc b/third_party/libyuv/source/compare.cc index 46aa8473d..e3846bdfd 100644 --- a/third_party/libyuv/source/compare.cc +++ b/third_party/libyuv/source/compare.cc @@ -17,6 +17,7 @@ #endif #include "libyuv/basic_types.h" +#include "libyuv/compare_row.h" #include "libyuv/cpu_id.h" #include "libyuv/row.h" #include "libyuv/video_common.h" @@ -27,29 +28,12 @@ extern "C" { #endif // hash seed of 5381 recommended. -// Internal C version of HashDjb2 with int sized count for efficiency. -uint32 HashDjb2_C(const uint8* src, int count, uint32 seed); - -// This module is for Visual C x86 -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || \ - (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))) -#define HAS_HASHDJB2_SSE41 -uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed); - -#ifdef VISUALC_HAS_AVX2 -#define HAS_HASHDJB2_AVX2 -uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed); -#endif - -#endif // HAS_HASHDJB2_SSE41 - -// hash seed of 5381 recommended. LIBYUV_API uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { const int kBlockSize = 1 << 15; // 32768; int remainder; - uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C; + uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = + HashDjb2_C; #if defined(HAS_HASHDJB2_SSE41) if (TestCpuFlag(kCpuHasSSE41)) { HashDjb2_SSE = HashDjb2_SSE41; @@ -127,23 +111,6 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) { return fourcc; } -uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count); -#if !defined(LIBYUV_DISABLE_NEON) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) -#define HAS_SUMSQUAREERROR_NEON -uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count); -#endif -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) -#define HAS_SUMSQUAREERROR_SSE2 -uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count); -#endif - -#ifdef VISUALC_HAS_AVX2 -#define HAS_SUMSQUAREERROR_AVX2 -uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count); -#endif - // TODO(fbarchard): Refactor into row function. LIBYUV_API uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, diff --git a/third_party/libyuv/source/compare_common.cc b/third_party/libyuv/source/compare_common.cc index c546b5182..42fc58935 100644 --- a/third_party/libyuv/source/compare_common.cc +++ b/third_party/libyuv/source/compare_common.cc @@ -10,6 +10,8 @@ #include "libyuv/basic_types.h" +#include "libyuv/compare_row.h" + #ifdef __cplusplus namespace libyuv { extern "C" { diff --git a/third_party/libyuv/source/compare_gcc.cc b/third_party/libyuv/source/compare_gcc.cc index 247cb33bb..1b83edb16 100644 --- a/third_party/libyuv/source/compare_gcc.cc +++ b/third_party/libyuv/source/compare_gcc.cc @@ -9,6 +9,8 @@ */ #include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" #include "libyuv/row.h" #ifdef __cplusplus @@ -16,11 +18,13 @@ namespace libyuv { extern "C" { #endif -#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) +// This module is for GCC x86 and x64. +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { uint32 sse; - asm volatile ( // NOLINT + asm volatile ( "pxor %%xmm0,%%xmm0 \n" "pxor %%xmm5,%%xmm5 \n" LABELALIGN @@ -54,15 +58,10 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { "+r"(count), // %2 "=g"(sse) // %3 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); // NOLINT + ); return sse; } -#endif // defined(__x86_64__) || defined(__i386__) - -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) -#define HAS_HASHDJB2_SSE41 static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 static uvec32 kHashMul0 = { 0x0c3525e1, // 33 ^ 15 @@ -91,7 +90,7 @@ static uvec32 kHashMul3 = { uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { uint32 hash; - asm volatile ( // NOLINT + asm volatile ( "movd %2,%%xmm0 \n" "pxor %%xmm7,%%xmm7 \n" "movdqa %4,%%xmm6 \n" @@ -140,7 +139,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { "m"(kHashMul3) // %8 : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); // NOLINT + ); return hash; } #endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) diff --git a/third_party/libyuv/source/compare_neon.cc b/third_party/libyuv/source/compare_neon.cc index ef006ec41..49aa3b4ee 100644 --- a/third_party/libyuv/source/compare_neon.cc +++ b/third_party/libyuv/source/compare_neon.cc @@ -9,6 +9,8 @@ */ #include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" #include "libyuv/row.h" #ifdef __cplusplus @@ -27,7 +29,6 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { "vmov.u8 q9, #0 \n" "vmov.u8 q11, #0 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" diff --git a/third_party/libyuv/source/compare_neon64.cc b/third_party/libyuv/source/compare_neon64.cc index 6d1e5e1bc..f9c7df98c 100644 --- a/third_party/libyuv/source/compare_neon64.cc +++ b/third_party/libyuv/source/compare_neon64.cc @@ -9,6 +9,8 @@ */ #include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" #include "libyuv/row.h" #ifdef __cplusplus @@ -26,7 +28,6 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { "eor v17.16b, v17.16b, v17.16b \n" "eor v19.16b, v19.16b, v19.16b \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "ld1 {v0.16b}, [%0], #16 \n" diff --git a/third_party/libyuv/source/compare_win.cc b/third_party/libyuv/source/compare_win.cc index 19806f275..dc86fe25b 100644 --- a/third_party/libyuv/source/compare_win.cc +++ b/third_party/libyuv/source/compare_win.cc @@ -9,6 +9,8 @@ */ #include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" #include "libyuv/row.h" #ifdef __cplusplus @@ -16,9 +18,8 @@ namespace libyuv { extern "C" { #endif -// This module is for Visual C x86. -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ - defined(_MSC_VER) && !defined(__clang__) +// This module is for 32 bit Visual C x86 and clangcl +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) __declspec(naked) uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { @@ -100,41 +101,32 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { } #endif // _MSC_VER >= 1700 -#define HAS_HASHDJB2_SSE41 -static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 -static uvec32 kHashMul0 = { +uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 +uvec32 kHashMul0 = { 0x0c3525e1, // 33 ^ 15 0xa3476dc1, // 33 ^ 14 0x3b4039a1, // 33 ^ 13 0x4f5f0981, // 33 ^ 12 }; -static uvec32 kHashMul1 = { +uvec32 kHashMul1 = { 0x30f35d61, // 33 ^ 11 0x855cb541, // 33 ^ 10 0x040a9121, // 33 ^ 9 0x747c7101, // 33 ^ 8 }; -static uvec32 kHashMul2 = { +uvec32 kHashMul2 = { 0xec41d4e1, // 33 ^ 7 0x4cfa3cc1, // 33 ^ 6 0x025528a1, // 33 ^ 5 0x00121881, // 33 ^ 4 }; -static uvec32 kHashMul3 = { +uvec32 kHashMul3 = { 0x00008c61, // 33 ^ 3 0x00000441, // 33 ^ 2 0x00000021, // 33 ^ 1 0x00000001, // 33 ^ 0 }; -// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6 -// 44: 66 0F 38 40 DD pmulld xmm3,xmm5 -// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5 -// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5 -// 83: 66 0F 38 40 CD pmulld xmm1,xmm5 -#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \ - _asm _emit 0x40 _asm _emit reg - __declspec(naked) uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { __asm { @@ -143,30 +135,30 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { movd xmm0, [esp + 12] // seed pxor xmm7, xmm7 // constant 0 for unpck - movdqa xmm6, kHash16x33 + movdqa xmm6, xmmword ptr kHash16x33 wloop: movdqu xmm1, [eax] // src[0-15] lea eax, [eax + 16] - pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16 - movdqa xmm5, kHashMul0 + pmulld xmm0, xmm6 // hash *= 33 ^ 16 + movdqa xmm5, xmmword ptr kHashMul0 movdqa xmm2, xmm1 punpcklbw xmm2, xmm7 // src[0-7] movdqa xmm3, xmm2 punpcklwd xmm3, xmm7 // src[0-3] - pmulld(0xdd) // pmulld xmm3, xmm5 - movdqa xmm5, kHashMul1 + pmulld xmm3, xmm5 + movdqa xmm5, xmmword ptr kHashMul1 movdqa xmm4, xmm2 punpckhwd xmm4, xmm7 // src[4-7] - pmulld(0xe5) // pmulld xmm4, xmm5 - movdqa xmm5, kHashMul2 + pmulld xmm4, xmm5 + movdqa xmm5, xmmword ptr kHashMul2 punpckhbw xmm1, xmm7 // src[8-15] movdqa xmm2, xmm1 punpcklwd xmm2, xmm7 // src[8-11] - pmulld(0xd5) // pmulld xmm2, xmm5 - movdqa xmm5, kHashMul3 + pmulld xmm2, xmm5 + movdqa xmm5, xmmword ptr kHashMul3 punpckhwd xmm1, xmm7 // src[12-15] - pmulld(0xcd) // pmulld xmm1, xmm5 + pmulld xmm1, xmm5 paddd xmm3, xmm4 // add 16 results paddd xmm1, xmm2 paddd xmm1, xmm3 @@ -191,36 +183,37 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { __asm { mov eax, [esp + 4] // src mov ecx, [esp + 8] // count - movd xmm0, [esp + 12] // seed - movdqa xmm6, kHash16x33 + vmovd xmm0, [esp + 12] // seed wloop: - vpmovzxbd xmm3, dword ptr [eax] // src[0-3] - pmulld xmm0, xmm6 // hash *= 33 ^ 16 - vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7] - pmulld xmm3, kHashMul0 - vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11] - pmulld xmm4, kHashMul1 - vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15] - pmulld xmm2, kHashMul2 + vpmovzxbd xmm3, [eax] // src[0-3] + vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16 + vpmovzxbd xmm4, [eax + 4] // src[4-7] + vpmulld xmm3, xmm3, xmmword ptr kHashMul0 + vpmovzxbd xmm2, [eax + 8] // src[8-11] + vpmulld xmm4, xmm4, xmmword ptr kHashMul1 + vpmovzxbd xmm1, [eax + 12] // src[12-15] + vpmulld xmm2, xmm2, xmmword ptr kHashMul2 lea eax, [eax + 16] - pmulld xmm1, kHashMul3 - paddd xmm3, xmm4 // add 16 results - paddd xmm1, xmm2 - paddd xmm1, xmm3 - pshufd xmm2, xmm1, 0x0e // upper 2 dwords - paddd xmm1, xmm2 - pshufd xmm2, xmm1, 0x01 - paddd xmm1, xmm2 - paddd xmm0, xmm1 + vpmulld xmm1, xmm1, xmmword ptr kHashMul3 + vpaddd xmm3, xmm3, xmm4 // add 16 results + vpaddd xmm1, xmm1, xmm2 + vpaddd xmm1, xmm1, xmm3 + vpshufd xmm2, xmm1, 0x0e // upper 2 dwords + vpaddd xmm1, xmm1,xmm2 + vpshufd xmm2, xmm1, 0x01 + vpaddd xmm1, xmm1, xmm2 + vpaddd xmm0, xmm0, xmm1 sub ecx, 16 jg wloop - movd eax, xmm0 // return hash + vmovd eax, xmm0 // return hash + vzeroupper ret } } #endif // _MSC_VER >= 1700 + #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) #ifdef __cplusplus diff --git a/third_party/libyuv/source/convert.cc b/third_party/libyuv/source/convert.cc index 3ad6bd7a4..e332bc505 100644 --- a/third_party/libyuv/source/convert.cc +++ b/third_party/libyuv/source/convert.cc @@ -245,8 +245,8 @@ static int X420ToI420(const uint8* src_y, int y; int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) = - SplitUVRow_C; + void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) = SplitUVRow_C; if (!src_y || !src_uv || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { @@ -303,14 +303,14 @@ static int X420ToI420(const uint8* src_y, } } #endif -#if defined(HAS_SPLITUVROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && +#if defined(HAS_SPLITUVROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) && IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) && IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) { - SplitUVRow = SplitUVRow_Any_MIPS_DSPR2; + SplitUVRow = SplitUVRow_Any_DSPR2; if (IS_ALIGNED(halfwidth, 16)) { - SplitUVRow = SplitUVRow_MIPS_DSPR2; + SplitUVRow = SplitUVRow_DSPR2; } } #endif @@ -390,9 +390,9 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, int width, int height) { int y; void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix) = YUY2ToUVRow_C; + uint8* dst_u, uint8* dst_v, int width) = YUY2ToUVRow_C; void (*YUY2ToYRow)(const uint8* src_yuy2, - uint8* dst_y, int pix) = YUY2ToYRow_C; + uint8* dst_y, int width) = YUY2ToYRow_C; // Negative height means invert the image. if (height < 0) { height = -height; @@ -455,9 +455,9 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, int width, int height) { int y; void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix) = UYVYToUVRow_C; + uint8* dst_u, uint8* dst_v, int width) = UYVYToUVRow_C; void (*UYVYToYRow)(const uint8* src_uyvy, - uint8* dst_y, int pix) = UYVYToYRow_C; + uint8* dst_y, int width) = UYVYToYRow_C; // Negative height means invert the image. if (height < 0) { height = -height; @@ -521,7 +521,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, int y; void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; if (!src_argb || !dst_y || !dst_u || !dst_v || @@ -597,7 +597,7 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra, int y; void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra, uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C; - void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix) = + void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int width) = BGRAToYRow_C; if (!src_bgra || !dst_y || !dst_u || !dst_v || @@ -663,7 +663,7 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr, int y; void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr, uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C; - void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix) = + void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int width) = ABGRToYRow_C; if (!src_abgr || !dst_y || !dst_u || !dst_v || @@ -729,7 +729,7 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba, int y; void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba, uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C; - void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix) = + void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int width) = RGBAToYRow_C; if (!src_rgba || !dst_y || !dst_u || !dst_v || @@ -796,14 +796,14 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, #if defined(HAS_RGB24TOYROW_NEON) void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24, uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C; - void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int pix) = + void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int width) = RGB24ToYRow_C; #else - void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = RGB24ToARGBRow_C; void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; #endif if (!src_rgb24 || !dst_y || !dst_u || !dst_v || @@ -910,14 +910,14 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, #if defined(HAS_RAWTOYROW_NEON) void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw, uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C; - void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int pix) = + void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int width) = RAWToYRow_C; #else - void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = RAWToARGBRow_C; void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; #endif if (!src_raw || !dst_y || !dst_u || !dst_v || @@ -1024,14 +1024,14 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, #if defined(HAS_RGB565TOYROW_NEON) void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565, uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C; - void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int pix) = + void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int width) = RGB565ToYRow_C; #else - void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = RGB565ToARGBRow_C; void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; #endif if (!src_rgb565 || !dst_y || !dst_u || !dst_v || @@ -1146,14 +1146,14 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, #if defined(HAS_ARGB1555TOYROW_NEON) void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555, uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C; - void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int pix) = + void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int width) = ARGB1555ToYRow_C; #else - void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = ARGB1555ToARGBRow_C; void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; #endif if (!src_argb1555 || !dst_y || !dst_u || !dst_v || @@ -1270,14 +1270,14 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, #if defined(HAS_ARGB4444TOYROW_NEON) void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444, uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C; - void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int pix) = + void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int width) = ARGB4444ToYRow_C; #else - void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = ARGB4444ToARGBRow_C; void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; #endif if (!src_argb4444 || !dst_y || !dst_u || !dst_v || diff --git a/third_party/libyuv/source/convert_argb.cc b/third_party/libyuv/source/convert_argb.cc index 44756bc41..fb9582d62 100644 --- a/third_party/libyuv/source/convert_argb.cc +++ b/third_party/libyuv/source/convert_argb.cc @@ -14,6 +14,7 @@ #ifdef HAVE_JPEG #include "libyuv/mjpeg_decoder.h" #endif +#include "libyuv/planar_functions.h" // For CopyPlane and ARGBShuffle. #include "libyuv/rotate_argb.h" #include "libyuv/row.h" #include "libyuv/video_common.h" @@ -44,21 +45,21 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb, return 0; } -// Convert I444 to ARGB. -LIBYUV_API -int I444ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +// Convert I422 to ARGB with matrix +static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, int height) { int y; - void (*I444ToARGBRow)(const uint8* y_buf, + void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, - int width) = I444ToARGBRow_C; - if (!src_y || !src_u || !src_v || - !dst_argb || + const struct YuvConstants* yuvconstants, + int width) = I422ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -68,62 +69,155 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - // Coalesce rows. - if (src_stride_y == width && - src_stride_u == width && - src_stride_v == width && - dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; - } -#if defined(HAS_I444TOARGBROW_SSSE3) +#if defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - I444ToARGBRow = I444ToARGBRow_Any_SSSE3; + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I444ToARGBRow = I444ToARGBRow_SSSE3; + I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif -#if defined(HAS_I444TOARGBROW_AVX2) +#if defined(HAS_I422TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - I444ToARGBRow = I444ToARGBRow_Any_AVX2; + I422ToARGBRow = I422ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { - I444ToARGBRow = I444ToARGBRow_AVX2; + I422ToARGBRow = I422ToARGBRow_AVX2; } } #endif -#if defined(HAS_I444TOARGBROW_NEON) +#if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I444ToARGBRow = I444ToARGBRow_Any_NEON; + I422ToARGBRow = I422ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { - I444ToARGBRow = I444ToARGBRow_NEON; + I422ToARGBRow = I422ToARGBRow_NEON; } } #endif +#if defined(HAS_I422TOARGBROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + I422ToARGBRow = I422ToARGBRow_DSPR2; + } +#endif for (y = 0; y < height; ++y) { - I444ToARGBRow(src_y, src_u, src_v, dst_argb, width); + I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } } return 0; } -// Convert I422 to ARGB. +// Convert I420 to ARGB. LIBYUV_API -int I422ToARGB(const uint8* src_y, int src_stride_y, +int I420ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return I420ToARGBMatrix(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_argb, dst_stride_argb, + &kYuvI601Constants, + width, height); +} + +// Convert I420 to ABGR. +LIBYUV_API +int I420ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height) { + return I420ToARGBMatrix(src_y, src_stride_y, + src_v, src_stride_v, // Swap U and V + src_u, src_stride_u, + dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert J420 to ARGB. +LIBYUV_API +int J420ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return I420ToARGBMatrix(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_argb, dst_stride_argb, + &kYuvJPEGConstants, + width, height); +} + +// Convert J420 to ABGR. +LIBYUV_API +int J420ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height) { + return I420ToARGBMatrix(src_y, src_stride_y, + src_v, src_stride_v, // Swap U and V + src_u, src_stride_u, + dst_abgr, dst_stride_abgr, + &kYvuJPEGConstants, // Use Yvu matrix + width, height); +} + +// Convert H420 to ARGB. +LIBYUV_API +int H420ToARGB(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, uint8* dst_argb, int dst_stride_argb, int width, int height) { + return I420ToARGBMatrix(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_argb, dst_stride_argb, + &kYuvH709Constants, + width, height); +} + +// Convert H420 to ABGR. +LIBYUV_API +int H420ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height) { + return I420ToARGBMatrix(src_y, src_stride_y, + src_v, src_stride_v, // Swap U and V + src_u, src_stride_u, + dst_abgr, dst_stride_abgr, + &kYvuH709Constants, // Use Yvu matrix + width, height); +} + +// Convert I422 to ARGB with matrix +static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, int height) { int y; void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = I422ToARGBRow_C; if (!src_y || !src_u || !src_v || !dst_argb || @@ -169,18 +263,18 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, } } #endif -#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && +#if defined(HAS_I422TOARGBROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + I422ToARGBRow = I422ToARGBRow_DSPR2; } #endif for (y = 0; y < height; ++y) { - I422ToARGBRow(src_y, src_u, src_v, dst_argb, width); + I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; src_u += src_stride_u; @@ -189,6 +283,210 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, return 0; } +// Convert I422 to ARGB. +LIBYUV_API +int I422ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return I422ToARGBMatrix(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_argb, dst_stride_argb, + &kYuvI601Constants, + width, height); +} + +// Convert I422 to ABGR. +LIBYUV_API +int I422ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height) { + return I422ToARGBMatrix(src_y, src_stride_y, + src_v, src_stride_v, // Swap U and V + src_u, src_stride_u, + dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert J422 to ARGB. +LIBYUV_API +int J422ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return I422ToARGBMatrix(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_argb, dst_stride_argb, + &kYuvJPEGConstants, + width, height); +} + +// Convert J422 to ABGR. +LIBYUV_API +int J422ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height) { + return I422ToARGBMatrix(src_y, src_stride_y, + src_v, src_stride_v, // Swap U and V + src_u, src_stride_u, + dst_abgr, dst_stride_abgr, + &kYvuJPEGConstants, // Use Yvu matrix + width, height); +} + +// Convert H422 to ARGB. +LIBYUV_API +int H422ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return I422ToARGBMatrix(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_argb, dst_stride_argb, + &kYuvH709Constants, + width, height); +} + +// Convert H422 to ABGR. +LIBYUV_API +int H422ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height) { + return I422ToARGBMatrix(src_y, src_stride_y, + src_v, src_stride_v, // Swap U and V + src_u, src_stride_u, + dst_abgr, dst_stride_abgr, + &kYvuH709Constants, // Use Yvu matrix + width, height); +} + +// Convert I444 to ARGB with matrix +static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, int height) { + int y; + void (*I444ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) = I444ToARGBRow_C; + if (!src_y || !src_u || !src_v || + !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u == width && + src_stride_v == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; + } +#if defined(HAS_I444TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I444ToARGBRow = I444ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I444TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I444ToARGBRow = I444ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I444ToARGBRow = I444ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I444TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I444ToARGBRow = I444ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I444 to ARGB. +LIBYUV_API +int I444ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return I444ToARGBMatrix(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_argb, dst_stride_argb, + &kYuvI601Constants, + width, height); +} + +// Convert I444 to ABGR. +LIBYUV_API +int I444ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height) { + return I444ToARGBMatrix(src_y, src_stride_y, + src_v, src_stride_v, // Swap U and V + src_u, src_stride_u, + dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert J444 to ARGB. +LIBYUV_API +int J444ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return I444ToARGBMatrix(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_argb, dst_stride_argb, + &kYuvJPEGConstants, + width, height); +} + // Convert I411 to ARGB. LIBYUV_API int I411ToARGB(const uint8* src_y, int src_stride_y, @@ -201,6 +499,7 @@ int I411ToARGB(const uint8* src_y, int src_stride_y, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = I411ToARGBRow_C; if (!src_y || !src_u || !src_v || !dst_argb || @@ -248,7 +547,7 @@ int I411ToARGB(const uint8* src_y, int src_stride_y, #endif for (y = 0; y < height; ++y) { - I411ToARGBRow(src_y, src_u, src_v, dst_argb, width); + I411ToARGBRow(src_y, src_u, src_v, dst_argb, &kYuvI601Constants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; src_u += src_stride_u; @@ -257,6 +556,143 @@ int I411ToARGB(const uint8* src_y, int src_stride_y, return 0; } +// Convert I420 with Alpha to preattenuated ARGB. +static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + const uint8* src_a, int src_stride_a, + uint8* dst_argb, int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, int height, int attenuate) { + int y; + void (*I422AlphaToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I422AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, + int width) = ARGBAttenuateRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I422ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422ALPHATOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422ALPHATOARGBROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2; + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_a += src_stride_a; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 with Alpha to ARGB. +LIBYUV_API +int I420AlphaToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + const uint8* src_a, int src_stride_a, + uint8* dst_argb, int dst_stride_argb, + int width, int height, int attenuate) { + return I420AlphaToARGBMatrix(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + src_a, src_stride_a, + dst_argb, dst_stride_argb, + &kYuvI601Constants, + width, height, attenuate); +} + +// Convert I420 with Alpha to ABGR. +LIBYUV_API +int I420AlphaToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + const uint8* src_a, int src_stride_a, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height, int attenuate) { + return I420AlphaToARGBMatrix(src_y, src_stride_y, + src_v, src_stride_v, // Swap U and V + src_u, src_stride_u, + src_a, src_stride_a, + dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height, attenuate); +} + // Convert I400 to ARGB. LIBYUV_API int I400ToARGB(const uint8* src_y, int src_stride_y, @@ -322,7 +758,7 @@ int J400ToARGB(const uint8* src_y, int src_stride_y, uint8* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) = + void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int width) = J400ToARGBRow_C; if (!src_y || !dst_argb || width <= 0 || height == 0) { @@ -449,7 +885,7 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24, uint8* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = RGB24ToARGBRow_C; if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) { @@ -499,7 +935,7 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw, uint8* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = RAWToARGBRow_C; if (!src_raw || !dst_argb || width <= 0 || height == 0) { @@ -549,7 +985,7 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, uint8* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) = + void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int width) = RGB565ToARGBRow_C; if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) { @@ -608,7 +1044,7 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, int width, int height) { int y; void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb, - int pix) = ARGB1555ToARGBRow_C; + int width) = ARGB1555ToARGBRow_C; if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) { return -1; @@ -666,7 +1102,7 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, int width, int height) { int y; void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb, - int pix) = ARGB4444ToARGBRow_C; + int width) = ARGB4444ToARGBRow_C; if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) { return -1; @@ -727,6 +1163,7 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y, void (*NV12ToARGBRow)(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C; if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { @@ -764,7 +1201,7 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y, #endif for (y = 0; y < height; ++y) { - NV12ToARGBRow(src_y, src_uv, dst_argb, width); + NV12ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; if (y & 1) { @@ -784,6 +1221,7 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y, void (*NV21ToARGBRow)(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C; if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { @@ -821,7 +1259,7 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y, #endif for (y = 0; y < height; ++y) { - NV21ToARGBRow(src_y, src_uv, dst_argb, width); + NV21ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; if (y & 1) { @@ -840,6 +1278,7 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420, void (*NV12ToARGBRow)(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C; if (!src_m420 || !dst_argb || width <= 0 || height == 0) { @@ -877,14 +1316,16 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420, #endif for (y = 0; y < height - 1; y += 2) { - NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width); + NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, + &kYuvI601Constants, width); NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2, - dst_argb + dst_stride_argb, width); + dst_argb + dst_stride_argb, &kYuvI601Constants, width); dst_argb += dst_stride_argb * 2; src_m420 += src_stride_m420 * 3; } if (height & 1) { - NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width); + NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, + &kYuvI601Constants, width); } return 0; } @@ -895,7 +1336,10 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, uint8* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) = + void (*YUY2ToARGBRow)(const uint8* src_yuy2, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = YUY2ToARGBRow_C; if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) { @@ -939,7 +1383,7 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, } #endif for (y = 0; y < height; ++y) { - YUY2ToARGBRow(src_yuy2, dst_argb, width); + YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width); src_yuy2 += src_stride_yuy2; dst_argb += dst_stride_argb; } @@ -952,7 +1396,10 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, uint8* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) = + void (*UYVYToARGBRow)(const uint8* src_uyvy, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = UYVYToARGBRow_C; if (!src_uyvy || !dst_argb || width <= 0 || height == 0) { @@ -996,159 +1443,13 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, } #endif for (y = 0; y < height; ++y) { - UYVYToARGBRow(src_uyvy, dst_argb, width); + UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width); src_uyvy += src_stride_uyvy; dst_argb += dst_stride_argb; } return 0; } -// Convert J420 to ARGB. -LIBYUV_API -int J420ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - int y; - void (*J422ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) = J422ToARGBRow_C; - if (!src_y || !src_u || !src_v || !dst_argb || - width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } -#if defined(HAS_J422TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - J422ToARGBRow = J422ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - J422ToARGBRow = J422ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_J422TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - J422ToARGBRow = J422ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - J422ToARGBRow = J422ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_J422TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - J422ToARGBRow = J422ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - J422ToARGBRow = J422ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_J422TOARGBROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2; - } -#endif - - for (y = 0; y < height; ++y) { - J422ToARGBRow(src_y, src_u, src_v, dst_argb, width); - dst_argb += dst_stride_argb; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert J422 to ARGB. -LIBYUV_API -int J422ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - int y; - void (*J422ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) = J422ToARGBRow_C; - if (!src_y || !src_u || !src_v || - !dst_argb || - width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } - // Coalesce rows. - if (src_stride_y == width && - src_stride_u * 2 == width && - src_stride_v * 2 == width && - dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; - } -#if defined(HAS_J422TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - J422ToARGBRow = J422ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - J422ToARGBRow = J422ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_J422TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - J422ToARGBRow = J422ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - J422ToARGBRow = J422ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_J422TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - J422ToARGBRow = J422ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - J422ToARGBRow = J422ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_J422TOARGBROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2; - } -#endif - - for (y = 0; y < height; ++y) { - J422ToARGBRow(src_y, src_u, src_v, dst_argb, width); - dst_argb += dst_stride_argb; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - } - return 0; -} - #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/third_party/libyuv/source/convert_from.cc b/third_party/libyuv/source/convert_from.cc index 31f1ac992..46abdebcd 100644 --- a/third_party/libyuv/source/convert_from.cc +++ b/third_party/libyuv/source/convert_from.cc @@ -445,71 +445,72 @@ int I420ToNV21(const uint8* src_y, int src_stride_y, return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u, src_stride_u, - dst_y, src_stride_y, + dst_y, dst_stride_y, dst_vu, dst_stride_vu, width, height); } -// Convert I420 to ARGB. -LIBYUV_API -int I420ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +// Convert I422 to RGBA with matrix +static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgba, int dst_stride_rgba, + const struct YuvConstants* yuvconstants, + int width, int height) { int y; - void (*I422ToARGBRow)(const uint8* y_buf, + void (*I422ToRGBARow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, - int width) = I422ToARGBRow_C; - if (!src_y || !src_u || !src_v || !dst_argb || + const struct YuvConstants* yuvconstants, + int width) = I422ToRGBARow_C; + if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; + dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; + dst_stride_rgba = -dst_stride_rgba; } -#if defined(HAS_I422TOARGBROW_SSSE3) +#if defined(HAS_I422TORGBAROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + I422ToRGBARow = I422ToRGBARow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; + I422ToRGBARow = I422ToRGBARow_SSSE3; } } #endif -#if defined(HAS_I422TOARGBROW_AVX2) +#if defined(HAS_I422TORGBAROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - I422ToARGBRow = I422ToARGBRow_Any_AVX2; + I422ToRGBARow = I422ToRGBARow_Any_AVX2; if (IS_ALIGNED(width, 16)) { - I422ToARGBRow = I422ToARGBRow_AVX2; + I422ToRGBARow = I422ToRGBARow_AVX2; } } #endif -#if defined(HAS_I422TOARGBROW_NEON) +#if defined(HAS_I422TORGBAROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGBRow = I422ToARGBRow_Any_NEON; + I422ToRGBARow = I422ToRGBARow_Any_NEON; if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_NEON; + I422ToRGBARow = I422ToRGBARow_NEON; } } #endif -#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && +#if defined(HAS_I422TORGBAROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) { + I422ToRGBARow = I422ToRGBARow_DSPR2; } #endif for (y = 0; y < height; ++y) { - I422ToARGBRow(src_y, src_u, src_v, dst_argb, width); - dst_argb += dst_stride_argb; + I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); + dst_rgba += dst_stride_rgba; src_y += src_stride_y; if (y & 1) { src_u += src_stride_u; @@ -519,207 +520,49 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, return 0; } -// Convert I420 to BGRA. +// Convert I420 to RGBA. LIBYUV_API -int I420ToBGRA(const uint8* src_y, int src_stride_y, +int I420ToRGBA(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, - uint8* dst_bgra, int dst_stride_bgra, + uint8* dst_rgba, int dst_stride_rgba, int width, int height) { - int y; - void (*I422ToBGRARow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) = I422ToBGRARow_C; - if (!src_y || !src_u || !src_v || !dst_bgra || - width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra; - dst_stride_bgra = -dst_stride_bgra; - } -#if defined(HAS_I422TOBGRAROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToBGRARow = I422ToBGRARow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToBGRARow = I422ToBGRARow_SSSE3; - } - } -#endif -#if defined(HAS_I422TOBGRAROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToBGRARow = I422ToBGRARow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToBGRARow = I422ToBGRARow_AVX2; - } - } -#endif -#if defined(HAS_I422TOBGRAROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToBGRARow = I422ToBGRARow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToBGRARow = I422ToBGRARow_NEON; - } - } -#endif -#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) { - I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2; - } -#endif - - for (y = 0; y < height; ++y) { - I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width); - dst_bgra += dst_stride_bgra; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; + return I420ToRGBAMatrix(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_rgba, dst_stride_rgba, + &kYuvI601Constants, + width, height); } -// Convert I420 to ABGR. -LIBYUV_API -int I420ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { - int y; - void (*I422ToABGRRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) = I422ToABGRRow_C; - if (!src_y || !src_u || !src_v || !dst_abgr || - width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr; - dst_stride_abgr = -dst_stride_abgr; - } -#if defined(HAS_I422TOABGRROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToABGRRow = I422ToABGRRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToABGRRow = I422ToABGRRow_SSSE3; - } - } -#endif -#if defined(HAS_I422TOABGRROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToABGRRow = I422ToABGRRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToABGRRow = I422ToABGRRow_AVX2; - } - } -#endif -#if defined(HAS_I422TOABGRROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToABGRRow = I422ToABGRRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToABGRRow = I422ToABGRRow_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width); - dst_abgr += dst_stride_abgr; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I420 to RGBA. +// Convert I420 to BGRA. LIBYUV_API -int I420ToRGBA(const uint8* src_y, int src_stride_y, +int I420ToBGRA(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, - uint8* dst_rgba, int dst_stride_rgba, + uint8* dst_bgra, int dst_stride_bgra, int width, int height) { - int y; - void (*I422ToRGBARow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) = I422ToRGBARow_C; - if (!src_y || !src_u || !src_v || !dst_rgba || - width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; - dst_stride_rgba = -dst_stride_rgba; - } -#if defined(HAS_I422TORGBAROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGBARow = I422ToRGBARow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_SSSE3; - } - } -#endif -#if defined(HAS_I422TORGBAROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToRGBARow = I422ToRGBARow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToRGBARow = I422ToRGBARow_AVX2; - } - } -#endif -#if defined(HAS_I422TORGBAROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToRGBARow = I422ToRGBARow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width); - dst_rgba += dst_stride_rgba; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; + return I420ToRGBAMatrix(src_y, src_stride_y, + src_v, src_stride_v, // Swap U and V + src_u, src_stride_u, + dst_bgra, dst_stride_bgra, + &kYvuI601Constants, // Use Yvu matrix + width, height); } -// Convert I420 to RGB24. -LIBYUV_API -int I420ToRGB24(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgb24, int dst_stride_rgb24, - int width, int height) { +// Convert I420 to RGB24 with matrix +static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgb24, int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, int height) { int y; void (*I422ToRGB24Row)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = I422ToRGB24Row_C; if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) { @@ -757,7 +600,7 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, #endif for (y = 0; y < height; ++y) { - I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width); + I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); dst_rgb24 += dst_stride_rgb24; src_y += src_stride_y; if (y & 1) { @@ -768,64 +611,34 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, return 0; } -// Convert I420 to RAW. +// Convert I420 to RGB24. LIBYUV_API -int I420ToRAW(const uint8* src_y, int src_stride_y, +int I420ToRGB24(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, - uint8* dst_raw, int dst_stride_raw, + uint8* dst_rgb24, int dst_stride_rgb24, int width, int height) { - int y; - void (*I422ToRAWRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) = I422ToRAWRow_C; - if (!src_y || !src_u || !src_v || !dst_raw || - width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_raw = dst_raw + (height - 1) * dst_stride_raw; - dst_stride_raw = -dst_stride_raw; - } -#if defined(HAS_I422TORAWROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRAWRow = I422ToRAWRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToRAWRow = I422ToRAWRow_SSSE3; - } - } -#endif -#if defined(HAS_I422TORAWROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToRAWRow = I422ToRAWRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToRAWRow = I422ToRAWRow_AVX2; - } - } -#endif -#if defined(HAS_I422TORAWROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToRAWRow = I422ToRAWRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToRAWRow = I422ToRAWRow_NEON; - } - } -#endif + return I420ToRGB24Matrix(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_rgb24, dst_stride_rgb24, + &kYuvI601Constants, + width, height); +} - for (y = 0; y < height; ++y) { - I422ToRAWRow(src_y, src_u, src_v, dst_raw, width); - dst_raw += dst_stride_raw; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; +// Convert I420 to RAW. +LIBYUV_API +int I420ToRAW(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_raw, int dst_stride_raw, + int width, int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, + src_v, src_stride_v, // Swap U and V + src_u, src_stride_u, + dst_raw, dst_stride_raw, + &kYvuI601Constants, // Use Yvu matrix + width, height); } // Convert I420 to ARGB1555. @@ -840,6 +653,7 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = I422ToARGB1555Row_C; if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 || height == 0) { @@ -877,7 +691,8 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, #endif for (y = 0; y < height; ++y) { - I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, width); + I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants, + width); dst_argb1555 += dst_stride_argb1555; src_y += src_stride_y; if (y & 1) { @@ -901,6 +716,7 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = I422ToARGB4444Row_C; if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 || height == 0) { @@ -938,7 +754,8 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, #endif for (y = 0; y < height; ++y) { - I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, width); + I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants, + width); dst_argb4444 += dst_stride_argb4444; src_y += src_stride_y; if (y & 1) { @@ -961,6 +778,7 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = I422ToRGB565Row_C; if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { @@ -998,7 +816,7 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, #endif for (y = 0; y < height; ++y) { - I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, width); + I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width); dst_rgb565 += dst_stride_rgb565; src_y += src_stride_y; if (y & 1) { @@ -1029,9 +847,10 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = I422ToARGBRow_C; void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C; + const uint32 dither4, int width) = ARGBToRGB565DitherRow_C; if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { return -1; @@ -1069,12 +888,12 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, } } #endif -#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && +#if defined(HAS_I422TOARGBROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) { - I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + I422ToARGBRow = I422ToARGBRow_DSPR2; } #endif #if defined(HAS_ARGBTORGB565DITHERROW_SSE2) @@ -1105,7 +924,7 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, // Allocate a row of argb. align_buffer_64(row_argb, width * 4); for (y = 0; y < height; ++y) { - I422ToARGBRow(src_y, src_u, src_v, row_argb, width); + I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width); ARGBToRGB565DitherRow(row_argb, dst_rgb565, *(uint32*)(dither4x4 + ((y & 3) << 2)), width); dst_rgb565 += dst_stride_rgb565; @@ -1258,7 +1077,6 @@ int ConvertFromI420(const uint8* y, int y_stride, // Triplanar formats // TODO(fbarchard): halfstride instead of halfwidth case FOURCC_I420: - case FOURCC_YU12: case FOURCC_YV12: { int halfwidth = (width + 1) / 2; int halfheight = (height + 1) / 2; diff --git a/third_party/libyuv/source/convert_from_argb.cc b/third_party/libyuv/source/convert_from_argb.cc index 8d1e97aec..2a8682b7e 100644 --- a/third_party/libyuv/source/convert_from_argb.cc +++ b/third_party/libyuv/source/convert_from_argb.cc @@ -28,10 +28,10 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, uint8* dst_v, int dst_stride_v, int width, int height) { int y; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int pix) = ARGBToUV444Row_C; + int width) = ARGBToUV444Row_C; if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } @@ -109,13 +109,16 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, uint8* dst_v, int dst_stride_v, int width, int height) { int y; - void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int pix) = ARGBToUV422Row_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; - if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + if (!src_argb || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { return -1; } + // Negative height means invert the image. if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; @@ -130,34 +133,22 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, height = 1; src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; } -#if defined(HAS_ARGBTOUV422ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOUV422ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUV422Row = ARGBToUV422Row_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYROW_SSSE3) +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOYROW_AVX2) +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; ARGBToYRow = ARGBToYRow_AVX2; } } @@ -170,9 +161,17 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif for (y = 0; y < height; ++y) { - ARGBToUV422Row(src_argb, dst_u, dst_v, width); + ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); ARGBToYRow(src_argb, dst_y, width); src_argb += src_stride_argb; dst_y += dst_stride_y; @@ -191,8 +190,8 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb, int width, int height) { int y; void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int pix) = ARGBToUV411Row_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + int width) = ARGBToUV411Row_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; @@ -264,7 +263,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, int halfwidth = (width + 1) >> 1; void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) = MergeUVRow_C; @@ -373,7 +372,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, int halfwidth = (width + 1) >> 1; void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) = MergeUVRow_C; @@ -478,9 +477,9 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, uint8* dst_yuy2, int dst_stride_yuy2, int width, int height) { int y; - void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int pix) = ARGBToUV422Row_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C; @@ -502,34 +501,22 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, height = 1; src_stride_argb = dst_stride_yuy2 = 0; } -#if defined(HAS_ARGBTOUV422ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOUV422ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUV422Row = ARGBToUV422Row_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYROW_SSSE3) +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOYROW_AVX2) +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; ARGBToYRow = ARGBToYRow_AVX2; } } @@ -542,7 +529,14 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, } } #endif - +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif #if defined(HAS_I422TOYUY2ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; @@ -567,7 +561,7 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, uint8* row_v = row_u + ((width + 63) & ~63) / 2; for (y = 0; y < height; ++y) { - ARGBToUV422Row(src_argb, row_u, row_v, width); + ARGBToUVRow(src_argb, 0, row_u, row_v, width); ARGBToYRow(src_argb, row_y, width); I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width); src_argb += src_stride_argb; @@ -585,9 +579,9 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, uint8* dst_uyvy, int dst_stride_uyvy, int width, int height) { int y; - void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int pix) = ARGBToUV422Row_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C; @@ -609,34 +603,22 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, height = 1; src_stride_argb = dst_stride_uyvy = 0; } -#if defined(HAS_ARGBTOUV422ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOUV422ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUV422Row = ARGBToUV422Row_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYROW_SSSE3) +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOYROW_AVX2) +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; ARGBToYRow = ARGBToYRow_AVX2; } } @@ -649,7 +631,14 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, } } #endif - +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif #if defined(HAS_I422TOUYVYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; @@ -674,7 +663,7 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, uint8* row_v = row_u + ((width + 63) & ~63) / 2; for (y = 0; y < height; ++y) { - ARGBToUV422Row(src_argb, row_u, row_v, width); + ARGBToUVRow(src_argb, 0, row_u, row_v, width); ARGBToYRow(src_argb, row_y, width); I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width); src_argb += src_stride_argb; @@ -692,7 +681,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, uint8* dst_y, int dst_stride_y, int width, int height) { int y; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; if (!src_argb || !dst_y || width <= 0 || height == 0) { return -1; @@ -764,7 +753,7 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, uint8* dst_rgb24, int dst_stride_rgb24, int width, int height) { int y; - void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int width) = ARGBToRGB24Row_C; if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) { return -1; @@ -812,7 +801,7 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, uint8* dst_raw, int dst_stride_raw, int width, int height) { int y; - void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) = + void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int width) = ARGBToRAWRow_C; if (!src_argb || !dst_raw || width <= 0 || height == 0) { return -1; @@ -869,7 +858,7 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, const uint8* dither4x4, int width, int height) { int y; void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C; + const uint32 dither4, int width) = ARGBToRGB565DitherRow_C; if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { return -1; } @@ -921,7 +910,7 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, uint8* dst_rgb565, int dst_stride_rgb565, int width, int height) { int y; - void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int width) = ARGBToRGB565Row_C; if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { return -1; @@ -977,7 +966,7 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, uint8* dst_argb1555, int dst_stride_argb1555, int width, int height) { int y; - void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int width) = ARGBToARGB1555Row_C; if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) { return -1; @@ -1033,7 +1022,7 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, uint8* dst_argb4444, int dst_stride_argb4444, int width, int height) { int y; - void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int width) = ARGBToARGB4444Row_C; if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) { return -1; @@ -1093,7 +1082,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, int y; void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C; - void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) = + void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) = ARGBToYJRow_C; if (!src_argb || !dst_yj || !dst_u || !dst_v || @@ -1157,21 +1146,24 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, return 0; } -// ARGB little endian (bgra in memory) to J422 +// Convert ARGB to J422. (JPeg full range I422). LIBYUV_API int ARGBToJ422(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, + uint8* dst_yj, int dst_stride_yj, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height) { int y; - void (*ARGBToUVJ422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int pix) = ARGBToUVJ422Row_C; - void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_y, int pix) = + void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) = ARGBToYJRow_C; - if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + if (!src_argb || + !dst_yj || !dst_u || !dst_v || + width <= 0 || height == 0) { return -1; } + // Negative height means invert the image. if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; @@ -1179,34 +1171,19 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb, } // Coalesce rows. if (src_stride_argb == width * 4 && - dst_stride_y == width && + dst_stride_yj == width && dst_stride_u * 2 == width && dst_stride_v * 2 == width) { width *= height; height = 1; - src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; - } -#if defined(HAS_ARGBTOUVJ422ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJ422Row = ARGBToUVJ422Row_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVJ422Row = ARGBToUVJ422Row_SSSE3; - } + src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0; } -#endif -#if defined(HAS_ARGBTOUVJ422ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVJ422Row = ARGBToUVJ422Row_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVJ422Row = ARGBToUVJ422Row_NEON; - } - } -#endif - -#if defined(HAS_ARGBTOYJROW_SSSE3) +#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; ARGBToYJRow = ARGBToYJRow_SSSE3; } } @@ -1227,12 +1204,20 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVJRow = ARGBToUVJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_NEON; + } + } +#endif for (y = 0; y < height; ++y) { - ARGBToUVJ422Row(src_argb, dst_u, dst_v, width); - ARGBToYJRow(src_argb, dst_y, width); + ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width); + ARGBToYJRow(src_argb, dst_yj, width); src_argb += src_stride_argb; - dst_y += dst_stride_y; + dst_yj += dst_stride_yj; dst_u += dst_stride_u; dst_v += dst_stride_v; } @@ -1245,7 +1230,7 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb, uint8* dst_yj, int dst_stride_yj, int width, int height) { int y; - void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) = + void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) = ARGBToYJRow_C; if (!src_argb || !dst_yj || width <= 0 || height == 0) { return -1; diff --git a/third_party/libyuv/source/convert_jpeg.cc b/third_party/libyuv/source/convert_jpeg.cc index bcb980f7f..90f550a26 100644 --- a/third_party/libyuv/source/convert_jpeg.cc +++ b/third_party/libyuv/source/convert_jpeg.cc @@ -9,6 +9,7 @@ */ #include "libyuv/convert.h" +#include "libyuv/convert_argb.h" #ifdef HAVE_JPEG #include "libyuv/mjpeg_decoder.h" diff --git a/third_party/libyuv/source/convert_to_argb.cc b/third_party/libyuv/source/convert_to_argb.cc index af829fbd3..aecdc80fd 100644 --- a/third_party/libyuv/source/convert_to_argb.cc +++ b/third_party/libyuv/source/convert_to_argb.cc @@ -23,7 +23,7 @@ namespace libyuv { extern "C" { #endif -// Convert camera sample to I420 with cropping, rotation and vertical flip. +// Convert camera sample to ARGB with cropping, rotation and vertical flip. // src_width is used for source stride computation // src_height is used to compute location of planes, and indicate inversion // sample_size is measured in bytes and is the size of the frame. @@ -51,8 +51,8 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, // also enable temporary buffer. LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) || crop_argb == sample; - uint8* tmp_argb = crop_argb; - int tmp_argb_stride = argb_stride; + uint8* dest_argb = crop_argb; + int dest_argb_stride = argb_stride; uint8* rotate_buffer = NULL; int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; @@ -66,13 +66,13 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, } if (need_buf) { - int argb_size = crop_width * abs_crop_height * 4; + int argb_size = crop_width * 4 * abs_crop_height; rotate_buffer = (uint8*)malloc(argb_size); if (!rotate_buffer) { return 1; // Out of memory runtime error. } crop_argb = rotate_buffer; - argb_stride = crop_width; + argb_stride = crop_width * 4; } switch (format) { @@ -176,7 +176,6 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, break; // Triplanar formats case FOURCC_I420: - case FOURCC_YU12: case FOURCC_YV12: { const uint8* src_y = sample + (src_width * crop_y + crop_x); const uint8* src_u; @@ -291,7 +290,7 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, if (need_buf) { if (!r) { r = ARGBRotate(crop_argb, argb_stride, - tmp_argb, tmp_argb_stride, + dest_argb, dest_argb_stride, crop_width, abs_crop_height, rotation); } free(rotate_buffer); diff --git a/third_party/libyuv/source/convert_to_i420.cc b/third_party/libyuv/source/convert_to_i420.cc index 5e75369b5..e5f307c44 100644 --- a/third_party/libyuv/source/convert_to_i420.cc +++ b/third_party/libyuv/source/convert_to_i420.cc @@ -39,12 +39,13 @@ int ConvertToI420(const uint8* sample, int aligned_src_width = (src_width + 1) & ~1; const uint8* src; const uint8* src_uv; - int abs_src_height = (src_height < 0) ? -src_height : src_height; - int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height; + const int abs_src_height = (src_height < 0) ? -src_height : src_height; + // TODO(nisse): Why allow crop_height < 0? + const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; int r = 0; LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 && format != FOURCC_NV12 && format != FOURCC_NV21 && - format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample; + format != FOURCC_YV12) || y == sample; uint8* tmp_y = y; uint8* tmp_u = u; uint8* tmp_v = v; @@ -52,16 +53,14 @@ int ConvertToI420(const uint8* sample, int tmp_u_stride = u_stride; int tmp_v_stride = v_stride; uint8* rotate_buffer = NULL; - int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; + const int inv_crop_height = + (src_height < 0) ? -abs_crop_height : abs_crop_height; if (!y || !u || !v || !sample || src_width <= 0 || crop_width <= 0 || src_height == 0 || crop_height == 0) { return -1; } - if (src_height < 0) { - inv_crop_height = -inv_crop_height; - } // One pass rotation is available for some formats. For the rest, convert // to I420 (with optional vertical flipping) into a temporary I420 buffer, @@ -214,7 +213,6 @@ int ConvertToI420(const uint8* sample, break; // Triplanar formats case FOURCC_I420: - case FOURCC_YU12: case FOURCC_YV12: { const uint8* src_y = sample + (src_width * crop_y + crop_x); const uint8* src_u; diff --git a/third_party/libyuv/source/cpu_id.cc b/third_party/libyuv/source/cpu_id.cc index 8a10b0083..84927ebc3 100644 --- a/third_party/libyuv/source/cpu_id.cc +++ b/third_party/libyuv/source/cpu_id.cc @@ -10,12 +10,12 @@ #include "libyuv/cpu_id.h" -#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__) +#if defined(_MSC_VER) #include <intrin.h> // For __cpuidex() #endif #if !defined(__pnacl__) && !defined(__CLR_VER) && \ !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \ - defined(_MSC_VER) && !defined(__clang__) && (_MSC_FULL_VER >= 160040219) + defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) #include <immintrin.h> // For _xgetbv() #endif @@ -36,7 +36,8 @@ extern "C" { // For functions that use the stack and have runtime checks for overflow, // use SAFEBUFFERS to avoid additional check. -#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219) +#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) && \ + !defined(__clang__) #define SAFEBUFFERS __declspec(safebuffers) #else #define SAFEBUFFERS @@ -48,9 +49,9 @@ extern "C" { !defined(__pnacl__) && !defined(__CLR_VER) LIBYUV_API void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { -#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__) +#if defined(_MSC_VER) // Visual C version uses intrinsic or inline x86 assembly. -#if (_MSC_FULL_VER >= 160040219) +#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) __cpuidex((int*)(cpu_info), info_eax, info_ecx); #elif defined(_M_IX86) __asm { @@ -63,7 +64,7 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { mov [edi + 8], ecx mov [edi + 12], edx } -#else +#else // Visual C but not x86 if (info_ecx == 0) { __cpuid((int*)(cpu_info), info_eax); } else { @@ -71,9 +72,9 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { } #endif // GCC version uses inline x86 assembly. -#else // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__) +#else // defined(_MSC_VER) uint32 info_ebx, info_edx; - asm volatile ( // NOLINT + asm volatile ( #if defined( __i386__) && defined(__PIC__) // Preserve ebx for fpic 32 bit. "mov %%ebx, %%edi \n" @@ -89,7 +90,7 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { cpu_info[1] = info_ebx; cpu_info[2] = info_ecx; cpu_info[3] = info_edx; -#endif // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__) +#endif // defined(_MSC_VER) } #else // (defined(_M_IX86) || defined(_M_X64) ... LIBYUV_API @@ -98,28 +99,37 @@ void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) { } #endif -// TODO(fbarchard): Enable xgetbv when validator supports it. +// For VS2010 and earlier emit can be used: +// _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier. +// __asm { +// xor ecx, ecx // xcr 0 +// xgetbv +// mov xcr0, eax +// } +// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code. +// https://code.google.com/p/libyuv/issues/detail?id=529 +#if defined(_M_IX86) && (_MSC_VER < 1900) +#pragma optimize("g", off) +#endif #if (defined(_M_IX86) || defined(_M_X64) || \ defined(__i386__) || defined(__x86_64__)) && \ !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__) #define HAS_XGETBV // X86 CPUs have xgetbv to detect OS saves high parts of ymm registers. -int TestOsSaveYmm() { +int GetXCR0() { uint32 xcr0 = 0u; -#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219) +#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required. -#elif defined(_M_IX86) && defined(_MSC_VER) && !defined(__clang__) - __asm { - xor ecx, ecx // xcr 0 - _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier. - mov xcr0, eax - } #elif defined(__i386__) || defined(__x86_64__) asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx"); #endif // defined(__i386__) || defined(__x86_64__) - return((xcr0 & 6) == 6); // Is ymm saved? + return xcr0; } #endif // defined(_M_IX86) || defined(_M_X64) .. +// Return optimization to previous setting. +#if defined(_M_IX86) && (_MSC_VER < 1900) +#pragma optimize("g", on) +#endif // based on libvpx arm_cpudetect.c // For Arm, but public to allow testing on any CPU @@ -151,30 +161,9 @@ int ArmCpuCaps(const char* cpuinfo_name) { return 0; } -#if defined(__mips__) && defined(__linux__) -static int MipsCpuCaps(const char* search_string) { - char cpuinfo_line[512]; - const char* file_name = "/proc/cpuinfo"; - FILE* f = fopen(file_name, "r"); - if (!f) { - // Assume DSP if /proc/cpuinfo is unavailable. - // This will occur for Chrome sandbox for Pepper or Render process. - return kCpuHasMIPS_DSP; - } - while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f) != NULL) { - if (strstr(cpuinfo_line, search_string) != NULL) { - fclose(f); - return kCpuHasMIPS_DSP; - } - } - fclose(f); - return 0; -} -#endif - // CPU detect function for SIMD instruction sets. LIBYUV_API -int cpu_info_ = kCpuInit; // cpu_info is not initialized yet. +int cpu_info_ = 0; // cpu_info is not initialized yet. // Test environment variable for disabling CPU features. Any non-zero value // to disable. Zero ignored to make it easy to set the variable on/off. @@ -197,8 +186,9 @@ static LIBYUV_BOOL TestEnv(const char*) { LIBYUV_API SAFEBUFFERS int InitCpuFlags(void) { + // TODO(fbarchard): swap kCpuInit logic so 0 means uninitialized. + int cpu_info = 0; #if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86) - uint32 cpu_info0[4] = { 0, 0, 0, 0 }; uint32 cpu_info1[4] = { 0, 0, 0, 0 }; uint32 cpu_info7[4] = { 0, 0, 0, 0 }; @@ -207,66 +197,66 @@ int InitCpuFlags(void) { if (cpu_info0[0] >= 7) { CpuId(7, 0, cpu_info7); } - cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) | - ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) | - ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) | - ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) | - ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) | - ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) | - kCpuHasX86; + cpu_info = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) | + ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) | + ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) | + ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) | + ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) | + ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) | + kCpuHasX86; #ifdef HAS_XGETBV - if ((cpu_info1[2] & 0x18000000) == 0x18000000 && // AVX and OSSave - TestOsSaveYmm()) { // Saves YMM. - cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | - kCpuHasAVX; + // AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv + if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave + ((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers + cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX; + + // Detect AVX512bw + if ((GetXCR0() & 0xe0) == 0xe0) { + cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0; + } } #endif + // Environment variable overrides for testing. if (TestEnv("LIBYUV_DISABLE_X86")) { - cpu_info_ &= ~kCpuHasX86; + cpu_info &= ~kCpuHasX86; } if (TestEnv("LIBYUV_DISABLE_SSE2")) { - cpu_info_ &= ~kCpuHasSSE2; + cpu_info &= ~kCpuHasSSE2; } if (TestEnv("LIBYUV_DISABLE_SSSE3")) { - cpu_info_ &= ~kCpuHasSSSE3; + cpu_info &= ~kCpuHasSSSE3; } if (TestEnv("LIBYUV_DISABLE_SSE41")) { - cpu_info_ &= ~kCpuHasSSE41; + cpu_info &= ~kCpuHasSSE41; } if (TestEnv("LIBYUV_DISABLE_SSE42")) { - cpu_info_ &= ~kCpuHasSSE42; + cpu_info &= ~kCpuHasSSE42; } if (TestEnv("LIBYUV_DISABLE_AVX")) { - cpu_info_ &= ~kCpuHasAVX; + cpu_info &= ~kCpuHasAVX; } if (TestEnv("LIBYUV_DISABLE_AVX2")) { - cpu_info_ &= ~kCpuHasAVX2; + cpu_info &= ~kCpuHasAVX2; } if (TestEnv("LIBYUV_DISABLE_ERMS")) { - cpu_info_ &= ~kCpuHasERMS; + cpu_info &= ~kCpuHasERMS; } if (TestEnv("LIBYUV_DISABLE_FMA3")) { - cpu_info_ &= ~kCpuHasFMA3; + cpu_info &= ~kCpuHasFMA3; + } + if (TestEnv("LIBYUV_DISABLE_AVX3")) { + cpu_info &= ~kCpuHasAVX3; } #endif #if defined(__mips__) && defined(__linux__) - // Linux mips parse text file for dsp detect. - cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP. #if defined(__mips_dspr2) - cpu_info_ |= kCpuHasMIPS_DSPR2; + cpu_info |= kCpuHasDSPR2; #endif - cpu_info_ |= kCpuHasMIPS; - - if (getenv("LIBYUV_DISABLE_MIPS")) { - cpu_info_ &= ~kCpuHasMIPS; - } - if (getenv("LIBYUV_DISABLE_MIPS_DSP")) { - cpu_info_ &= ~kCpuHasMIPS_DSP; - } - if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) { - cpu_info_ &= ~kCpuHasMIPS_DSPR2; + cpu_info |= kCpuHasMIPS; + if (getenv("LIBYUV_DISABLE_DSPR2")) { + cpu_info &= ~kCpuHasDSPR2; } #endif #if defined(__arm__) || defined(__aarch64__) @@ -274,28 +264,31 @@ int InitCpuFlags(void) { // __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon. // For Linux, /proc/cpuinfo can be tested but without that assume Neon. #if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__) - cpu_info_ = kCpuHasNEON; + cpu_info = kCpuHasNEON; // For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon // flag in it. // So for aarch64, neon enabling is hard coded here. #endif #if defined(__aarch64__) - cpu_info_ = kCpuHasNEON; + cpu_info = kCpuHasNEON; #else // Linux arm parse text file for neon detect. - cpu_info_ = ArmCpuCaps("/proc/cpuinfo"); + cpu_info = ArmCpuCaps("/proc/cpuinfo"); #endif - cpu_info_ |= kCpuHasARM; + cpu_info |= kCpuHasARM; if (TestEnv("LIBYUV_DISABLE_NEON")) { - cpu_info_ &= ~kCpuHasNEON; + cpu_info &= ~kCpuHasNEON; } #endif // __arm__ if (TestEnv("LIBYUV_DISABLE_ASM")) { - cpu_info_ = 0; + cpu_info = 0; } - return cpu_info_; + cpu_info |= kCpuInitialized; + cpu_info_ = cpu_info; + return cpu_info; } +// Note that use of this function is not thread safe. LIBYUV_API void MaskCpuFlags(int enable_flags) { cpu_info_ = InitCpuFlags() & enable_flags; diff --git a/third_party/libyuv/source/mjpeg_decoder.cc b/third_party/libyuv/source/mjpeg_decoder.cc index 75f8a610e..50818418a 100644 --- a/third_party/libyuv/source/mjpeg_decoder.cc +++ b/third_party/libyuv/source/mjpeg_decoder.cc @@ -59,8 +59,7 @@ const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK; // Methods that are passed to jpeglib. boolean fill_input_buffer(jpeg_decompress_struct* cinfo); void init_source(jpeg_decompress_struct* cinfo); -void skip_input_data(jpeg_decompress_struct* cinfo, - long num_bytes); // NOLINT +void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes); // NOLINT void term_source(jpeg_decompress_struct* cinfo); void ErrorHandler(jpeg_common_struct* cinfo); @@ -429,8 +428,7 @@ boolean fill_input_buffer(j_decompress_ptr cinfo) { return TRUE; } -void skip_input_data(j_decompress_ptr cinfo, - long num_bytes) { // NOLINT +void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT cinfo->src->next_input_byte += num_bytes; } diff --git a/third_party/libyuv/source/mjpeg_validate.cc b/third_party/libyuv/source/mjpeg_validate.cc index 8edfbe1e7..9c4883204 100644 --- a/third_party/libyuv/source/mjpeg_validate.cc +++ b/third_party/libyuv/source/mjpeg_validate.cc @@ -17,51 +17,22 @@ namespace libyuv { extern "C" { #endif -// Enable this to try scasb implementation. -// #define ENABLE_SCASB 1 - -#ifdef ENABLE_SCASB - -// Multiple of 1. -__declspec(naked) -const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) { - __asm { - mov edx, edi - mov edi, [esp + 4] // src - mov eax, [esp + 8] // val - mov ecx, [esp + 12] // count - repne scasb - jne sr99 - mov eax, edi - sub eax, 1 - mov edi, edx - ret - - sr99: - mov eax, 0 - mov edi, edx - ret - } -} -#endif - -// Helper function to scan for EOI marker. +// Helper function to scan for EOI marker (0xff 0xd9). static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) { - const uint8* end = sample + sample_size - 1; - const uint8* it = sample; - for (;;) { -#ifdef ENABLE_SCASB - it = ScanRow_ERMS(it, 0xff, end - it); -#else - it = static_cast<const uint8*>(memchr(it, 0xff, end - it)); -#endif - if (it == NULL) { - break; + if (sample_size >= 2) { + const uint8* end = sample + sample_size - 1; + const uint8* it = sample; + while (it < end) { + // TODO(fbarchard): scan for 0xd9 instead. + it = static_cast<const uint8 *>(memchr(it, 0xff, end - it)); + if (it == NULL) { + break; + } + if (it[1] == 0xd9) { + return LIBYUV_TRUE; // Success: Valid jpeg. + } + ++it; // Skip over current 0xff. } - if (it[1] == 0xd9) { - return LIBYUV_TRUE; // Success: Valid jpeg. - } - ++it; // Skip over current 0xff. } // ERROR: Invalid jpeg end code not found. Size sample_size return LIBYUV_FALSE; @@ -69,20 +40,19 @@ static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) { // Helper function to validate the jpeg appears intact. LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) { + // Maximum size that ValidateJpeg will consider valid. + const size_t kMaxJpegSize = 0x7fffffffull; const size_t kBackSearchSize = 1024; - if (sample_size < 64) { + if (sample_size < 64 || sample_size > kMaxJpegSize || !sample) { // ERROR: Invalid jpeg size: sample_size return LIBYUV_FALSE; } - if (sample[0] != 0xff || sample[1] != 0xd8) { // Start Of Image + if (sample[0] != 0xff || sample[1] != 0xd8) { // SOI marker // ERROR: Invalid jpeg initial start code return LIBYUV_FALSE; } - // Step over SOI marker. - sample += 2; - sample_size -= 2; - // Look for the End Of Image (EOI) marker in the end kilobyte of the buffer. + // Look for the End Of Image (EOI) marker near the end of the buffer. if (sample_size > kBackSearchSize) { if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) { return LIBYUV_TRUE; // Success: Valid jpeg. @@ -90,8 +60,8 @@ LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) { // Reduce search size for forward search. sample_size = sample_size - kBackSearchSize + 1; } - return ScanEOI(sample, sample_size); - + // Step over SOI marker and scan for EOI. + return ScanEOI(sample + 2, sample_size - 2); } #ifdef __cplusplus diff --git a/third_party/libyuv/source/planar_functions.cc b/third_party/libyuv/source/planar_functions.cc index b96bd5020..237ab6831 100644 --- a/third_party/libyuv/source/planar_functions.cc +++ b/third_party/libyuv/source/planar_functions.cc @@ -17,6 +17,7 @@ #include "libyuv/mjpeg_decoder.h" #endif #include "libyuv/row.h" +#include "libyuv/scale_row.h" // for ScaleRowDown2 #ifdef __cplusplus namespace libyuv { @@ -237,14 +238,6 @@ void MirrorPlane(const uint8* src_y, int src_stride_y, } } #endif -#if defined(HAS_MIRRORROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - MirrorRow = MirrorRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - MirrorRow = MirrorRow_SSE2; - } - } -#endif #if defined(HAS_MIRRORROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { MirrorRow = MirrorRow_Any_SSSE3; @@ -262,11 +255,11 @@ void MirrorPlane(const uint8* src_y, int src_stride_y, } #endif // TODO(fbarchard): Mirror on mips handle unaligned memory. -#if defined(HAS_MIRRORROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && +#if defined(HAS_MIRRORROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) { - MirrorRow = MirrorRow_MIPS_DSPR2; + MirrorRow = MirrorRow_DSPR2; } #endif @@ -287,9 +280,9 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, int width, int height) { int y; void (*YUY2ToUV422Row)(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix) = + uint8* dst_u, uint8* dst_v, int width) = YUY2ToUV422Row_C; - void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) = + void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) = YUY2ToYRow_C; // Negative height means invert the image. if (height < 0) { @@ -359,10 +352,10 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, int width, int height) { int y; void (*UYVYToUV422Row)(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix) = + uint8* dst_u, uint8* dst_v, int width) = UYVYToUV422Row_C; void (*UYVYToYRow)(const uint8* src_uyvy, - uint8* dst_y, int pix) = UYVYToYRow_C; + uint8* dst_y, int width) = UYVYToYRow_C; // Negative height means invert the image. if (height < 0) { height = -height; @@ -541,11 +534,6 @@ ARGBBlendRow GetARGBBlend() { return ARGBBlendRow; } #endif -#if defined(HAS_ARGBBLENDROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBBlendRow = ARGBBlendRow_SSE2; - } -#endif #if defined(HAS_ARGBBLENDROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBBlendRow = ARGBBlendRow_NEON; @@ -590,6 +578,179 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, return 0; } +// Alpha Blend plane and store to destination. +LIBYUV_API +int BlendPlane(const uint8* src_y0, int src_stride_y0, + const uint8* src_y1, int src_stride_y1, + const uint8* alpha, int alpha_stride, + uint8* dst_y, int dst_stride_y, + int width, int height) { + int y; + void (*BlendPlaneRow)(const uint8* src0, const uint8* src1, + const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C; + if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + + // Coalesce rows for Y plane. + if (src_stride_y0 == width && + src_stride_y1 == width && + alpha_stride == width && + dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0; + } + +#if defined(HAS_BLENDPLANEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + BlendPlaneRow = BlendPlaneRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + BlendPlaneRow = BlendPlaneRow_SSSE3; + } + } +#endif +#if defined(HAS_BLENDPLANEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + BlendPlaneRow = BlendPlaneRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + BlendPlaneRow = BlendPlaneRow_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width); + src_y0 += src_stride_y0; + src_y1 += src_stride_y1; + alpha += alpha_stride; + dst_y += dst_stride_y; + } + return 0; +} + +#define MAXTWIDTH 2048 +// Alpha Blend YUV images and store to destination. +LIBYUV_API +int I420Blend(const uint8* src_y0, int src_stride_y0, + const uint8* src_u0, int src_stride_u0, + const uint8* src_v0, int src_stride_v0, + const uint8* src_y1, int src_stride_y1, + const uint8* src_u1, int src_stride_u1, + const uint8* src_v1, int src_stride_v1, + const uint8* alpha, int alpha_stride, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + // Half width/height for UV. + int halfwidth = (width + 1) >> 1; + void (*BlendPlaneRow)(const uint8* src0, const uint8* src1, + const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C; + void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C; + if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 || + !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + + // Blend Y plane. + BlendPlane(src_y0, src_stride_y0, + src_y1, src_stride_y1, + alpha, alpha_stride, + dst_y, dst_stride_y, + width, height); + +#if defined(HAS_BLENDPLANEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + BlendPlaneRow = BlendPlaneRow_Any_SSSE3; + if (IS_ALIGNED(halfwidth, 8)) { + BlendPlaneRow = BlendPlaneRow_SSSE3; + } + } +#endif +#if defined(HAS_BLENDPLANEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + BlendPlaneRow = BlendPlaneRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + BlendPlaneRow = BlendPlaneRow_AVX2; + } + } +#endif + if (!IS_ALIGNED(width, 2)) { + ScaleRowDown2 = ScaleRowDown2Box_Odd_C; + } +#if defined(HAS_SCALEROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowDown2 = ScaleRowDown2Box_Odd_NEON; + if (IS_ALIGNED(width, 2)) { + ScaleRowDown2 = ScaleRowDown2Box_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + ScaleRowDown2 = ScaleRowDown2Box_NEON; + } + } + } +#endif +#if defined(HAS_SCALEROWDOWN2_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowDown2 = ScaleRowDown2Box_Odd_SSSE3; + if (IS_ALIGNED(width, 2)) { + ScaleRowDown2 = ScaleRowDown2Box_Any_SSSE3; + if (IS_ALIGNED(halfwidth, 16)) { + ScaleRowDown2 = ScaleRowDown2Box_SSSE3; + } + } + } +#endif +#if defined(HAS_SCALEROWDOWN2_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowDown2 = ScaleRowDown2Box_Odd_AVX2; + if (IS_ALIGNED(width, 2)) { + ScaleRowDown2 = ScaleRowDown2Box_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + ScaleRowDown2 = ScaleRowDown2Box_AVX2; + } + } + } +#endif + + // Row buffer for intermediate alpha pixels. + align_buffer_64(halfalpha, halfwidth); + for (y = 0; y < height; y += 2) { + // last row of odd height image use 1 row of alpha instead of 2. + if (y == (height - 1)) { + alpha_stride = 0; + } + // Subsample 2 rows of UV to half width and half height. + ScaleRowDown2(alpha, alpha_stride, halfalpha, halfwidth); + alpha += alpha_stride * 2; + BlendPlaneRow(src_u0, src_u1, halfalpha, dst_u, halfwidth); + BlendPlaneRow(src_v0, src_v1, halfalpha, dst_v, halfwidth); + src_u0 += src_stride_u0; + src_u1 += src_stride_u1; + dst_u += dst_stride_u; + src_v0 += src_stride_v0; + src_v1 += src_stride_v1; + dst_v += dst_stride_v; + } + free_aligned_buffer_64(halfalpha); + return 0; +} + // Multiply 2 ARGB images and store to destination. LIBYUV_API int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, @@ -777,77 +938,67 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, } return 0; } - -// Convert I422 to BGRA. -LIBYUV_API -int I422ToBGRA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_bgra, int dst_stride_bgra, - int width, int height) { +// Convert I422 to RGBA with matrix +static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgba, int dst_stride_rgba, + const struct YuvConstants* yuvconstants, + int width, int height) { int y; - void (*I422ToBGRARow)(const uint8* y_buf, + void (*I422ToRGBARow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, - int width) = I422ToBGRARow_C; - if (!src_y || !src_u || !src_v || - !dst_bgra || + const struct YuvConstants* yuvconstants, + int width) = I422ToRGBARow_C; + if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; - dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra; - dst_stride_bgra = -dst_stride_bgra; - } - // Coalesce rows. - if (src_stride_y == width && - src_stride_u * 2 == width && - src_stride_v * 2 == width && - dst_stride_bgra == width * 4) { - width *= height; - height = 1; - src_stride_y = src_stride_u = src_stride_v = dst_stride_bgra = 0; + dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; + dst_stride_rgba = -dst_stride_rgba; } -#if defined(HAS_I422TOBGRAROW_SSSE3) +#if defined(HAS_I422TORGBAROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToBGRARow = I422ToBGRARow_Any_SSSE3; + I422ToRGBARow = I422ToRGBARow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToBGRARow = I422ToBGRARow_SSSE3; + I422ToRGBARow = I422ToRGBARow_SSSE3; } } #endif -#if defined(HAS_I422TOBGRAROW_AVX2) +#if defined(HAS_I422TORGBAROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - I422ToBGRARow = I422ToBGRARow_Any_AVX2; + I422ToRGBARow = I422ToRGBARow_Any_AVX2; if (IS_ALIGNED(width, 16)) { - I422ToBGRARow = I422ToBGRARow_AVX2; + I422ToRGBARow = I422ToRGBARow_AVX2; } } #endif -#if defined(HAS_I422TOBGRAROW_NEON) +#if defined(HAS_I422TORGBAROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I422ToBGRARow = I422ToBGRARow_Any_NEON; + I422ToRGBARow = I422ToRGBARow_Any_NEON; if (IS_ALIGNED(width, 8)) { - I422ToBGRARow = I422ToBGRARow_NEON; + I422ToRGBARow = I422ToRGBARow_NEON; } } #endif -#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && +#if defined(HAS_I422TORGBAROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) { - I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2; + IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) { + I422ToRGBARow = I422ToRGBARow_DSPR2; } #endif for (y = 0; y < height; ++y) { - I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width); - dst_bgra += dst_stride_bgra; + I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); + dst_rgba += dst_stride_rgba; src_y += src_stride_y; src_u += src_stride_u; src_v += src_stride_v; @@ -855,140 +1006,34 @@ int I422ToBGRA(const uint8* src_y, int src_stride_y, return 0; } -// Convert I422 to ABGR. +// Convert I422 to RGBA. LIBYUV_API -int I422ToABGR(const uint8* src_y, int src_stride_y, +int I422ToRGBA(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, + uint8* dst_rgba, int dst_stride_rgba, int width, int height) { - int y; - void (*I422ToABGRRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) = I422ToABGRRow_C; - if (!src_y || !src_u || !src_v || - !dst_abgr || - width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr; - dst_stride_abgr = -dst_stride_abgr; - } - // Coalesce rows. - if (src_stride_y == width && - src_stride_u * 2 == width && - src_stride_v * 2 == width && - dst_stride_abgr == width * 4) { - width *= height; - height = 1; - src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0; - } -#if defined(HAS_I422TOABGRROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { - I422ToABGRRow = I422ToABGRRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToABGRRow = I422ToABGRRow_NEON; - } - } -#endif -#if defined(HAS_I422TOABGRROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToABGRRow = I422ToABGRRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToABGRRow = I422ToABGRRow_SSSE3; - } - } -#endif -#if defined(HAS_I422TOABGRROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToABGRRow = I422ToABGRRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToABGRRow = I422ToABGRRow_AVX2; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width); - dst_abgr += dst_stride_abgr; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - } - return 0; + return I422ToRGBAMatrix(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_rgba, dst_stride_rgba, + &kYuvI601Constants, + width, height); } -// Convert I422 to RGBA. +// Convert I422 to BGRA. LIBYUV_API -int I422ToRGBA(const uint8* src_y, int src_stride_y, +int I422ToBGRA(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, - uint8* dst_rgba, int dst_stride_rgba, + uint8* dst_bgra, int dst_stride_bgra, int width, int height) { - int y; - void (*I422ToRGBARow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) = I422ToRGBARow_C; - if (!src_y || !src_u || !src_v || - !dst_rgba || - width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; - dst_stride_rgba = -dst_stride_rgba; - } - // Coalesce rows. - if (src_stride_y == width && - src_stride_u * 2 == width && - src_stride_v * 2 == width && - dst_stride_rgba == width * 4) { - width *= height; - height = 1; - src_stride_y = src_stride_u = src_stride_v = dst_stride_rgba = 0; - } -#if defined(HAS_I422TORGBAROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { - I422ToRGBARow = I422ToRGBARow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_NEON; - } - } -#endif -#if defined(HAS_I422TORGBAROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGBARow = I422ToRGBARow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_SSSE3; - } - } -#endif -#if defined(HAS_I422TORGBAROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToRGBARow = I422ToRGBARow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToRGBARow = I422ToRGBARow_AVX2; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width); - dst_rgba += dst_stride_rgba; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - } - return 0; + return I422ToRGBAMatrix(src_y, src_stride_y, + src_v, src_stride_v, // Swap U and V + src_u, src_stride_u, + dst_bgra, dst_stride_bgra, + &kYvuI601Constants, // Use Yvu matrix + width, height); } // Convert NV12 to RGB565. @@ -1001,6 +1046,7 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, void (*NV12ToRGB565Row)(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C; if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) { @@ -1038,7 +1084,7 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, #endif for (y = 0; y < height; ++y) { - NV12ToRGB565Row(src_y, src_uv, dst_rgb565, width); + NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width); dst_rgb565 += dst_stride_rgb565; src_y += src_stride_y; if (y & 1) { @@ -1048,59 +1094,52 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, return 0; } -// Convert NV21 to RGB565. +// Convert RAW to RGB24. LIBYUV_API -int NV21ToRGB565(const uint8* src_y, int src_stride_y, - const uint8* src_vu, int src_stride_vu, - uint8* dst_rgb565, int dst_stride_rgb565, - int width, int height) { +int RAWToRGB24(const uint8* src_raw, int src_stride_raw, + uint8* dst_rgb24, int dst_stride_rgb24, + int width, int height) { int y; - void (*NV21ToRGB565Row)(const uint8* y_buf, - const uint8* src_vu, - uint8* rgb_buf, - int width) = NV21ToRGB565Row_C; - if (!src_y || !src_vu || !dst_rgb565 || + void (*RAWToRGB24Row)(const uint8* src_rgb, uint8* dst_rgb24, int width) = + RAWToRGB24Row_C; + if (!src_raw || !dst_rgb24 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; - dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; - dst_stride_rgb565 = -dst_stride_rgb565; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + // Coalesce rows. + if (src_stride_raw == width * 3 && + dst_stride_rgb24 == width * 3) { + width *= height; + height = 1; + src_stride_raw = dst_stride_rgb24 = 0; } -#if defined(HAS_NV21TORGB565ROW_SSSE3) +#if defined(HAS_RAWTORGB24ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - NV21ToRGB565Row = NV21ToRGB565Row_Any_SSSE3; + RAWToRGB24Row = RAWToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - NV21ToRGB565Row = NV21ToRGB565Row_SSSE3; + RAWToRGB24Row = RAWToRGB24Row_SSSE3; } } #endif -#if defined(HAS_NV21TORGB565ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - NV21ToRGB565Row = NV21ToRGB565Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - NV21ToRGB565Row = NV21ToRGB565Row_AVX2; - } - } -#endif -#if defined(HAS_NV21TORGB565ROW_NEON) +#if defined(HAS_RAWTORGB24ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON; + RAWToRGB24Row = RAWToRGB24Row_Any_NEON; if (IS_ALIGNED(width, 8)) { - NV21ToRGB565Row = NV21ToRGB565Row_NEON; + RAWToRGB24Row = RAWToRGB24Row_NEON; } } #endif for (y = 0; y < height; ++y) { - NV21ToRGB565Row(src_y, src_vu, dst_rgb565, width); - dst_rgb565 += dst_stride_rgb565; - src_y += src_stride_y; - if (y & 1) { - src_vu += src_stride_vu; - } + RAWToRGB24Row(src_raw, dst_rgb24, width); + src_raw += src_stride_raw; + dst_rgb24 += dst_stride_rgb24; } return 0; } @@ -1110,7 +1149,7 @@ void SetPlane(uint8* dst_y, int dst_stride_y, int width, int height, uint32 value) { int y; - void (*SetRow)(uint8* dst, uint8 value, int pix) = SetRow_C; + void (*SetRow)(uint8* dst, uint8 value, int width) = SetRow_C; if (height < 0) { height = -height; dst_y = dst_y + (height - 1) * dst_stride_y; @@ -1186,7 +1225,7 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb, int width, int height, uint32 value) { int y; - void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int pix) = ARGBSetRow_C; + void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int width) = ARGBSetRow_C; if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) { @@ -1262,14 +1301,6 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, height = 1; src_stride_argb = dst_stride_argb = 0; } -#if defined(HAS_ARGBATTENUATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSE2; - } - } -#endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; @@ -1824,45 +1855,37 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, return 0; } -// Interpolate 2 ARGB images by specified amount (0 to 255). +// Interpolate 2 planes by specified amount (0 to 255). LIBYUV_API -int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height, int interpolation) { +int InterpolatePlane(const uint8* src0, int src_stride0, + const uint8* src1, int src_stride1, + uint8* dst, int dst_stride, + int width, int height, int interpolation) { int y; void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; - if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + if (!src0 || !src1 || !dst || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; + dst = dst + (height - 1) * dst_stride; + dst_stride = -dst_stride; } // Coalesce rows. - if (src_stride_argb0 == width * 4 && - src_stride_argb1 == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride0 == width && + src_stride1 == width && + dst_stride == width) { width *= height; height = 1; - src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + src_stride0 = src_stride1 = dst_stride = 0; } -#if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - InterpolateRow = InterpolateRow_SSE2; - } - } -#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; - if (IS_ALIGNED(width, 4)) { + if (IS_ALIGNED(width, 16)) { InterpolateRow = InterpolateRow_SSSE3; } } @@ -1870,7 +1893,7 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, #if defined(HAS_INTERPOLATEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_AVX2; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 32)) { InterpolateRow = InterpolateRow_AVX2; } } @@ -1878,27 +1901,74 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, #if defined(HAS_INTERPOLATEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_NEON; - if (IS_ALIGNED(width, 4)) { + if (IS_ALIGNED(width, 16)) { InterpolateRow = InterpolateRow_NEON; } } #endif -#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && - IS_ALIGNED(src_argb0, 4) && IS_ALIGNED(src_stride_argb0, 4) && - IS_ALIGNED(src_argb1, 4) && IS_ALIGNED(src_stride_argb1, 4) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - InterpolateRow = InterpolateRow_MIPS_DSPR2; +#if defined(HAS_INTERPOLATEROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && + IS_ALIGNED(src0, 4) && IS_ALIGNED(src_stride0, 4) && + IS_ALIGNED(src1, 4) && IS_ALIGNED(src_stride1, 4) && + IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4) && + IS_ALIGNED(width, 4)) { + InterpolateRow = InterpolateRow_DSPR2; } #endif for (y = 0; y < height; ++y) { - InterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0, - width * 4, interpolation); - src_argb0 += src_stride_argb0; - src_argb1 += src_stride_argb1; - dst_argb += dst_stride_argb; + InterpolateRow(dst, src0, src1 - src0, width, interpolation); + src0 += src_stride0; + src1 += src_stride1; + dst += dst_stride; + } + return 0; +} + +// Interpolate 2 ARGB images by specified amount (0 to 255). +LIBYUV_API +int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height, int interpolation) { + return InterpolatePlane(src_argb0, src_stride_argb0, + src_argb1, src_stride_argb1, + dst_argb, dst_stride_argb, + width * 4, height, interpolation); +} + +// Interpolate 2 YUV images by specified amount (0 to 255). +LIBYUV_API +int I420Interpolate(const uint8* src0_y, int src0_stride_y, + const uint8* src0_u, int src0_stride_u, + const uint8* src0_v, int src0_stride_v, + const uint8* src1_y, int src1_stride_y, + const uint8* src1_u, int src1_stride_u, + const uint8* src1_v, int src1_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height, int interpolation) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src0_y || !src0_u || !src0_v || + !src1_y || !src1_u || !src1_v || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; } + InterpolatePlane(src0_y, src0_stride_y, + src1_y, src1_stride_y, + dst_y, dst_stride_y, + width, height, interpolation); + InterpolatePlane(src0_u, src0_stride_u, + src1_u, src1_stride_u, + dst_u, dst_stride_u, + halfwidth, halfheight, interpolation); + InterpolatePlane(src0_v, src0_stride_v, + src1_v, src1_stride_v, + dst_v, dst_stride_v, + halfwidth, halfheight, interpolation); return 0; } @@ -1909,7 +1979,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, const uint8* shuffler, int width, int height) { int y; void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb, - const uint8* shuffler, int pix) = ARGBShuffleRow_C; + const uint8* shuffler, int width) = ARGBShuffleRow_C; if (!src_bgra || !dst_argb || width <= 0 || height == 0) { return -1; @@ -1976,7 +2046,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, const uint8* src_sobely, uint8* dst, int width)) { int y; - void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int pix) = + void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int width) = ARGBToYJRow_C; void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely, int width) = SobelYRow_C; @@ -2280,13 +2350,19 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBCOPYALPHAROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { - ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2; + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2; + } } #endif #if defined(HAS_ARGBCOPYALPHAROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) { - ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2; + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2; + } } #endif @@ -2298,6 +2374,49 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, return 0; } +// Extract just the alpha channel from ARGB. +LIBYUV_API +int ARGBExtractAlpha(const uint8* src_argb, int src_stride, + uint8* dst_a, int dst_stride, + int width, int height) { + if (!src_argb || !dst_a || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb += (height - 1) * src_stride; + src_stride = -src_stride; + } + // Coalesce rows. + if (src_stride == width * 4 && dst_stride == width) { + width *= height; + height = 1; + src_stride = dst_stride = 0; + } + void (*ARGBExtractAlphaRow)(const uint8 *src_argb, uint8 *dst_a, int width) = + ARGBExtractAlphaRow_C; +#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2 + : ARGBExtractAlphaRow_Any_SSE2; + } +#endif +#if defined(HAS_ARGBEXTRACTALPHAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON + : ARGBExtractAlphaRow_Any_NEON; + } +#endif + + for (int y = 0; y < height; ++y) { + ARGBExtractAlphaRow(src_argb, dst_a, width); + src_argb += src_stride; + dst_a += dst_stride; + } + return 0; +} + // Copy a planar Y channel to the alpha channel of a destination ARGB image. LIBYUV_API int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, @@ -2323,13 +2442,19 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, src_stride_y = dst_stride_argb = 0; } #if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { - ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2; + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2; + } } #endif #if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) { - ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2; + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2; + } } #endif @@ -2341,6 +2466,9 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, return 0; } +// TODO(fbarchard): Consider if width is even Y channel can be split +// directly. A SplitUVRow_Odd function could copy the remaining chroma. + LIBYUV_API int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2, uint8* dst_y, int dst_stride_y, @@ -2348,8 +2476,8 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2, int width, int height) { int y; int halfwidth = (width + 1) >> 1; - void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) = - SplitUVRow_C; + void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) = SplitUVRow_C; void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; @@ -2388,14 +2516,6 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2, } } #endif -#if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } - } -#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -2423,22 +2543,24 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2, { int awidth = halfwidth * 2; - // 2 rows of uv - align_buffer_64(rows, awidth * 2); + // row of y and 2 rows of uv + align_buffer_64(rows, awidth * 3); for (y = 0; y < height - 1; y += 2) { // Split Y from UV. - SplitUVRow(src_yuy2, dst_y, rows, awidth); - SplitUVRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, - rows + awidth, awidth); - InterpolateRow(dst_uv, rows, awidth, awidth, 128); + SplitUVRow(src_yuy2, rows, rows + awidth, awidth); + memcpy(dst_y, rows, width); + SplitUVRow(src_yuy2 + src_stride_yuy2, rows, rows + awidth * 2, awidth); + memcpy(dst_y + dst_stride_y, rows, width); + InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128); src_yuy2 += src_stride_yuy2 * 2; dst_y += dst_stride_y * 2; dst_uv += dst_stride_uv; } if (height & 1) { // Split Y from UV. - SplitUVRow(src_yuy2, dst_y, dst_uv, width); + SplitUVRow(src_yuy2, rows, dst_uv, awidth); + memcpy(dst_y, rows, width); } free_aligned_buffer_64(rows); } @@ -2452,8 +2574,8 @@ int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy, int width, int height) { int y; int halfwidth = (width + 1) >> 1; - void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) = - SplitUVRow_C; + void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) = SplitUVRow_C; void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; @@ -2492,14 +2614,6 @@ int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy, } } #endif -#if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } - } -#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -2527,22 +2641,24 @@ int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy, { int awidth = halfwidth * 2; - // 2 rows of uv - align_buffer_64(rows, awidth * 2); + // row of y and 2 rows of uv + align_buffer_64(rows, awidth * 3); for (y = 0; y < height - 1; y += 2) { // Split Y from UV. - SplitUVRow(src_uyvy, rows, dst_y, awidth); - SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth, - dst_y + dst_stride_y, awidth); - InterpolateRow(dst_uv, rows, awidth, awidth, 128); + SplitUVRow(src_uyvy, rows + awidth, rows, awidth); + memcpy(dst_y, rows, width); + SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth * 2, rows, awidth); + memcpy(dst_y + dst_stride_y, rows, width); + InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128); src_uyvy += src_stride_uyvy * 2; dst_y += dst_stride_y * 2; dst_uv += dst_stride_uv; } if (height & 1) { // Split Y from UV. - SplitUVRow(src_uyvy, dst_y, dst_uv, width); + SplitUVRow(src_uyvy, dst_uv, rows, awidth); + memcpy(dst_y, rows, width); } free_aligned_buffer_64(rows); } diff --git a/third_party/libyuv/source/rotate.cc b/third_party/libyuv/source/rotate.cc index be3d58920..01ea5c407 100644 --- a/third_party/libyuv/source/rotate.cc +++ b/third_party/libyuv/source/rotate.cc @@ -49,13 +49,13 @@ void TransposePlane(const uint8* src, int src_stride, } } #endif -#if defined(HAS_TRANSPOSEWX8_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { +#if defined(HAS_TRANSPOSEWX8_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { if (IS_ALIGNED(width, 4) && IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { - TransposeWx8 = TransposeWx8_Fast_MIPS_DSPR2; + TransposeWx8 = TransposeWx8_Fast_DSPR2; } else { - TransposeWx8 = TransposeWx8_MIPS_DSPR2; + TransposeWx8 = TransposeWx8_DSPR2; } } #endif @@ -117,14 +117,6 @@ void RotatePlane180(const uint8* src, int src_stride, } } #endif -#if defined(HAS_MIRRORROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - MirrorRow = MirrorRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - MirrorRow = MirrorRow_SSE2; - } - } -#endif #if defined(HAS_MIRRORROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { MirrorRow = MirrorRow_Any_SSSE3; @@ -142,11 +134,11 @@ void RotatePlane180(const uint8* src, int src_stride, } #endif // TODO(fbarchard): Mirror on mips handle unaligned memory. -#if defined(HAS_MIRRORROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && +#if defined(HAS_MIRRORROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) { - MirrorRow = MirrorRow_MIPS_DSPR2; + MirrorRow = MirrorRow_DSPR2; } #endif #if defined(HAS_COPYROW_SSE2) @@ -204,14 +196,17 @@ void TransposeUV(const uint8* src, int src_stride, } #endif #if defined(HAS_TRANSPOSEUVWX8_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { - TransposeUVWx8 = TransposeUVWx8_SSE2; + if (TestCpuFlag(kCpuHasSSE2)) { + TransposeUVWx8 = TransposeUVWx8_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + TransposeUVWx8 = TransposeUVWx8_SSE2; + } } #endif -#if defined(HAS_TRANSPOSEUVWx8_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) && +#if defined(HAS_TRANSPOSEUVWX8_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) && IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { - TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2; + TransposeUVWx8 = TransposeUVWx8_DSPR2; } #endif @@ -272,22 +267,22 @@ void RotateUV180(const uint8* src, int src_stride, uint8* dst_b, int dst_stride_b, int width, int height) { int i; - void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) = + void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) = MirrorUVRow_C; #if defined(HAS_MIRRORUVROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { - MirrorRowUV = MirrorUVRow_NEON; + MirrorUVRow = MirrorUVRow_NEON; } #endif -#if defined(HAS_MIRRORROW_UV_SSSE3) +#if defined(HAS_MIRRORUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { - MirrorRowUV = MirrorUVRow_SSSE3; + MirrorUVRow = MirrorUVRow_SSSE3; } #endif -#if defined(HAS_MIRRORUVROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && +#if defined(HAS_MIRRORUVROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { - MirrorRowUV = MirrorUVRow_MIPS_DSPR2; + MirrorUVRow = MirrorUVRow_DSPR2; } #endif @@ -295,7 +290,7 @@ void RotateUV180(const uint8* src, int src_stride, dst_b += dst_stride_b * (height - 1); for (i = 0; i < height; ++i) { - MirrorRowUV(src, dst_a, dst_b, width); + MirrorUVRow(src, dst_a, dst_b, width); src += src_stride; dst_a -= dst_stride_a; dst_b -= dst_stride_b; diff --git a/third_party/libyuv/source/rotate_any.cc b/third_party/libyuv/source/rotate_any.cc index 4d6eb34e1..31a74c315 100644 --- a/third_party/libyuv/source/rotate_any.cc +++ b/third_party/libyuv/source/rotate_any.cc @@ -18,7 +18,7 @@ namespace libyuv { extern "C" { #endif -#define TANY(NAMEANY, TPOS_SIMD, TPOS_C, MASK) \ +#define TANY(NAMEANY, TPOS_SIMD, MASK) \ void NAMEANY(const uint8* src, int src_stride, \ uint8* dst, int dst_stride, int width) { \ int r = width & MASK; \ @@ -26,24 +26,49 @@ extern "C" { if (n > 0) { \ TPOS_SIMD(src, src_stride, dst, dst_stride, n); \ } \ - TPOS_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \ + TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);\ } #ifdef HAS_TRANSPOSEWX8_NEON -TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, TransposeWx8_C, 7) +TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7) #endif #ifdef HAS_TRANSPOSEWX8_SSSE3 -TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, TransposeWx8_C, 7) +TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7) #endif #ifdef HAS_TRANSPOSEWX8_FAST_SSSE3 -TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, TransposeWx8_C, 15) +TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15) #endif -#ifdef HAS_TRANSPOSEWX8_MIPS_DSPR2 -TANY(TransposeWx8_Any_MIPS_DSPR2, TransposeWx8_MIPS_DSPR2, TransposeWx8_C, 7) +#ifdef HAS_TRANSPOSEWX8_DSPR2 +TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7) #endif - #undef TANY +#define TUVANY(NAMEANY, TPOS_SIMD, MASK) \ + void NAMEANY(const uint8* src, int src_stride, \ + uint8* dst_a, int dst_stride_a, \ + uint8* dst_b, int dst_stride_b, int width) { \ + int r = width & MASK; \ + int n = width - r; \ + if (n > 0) { \ + TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, \ + n); \ + } \ + TransposeUVWx8_C(src + n * 2, src_stride, \ + dst_a + n * dst_stride_a, dst_stride_a, \ + dst_b + n * dst_stride_b, dst_stride_b, r); \ + } + +#ifdef HAS_TRANSPOSEUVWX8_NEON +TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7) +#endif +#ifdef HAS_TRANSPOSEUVWX8_SSE2 +TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7) +#endif +#ifdef HAS_TRANSPOSEUVWX8_DSPR2 +TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7) +#endif +#undef TUVANY + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/third_party/libyuv/source/rotate_gcc.cc b/third_party/libyuv/source/rotate_gcc.cc index fd385bcd3..cbe870caa 100644 --- a/third_party/libyuv/source/rotate_gcc.cc +++ b/third_party/libyuv/source/rotate_gcc.cc @@ -17,16 +17,17 @@ extern "C" { #endif // This module is for GCC x86 and x64. -#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) - #if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) + +// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit. +#if defined(HAS_TRANSPOSEWX8_SSSE3) void TransposeWx8_SSSE3(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width) { asm volatile ( // Read in the data from the source pointer. // First round of bit swap. - ".p2align 2 \n" + LABELALIGN "1: \n" "movq (%0),%%xmm0 \n" "movq (%0,%3),%%xmm1 \n" @@ -105,386 +106,260 @@ void TransposeWx8_SSSE3(const uint8* src, int src_stride, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } +#endif // defined(HAS_TRANSPOSEWX8_SSSE3) -#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) && !defined(__clang__) -void TransposeUVWx8_SSE2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width); - asm ( - DECLARE_FUNCTION(TransposeUVWx8_SSE2) - "push %ebx \n" - "push %esi \n" - "push %edi \n" - "push %ebp \n" - "mov 0x14(%esp),%eax \n" - "mov 0x18(%esp),%edi \n" - "mov 0x1c(%esp),%edx \n" - "mov 0x20(%esp),%esi \n" - "mov 0x24(%esp),%ebx \n" - "mov 0x28(%esp),%ebp \n" - "mov %esp,%ecx \n" - "sub $0x14,%esp \n" - "and $0xfffffff0,%esp \n" - "mov %ecx,0x10(%esp) \n" - "mov 0x2c(%ecx),%ecx \n" - -"1: \n" - "movdqu (%eax),%xmm0 \n" - "movdqu (%eax,%edi,1),%xmm1 \n" - "lea (%eax,%edi,2),%eax \n" - "movdqa %xmm0,%xmm7 \n" - "punpcklbw %xmm1,%xmm0 \n" - "punpckhbw %xmm1,%xmm7 \n" - "movdqa %xmm7,%xmm1 \n" - "movdqu (%eax),%xmm2 \n" - "movdqu (%eax,%edi,1),%xmm3 \n" - "lea (%eax,%edi,2),%eax \n" - "movdqa %xmm2,%xmm7 \n" - "punpcklbw %xmm3,%xmm2 \n" - "punpckhbw %xmm3,%xmm7 \n" - "movdqa %xmm7,%xmm3 \n" - "movdqu (%eax),%xmm4 \n" - "movdqu (%eax,%edi,1),%xmm5 \n" - "lea (%eax,%edi,2),%eax \n" - "movdqa %xmm4,%xmm7 \n" - "punpcklbw %xmm5,%xmm4 \n" - "punpckhbw %xmm5,%xmm7 \n" - "movdqa %xmm7,%xmm5 \n" - "movdqu (%eax),%xmm6 \n" - "movdqu (%eax,%edi,1),%xmm7 \n" - "lea (%eax,%edi,2),%eax \n" - "movdqu %xmm5,(%esp) \n" - "neg %edi \n" - "movdqa %xmm6,%xmm5 \n" - "punpcklbw %xmm7,%xmm6 \n" - "punpckhbw %xmm7,%xmm5 \n" - "movdqa %xmm5,%xmm7 \n" - "lea 0x10(%eax,%edi,8),%eax \n" - "neg %edi \n" - "movdqa %xmm0,%xmm5 \n" - "punpcklwd %xmm2,%xmm0 \n" - "punpckhwd %xmm2,%xmm5 \n" - "movdqa %xmm5,%xmm2 \n" - "movdqa %xmm1,%xmm5 \n" - "punpcklwd %xmm3,%xmm1 \n" - "punpckhwd %xmm3,%xmm5 \n" - "movdqa %xmm5,%xmm3 \n" - "movdqa %xmm4,%xmm5 \n" - "punpcklwd %xmm6,%xmm4 \n" - "punpckhwd %xmm6,%xmm5 \n" - "movdqa %xmm5,%xmm6 \n" - "movdqu (%esp),%xmm5 \n" - "movdqu %xmm6,(%esp) \n" - "movdqa %xmm5,%xmm6 \n" - "punpcklwd %xmm7,%xmm5 \n" - "punpckhwd %xmm7,%xmm6 \n" - "movdqa %xmm6,%xmm7 \n" - "movdqa %xmm0,%xmm6 \n" - "punpckldq %xmm4,%xmm0 \n" - "punpckhdq %xmm4,%xmm6 \n" - "movdqa %xmm6,%xmm4 \n" - "movdqu (%esp),%xmm6 \n" - "movlpd %xmm0,(%edx) \n" - "movhpd %xmm0,(%ebx) \n" - "movlpd %xmm4,(%edx,%esi,1) \n" - "lea (%edx,%esi,2),%edx \n" - "movhpd %xmm4,(%ebx,%ebp,1) \n" - "lea (%ebx,%ebp,2),%ebx \n" - "movdqa %xmm2,%xmm0 \n" - "punpckldq %xmm6,%xmm2 \n" - "movlpd %xmm2,(%edx) \n" - "movhpd %xmm2,(%ebx) \n" - "punpckhdq %xmm6,%xmm0 \n" - "movlpd %xmm0,(%edx,%esi,1) \n" - "lea (%edx,%esi,2),%edx \n" - "movhpd %xmm0,(%ebx,%ebp,1) \n" - "lea (%ebx,%ebp,2),%ebx \n" - "movdqa %xmm1,%xmm0 \n" - "punpckldq %xmm5,%xmm1 \n" - "movlpd %xmm1,(%edx) \n" - "movhpd %xmm1,(%ebx) \n" - "punpckhdq %xmm5,%xmm0 \n" - "movlpd %xmm0,(%edx,%esi,1) \n" - "lea (%edx,%esi,2),%edx \n" - "movhpd %xmm0,(%ebx,%ebp,1) \n" - "lea (%ebx,%ebp,2),%ebx \n" - "movdqa %xmm3,%xmm0 \n" - "punpckldq %xmm7,%xmm3 \n" - "movlpd %xmm3,(%edx) \n" - "movhpd %xmm3,(%ebx) \n" - "punpckhdq %xmm7,%xmm0 \n" - "sub $0x8,%ecx \n" - "movlpd %xmm0,(%edx,%esi,1) \n" - "lea (%edx,%esi,2),%edx \n" - "movhpd %xmm0,(%ebx,%ebp,1) \n" - "lea (%ebx,%ebp,2),%ebx \n" - "jg 1b \n" - "mov 0x10(%esp),%esp \n" - "pop %ebp \n" - "pop %edi \n" - "pop %esi \n" - "pop %ebx \n" -#if defined(__native_client__) - "pop %ecx \n" - "and $0xffffffe0,%ecx \n" - "jmp *%ecx \n" -#else - "ret \n" -#endif -); -#endif -#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ - defined(__x86_64__) -// 64 bit version has enough registers to do 16x8 to 8x16 at a time. +// Transpose 16x8. 64 bit +#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width) { asm volatile ( - // Read in the data from the source pointer. - // First round of bit swap. - ".p2align 2 \n" -"1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqu (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm8,%%xmm9 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "palignr $0x8,%%xmm9,%%xmm9 \n" - "movdqu (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm2,%%xmm10 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm10 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movdqa %%xmm10,%%xmm11 \n" - "movdqu (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "movdqu (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm4,%%xmm12 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm12 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movdqa %%xmm12,%%xmm13 \n" - "movdqu (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movdqu (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm6,%%xmm14 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "punpckhbw %%xmm7,%%xmm14 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "movdqa %%xmm14,%%xmm15 \n" - "lea 0x10(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "neg %3 \n" - // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "punpcklwd %%xmm10,%%xmm8 \n" - "punpcklwd %%xmm11,%%xmm9 \n" - "movdqa %%xmm8,%%xmm10 \n" - "movdqa %%xmm9,%%xmm11 \n" - "palignr $0x8,%%xmm10,%%xmm10 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "punpcklwd %%xmm14,%%xmm12 \n" - "punpcklwd %%xmm15,%%xmm13 \n" - "movdqa %%xmm12,%%xmm14 \n" - "movdqa %%xmm13,%%xmm15 \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - // Third round of bit swap. - // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm12,%%xmm8 \n" - "movq %%xmm8,(%1) \n" - "movdqa %%xmm8,%%xmm12 \n" - "palignr $0x8,%%xmm12,%%xmm12 \n" - "movq %%xmm12,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm14,%%xmm10 \n" - "movdqa %%xmm10,%%xmm14 \n" - "movq %%xmm10,(%1) \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "punpckldq %%xmm13,%%xmm9 \n" - "movq %%xmm14,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm9,%%xmm13 \n" - "movq %%xmm9,(%1) \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movq %%xmm13,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm15,%%xmm11 \n" - "movq %%xmm11,(%1) \n" - "movdqa %%xmm11,%%xmm15 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "sub $0x10,%2 \n" - "movq %%xmm15,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)) // %4 - : "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" -); + // Read in the data from the source pointer. + // First round of bit swap. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqu (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm8,%%xmm9 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "palignr $0x8,%%xmm9,%%xmm9 \n" + "movdqu (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm2,%%xmm10 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm10 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm10,%%xmm11 \n" + "movdqu (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "movdqu (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm4,%%xmm12 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm12 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movdqa %%xmm12,%%xmm13 \n" + "movdqu (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movdqu (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm6,%%xmm14 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "punpckhbw %%xmm7,%%xmm14 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "movdqa %%xmm14,%%xmm15 \n" + "lea 0x10(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "neg %3 \n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "punpcklwd %%xmm10,%%xmm8 \n" + "punpcklwd %%xmm11,%%xmm9 \n" + "movdqa %%xmm8,%%xmm10 \n" + "movdqa %%xmm9,%%xmm11 \n" + "palignr $0x8,%%xmm10,%%xmm10 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "punpcklwd %%xmm14,%%xmm12 \n" + "punpcklwd %%xmm15,%%xmm13 \n" + "movdqa %%xmm12,%%xmm14 \n" + "movdqa %%xmm13,%%xmm15 \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm12,%%xmm8 \n" + "movq %%xmm8,(%1) \n" + "movdqa %%xmm8,%%xmm12 \n" + "palignr $0x8,%%xmm12,%%xmm12 \n" + "movq %%xmm12,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm14,%%xmm10 \n" + "movdqa %%xmm10,%%xmm14 \n" + "movq %%xmm10,(%1) \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "punpckldq %%xmm13,%%xmm9 \n" + "movq %%xmm14,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm9,%%xmm13 \n" + "movq %%xmm9,(%1) \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movq %%xmm13,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm15,%%xmm11 \n" + "movq %%xmm11,(%1) \n" + "movdqa %%xmm11,%%xmm15 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "sub $0x10,%2 \n" + "movq %%xmm15,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" + ); } +#endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3) +// Transpose UV 8x8. 64 bit. +#if defined(HAS_TRANSPOSEUVWX8_SSE2) void TransposeUVWx8_SSE2(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, int width) { asm volatile ( - // Read in the data from the source pointer. - // First round of bit swap. - ".p2align 2 \n" -"1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%4),%%xmm1 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqa %%xmm8,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "movdqu (%0,%4),%%xmm3 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm8 \n" - "movdqa %%xmm8,%%xmm3 \n" - "movdqu (%0),%%xmm4 \n" - "movdqu (%0,%4),%%xmm5 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm4,%%xmm8 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm8 \n" - "movdqa %%xmm8,%%xmm5 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu (%0,%4),%%xmm7 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm6,%%xmm8 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %4 \n" - "lea 0x10(%0,%4,8),%0 \n" - "punpckhbw %%xmm7,%%xmm8 \n" - "movdqa %%xmm8,%%xmm7 \n" - "neg %4 \n" - // Second round of bit swap. - "movdqa %%xmm0,%%xmm8 \n" - "movdqa %%xmm1,%%xmm9 \n" - "punpckhwd %%xmm2,%%xmm8 \n" - "punpckhwd %%xmm3,%%xmm9 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm8,%%xmm2 \n" - "movdqa %%xmm9,%%xmm3 \n" - "movdqa %%xmm4,%%xmm8 \n" - "movdqa %%xmm5,%%xmm9 \n" - "punpckhwd %%xmm6,%%xmm8 \n" - "punpckhwd %%xmm7,%%xmm9 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm8,%%xmm6 \n" - "movdqa %%xmm9,%%xmm7 \n" - // Third round of bit swap. - // Write to the destination pointer. - "movdqa %%xmm0,%%xmm8 \n" - "punpckldq %%xmm4,%%xmm0 \n" - "movlpd %%xmm0,(%1) \n" // Write back U channel - "movhpd %%xmm0,(%2) \n" // Write back V channel - "punpckhdq %%xmm4,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movlpd %%xmm2,(%1) \n" - "movhpd %%xmm2,(%2) \n" - "punpckhdq %%xmm6,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm1,%%xmm8 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movlpd %%xmm1,(%1) \n" - "movhpd %%xmm1,(%2) \n" - "punpckhdq %%xmm5,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm3,%%xmm8 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movlpd %%xmm3,(%1) \n" - "movhpd %%xmm3,(%2) \n" - "punpckhdq %%xmm7,%%xmm8 \n" - "sub $0x8,%3 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst_a), // %1 - "+r"(dst_b), // %2 - "+r"(width) // %3 - : "r"((intptr_t)(src_stride)), // %4 - "r"((intptr_t)(dst_stride_a)), // %5 - "r"((intptr_t)(dst_stride_b)) // %6 - : "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9" -); + // Read in the data from the source pointer. + // First round of bit swap. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%4),%%xmm1 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqa %%xmm8,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "movdqu (%0,%4),%%xmm3 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm8 \n" + "movdqa %%xmm8,%%xmm3 \n" + "movdqu (%0),%%xmm4 \n" + "movdqu (%0,%4),%%xmm5 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm4,%%xmm8 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm8 \n" + "movdqa %%xmm8,%%xmm5 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu (%0,%4),%%xmm7 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm6,%%xmm8 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %4 \n" + "lea 0x10(%0,%4,8),%0 \n" + "punpckhbw %%xmm7,%%xmm8 \n" + "movdqa %%xmm8,%%xmm7 \n" + "neg %4 \n" + // Second round of bit swap. + "movdqa %%xmm0,%%xmm8 \n" + "movdqa %%xmm1,%%xmm9 \n" + "punpckhwd %%xmm2,%%xmm8 \n" + "punpckhwd %%xmm3,%%xmm9 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm8,%%xmm2 \n" + "movdqa %%xmm9,%%xmm3 \n" + "movdqa %%xmm4,%%xmm8 \n" + "movdqa %%xmm5,%%xmm9 \n" + "punpckhwd %%xmm6,%%xmm8 \n" + "punpckhwd %%xmm7,%%xmm9 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm8,%%xmm6 \n" + "movdqa %%xmm9,%%xmm7 \n" + // Third round of bit swap. + // Write to the destination pointer. + "movdqa %%xmm0,%%xmm8 \n" + "punpckldq %%xmm4,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" // Write back U channel + "movhpd %%xmm0,(%2) \n" // Write back V channel + "punpckhdq %%xmm4,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movlpd %%xmm2,(%1) \n" + "movhpd %%xmm2,(%2) \n" + "punpckhdq %%xmm6,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm1,%%xmm8 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movlpd %%xmm1,(%1) \n" + "movhpd %%xmm1,(%2) \n" + "punpckhdq %%xmm5,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm3,%%xmm8 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movlpd %%xmm3,(%1) \n" + "movhpd %%xmm3,(%2) \n" + "punpckhdq %%xmm7,%%xmm8 \n" + "sub $0x8,%3 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst_a), // %1 + "+r"(dst_b), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(src_stride)), // %4 + "r"((intptr_t)(dst_stride_a)), // %5 + "r"((intptr_t)(dst_stride_b)) // %6 + : "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9" + ); } -#endif -#endif - +#endif // defined(HAS_TRANSPOSEUVWX8_SSE2) #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/third_party/libyuv/source/rotate_mips.cc b/third_party/libyuv/source/rotate_mips.cc index efe6bd909..1e8ce2519 100644 --- a/third_party/libyuv/source/rotate_mips.cc +++ b/third_party/libyuv/source/rotate_mips.cc @@ -22,8 +22,8 @@ extern "C" { defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \ (_MIPS_SIM == _MIPS_SIM_ABI32) -void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { +void TransposeWx8_DSPR2(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { __asm__ __volatile__ ( ".set push \n" ".set noreorder \n" @@ -106,8 +106,8 @@ void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, ); } -void TransposeWx8_Fast_MIPS_DSPR2(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { +void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { __asm__ __volatile__ ( ".set noat \n" ".set push \n" @@ -308,10 +308,10 @@ void TransposeWx8_Fast_MIPS_DSPR2(const uint8* src, int src_stride, ); } -void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width) { +void TransposeUVWx8_DSPR2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width) { __asm__ __volatile__ ( ".set push \n" ".set noreorder \n" diff --git a/third_party/libyuv/source/rotate_neon.cc b/third_party/libyuv/source/rotate_neon.cc index 76043b3b3..1c22b472b 100644 --- a/third_party/libyuv/source/rotate_neon.cc +++ b/third_party/libyuv/source/rotate_neon.cc @@ -27,7 +27,7 @@ static uvec8 kVTbl4x4Transpose = void TransposeWx8_NEON(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width) { - const uint8* src_temp = NULL; + const uint8* src_temp; asm volatile ( // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter @@ -35,7 +35,6 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, "sub %5, #8 \n" // handle 8x8 blocks. this should be the majority of the plane - ".p2align 2 \n" "1: \n" "mov %0, %1 \n" @@ -230,7 +229,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, "4: \n" - : "+r"(src_temp), // %0 + : "=&r"(src_temp), // %0 "+r"(src), // %1 "+r"(src_stride), // %2 "+r"(dst), // %3 @@ -248,7 +247,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, int width) { - const uint8* src_temp = NULL; + const uint8* src_temp; asm volatile ( // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter @@ -256,7 +255,6 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, "sub %7, #8 \n" // handle 8x8 blocks. this should be the majority of the plane - ".p2align 2 \n" "1: \n" "mov %0, %1 \n" @@ -514,7 +512,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, "4: \n" - : "+r"(src_temp), // %0 + : "=&r"(src_temp), // %0 "+r"(src), // %1 "+r"(src_stride), // %2 "+r"(dst_a), // %3 diff --git a/third_party/libyuv/source/rotate_neon64.cc b/third_party/libyuv/source/rotate_neon64.cc index f52c082b3..1ab448f3a 100644 --- a/third_party/libyuv/source/rotate_neon64.cc +++ b/third_party/libyuv/source/rotate_neon64.cc @@ -26,7 +26,7 @@ static uvec8 kVTbl4x4Transpose = void TransposeWx8_NEON(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width) { - const uint8* src_temp = NULL; + const uint8* src_temp; int64 width64 = (int64) width; // Work around clang 3.4 warning. asm volatile ( // loops are on blocks of 8. loop will stop when @@ -235,7 +235,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, "4: \n" - : "+r"(src_temp), // %0 + : "=&r"(src_temp), // %0 "+r"(src), // %1 "+r"(dst), // %2 "+r"(width64) // %3 @@ -255,7 +255,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, int width) { - const uint8* src_temp = NULL; + const uint8* src_temp; int64 width64 = (int64) width; // Work around clang 3.4 warning. asm volatile ( // loops are on blocks of 8. loop will stop when @@ -520,7 +520,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, "4: \n" - : "+r"(src_temp), // %0 + : "=&r"(src_temp), // %0 "+r"(src), // %1 "+r"(dst_a), // %2 "+r"(dst_b), // %3 diff --git a/third_party/libyuv/source/rotate_win.cc b/third_party/libyuv/source/rotate_win.cc index 2760066df..1300fc0fe 100644 --- a/third_party/libyuv/source/rotate_win.cc +++ b/third_party/libyuv/source/rotate_win.cc @@ -16,9 +16,8 @@ namespace libyuv { extern "C" { #endif -// This module is for Visual C x86. -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ - defined(_MSC_VER) && !defined(__clang__) +// This module is for 32 bit Visual C x86 and clangcl +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) __declspec(naked) void TransposeWx8_SSSE3(const uint8* src, int src_stride, diff --git a/third_party/libyuv/source/row_any.cc b/third_party/libyuv/source/row_any.cc index 1cb1f6b93..494164fd0 100644 --- a/third_party/libyuv/source/row_any.cc +++ b/third_party/libyuv/source/row_any.cc @@ -22,6 +22,39 @@ extern "C" { // Subsampled source needs to be increase by 1 of not even. #define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift)) +// Any 4 planes to 1 with yuvconstants +#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ + const uint8* a_buf, uint8* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8 temp[64 * 5]); \ + memset(temp, 0, 64 * 4); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n, r); \ + memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 192, a_buf + n, r); \ + ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \ + yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \ + SS(r, DUVSHIFT) * BPP); \ + } + +#ifdef HAS_I422ALPHATOARGBROW_SSSE3 +ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7) +#endif +#ifdef HAS_I422ALPHATOARGBROW_AVX2 +ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15) +#endif +#ifdef HAS_I422ALPHATOARGBROW_NEON +ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7) +#endif +#undef ANY41C + // Any 3 planes to 1. #define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ @@ -40,83 +73,100 @@ extern "C" { memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \ SS(r, DUVSHIFT) * BPP); \ } +#ifdef HAS_I422TOYUY2ROW_SSE2 +ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15) +ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15) +#endif +#ifdef HAS_I422TOYUY2ROW_NEON +ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15) +#endif +#ifdef HAS_I422TOUYVYROW_NEON +ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15) +#endif +#ifdef HAS_BLENDPLANEROW_AVX2 +ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31) +#endif +#ifdef HAS_BLENDPLANEROW_SSSE3 +ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7) +#endif +#undef ANY31 + +// Note that odd width replication includes 444 due to implementation +// on arm that subsamples 444 to 422 internally. +// Any 3 planes to 1 with yuvconstants +#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ + uint8* dst_ptr, const struct YuvConstants* yuvconstants, \ + int width) { \ + SIMD_ALIGNED(uint8 temp[64 * 4]); \ + memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n, r); \ + memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + if (width & 1) { \ + temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \ + temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \ + } \ + ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, \ + yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \ + SS(r, DUVSHIFT) * BPP); \ + } #ifdef HAS_I422TOARGBROW_SSSE3 -ANY31(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7) +ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7) +#endif +#ifdef HAS_I411TOARGBROW_SSSE3 +ANY31C(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7) #endif #ifdef HAS_I444TOARGBROW_SSSE3 -ANY31(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7) -ANY31(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7) -ANY31(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_SSSE3, 1, 0, 4, 7) -ANY31(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_SSSE3, 1, 0, 4, 7) -ANY31(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7) -ANY31(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7) -ANY31(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7) -ANY31(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7) -ANY31(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7) -ANY31(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, 1, 0, 3, 7) -ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15) -ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15) +ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7) +ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7) +ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7) +ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7) +ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7) +ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7) #endif // HAS_I444TOARGBROW_SSSE3 #ifdef HAS_I422TORGB24ROW_AVX2 -ANY31(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15) -#endif -#ifdef HAS_I422TORAWROW_AVX2 -ANY31(I422ToRAWRow_Any_AVX2, I422ToRAWRow_AVX2, 1, 0, 3, 15) -#endif -#ifdef HAS_J422TOARGBROW_SSSE3 -ANY31(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, 1, 0, 4, 7) -#endif -#ifdef HAS_J422TOARGBROW_AVX2 -ANY31(J422ToARGBRow_Any_AVX2, J422ToARGBRow_AVX2, 1, 0, 4, 15) +ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15) #endif #ifdef HAS_I422TOARGBROW_AVX2 -ANY31(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15) -#endif -#ifdef HAS_I422TOBGRAROW_AVX2 -ANY31(I422ToBGRARow_Any_AVX2, I422ToBGRARow_AVX2, 1, 0, 4, 15) +ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15) #endif #ifdef HAS_I422TORGBAROW_AVX2 -ANY31(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15) -#endif -#ifdef HAS_I422TOABGRROW_AVX2 -ANY31(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, 1, 0, 4, 15) +ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15) #endif #ifdef HAS_I444TOARGBROW_AVX2 -ANY31(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15) +ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15) #endif #ifdef HAS_I411TOARGBROW_AVX2 -ANY31(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15) +ANY31C(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15) #endif #ifdef HAS_I422TOARGB4444ROW_AVX2 -ANY31(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7) +ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7) #endif #ifdef HAS_I422TOARGB1555ROW_AVX2 -ANY31(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7) +ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7) #endif #ifdef HAS_I422TORGB565ROW_AVX2 -ANY31(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7) +ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7) #endif #ifdef HAS_I422TOARGBROW_NEON -ANY31(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7) -ANY31(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7) -ANY31(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7) -ANY31(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, 1, 0, 4, 7) -ANY31(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, 1, 0, 4, 7) -ANY31(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7) -ANY31(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7) -ANY31(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, 1, 0, 3, 7) -ANY31(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7) -ANY31(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7) -ANY31(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7) -#endif -#ifdef HAS_I422TOYUY2ROW_NEON -ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15) -#endif -#ifdef HAS_I422TOUYVYROW_NEON -ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15) -#endif -#undef ANY31 +ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7) +ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7) +ANY31C(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7) +ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7) +ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7) +ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7) +ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7) +ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7) +#endif +#undef ANY31C // Any 2 planes to 1. #define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ @@ -136,32 +186,6 @@ ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15) memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ } -// Biplanar to RGB. -#ifdef HAS_NV12TOARGBROW_SSSE3 -ANY21(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7) -ANY21(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7) -#endif -#ifdef HAS_NV12TOARGBROW_AVX2 -ANY21(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15) -ANY21(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15) -#endif -#ifdef HAS_NV12TOARGBROW_NEON -ANY21(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7) -ANY21(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7) -#endif -#ifdef HAS_NV12TORGB565ROW_SSSE3 -ANY21(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7) -ANY21(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, 1, 1, 2, 2, 7) -#endif -#ifdef HAS_NV12TORGB565ROW_AVX2 -ANY21(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15) -ANY21(NV21ToRGB565Row_Any_AVX2, NV21ToRGB565Row_AVX2, 1, 1, 2, 2, 15) -#endif -#ifdef HAS_NV12TORGB565ROW_NEON -ANY21(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7) -ANY21(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, 1, 1, 2, 2, 7) -#endif - // Merge functions. #ifdef HAS_MERGEUVROW_SSE2 ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15) @@ -221,6 +245,55 @@ ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7) #endif #undef ANY21 +// Any 2 planes to 1 with yuvconstants +#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ + void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \ + uint8* dst_ptr, const struct YuvConstants* yuvconstants, \ + int width) { \ + SIMD_ALIGNED(uint8 temp[64 * 3]); \ + memset(temp, 0, 64 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n * SBPP, r * SBPP); \ + memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \ + SS(r, UVSHIFT) * SBPP2); \ + ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } + +// Biplanar to RGB. +#ifdef HAS_NV12TOARGBROW_SSSE3 +ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV12TOARGBROW_AVX2 +ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15) +#endif +#ifdef HAS_NV12TOARGBROW_NEON +ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV21TOARGBROW_SSSE3 +ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV21TOARGBROW_AVX2 +ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15) +#endif +#ifdef HAS_NV21TOARGBROW_NEON +ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV12TORGB565ROW_SSSE3 +ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7) +#endif +#ifdef HAS_NV12TORGB565ROW_AVX2 +ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15) +#endif +#ifdef HAS_NV12TORGB565ROW_NEON +ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7) +#endif +#undef ANY21C + // Any 1 to 1. #define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ @@ -252,8 +325,10 @@ ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3) ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3) ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3) #endif -#if defined(HAS_ARGBTOARGB4444ROW_AVX2) +#if defined(HAS_ARGBTORGB565ROW_AVX2) ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7) +#endif +#if defined(HAS_ARGBTOARGB4444ROW_AVX2) ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7) #endif @@ -269,15 +344,16 @@ ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7) #if defined(HAS_I400TOARGBROW_AVX2) ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15) #endif -#if defined(HAS_YUY2TOARGBROW_SSSE3) -ANY11(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15) -ANY11(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15) +#if defined(HAS_RGB24TOARGBROW_SSSE3) ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15) ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15) ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7) ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7) ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7) #endif +#if defined(HAS_RAWTORGB24ROW_SSSE3) +ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7) +#endif #if defined(HAS_RGB565TOARGBROW_AVX2) ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15) #endif @@ -287,10 +363,6 @@ ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15) #if defined(HAS_ARGB4444TOARGBROW_AVX2) ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15) #endif -#if defined(HAS_YUY2TOARGBROW_AVX2) -ANY11(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31) -ANY11(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31) -#endif #if defined(HAS_ARGBTORGB24ROW_NEON) ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7) ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7) @@ -299,8 +371,9 @@ ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7) ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7) ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7) -ANY11(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7) -ANY11(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7) +#endif +#if defined(HAS_RAWTORGB24ROW_NEON) +ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7) #endif #ifdef HAS_ARGBTOYROW_AVX2 ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31) @@ -381,9 +454,6 @@ ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7) #ifdef HAS_ARGBATTENUATEROW_SSSE3 ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3) #endif -#ifdef HAS_ARGBATTENUATEROW_SSE2 -ANY11(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, 0, 4, 4, 3) -#endif #ifdef HAS_ARGBUNATTENUATEROW_SSE2 ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3) #endif @@ -396,8 +466,44 @@ ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7) #ifdef HAS_ARGBATTENUATEROW_NEON ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7) #endif +#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 +ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7) +#endif +#ifdef HAS_ARGBEXTRACTALPHAROW_NEON +ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15) +#endif #undef ANY11 +// Any 1 to 1 blended. Destination is read, modify, write. +#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8 temp[128 * 2]); \ + memset(temp, 0, 128 * 2); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + memcpy(temp + 128, dst_ptr + n * BPP, r * BPP); \ + ANY_SIMD(temp, temp + 128, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } + +#ifdef HAS_ARGBCOPYALPHAROW_AVX2 +ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15) +#endif +#ifdef HAS_ARGBCOPYALPHAROW_SSE2 +ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7) +#endif +#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 +ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15) +#endif +#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 +ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7) +#endif +#undef ANY11B + // Any 1 to 1 with parameter. #define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \ @@ -440,6 +546,35 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3) #endif #undef ANY11P +// Any 1 to 1 with yuvconstants +#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8 temp[128 * 2]); \ + memset(temp, 0, 128); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } +#if defined(HAS_YUY2TOARGBROW_SSSE3) +ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15) +ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15) +#endif +#if defined(HAS_YUY2TOARGBROW_AVX2) +ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31) +ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31) +#endif +#if defined(HAS_YUY2TOARGBROW_NEON) +ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7) +ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7) +#endif +#undef ANY11C + // Any 1 to 1 interpolate. Takes 2 rows of source via stride. #define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \ @@ -464,14 +599,11 @@ ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31) #ifdef HAS_INTERPOLATEROW_SSSE3 ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15) #endif -#ifdef HAS_INTERPOLATEROW_SSE2 -ANY11T(InterpolateRow_Any_SSE2, InterpolateRow_SSE2, 1, 1, 15) -#endif #ifdef HAS_INTERPOLATEROW_NEON ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15) #endif -#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2 -ANY11T(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, 1, 1, 3) +#ifdef HAS_INTERPOLATEROW_DSPR2 +ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3) #endif #undef ANY11T @@ -496,9 +628,6 @@ ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31) #ifdef HAS_MIRRORROW_SSSE3 ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15) #endif -#ifdef HAS_MIRRORROW_SSE2 -ANY11M(MirrorRow_Any_SSE2, MirrorRow_SSE2, 1, 15) -#endif #ifdef HAS_MIRRORROW_NEON ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15) #endif @@ -548,9 +677,25 @@ ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3) ANY_SIMD(src_ptr, dst_u, dst_v, n); \ } \ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ - if ((width & 1) && BPP == 4) { /* repeat last 4 bytes for subsampler */ \ + /* repeat last 4 bytes for 422 subsampler */ \ + if ((width & 1) && BPP == 4 && DUVSHIFT == 1) { \ + memcpy(temp + SS(r, UVSHIFT) * BPP, \ + temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \ + } \ + /* repeat last 4 - 12 bytes for 411 subsampler */ \ + if (((width & 3) == 1) && BPP == 4 && DUVSHIFT == 2) { \ + memcpy(temp + SS(r, UVSHIFT) * BPP, \ + temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \ + memcpy(temp + SS(r, UVSHIFT) * BPP + BPP, \ + temp + SS(r, UVSHIFT) * BPP - BPP, BPP * 2); \ + } \ + if (((width & 3) == 2) && BPP == 4 && DUVSHIFT == 2) { \ memcpy(temp + SS(r, UVSHIFT) * BPP, \ - temp + SS(r, UVSHIFT) * BPP - BPP, 4); \ + temp + SS(r, UVSHIFT) * BPP - BPP * 2, BPP * 2); \ + } \ + if (((width & 3) == 3) && BPP == 4 && DUVSHIFT == 2) { \ + memcpy(temp + SS(r, UVSHIFT) * BPP, \ + temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \ } \ ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \ memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \ @@ -566,8 +711,8 @@ ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31) #ifdef HAS_SPLITUVROW_NEON ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15) #endif -#ifdef HAS_SPLITUVROW_MIPS_DSPR2 -ANY12(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_MIPS_DSPR2, 0, 2, 0, 15) +#ifdef HAS_SPLITUVROW_DSPR2 +ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15) #endif #ifdef HAS_ARGBTOUV444ROW_SSSE3 ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15) @@ -576,16 +721,12 @@ ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15) ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31) ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31) #endif -#ifdef HAS_ARGBTOUV422ROW_SSSE3 -ANY12(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_SSSE3, 0, 4, 1, 15) -#endif #ifdef HAS_YUY2TOUV422ROW_SSE2 ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15) ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15) #endif #ifdef HAS_YUY2TOUV422ROW_NEON ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7) -ANY12(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON, 0, 4, 1, 15) ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31) ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15) ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15) @@ -607,11 +748,11 @@ ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15) memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \ SS(r, UVSHIFT) * BPP); \ - if ((width & 1) && BPP == 4) { /* repeat last 4 bytes for subsampler */ \ + if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */\ memcpy(temp + SS(r, UVSHIFT) * BPP, \ - temp + SS(r, UVSHIFT) * BPP - BPP, 4); \ + temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \ memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \ - temp + 128 + SS(r, UVSHIFT) * BPP - BPP, 4); \ + temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ } \ ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \ memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \ @@ -621,6 +762,9 @@ ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15) #ifdef HAS_ARGBTOUVROW_AVX2 ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31) #endif +#ifdef HAS_ARGBTOUVJROW_AVX2 +ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31) +#endif #ifdef HAS_ARGBTOUVROW_SSSE3 ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15) ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15) diff --git a/third_party/libyuv/source/row_common.cc b/third_party/libyuv/source/row_common.cc index 49875894f..32d2f686f 100644 --- a/third_party/libyuv/source/row_common.cc +++ b/third_party/libyuv/source/row_common.cc @@ -100,6 +100,20 @@ void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) { } } +void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 r = src_raw[0]; + uint8 g = src_raw[1]; + uint8 b = src_raw[2]; + dst_rgb24[0] = b; + dst_rgb24[1] = g; + dst_rgb24[2] = r; + dst_rgb24 += 3; + src_raw += 3; + } +} + void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { @@ -419,28 +433,6 @@ void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb, \ MAKEROWYJ(ARGB, 2, 1, 0, 4) #undef MAKEROWYJ -void ARGBToUVJ422Row_C(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - uint8 ab = (src_argb[0] + src_argb[4]) >> 1; - uint8 ag = (src_argb[1] + src_argb[5]) >> 1; - uint8 ar = (src_argb[2] + src_argb[6]) >> 1; - dst_u[0] = RGBToUJ(ar, ag, ab); - dst_v[0] = RGBToVJ(ar, ag, ab); - src_argb += 8; - dst_u += 1; - dst_v += 1; - } - if (width & 1) { - uint8 ab = src_argb[0]; - uint8 ag = src_argb[1]; - uint8 ar = src_argb[2]; - dst_u[0] = RGBToUJ(ar, ag, ab); - dst_v[0] = RGBToVJ(ar, ag, ab); - } -} - void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) { int x; for (x = 0; x < width; ++x) { @@ -644,28 +636,6 @@ void ARGBToUV444Row_C(const uint8* src_argb, } } -void ARGBToUV422Row_C(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - uint8 ab = (src_argb[0] + src_argb[4]) >> 1; - uint8 ag = (src_argb[1] + src_argb[5]) >> 1; - uint8 ar = (src_argb[2] + src_argb[6]) >> 1; - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); - src_argb += 8; - dst_u += 1; - dst_v += 1; - } - if (width & 1) { - uint8 ab = src_argb[0]; - uint8 ag = src_argb[1]; - uint8 ar = src_argb[2]; - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); - } -} - void ARGBToUV411Row_C(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width) { int x; @@ -679,10 +649,11 @@ void ARGBToUV411Row_C(const uint8* src_argb, dst_u += 1; dst_v += 1; } + // Odd width handling mimics 'any' function which replicates last pixel. if ((width & 3) == 3) { - uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8]) / 3; - uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9]) / 3; - uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10]) / 3; + uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[8]) >> 2; + uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[9]) >> 2; + uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[10]) >> 2; dst_u[0] = RGBToU(ar, ag, ab); dst_v[0] = RGBToV(ar, ag, ab); } else if ((width & 3) == 2) { @@ -994,13 +965,15 @@ void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { } } +// TODO(fbarchard): Unify these structures to be platform independent. +// TODO(fbarchard): Generate SIMD structures from float matrix. + // BT.601 YUV to RGB reference // R = (Y - 16) * 1.164 - V * -1.596 // G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 // B = (Y - 16) * 1.164 - U * -2.018 // Y contribution to R,G,B. Scale and bias. -// TODO(fbarchard): Consider moving constants into a common header. #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ @@ -1011,36 +984,76 @@ void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { #define VR -102 /* round(-1.596 * 64) */ // Bias values to subtract 16 from Y and 128 from U and V. -#define BB (UB * 128 + YGB) +#define BB (UB * 128 + YGB) #define BG (UG * 128 + VG * 128 + YGB) -#define BR (VR * 128 + YGB) - -// C reference code that mimics the YUV assembly. -static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, - uint8* b, uint8* g, uint8* r) { - uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16; - *b = Clamp((int32)(-(u * UB) + y1 + BB) >> 6); - *g = Clamp((int32)(-(v * VG + u * UG) + y1 + BG) >> 6); - *r = Clamp((int32)(-(v * VR)+ y1 + BR) >> 6); -} - -// C reference code that mimics the YUV assembly. -static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) { - uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16; - *b = Clamp((int32)(y1 + YGB) >> 6); - *g = Clamp((int32)(y1 + YGB) >> 6); - *r = Clamp((int32)(y1 + YGB) >> 6); -} +#define BR (VR * 128 + YGB) + +#if defined(__aarch64__) +const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { + { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, + { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, + { UG, VG, UG, VG, UG, VG, UG, VG }, + { UG, VG, UG, VG, UG, VG, UG, VG }, + { BB, BG, BR, 0, 0, 0, 0, 0 }, + { 0x0101 * YG, 0, 0, 0 } +}; +const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { + { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, + { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, + { VG, UG, VG, UG, VG, UG, VG, UG }, + { VG, UG, VG, UG, VG, UG, VG, UG }, + { BR, BG, BB, 0, 0, 0, 0, 0 }, + { 0x0101 * YG, 0, 0, 0 } +}; +#elif defined(__arm__) +const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { + { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 }, + { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 }, + { BB, BG, BR, 0, 0, 0, 0, 0 }, + { 0x0101 * YG, 0, 0, 0 } +}; +const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { + { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 }, + { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 }, + { BR, BG, BB, 0, 0, 0, 0, 0 }, + { 0x0101 * YG, 0, 0, 0 } +}; +#else +const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { + { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, + { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, + { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, + { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, + { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, + { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, + { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } +}; +const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { + { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, + VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, + { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, + { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, + 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, + { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, + { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, + { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, + { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } +}; +#endif -#undef YG +#undef BB +#undef BG +#undef BR #undef YGB #undef UB #undef UG #undef VG #undef VR -#undef BB -#undef BG -#undef BR +#undef YG // JPEG YUV to RGB reference // * R = Y - V * -1.40200 @@ -1048,39 +1061,228 @@ static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) { // * B = Y - U * -1.77200 // Y contribution to R,G,B. Scale and bias. -// TODO(fbarchard): Consider moving constants into a common header. -#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ -#define YGBJ 32 /* 64 / 2 */ +#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ +#define YGB 32 /* 64 / 2 */ // U and V contributions to R,G,B. -#define UBJ -113 /* round(-1.77200 * 64) */ -#define UGJ 22 /* round(0.34414 * 64) */ -#define VGJ 46 /* round(0.71414 * 64) */ -#define VRJ -90 /* round(-1.40200 * 64) */ +#define UB -113 /* round(-1.77200 * 64) */ +#define UG 22 /* round(0.34414 * 64) */ +#define VG 46 /* round(0.71414 * 64) */ +#define VR -90 /* round(-1.40200 * 64) */ -// Bias values to subtract 16 from Y and 128 from U and V. -#define BBJ (UBJ * 128 + YGBJ) -#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ) -#define BRJ (VRJ * 128 + YGBJ) +// Bias values to round, and subtract 128 from U and V. +#define BB (UB * 128 + YGB) +#define BG (UG * 128 + VG * 128 + YGB) +#define BR (VR * 128 + YGB) + +#if defined(__aarch64__) +const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { + { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, + { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, + { UG, VG, UG, VG, UG, VG, UG, VG }, + { UG, VG, UG, VG, UG, VG, UG, VG }, + { BB, BG, BR, 0, 0, 0, 0, 0 }, + { 0x0101 * YG, 0, 0, 0 } +}; +const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { + { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, + { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, + { VG, UG, VG, UG, VG, UG, VG, UG }, + { VG, UG, VG, UG, VG, UG, VG, UG }, + { BR, BG, BB, 0, 0, 0, 0, 0 }, + { 0x0101 * YG, 0, 0, 0 } +}; +#elif defined(__arm__) +const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { + { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 }, + { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 }, + { BB, BG, BR, 0, 0, 0, 0, 0 }, + { 0x0101 * YG, 0, 0, 0 } +}; +const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { + { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 }, + { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 }, + { BR, BG, BB, 0, 0, 0, 0, 0 }, + { 0x0101 * YG, 0, 0, 0 } +}; +#else +const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { + { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, + { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, + { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, + { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, + { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, + { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, + { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } +}; +const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { + { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, + VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, + { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, + { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, + 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, + { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, + { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, + { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, + { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } +}; +#endif + +#undef BB +#undef BG +#undef BR +#undef YGB +#undef UB +#undef UG +#undef VG +#undef VR +#undef YG + +// BT.709 YUV to RGB reference +// * R = Y - V * -1.28033 +// * G = Y - U * 0.21482 - V * 0.38059 +// * B = Y - U * -2.12798 + +// Y contribution to R,G,B. Scale and bias. +#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ +#define YGB 32 /* 64 / 2 */ + +// TODO(fbarchard): Find way to express 2.12 instead of 2.0. +// U and V contributions to R,G,B. +#define UB -128 /* max(-128, round(-2.12798 * 64)) */ +#define UG 14 /* round(0.21482 * 64) */ +#define VG 24 /* round(0.38059 * 64) */ +#define VR -82 /* round(-1.28033 * 64) */ + +// Bias values to round, and subtract 128 from U and V. +#define BB (UB * 128 + YGB) +#define BG (UG * 128 + VG * 128 + YGB) +#define BR (VR * 128 + YGB) + +#if defined(__aarch64__) +const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { + { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, + { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, + { UG, VG, UG, VG, UG, VG, UG, VG }, + { UG, VG, UG, VG, UG, VG, UG, VG }, + { BB, BG, BR, 0, 0, 0, 0, 0 }, + { 0x0101 * YG, 0, 0, 0 } +}; +const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { + { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, + { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, + { VG, UG, VG, UG, VG, UG, VG, UG }, + { VG, UG, VG, UG, VG, UG, VG, UG }, + { BR, BG, BB, 0, 0, 0, 0, 0 }, + { 0x0101 * YG, 0, 0, 0 } +}; +#elif defined(__arm__) +const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { + { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 }, + { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 }, + { BB, BG, BR, 0, 0, 0, 0, 0 }, + { 0x0101 * YG, 0, 0, 0 } +}; +const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { + { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 }, + { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 }, + { BR, BG, BB, 0, 0, 0, 0, 0 }, + { 0x0101 * YG, 0, 0, 0 } +}; +#else +const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { + { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, + { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, + { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, + { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, + { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, + { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, + { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } +}; +const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { + { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, + VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, + { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, + { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, + 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, + { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, + { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, + { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, + { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } +}; +#endif + +#undef BB +#undef BG +#undef BR +#undef YGB +#undef UB +#undef UG +#undef VG +#undef VR +#undef YG // C reference code that mimics the YUV assembly. -static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v, - uint8* b, uint8* g, uint8* r) { - uint32 y1 = (uint32)(y * 0x0101 * YGJ) >> 16; - *b = Clamp((int32)(-(u * UBJ) + y1 + BBJ) >> 6); - *g = Clamp((int32)(-(v * VGJ + u * UGJ) + y1 + BGJ) >> 6); - *r = Clamp((int32)(-(v * VRJ) + y1 + BRJ) >> 6); -} - -#undef YGJ -#undef YGBJ -#undef UBJ -#undef UGJ -#undef VGJ -#undef VRJ -#undef BBJ -#undef BGJ -#undef BRJ +static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, + uint8* b, uint8* g, uint8* r, + const struct YuvConstants* yuvconstants) { +#if defined(__aarch64__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = -yuvconstants->kUVToRB[1]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[0] / 0x0101; +#elif defined(__arm__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[4]; + int vr = -yuvconstants->kUVToRB[4]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[0] / 0x0101; +#else + int ub = yuvconstants->kUVToB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = yuvconstants->kUVToR[1]; + int bb = yuvconstants->kUVBiasB[0]; + int bg = yuvconstants->kUVBiasG[0]; + int br = yuvconstants->kUVBiasR[0]; + int yg = yuvconstants->kYToRgb[0]; +#endif + + uint32 y1 = (uint32)(y * 0x0101 * yg) >> 16; + *b = Clamp((int32)(-(u * ub ) + y1 + bb) >> 6); + *g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6); + *r = Clamp((int32)(-( v * vr) + y1 + br) >> 6); +} + +// Y contribution to R,G,B. Scale and bias. +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ + +// C reference code that mimics the YUV assembly. +static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) { + uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16; + *b = Clamp((int32)(y1 + YGB) >> 6); + *g = Clamp((int32)(y1 + YGB) >> 6); + *r = Clamp((int32)(y1 + YGB) >> 6); +} + +#undef YG +#undef YGB #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON)) @@ -1090,14 +1292,17 @@ void I444ToARGBRow_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { uint8 u = (src_u[0] + src_u[1] + 1) >> 1; uint8 v = (src_v[0] + src_v[1] + 1) >> 1; - YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, + yuvconstants); rgb_buf[3] = 255; - YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, + yuvconstants); rgb_buf[7] = 255; src_y += 2; src_u += 2; @@ -1106,7 +1311,8 @@ void I444ToARGBRow_C(const uint8* src_y, } if (width & 1) { YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; } } #else @@ -1114,11 +1320,12 @@ void I444ToARGBRow_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width; ++x) { YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; src_y += 1; src_u += 1; @@ -1133,14 +1340,15 @@ void I422ToARGBRow_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_u += 1; @@ -1149,33 +1357,36 @@ void I422ToARGBRow_C(const uint8* src_y, } if (width & 1) { YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } -void J422ToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, - int width) { +void I422AlphaToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + const uint8* src_a, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvJPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); - rgb_buf[3] = 255; - YuvJPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); - rgb_buf[7] = 255; + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + rgb_buf[3] = src_a[0]; + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); + rgb_buf[7] = src_a[1]; src_y += 2; src_u += 1; src_v += 1; + src_a += 2; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvJPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); - rgb_buf[3] = 255; + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + rgb_buf[3] = src_a[0]; } } @@ -1183,35 +1394,14 @@ void I422ToRGB24Row_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); - YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 3, rgb_buf + 4, rgb_buf + 5); - src_y += 2; - src_u += 1; - src_v += 1; - rgb_buf += 6; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); - } -} - -void I422ToRAWRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 2, rgb_buf + 1, rgb_buf + 0); + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 5, rgb_buf + 4, rgb_buf + 3); + rgb_buf + 3, rgb_buf + 4, rgb_buf + 5, yuvconstants); src_y += 2; src_u += 1; src_v += 1; @@ -1219,7 +1409,7 @@ void I422ToRAWRow_C(const uint8* src_y, } if (width & 1) { YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 2, rgb_buf + 1, rgb_buf + 0); + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); } } @@ -1227,6 +1417,7 @@ void I422ToARGB4444Row_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb4444, + const struct YuvConstants* yuvconstants, int width) { uint8 b0; uint8 g0; @@ -1236,8 +1427,8 @@ void I422ToARGB4444Row_C(const uint8* src_y, uint8 r1; int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); - YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1); + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); b0 = b0 >> 4; g0 = g0 >> 4; r0 = r0 >> 4; @@ -1252,7 +1443,7 @@ void I422ToARGB4444Row_C(const uint8* src_y, dst_argb4444 += 4; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); b0 = b0 >> 4; g0 = g0 >> 4; r0 = r0 >> 4; @@ -1265,6 +1456,7 @@ void I422ToARGB1555Row_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb1555, + const struct YuvConstants* yuvconstants, int width) { uint8 b0; uint8 g0; @@ -1274,8 +1466,8 @@ void I422ToARGB1555Row_C(const uint8* src_y, uint8 r1; int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); - YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1); + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 3; r0 = r0 >> 3; @@ -1290,7 +1482,7 @@ void I422ToARGB1555Row_C(const uint8* src_y, dst_argb1555 += 4; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 3; r0 = r0 >> 3; @@ -1303,6 +1495,7 @@ void I422ToRGB565Row_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgb565, + const struct YuvConstants* yuvconstants, int width) { uint8 b0; uint8 g0; @@ -1312,8 +1505,8 @@ void I422ToRGB565Row_C(const uint8* src_y, uint8 r1; int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); - YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1); + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; @@ -1328,7 +1521,7 @@ void I422ToRGB565Row_C(const uint8* src_y, dst_rgb565 += 4; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; @@ -1340,20 +1533,21 @@ void I411ToARGBRow_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 3; x += 4) { YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; YuvPixel(src_y[2], src_u[0], src_v[0], - rgb_buf + 8, rgb_buf + 9, rgb_buf + 10); + rgb_buf + 8, rgb_buf + 9, rgb_buf + 10, yuvconstants); rgb_buf[11] = 255; YuvPixel(src_y[3], src_u[0], src_v[0], - rgb_buf + 12, rgb_buf + 13, rgb_buf + 14); + rgb_buf + 12, rgb_buf + 13, rgb_buf + 14, yuvconstants); rgb_buf[15] = 255; src_y += 4; src_u += 1; @@ -1362,17 +1556,17 @@ void I411ToARGBRow_C(const uint8* src_y, } if (width & 2) { YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -1380,14 +1574,15 @@ void I411ToARGBRow_C(const uint8* src_y, void NV12ToARGBRow_C(const uint8* src_y, const uint8* src_uv, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_uv[0], src_uv[1], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; YuvPixel(src_y[1], src_uv[0], src_uv[1], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_uv += 2; @@ -1395,7 +1590,7 @@ void NV12ToARGBRow_C(const uint8* src_y, } if (width & 1) { YuvPixel(src_y[0], src_uv[0], src_uv[1], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -1403,24 +1598,23 @@ void NV12ToARGBRow_C(const uint8* src_y, void NV21ToARGBRow_C(const uint8* src_y, const uint8* src_vu, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_vu[1], src_vu[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel(src_y[1], src_vu[1], src_vu[0], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; - src_y += 2; src_vu += 2; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { YuvPixel(src_y[0], src_vu[1], src_vu[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -1428,6 +1622,7 @@ void NV21ToARGBRow_C(const uint8* src_y, void NV12ToRGB565Row_C(const uint8* src_y, const uint8* src_uv, uint8* dst_rgb565, + const struct YuvConstants* yuvconstants, int width) { uint8 b0; uint8 g0; @@ -1437,8 +1632,8 @@ void NV12ToRGB565Row_C(const uint8* src_y, uint8 r1; int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0); - YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1); + YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants); + YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; @@ -1452,42 +1647,7 @@ void NV12ToRGB565Row_C(const uint8* src_y, dst_rgb565 += 4; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0); - b0 = b0 >> 3; - g0 = g0 >> 2; - r0 = r0 >> 3; - *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); - } -} - -void NV21ToRGB565Row_C(const uint8* src_y, - const uint8* vsrc_u, - uint8* dst_rgb565, - int width) { - uint8 b0; - uint8 g0; - uint8 r0; - uint8 b1; - uint8 g1; - uint8 r1; - int x; - for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0); - YuvPixel(src_y[1], vsrc_u[1], vsrc_u[0], &b1, &g1, &r1); - b0 = b0 >> 3; - g0 = g0 >> 2; - r0 = r0 >> 3; - b1 = b1 >> 3; - g1 = g1 >> 2; - r1 = r1 >> 3; - *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) | - (b1 << 16) | (g1 << 21) | (r1 << 27); - src_y += 2; - vsrc_u += 2; - dst_rgb565 += 4; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0); + YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; @@ -1497,92 +1657,44 @@ void NV21ToRGB565Row_C(const uint8* src_y, void YUY2ToARGBRow_C(const uint8* src_yuy2, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_yuy2 += 4; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } void UYVYToARGBRow_C(const uint8* src_uyvy, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_uyvy += 4; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); - rgb_buf[3] = 255; - } -} - -void I422ToBGRARow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 3, rgb_buf + 2, rgb_buf + 1); - rgb_buf[0] = 255; - YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 7, rgb_buf + 6, rgb_buf + 5); - rgb_buf[4] = 255; - src_y += 2; - src_u += 1; - src_v += 1; - rgb_buf += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 3, rgb_buf + 2, rgb_buf + 1); - rgb_buf[0] = 255; - } -} - -void I422ToABGRRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 2, rgb_buf + 1, rgb_buf + 0); - rgb_buf[3] = 255; - YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 6, rgb_buf + 5, rgb_buf + 4); - rgb_buf[7] = 255; - src_y += 2; - src_u += 1; - src_v += 1; - rgb_buf += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 2, rgb_buf + 1, rgb_buf + 0); + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -1591,14 +1703,15 @@ void I422ToRGBARow_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 1, rgb_buf + 2, rgb_buf + 3); + rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants); rgb_buf[0] = 255; YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 5, rgb_buf + 6, rgb_buf + 7); + rgb_buf + 5, rgb_buf + 6, rgb_buf + 7, yuvconstants); rgb_buf[4] = 255; src_y += 2; src_u += 1; @@ -1607,7 +1720,7 @@ void I422ToRGBARow_C(const uint8* src_y, } if (width & 1) { YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 1, rgb_buf + 2, rgb_buf + 3); + rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants); rgb_buf[0] = 255; } } @@ -1859,6 +1972,25 @@ void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1, } } #undef BLEND + +#define UBLEND(f, b, a) (((a) * f) + ((255 - a) * b) + 255) >> 8 +void BlendPlaneRow_C(const uint8* src0, const uint8* src1, + const uint8* alpha, uint8* dst, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst[0] = UBLEND(src0[0], src1[0], alpha[0]); + dst[1] = UBLEND(src0[1], src1[1], alpha[1]); + src0 += 2; + src1 += 2; + alpha += 2; + dst += 2; + } + if (width & 1) { + dst[0] = UBLEND(src0[0], src1[0], alpha[0]); + } +} +#undef UBLEND + #define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24 // Multiply source RGB by alpha and store to destination. @@ -2015,18 +2147,18 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, } // Blend 2 rows into 1. -static void HalfRow_C(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix) { +static void HalfRow_C(const uint8* src_uv, ptrdiff_t src_uv_stride, + uint8* dst_uv, int width) { int x; - for (x = 0; x < pix; ++x) { + for (x = 0; x < width; ++x) { dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; } } -static void HalfRow_16_C(const uint16* src_uv, int src_uv_stride, - uint16* dst_uv, int pix) { +static void HalfRow_16_C(const uint16* src_uv, ptrdiff_t src_uv_stride, + uint16* dst_uv, int width) { int x; - for (x = 0; x < pix; ++x) { + for (x = 0; x < width; ++x) { dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; } } @@ -2035,27 +2167,30 @@ static void HalfRow_16_C(const uint16* src_uv, int src_uv_stride, void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int width, int source_y_fraction) { - int y1_fraction = source_y_fraction; + int y1_fraction = source_y_fraction ; int y0_fraction = 256 - y1_fraction; const uint8* src_ptr1 = src_ptr + src_stride; int x; - if (source_y_fraction == 0) { + if (y1_fraction == 0) { memcpy(dst_ptr, src_ptr, width); return; } - if (source_y_fraction == 128) { - HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width); + if (y1_fraction == 128) { + HalfRow_C(src_ptr, src_stride, dst_ptr, width); return; } for (x = 0; x < width - 1; x += 2) { - dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; - dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; + dst_ptr[0] = + (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8; + dst_ptr[1] = + (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8; src_ptr += 2; src_ptr1 += 2; dst_ptr += 2; } if (width & 1) { - dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + dst_ptr[0] = + (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8; } } @@ -2071,7 +2206,7 @@ void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr, return; } if (source_y_fraction == 128) { - HalfRow_16_C(src_ptr, (int)(src_stride), dst_ptr, width); + HalfRow_16_C(src_ptr, src_stride, dst_ptr, width); return; } for (x = 0; x < width - 1; x += 2) { @@ -2088,14 +2223,14 @@ void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr, // Use first 4 shuffler values to reorder ARGB channels. void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix) { + const uint8* shuffler, int width) { int index0 = shuffler[0]; int index1 = shuffler[1]; int index2 = shuffler[2]; int index3 = shuffler[3]; // Shuffle a row of ARGB. int x; - for (x = 0; x < pix; ++x) { + for (x = 0; x < width; ++x) { // To support in-place conversion. uint8 b = src_argb[index0]; uint8 g = src_argb[index1]; @@ -2156,21 +2291,138 @@ void I422ToUYVYRow_C(const uint8* src_y, } } + +void ARGBPolynomialRow_C(const uint8* src_argb, + uint8* dst_argb, + const float* poly, + int width) { + int i; + for (i = 0; i < width; ++i) { + float b = (float)(src_argb[0]); + float g = (float)(src_argb[1]); + float r = (float)(src_argb[2]); + float a = (float)(src_argb[3]); + float b2 = b * b; + float g2 = g * g; + float r2 = r * r; + float a2 = a * a; + float db = poly[0] + poly[4] * b; + float dg = poly[1] + poly[5] * g; + float dr = poly[2] + poly[6] * r; + float da = poly[3] + poly[7] * a; + float b3 = b2 * b; + float g3 = g2 * g; + float r3 = r2 * r; + float a3 = a2 * a; + db += poly[8] * b2; + dg += poly[9] * g2; + dr += poly[10] * r2; + da += poly[11] * a2; + db += poly[12] * b3; + dg += poly[13] * g3; + dr += poly[14] * r3; + da += poly[15] * a3; + + dst_argb[0] = Clamp((int32)(db)); + dst_argb[1] = Clamp((int32)(dg)); + dst_argb[2] = Clamp((int32)(dr)); + dst_argb[3] = Clamp((int32)(da)); + src_argb += 4; + dst_argb += 4; + } +} + +void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, + const uint8* luma, uint32 lumacoeff) { + uint32 bc = lumacoeff & 0xff; + uint32 gc = (lumacoeff >> 8) & 0xff; + uint32 rc = (lumacoeff >> 16) & 0xff; + + int i; + for (i = 0; i < width - 1; i += 2) { + // Luminance in rows, color values in columns. + const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + + src_argb[2] * rc) & 0x7F00u) + luma; + const uint8* luma1; + dst_argb[0] = luma0[src_argb[0]]; + dst_argb[1] = luma0[src_argb[1]]; + dst_argb[2] = luma0[src_argb[2]]; + dst_argb[3] = src_argb[3]; + luma1 = ((src_argb[4] * bc + src_argb[5] * gc + + src_argb[6] * rc) & 0x7F00u) + luma; + dst_argb[4] = luma1[src_argb[4]]; + dst_argb[5] = luma1[src_argb[5]]; + dst_argb[6] = luma1[src_argb[6]]; + dst_argb[7] = src_argb[7]; + src_argb += 8; + dst_argb += 8; + } + if (width & 1) { + // Luminance in rows, color values in columns. + const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + + src_argb[2] * rc) & 0x7F00u) + luma; + dst_argb[0] = luma0[src_argb[0]]; + dst_argb[1] = luma0[src_argb[1]]; + dst_argb[2] = luma0[src_argb[2]]; + dst_argb[3] = src_argb[3]; + } +} + +void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) { + int i; + for (i = 0; i < width - 1; i += 2) { + dst[3] = src[3]; + dst[7] = src[7]; + dst += 8; + src += 8; + } + if (width & 1) { + dst[3] = src[3]; + } +} + +void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width) { + int i; + for (i = 0; i < width - 1; i += 2) { + dst_a[0] = src_argb[3]; + dst_a[1] = src_argb[7]; + dst_a += 2; + src_argb += 8; + } + if (width & 1) { + dst_a[0] = src_argb[3]; + } +} + +void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) { + int i; + for (i = 0; i < width - 1; i += 2) { + dst[3] = src[0]; + dst[7] = src[1]; + dst += 8; + src += 2; + } + if (width & 1) { + dst[3] = src[0]; + } +} + // Maximum temporary width for wrappers to process at a time, in pixels. #define MAXTWIDTH 2048 -#if !(defined(_MSC_VER) && !defined(__clang__)) && \ +#if !(defined(_MSC_VER) && defined(_M_IX86)) && \ defined(HAS_I422TORGB565ROW_SSSE3) // row_win.cc has asm version, but GCC uses 2 step wrapper. void I422ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgb565, + const struct YuvConstants* yuvconstants, int width) { SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth); + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); src_y += twidth; src_u += twidth / 2; @@ -2186,12 +2438,13 @@ void I422ToARGB1555Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb1555, + const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth); + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth); src_y += twidth; src_u += twidth / 2; @@ -2207,12 +2460,13 @@ void I422ToARGB4444Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb4444, + const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth); + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth); src_y += twidth; src_u += twidth / 2; @@ -2224,13 +2478,16 @@ void I422ToARGB4444Row_SSSE3(const uint8* src_y, #endif #if defined(HAS_NV12TORGB565ROW_SSSE3) -void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv, - uint8* dst_rgb565, int width) { +void NV12ToRGB565Row_SSSE3(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { // Row buffer for intermediate ARGB pixels. SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - NV12ToARGBRow_SSSE3(src_y, src_uv, row, twidth); + NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth); ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); src_y += twidth; src_uv += twidth; @@ -2240,70 +2497,22 @@ void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv, } #endif -#if defined(HAS_NV21TORGB565ROW_SSSE3) -void NV21ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_vu, - uint8* dst_rgb565, int width) { - // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - NV21ToARGBRow_SSSE3(src_y, src_vu, row, twidth); - ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); - src_y += twidth; - src_vu += twidth; - dst_rgb565 += twidth * 2; - width -= twidth; - } -} -#endif - -#if defined(HAS_YUY2TOARGBROW_SSSE3) -void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width) { - // Row buffers for intermediate YUV pixels. - SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]); - SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]); - SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, twidth); - YUY2ToYRow_SSE2(src_yuy2, row_y, twidth); - I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth); - src_yuy2 += twidth * 2; - dst_argb += twidth * 4; - width -= twidth; - } -} -#endif - -#if defined(HAS_UYVYTOARGBROW_SSSE3) -void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) { - // Row buffers for intermediate YUV pixels. - SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]); - SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]); - SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, twidth); - UYVYToYRow_SSE2(src_uyvy, row_y, twidth); - I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth); - src_uyvy += twidth * 2; - dst_argb += twidth * 4; - width -= twidth; - } -} -#endif // !defined(LIBYUV_DISABLE_X86) - #if defined(HAS_I422TORGB565ROW_AVX2) void I422ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgb565, + const struct YuvConstants* yuvconstants, int width) { SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth); + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); +#if defined(HAS_ARGBTORGB565ROW_AVX2) ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); +#else + ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); +#endif src_y += twidth; src_u += twidth / 2; src_v += twidth / 2; @@ -2318,13 +2527,18 @@ void I422ToARGB1555Row_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb1555, + const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth); + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); +#if defined(HAS_ARGBTOARGB1555ROW_AVX2) ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth); +#else + ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth); +#endif src_y += twidth; src_u += twidth / 2; src_v += twidth / 2; @@ -2339,13 +2553,18 @@ void I422ToARGB4444Row_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb4444, + const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth); + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); +#if defined(HAS_ARGBTOARGB4444ROW_AVX2) ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth); +#else + ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth); +#endif src_y += twidth; src_u += twidth / 2; src_v += twidth / 2; @@ -2360,12 +2579,13 @@ void I422ToRGB24Row_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgb24, + const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth); + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); // TODO(fbarchard): ARGBToRGB24Row_AVX2 ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); src_y += twidth; @@ -2377,199 +2597,30 @@ void I422ToRGB24Row_AVX2(const uint8* src_y, } #endif -#if defined(HAS_I422TORAWROW_AVX2) -void I422ToRAWRow_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_raw, - int width) { - // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth); - // TODO(fbarchard): ARGBToRAWRow_AVX2 - ARGBToRAWRow_SSSE3(row, dst_raw, twidth); - src_y += twidth; - src_u += twidth / 2; - src_v += twidth / 2; - dst_raw += twidth * 3; - width -= twidth; - } -} -#endif - #if defined(HAS_NV12TORGB565ROW_AVX2) -void NV12ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_uv, - uint8* dst_rgb565, int width) { +void NV12ToRGB565Row_AVX2(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { // Row buffer for intermediate ARGB pixels. SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - NV12ToARGBRow_AVX2(src_y, src_uv, row, twidth); + NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth); +#if defined(HAS_ARGBTORGB565ROW_AVX2) ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); - src_y += twidth; - src_uv += twidth; - dst_rgb565 += twidth * 2; - width -= twidth; - } -} +#else + ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); #endif - -#if defined(HAS_NV21TORGB565ROW_AVX2) -void NV21ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_vu, - uint8* dst_rgb565, int width) { - // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - NV21ToARGBRow_AVX2(src_y, src_vu, row, twidth); - ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); src_y += twidth; - src_vu += twidth; + src_uv += twidth; dst_rgb565 += twidth * 2; width -= twidth; } } #endif -#if defined(HAS_YUY2TOARGBROW_AVX2) -void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, uint8* dst_argb, int width) { - // Row buffers for intermediate YUV pixels. - SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]); - SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]); - SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - YUY2ToUV422Row_AVX2(src_yuy2, row_u, row_v, twidth); - YUY2ToYRow_AVX2(src_yuy2, row_y, twidth); - I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth); - src_yuy2 += twidth * 2; - dst_argb += twidth * 4; - width -= twidth; - } -} -#endif - -#if defined(HAS_UYVYTOARGBROW_AVX2) -void UYVYToARGBRow_AVX2(const uint8* src_uyvy, uint8* dst_argb, int width) { - // Row buffers for intermediate YUV pixels. - SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]); - SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]); - SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - UYVYToUV422Row_AVX2(src_uyvy, row_u, row_v, twidth); - UYVYToYRow_AVX2(src_uyvy, row_y, twidth); - I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth); - src_uyvy += twidth * 2; - dst_argb += twidth * 4; - width -= twidth; - } -} -#endif // !defined(LIBYUV_DISABLE_X86) - -void ARGBPolynomialRow_C(const uint8* src_argb, - uint8* dst_argb, const float* poly, - int width) { - int i; - for (i = 0; i < width; ++i) { - float b = (float)(src_argb[0]); - float g = (float)(src_argb[1]); - float r = (float)(src_argb[2]); - float a = (float)(src_argb[3]); - float b2 = b * b; - float g2 = g * g; - float r2 = r * r; - float a2 = a * a; - float db = poly[0] + poly[4] * b; - float dg = poly[1] + poly[5] * g; - float dr = poly[2] + poly[6] * r; - float da = poly[3] + poly[7] * a; - float b3 = b2 * b; - float g3 = g2 * g; - float r3 = r2 * r; - float a3 = a2 * a; - db += poly[8] * b2; - dg += poly[9] * g2; - dr += poly[10] * r2; - da += poly[11] * a2; - db += poly[12] * b3; - dg += poly[13] * g3; - dr += poly[14] * r3; - da += poly[15] * a3; - - dst_argb[0] = Clamp((int32)(db)); - dst_argb[1] = Clamp((int32)(dg)); - dst_argb[2] = Clamp((int32)(dr)); - dst_argb[3] = Clamp((int32)(da)); - src_argb += 4; - dst_argb += 4; - } -} - -void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, - const uint8* luma, uint32 lumacoeff) { - uint32 bc = lumacoeff & 0xff; - uint32 gc = (lumacoeff >> 8) & 0xff; - uint32 rc = (lumacoeff >> 16) & 0xff; - - int i; - for (i = 0; i < width - 1; i += 2) { - // Luminance in rows, color values in columns. - const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + - src_argb[2] * rc) & 0x7F00u) + luma; - const uint8* luma1; - dst_argb[0] = luma0[src_argb[0]]; - dst_argb[1] = luma0[src_argb[1]]; - dst_argb[2] = luma0[src_argb[2]]; - dst_argb[3] = src_argb[3]; - luma1 = ((src_argb[4] * bc + src_argb[5] * gc + - src_argb[6] * rc) & 0x7F00u) + luma; - dst_argb[4] = luma1[src_argb[4]]; - dst_argb[5] = luma1[src_argb[5]]; - dst_argb[6] = luma1[src_argb[6]]; - dst_argb[7] = src_argb[7]; - src_argb += 8; - dst_argb += 8; - } - if (width & 1) { - // Luminance in rows, color values in columns. - const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + - src_argb[2] * rc) & 0x7F00u) + luma; - dst_argb[0] = luma0[src_argb[0]]; - dst_argb[1] = luma0[src_argb[1]]; - dst_argb[2] = luma0[src_argb[2]]; - dst_argb[3] = src_argb[3]; - } -} - -void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) { - int i; - for (i = 0; i < width - 1; i += 2) { - dst[3] = src[3]; - dst[7] = src[7]; - dst += 8; - src += 8; - } - if (width & 1) { - dst[3] = src[3]; - } -} - -void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) { - int i; - for (i = 0; i < width - 1; i += 2) { - dst[3] = src[0]; - dst[7] = src[1]; - dst += 8; - src += 2; - } - if (width & 1) { - dst[3] = src[0]; - } -} - #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/third_party/libyuv/source/row_gcc.cc b/third_party/libyuv/source/row_gcc.cc index 820de0a1c..1ac7ef1aa 100644 --- a/third_party/libyuv/source/row_gcc.cc +++ b/third_party/libyuv/source/row_gcc.cc @@ -17,7 +17,8 @@ extern "C" { #endif // This module is for GCC x86 and x64. -#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) @@ -120,6 +121,24 @@ static uvec8 kShuffleMaskRAWToARGB = { 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u }; +// Shuffle table for converting RAW to RGB24. First 8. +static const uvec8 kShuffleMaskRAWToRGB24_0 = { + 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u +}; + +// Shuffle table for converting RAW to RGB24. Middle 8. +static const uvec8 kShuffleMaskRAWToRGB24_1 = { + 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u +}; + +// Shuffle table for converting RAW to RGB24. Last 8. +static const uvec8 kShuffleMaskRAWToRGB24_2 = { + 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u +}; + // Shuffle table for converting ARGB to RGB24. static uvec8 kShuffleMaskARGBToRGB24 = { 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u @@ -135,109 +154,39 @@ static uvec8 kShuffleMaskARGBToRGB24_0 = { 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u }; -// Shuffle table for converting ARGB to RAW. -static uvec8 kShuffleMaskARGBToRAW_0 = { - 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u +// YUY2 shuf 16 Y to 32 Y. +static const lvec8 kShuffleYUY2Y = { + 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, + 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 }; -#endif // HAS_RGB24TOARGBROW_SSSE3 -#if defined(TESTING) && defined(__x86_64__) -void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { - asm volatile ( - ".p2align 5 \n" - "mov %%eax,%%eax \n" - "mov %%ebx,%%ebx \n" - "mov %%ecx,%%ecx \n" - "mov %%edx,%%edx \n" - "mov %%esi,%%esi \n" - "mov %%edi,%%edi \n" - "mov %%ebp,%%ebp \n" - "mov %%esp,%%esp \n" - ".p2align 5 \n" - "mov %%r8d,%%r8d \n" - "mov %%r9d,%%r9d \n" - "mov %%r10d,%%r10d \n" - "mov %%r11d,%%r11d \n" - "mov %%r12d,%%r12d \n" - "mov %%r13d,%%r13d \n" - "mov %%r14d,%%r14d \n" - "mov %%r15d,%%r15d \n" - ".p2align 5 \n" - "lea (%%rax),%%eax \n" - "lea (%%rbx),%%ebx \n" - "lea (%%rcx),%%ecx \n" - "lea (%%rdx),%%edx \n" - "lea (%%rsi),%%esi \n" - "lea (%%rdi),%%edi \n" - "lea (%%rbp),%%ebp \n" - "lea (%%rsp),%%esp \n" - ".p2align 5 \n" - "lea (%%r8),%%r8d \n" - "lea (%%r9),%%r9d \n" - "lea (%%r10),%%r10d \n" - "lea (%%r11),%%r11d \n" - "lea (%%r12),%%r12d \n" - "lea (%%r13),%%r13d \n" - "lea (%%r14),%%r14d \n" - "lea (%%r15),%%r15d \n" - - ".p2align 5 \n" - "lea 0x10(%%rax),%%eax \n" - "lea 0x10(%%rbx),%%ebx \n" - "lea 0x10(%%rcx),%%ecx \n" - "lea 0x10(%%rdx),%%edx \n" - "lea 0x10(%%rsi),%%esi \n" - "lea 0x10(%%rdi),%%edi \n" - "lea 0x10(%%rbp),%%ebp \n" - "lea 0x10(%%rsp),%%esp \n" - ".p2align 5 \n" - "lea 0x10(%%r8),%%r8d \n" - "lea 0x10(%%r9),%%r9d \n" - "lea 0x10(%%r10),%%r10d \n" - "lea 0x10(%%r11),%%r11d \n" - "lea 0x10(%%r12),%%r12d \n" - "lea 0x10(%%r13),%%r13d \n" - "lea 0x10(%%r14),%%r14d \n" - "lea 0x10(%%r15),%%r15d \n" - - ".p2align 5 \n" - "add 0x10,%%eax \n" - "add 0x10,%%ebx \n" - "add 0x10,%%ecx \n" - "add 0x10,%%edx \n" - "add 0x10,%%esi \n" - "add 0x10,%%edi \n" - "add 0x10,%%ebp \n" - "add 0x10,%%esp \n" - ".p2align 5 \n" - "add 0x10,%%r8d \n" - "add 0x10,%%r9d \n" - "add 0x10,%%r10d \n" - "add 0x10,%%r11d \n" - "add 0x10,%%r12d \n" - "add 0x10,%%r13d \n" - "add 0x10,%%r14d \n" - "add 0x10,%%r15d \n" - - ".p2align 2 \n" - "1: \n" - "movq " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x8,0) ",%0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(pix) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm5" - ); -} -#endif // TESTING +// YUY2 shuf 8 UV to 16 UV. +static const lvec8 kShuffleYUY2UV = { + 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15, + 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15 +}; + +// UYVY shuf 16 Y to 32 Y. +static const lvec8 kShuffleUYVYY = { + 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, + 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 +}; + +// UYVY shuf 8 UV to 16 UV. +static const lvec8 kShuffleUYVYUV = { + 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14, + 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 +}; + +// NV21 shuf 8 VU to 16 UV. +static const lvec8 kShuffleNV21 = { + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, +}; +#endif // HAS_RGB24TOARGBROW_SSSE3 #ifdef HAS_J400TOARGBROW_SSE2 -void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { +void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0x18,%%xmm5 \n" @@ -258,14 +207,14 @@ void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { "jg 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 :: "memory", "cc", "xmm0", "xmm1", "xmm5" ); } #endif // HAS_J400TOARGBROW_SSE2 #ifdef HAS_RGB24TOARGBROW_SSSE3 -void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { +void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 "pslld $0x18,%%xmm5 \n" @@ -297,13 +246,13 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { "jg 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : "m"(kShuffleMaskRGB24ToARGB) // %3 : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } -void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { +void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 "pslld $0x18,%%xmm5 \n" @@ -335,13 +284,43 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { "jg 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : "m"(kShuffleMaskRAWToARGB) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } -void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { +void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) { + asm volatile ( + "movdqa %3,%%xmm3 \n" + "movdqa %4,%%xmm4 \n" + "movdqa %5,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x4,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x8,0) ",%%xmm2 \n" + "lea " MEMLEA(0x18,0) ",%0 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "movq %%xmm1," MEMACCESS2(0x8,1) " \n" + "movq %%xmm2," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x18,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskRAWToRGB24_0), // %3 + "m"(kShuffleMaskRAWToRGB24_1), // %4 + "m"(kShuffleMaskRAWToRGB24_2) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { asm volatile ( "mov $0x1080108,%%eax \n" "movd %%eax,%%xmm5 \n" @@ -382,14 +361,14 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "memory", "cc", "eax", NACL_R14 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } -void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { +void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { asm volatile ( "mov $0x1080108,%%eax \n" "movd %%eax,%%xmm5 \n" @@ -433,14 +412,14 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "memory", "cc", "eax", NACL_R14 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } -void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { +void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { asm volatile ( "mov $0xf0f0f0f,%%eax \n" "movd %%eax,%%xmm4 \n" @@ -471,14 +450,14 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "memory", "cc", "eax", NACL_R14 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } -void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { +void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) { asm volatile ( "movdqa %3,%%xmm6 \n" LABELALIGN @@ -510,13 +489,13 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : "m"(kShuffleMaskARGBToRGB24) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" ); } -void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) { +void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) { asm volatile ( "movdqa %3,%%xmm6 \n" LABELALIGN @@ -548,13 +527,13 @@ void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) { "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : "m"(kShuffleMaskARGBToRAW) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" ); } -void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { +void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) { asm volatile ( "pcmpeqb %%xmm3,%%xmm3 \n" "psrld $0x1b,%%xmm3 \n" @@ -585,12 +564,104 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } -void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { +void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst, + const uint32 dither4, int width) { + asm volatile ( + "movd %3,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm6 \n" + "movdqa %%xmm6,%%xmm7 \n" + "punpcklwd %%xmm6,%%xmm6 \n" + "punpckhwd %%xmm7,%%xmm7 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psrld $0x1b,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1a,%%xmm4 \n" + "pslld $0x5,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0xb,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "paddusb %%xmm6,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pslld $0x8,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x5,%%xmm2 \n" + "psrad $0x10,%%xmm0 \n" + "pand %%xmm3,%%xmm1 \n" + "pand %%xmm4,%%xmm2 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(dither4) // %3 + : "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +#ifdef HAS_ARGBTORGB565DITHERROW_AVX2 +void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst, + const uint32 dither4, int width) { + asm volatile ( + "vbroadcastss %3,%%xmm6 \n" + "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" + "vpermq $0xd8,%%ymm6,%%ymm6 \n" + "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" + "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" + "vpsrld $0x1b,%%ymm3,%%ymm3 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrld $0x1a,%%ymm4,%%ymm4 \n" + "vpslld $0x5,%%ymm4,%%ymm4 \n" + "vpslld $0xb,%%ymm3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" + "vpsrld $0x5,%%ymm0,%%ymm2 \n" + "vpsrld $0x3,%%ymm0,%%ymm1 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + "vpand %%ymm4,%%ymm2,%%ymm2 \n" + "vpand %%ymm3,%%ymm1,%%ymm1 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpor %%ymm2,%%ymm1,%%ymm1 \n" + "vpor %%ymm1,%%ymm0,%%ymm0 \n" + "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(dither4) // %3 + : "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_ARGBTORGB565DITHERROW_AVX2 + + +void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) { asm volatile ( "pcmpeqb %%xmm4,%%xmm4 \n" "psrld $0x1b,%%xmm4 \n" @@ -625,13 +696,13 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } -void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { +void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) { asm volatile ( "pcmpeqb %%xmm4,%%xmm4 \n" "psllw $0xc,%%xmm4 \n" @@ -654,7 +725,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" ); } @@ -662,7 +733,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { #ifdef HAS_ARGBTOYROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. -void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { asm volatile ( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" @@ -689,7 +760,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : "m"(kARGBToY), // %3 "m"(kAddY16) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" @@ -700,7 +771,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { #ifdef HAS_ARGBTOYJROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. -void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { asm volatile ( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" @@ -728,7 +799,7 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : "m"(kARGBToYJ), // %3 "m"(kAddYJ64) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" @@ -743,7 +814,7 @@ static const lvec32 kPermdARGBToY_AVX = { }; // Convert 32 ARGB pixels (128 bytes) to 32 Y values. -void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { +void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { asm volatile ( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" @@ -773,7 +844,7 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : "m"(kARGBToY), // %3 "m"(kAddY16), // %4 "m"(kPermdARGBToY_AVX) // %5 @@ -784,7 +855,7 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { #ifdef HAS_ARGBTOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. -void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { +void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { asm volatile ( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" @@ -815,7 +886,7 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : "m"(kARGBToYJ), // %3 "m"(kAddYJ64), // %4 "m"(kPermdARGBToY_AVX) // %5 @@ -952,6 +1023,67 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, } #endif // HAS_ARGBTOUVROW_AVX2 +#ifdef HAS_ARGBTOUVJROW_AVX2 +void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcastf128 %6,%%ymm6 \n" + "vbroadcastf128 %7,%%ymm7 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" + "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" + VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 + VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) + VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2) + VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3) + "lea " MEMLEA(0x80,0) ",%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + + "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n" + VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kAddUVJ128), // %5 + "m"(kARGBToVJ), // %6 + "m"(kARGBToUJ), // %7 + "m"(kShufARGBToUV_AVX) // %8 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_ARGBTOUVJROW_AVX2 + #ifdef HAS_ARGBTOUVJROW_SSSE3 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { @@ -1073,60 +1205,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, } #endif // HAS_ARGBTOUV444ROW_SSSE3 -#ifdef HAS_ARGBTOUV422ROW_SSSE3 -void ARGBToUV422Row_SSSE3(const uint8* src_argb0, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %4,%%xmm3 \n" - "movdqa %5,%%xmm4 \n" - "movdqa %6,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "m"(kARGBToV), // %4 - "m"(kARGBToU), // %5 - "m"(kAddUV128) // %6 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" - ); -} -#endif // HAS_ARGBTOUV422ROW_SSSE3 - -void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { +void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) { asm volatile ( "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" @@ -1153,7 +1232,7 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { "jg 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : "m"(kBGRAToY), // %3 "m"(kAddY16) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" @@ -1221,7 +1300,7 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, ); } -void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { +void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) { asm volatile ( "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" @@ -1248,14 +1327,14 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { "jg 1b \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : "m"(kABGRToY), // %3 "m"(kAddY16) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } -void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { +void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) { asm volatile ( "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" @@ -1282,7 +1361,7 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { "jg 1b \n" : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : "m"(kRGBAToY), // %3 "m"(kAddY16) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" @@ -1413,132 +1492,15 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) -struct YuvConstants { - lvec8 kUVToB; // 0 - lvec8 kUVToG; // 32 - lvec8 kUVToR; // 64 - lvec16 kUVBiasB; // 96 - lvec16 kUVBiasG; // 128 - lvec16 kUVBiasR; // 160 - lvec16 kYToRgb; // 192 -}; - -// BT.601 YUV to RGB reference -// R = (Y - 16) * 1.164 - V * -1.596 -// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 -// B = (Y - 16) * 1.164 - U * -2.018 - -// Y contribution to R,G,B. Scale and bias. -// TODO(fbarchard): Consider moving constants into a common header. -#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ -#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ - -// U and V contributions to R,G,B. -#define UB -128 /* max(-128, round(-2.018 * 64)) */ -#define UG 25 /* round(0.391 * 64) */ -#define VG 52 /* round(0.813 * 64) */ -#define VR -102 /* round(-1.596 * 64) */ - -// Bias values to subtract 16 from Y and 128 from U and V. -#define BB (UB * 128 + YGB) -#define BG (UG * 128 + VG * 128 + YGB) -#define BR (VR * 128 + YGB) - -// BT601 constants for YUV to RGB. -static YuvConstants SIMD_ALIGNED(kYuvConstants) = { - { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, - UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, - { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, - { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, - 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, - { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, - { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, - { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, - { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } -}; - -// BT601 constants for NV21 where chroma plane is VU instead of UV. -static YuvConstants SIMD_ALIGNED(kYvuConstants) = { - { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, - 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, - { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, - VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, - { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, - VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, - { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, - { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, - { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, - { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } -}; - -#undef YG -#undef YGB -#undef UB -#undef UG -#undef VG -#undef VR -#undef BB -#undef BG -#undef BR - -// JPEG YUV to RGB reference -// * R = Y - V * -1.40200 -// * G = Y - U * 0.34414 - V * 0.71414 -// * B = Y - U * -1.77200 - -// Y contribution to R,G,B. Scale and bias. -// TODO(fbarchard): Consider moving constants into a common header. -#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ -#define YGBJ 32 /* 64 / 2 */ - -// U and V contributions to R,G,B. -#define UBJ -113 /* round(-1.77200 * 64) */ -#define UGJ 22 /* round(0.34414 * 64) */ -#define VGJ 46 /* round(0.71414 * 64) */ -#define VRJ -90 /* round(-1.40200 * 64) */ - -// Bias values to subtract 16 from Y and 128 from U and V. -#define BBJ (UBJ * 128 + YGBJ) -#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ) -#define BRJ (VRJ * 128 + YGBJ) - -// JPEG constants for YUV to RGB. -YuvConstants SIMD_ALIGNED(kYuvJConstants) = { - { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, - UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 }, - { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, - UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, - UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, - UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ }, - { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, - 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ }, - { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, - BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ }, - { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, - BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ }, - { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, - BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ }, - { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, - YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ } -}; - -#undef YGJ -#undef YGBJ -#undef UBJ -#undef UGJ -#undef VGJ -#undef VRJ -#undef BBJ -#undef BGJ -#undef BRJ - -// Read 8 UV from 411 +// Read 8 UV from 444 #define READYUV444 \ "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm1,%%xmm0 \n" \ + "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" // Read 4 UV from 422, upsample to 8 UV #define READYUV422 \ @@ -1546,52 +1508,144 @@ YuvConstants SIMD_ALIGNED(kYuvJConstants) = { MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ "punpcklbw %%xmm1,%%xmm0 \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" -// Read 2 UV from 411, upsample to 8 UV -#define READYUV411 \ +// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. +#define READYUVA422 \ "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ + "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ + "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \ + "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n" + +// Read 2 UV from 411, upsample to 8 UV. +// reading 4 bytes is an msan violation. +// "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" +// MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) +// pinsrw fails with drmemory +// __asm pinsrw xmm0, [esi], 0 /* U */ +// __asm pinsrw xmm1, [esi + edi], 0 /* V */ +#define READYUV411_TEMP \ + "movzwl " MEMACCESS([u_buf]) ",%[temp] \n" \ + "movd %[temp],%%xmm0 \n" \ + MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) " \n" \ + "movd %[temp],%%xmm1 \n" \ "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ "punpcklbw %%xmm1,%%xmm0 \n" \ "punpcklwd %%xmm0,%%xmm0 \n" \ - "punpckldq %%xmm0,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm0 \n" \ + "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" // Read 4 UV from NV12, upsample to 8 UV #define READNV12 \ "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" + +// Read 4 VU from NV21, upsample to 8 UV +#define READNV21 \ + "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ + "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \ + "pshufb %[kShuffleNV21], %%xmm0 \n" \ + "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" + +// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. +#define READYUY2 \ + "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \ + "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ + "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \ + "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ + "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n" + +// Read 4 UYVY with 8 Y and update 4 UV to 8 UV. +#define READUYVY \ + "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \ + "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ + "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \ + "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ + "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n" + +#if defined(__x86_64__) +#define YUVTORGB_SETUP(yuvconstants) \ + "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \ + "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \ + "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \ + "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \ + "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \ + "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \ + "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n" +// Convert 8 pixels: 8 UV and 8 Y +#define YUVTORGB(yuvconstants) \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm3 \n" \ + "movdqa %%xmm11,%%xmm0 \n" \ + "pmaddubsw %%xmm8,%%xmm1 \n" \ + "psubw %%xmm1,%%xmm0 \n" \ + "movdqa %%xmm12,%%xmm1 \n" \ + "pmaddubsw %%xmm9,%%xmm2 \n" \ + "psubw %%xmm2,%%xmm1 \n" \ + "movdqa %%xmm13,%%xmm2 \n" \ + "pmaddubsw %%xmm10,%%xmm3 \n" \ + "psubw %%xmm3,%%xmm2 \n" \ + "pmulhuw %%xmm14,%%xmm4 \n" \ + "paddsw %%xmm4,%%xmm0 \n" \ + "paddsw %%xmm4,%%xmm1 \n" \ + "paddsw %%xmm4,%%xmm2 \n" \ + "psraw $0x6,%%xmm0 \n" \ + "psraw $0x6,%%xmm1 \n" \ + "psraw $0x6,%%xmm2 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "packuswb %%xmm1,%%xmm1 \n" \ + "packuswb %%xmm2,%%xmm2 \n" +#define YUVTORGB_REGS \ + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", +#else +#define YUVTORGB_SETUP(yuvconstants) // Convert 8 pixels: 8 UV and 8 Y -#define YUVTORGB(YuvConstants) \ +#define YUVTORGB(yuvconstants) \ "movdqa %%xmm0,%%xmm1 \n" \ "movdqa %%xmm0,%%xmm2 \n" \ "movdqa %%xmm0,%%xmm3 \n" \ - "movdqa " MEMACCESS2(96, [YuvConstants]) ",%%xmm0 \n" \ - "pmaddubsw " MEMACCESS([YuvConstants]) ",%%xmm1 \n" \ + "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \ + "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \ "psubw %%xmm1,%%xmm0 \n" \ - "movdqa " MEMACCESS2(128, [YuvConstants]) ",%%xmm1 \n" \ - "pmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%xmm2 \n" \ + "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \ + "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \ "psubw %%xmm2,%%xmm1 \n" \ - "movdqa " MEMACCESS2(160, [YuvConstants]) ",%%xmm2 \n" \ - "pmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%xmm3 \n" \ + "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \ + "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \ "psubw %%xmm3,%%xmm2 \n" \ - "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ - "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ - "punpcklbw %%xmm3,%%xmm3 \n" \ - "pmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%xmm3 \n" \ - "paddsw %%xmm3,%%xmm0 \n" \ - "paddsw %%xmm3,%%xmm1 \n" \ - "paddsw %%xmm3,%%xmm2 \n" \ + "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \ + "paddsw %%xmm4,%%xmm0 \n" \ + "paddsw %%xmm4,%%xmm1 \n" \ + "paddsw %%xmm4,%%xmm2 \n" \ "psraw $0x6,%%xmm0 \n" \ "psraw $0x6,%%xmm1 \n" \ "psraw $0x6,%%xmm2 \n" \ "packuswb %%xmm0,%%xmm0 \n" \ "packuswb %%xmm1,%%xmm1 \n" \ "packuswb %%xmm2,%%xmm2 \n" +#define YUVTORGB_REGS +#endif -// Store 8 ARGB values. Assumes XMM5 is zero. +// Store 8 ARGB values. #define STOREARGB \ "punpcklbw %%xmm1,%%xmm0 \n" \ "punpcklbw %%xmm5,%%xmm2 \n" \ @@ -1602,30 +1656,7 @@ YuvConstants SIMD_ALIGNED(kYuvJConstants) = { "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" -// Store 8 BGRA values. Assumes XMM5 is zero. -#define STOREBGRA \ - "pcmpeqb %%xmm5,%%xmm5 \n" \ - "punpcklbw %%xmm0,%%xmm1 \n" \ - "punpcklbw %%xmm2,%%xmm5 \n" \ - "movdqa %%xmm5,%%xmm0 \n" \ - "punpcklwd %%xmm1,%%xmm5 \n" \ - "punpckhwd %%xmm1,%%xmm0 \n" \ - "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \ - "movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \ - "lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n" - -// Store 8 ABGR values. Assumes XMM5 is zero. -#define STOREABGR \ - "punpcklbw %%xmm1,%%xmm2 \n" \ - "punpcklbw %%xmm5,%%xmm0 \n" \ - "movdqa %%xmm2,%%xmm1 \n" \ - "punpcklwd %%xmm0,%%xmm2 \n" \ - "punpckhwd %%xmm0,%%xmm1 \n" \ - "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \ - "movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \ - "lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n" - -// Store 8 RGBA values. Assumes XMM5 is zero. +// Store 8 RGBA values. #define STORERGBA \ "pcmpeqb %%xmm5,%%xmm5 \n" \ "punpcklbw %%xmm2,%%xmm1 \n" \ @@ -1641,14 +1672,16 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( + YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READYUV444 - YUVTORGB(kYuvConstants) + YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" @@ -1657,26 +1690,27 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, [v_buf]"+r"(v_buf), // %[v_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", NACL_R14 YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } -// TODO(fbarchard): Consider putting masks into constants. void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* dst_rgb24, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( + YUVTORGB_SETUP(yuvconstants) "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUV422 - YUVTORGB(kYuvConstants) + YUVTORGB(yuvconstants) "punpcklbw %%xmm1,%%xmm0 \n" "punpcklbw %%xmm2,%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -1694,61 +1728,16 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] -// TODO(fbarchard): Make width a register for 32 bit. #if defined(__i386__) && defined(__pic__) [width]"+m"(width) // %[width] #else [width]"+rm"(width) // %[width] #endif - : [kYuvConstants]"r"(&kYuvConstants.kUVToB), + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6" - ); -} - -void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_raw, - int width) { - asm volatile ( - "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n" - "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n" - "sub %[u_buf],%[v_buf] \n" - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB(kYuvConstants) - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "movq %%xmm0," MEMACCESS([dst_raw]) " \n" - "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n" - "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n" - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_raw]"+r"(dst_raw), // %[dst_raw] -// TODO(fbarchard): Make width a register for 32 bit. -#if defined(__i386__) && defined(__pic__) - [width]"+m"(width) // %[width] -#else - [width]"+rm"(width) // %[width] -#endif - : [kYuvConstants]"r"(&kYuvConstants.kUVToB), - [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), - [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW) - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6" + : "memory", "cc", NACL_R14 YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" ); } @@ -1756,14 +1745,16 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( + YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READYUV422 - YUVTORGB(kYuvConstants) + YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" @@ -1772,74 +1763,95 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, [v_buf]"+r"(v_buf), // %[v_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", NACL_R14 YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } -void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { +#ifdef HAS_I422ALPHATOARGBROW_SSSE3 +void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { asm volatile ( + YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" - READYUV422 - YUVTORGB(kYuvConstants) + READYUVA422 + YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" + "subl $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] + [a_buf]"+r"(a_buf), // %[a_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) && defined(__pic__) + [width]"+m"(width) // %[width] +#else [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", NACL_R14 YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } +#endif // HAS_I422ALPHATOARGBROW_SSSE3 +#ifdef HAS_I411TOARGBROW_SSSE3 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { + int temp; asm volatile ( + YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" - READYUV411 - YUVTORGB(kYuvConstants) + READYUV411_TEMP + YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" + "subl $0x8,%[width] \n" "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + [temp]"=&r"(temp), // %[temp] +#if defined(__i386__) && defined(__pic__) + [width]"+m"(width) // %[width] +#else + [width]"+rm"(width) // %[width] +#endif + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", NACL_R14 YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } +#endif void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, const uint8* uv_buf, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( + YUVTORGB_SETUP(yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READNV12 - YUVTORGB(kYuvConstants) + YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" @@ -1847,84 +1859,85 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - // Does not use r14. - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS // Does not use r14. + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* uv_buf, + const uint8* vu_buf, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( + YUVTORGB_SETUP(yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" - READNV12 - YUVTORGB(kYuvConstants) + READNV21 + YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] - [uv_buf]"+r"(uv_buf), // %[uv_buf] + [vu_buf]"+r"(vu_buf), // %[vu_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYvuConstants.kUVToB) // %[kYuvConstants] - // Does not use r14. - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleNV21]"m"(kShuffleNV21) + : "memory", "cc", YUVTORGB_REGS // Does not use r14. + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } -void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_bgra, +void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - "sub %[u_buf],%[v_buf] \n" + YUVTORGB_SETUP(yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" - READYUV422 - YUVTORGB(kYuvConstants) - STOREBGRA + READYUY2 + YUVTORGB(yuvconstants) + STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] + : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleYUY2Y]"m"(kShuffleYUY2Y), + [kShuffleYUY2UV]"m"(kShuffleYUY2UV) + : "memory", "cc", YUVTORGB_REGS // Does not use r14. + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } -void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_abgr, +void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - "sub %[u_buf],%[v_buf] \n" + YUVTORGB_SETUP(yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" - READYUV422 - YUVTORGB(kYuvConstants) - STOREABGR + READUYVY + YUVTORGB(yuvconstants) + STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] + : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleUYVYY]"m"(kShuffleUYVYY), + [kShuffleUYVYUV]"m"(kShuffleUYVYUV) + : "memory", "cc", YUVTORGB_REGS // Does not use r14. + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } @@ -1932,14 +1945,16 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* dst_rgba, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( + YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READYUV422 - YUVTORGB(kYuvConstants) + YUVTORGB(yuvconstants) STORERGBA "sub $0x8,%[width] \n" "jg 1b \n" @@ -1948,118 +1963,224 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, [v_buf]"+r"(v_buf), // %[v_buf] [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", NACL_R14 YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_I422TOARGBROW_SSSE3 +// Read 16 UV from 444 +#define READYUV444_AVX2 \ + "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \ + MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \ + "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" + // Read 8 UV from 422, upsample to 16 UV. #define READYUV422_AVX2 \ - "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ + "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" - -// Convert 16 pixels: 16 UV and 16 Y. -#define YUVTORGB_AVX2(YuvConstants) \ - "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \ - "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \ - "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \ - "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" + +// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. +#define READYUVA422_AVX2 \ + "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ + MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ + "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ + "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \ + "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ + "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n" + +// Read 4 UV from 411, upsample to 16 UV. +#define READYUV411_AVX2 \ + "vmovd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ + MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1) \ + "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" + +// Read 8 UV from NV12, upsample to 16 UV. +#define READNV12_AVX2 \ + "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ + "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" + +// Read 8 VU from NV21, upsample to 16 UV. +#define READNV21_AVX2 \ + "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ + "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \ + "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" + +// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. +#define READYUY2_AVX2 \ + "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \ + "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ + "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \ + "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ + "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n" + +// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. +#define READUYVY_AVX2 \ + "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \ + "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ + "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \ + "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ + "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n" + +#if defined(__x86_64__) +#define YUVTORGB_SETUP_AVX2(yuvconstants) \ + "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \ + "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \ + "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \ + "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \ + "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \ + "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \ + "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n" +#define YUVTORGB_AVX2(yuvconstants) \ + "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \ + "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \ + "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \ + "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \ + "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \ + "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \ + "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ + "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ + "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ + "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ + "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ + "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ + "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" +#define YUVTORGB_REGS_AVX2 \ + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", +#else // Convert 16 pixels: 16 UV and 16 Y. +#define YUVTORGB_SETUP_AVX2(yuvconstants) +#define YUVTORGB_AVX2(yuvconstants) \ + "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \ + "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \ + "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \ + "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \ "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ - "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm3 \n" \ + "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \ "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ - "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm3 \n" \ + "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \ "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ - "vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \ - "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ - "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ - "vpunpcklbw %%ymm3,%%ymm3,%%ymm3 \n" \ - "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3 \n" \ - "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" \ - "vpaddsw %%ymm3,%%ymm1,%%ymm1 \n" \ - "vpaddsw %%ymm3,%%ymm2,%%ymm2 \n" \ - "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ - "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ - "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ - "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" - -#if defined(HAS_I422TOBGRAROW_AVX2) + "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \ + "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ + "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ + "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ + "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ + "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ + "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" +#define YUVTORGB_REGS_AVX2 +#endif + +// Store 16 ARGB values. +#define STOREARGB_AVX2 \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ + "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ + "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ + "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ + "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ + "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \ + "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n" + +#ifdef HAS_I444TOARGBROW_AVX2 // 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). -void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, +// 16 UV values with 16 Y producing 16 ARGB (64 bytes). +void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, - uint8* dst_bgra, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" - READYUV422_AVX2 - YUVTORGB_AVX2(kYuvConstants) - - // Step 3: Weave into BGRA - "vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR - "vpermq $0xd8,%%ymm2,%%ymm2 \n" - "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels - "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels - - "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n" - "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n" - "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n" + READYUV444_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] - [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] + [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } -#endif // HAS_I422TOBGRAROW_AVX2 +#endif // HAS_I444TOARGBROW_AVX2 -#if defined(HAS_I422TOARGBROW_AVX2) +#ifdef HAS_I411TOARGBROW_AVX2 // 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, +// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" - READYUV422_AVX2 - YUVTORGB_AVX2(kYuvConstants) - - // Step 3: Weave into ARGB - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA - "vpermq $0xd8,%%ymm2,%%ymm2 \n" - "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels - "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels - - "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n" - "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n" - "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" + READYUV411_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" @@ -2068,40 +2189,31 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, [v_buf]"+r"(v_buf), // %[v_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } -#endif // HAS_I422TOARGBROW_AVX2 +#endif // HAS_I411TOARGBROW_AVX2 -#if defined(HAS_J422TOARGBROW_AVX2) +#if defined(HAS_I422TOARGBROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf, +void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READYUV422_AVX2 - YUVTORGB_AVX2(kYuvConstants) - - // Step 3: Weave into ARGB - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA - "vpermq $0xd8,%%ymm2,%%ymm2 \n" - "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels - "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels - - "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n" - "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n" - "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" @@ -2110,53 +2222,50 @@ void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf, [v_buf]"+r"(v_buf), // %[v_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } -#endif // HAS_J422TOARGBROW_AVX2 +#endif // HAS_I422TOARGBROW_AVX2 -#if defined(HAS_I422TOABGRROW_AVX2) +#if defined(HAS_I422ALPHATOARGBROW_AVX2) // 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). -void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, +// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. +void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, + const uint8* a_buf, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" - READYUV422_AVX2 - YUVTORGB_AVX2(kYuvConstants) - - // Step 3: Weave into ABGR - "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" // RG - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" // BA - "vpermq $0xd8,%%ymm2,%%ymm2 \n" - "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" // RGBA first 8 pixels - "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" // RGBA next 8 pixels - "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" - "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" - "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" - "sub $0x10,%[width] \n" + READYUVA422_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "subl $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] + [a_buf]"+r"(a_buf), // %[a_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) && defined(__pic__) + [width]"+m"(width) // %[width] +#else [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } -#endif // HAS_I422TOABGRROW_AVX2 +#endif // HAS_I422ALPHATOARGBROW_AVX2 #if defined(HAS_I422TORGBAROW_AVX2) // 16 pixels @@ -2165,14 +2274,16 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READYUV422_AVX2 - YUVTORGB_AVX2(kYuvConstants) + YUVTORGB_AVX2(yuvconstants) // Step 3: Weave into RGBA "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" @@ -2192,13 +2303,134 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, [v_buf]"+r"(v_buf), // %[v_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_I422TORGBAROW_AVX2 +#if defined(HAS_NV12TOARGBROW_AVX2) +// 16 pixels. +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN + "1: \n" + READNV12_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. + "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_NV12TOARGBROW_AVX2 + +#if defined(HAS_NV21TOARGBROW_AVX2) +// 16 pixels. +// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf, + const uint8* vu_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN + "1: \n" + READNV21_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [vu_buf]"+r"(vu_buf), // %[vu_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleNV21]"m"(kShuffleNV21) + : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. + "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_NV21TOARGBROW_AVX2 + +#if defined(HAS_YUY2TOARGBROW_AVX2) +// 16 pixels. +// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). +void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN + "1: \n" + READYUY2_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleYUY2Y]"m"(kShuffleYUY2Y), + [kShuffleYUY2UV]"m"(kShuffleYUY2UV) + : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. + "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_YUY2TOARGBROW_AVX2 + +#if defined(HAS_UYVYTOARGBROW_AVX2) +// 16 pixels. +// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). +void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN + "1: \n" + READUYVY_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleUYVYY]"m"(kShuffleUYVYY), + [kShuffleUYVYUV]"m"(kShuffleUYVYUV) + : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_UYVYTOARGBROW_AVX2 + #ifdef HAS_I400TOARGBROW_SSE2 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { asm volatile ( @@ -2344,35 +2576,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { } #endif // HAS_MIRRORROW_AVX2 -#ifdef HAS_MIRRORROW_SSE2 -void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { - intptr_t temp_width = (intptr_t)(width); - asm volatile ( - LABELALIGN - "1: \n" - MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 - "movdqa %%xmm0,%%xmm1 \n" - "psllw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "pshuflw $0x1b,%%xmm0,%%xmm0 \n" - "pshufhw $0x1b,%%xmm0,%%xmm0 \n" - "pshufd $0x4e,%%xmm0,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1)",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1" - ); -} -#endif // HAS_MIRRORROW_SSE2 - -#ifdef HAS_MIRRORROW_UV_SSSE3 +#ifdef HAS_MIRRORUVROW_SSSE3 // Shuffle table for reversing the bytes of UV channels. static uvec8 kShuffleMirrorUV = { 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u @@ -2403,7 +2607,7 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, "xmm0", "xmm1" ); } -#endif // HAS_MIRRORROW_UV_SSSE3 +#endif // HAS_MIRRORUVROW_SSSE3 #ifdef HAS_ARGBMIRRORROW_SSE2 @@ -2458,7 +2662,8 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { #endif // HAS_ARGBMIRRORROW_AVX2 #ifdef HAS_SPLITUVROW_AVX2 -void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { +void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { asm volatile ( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" @@ -2485,7 +2690,7 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(pix) // %3 + "+r"(width) // %3 : : "memory", "cc", NACL_R14 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" @@ -2494,7 +2699,8 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { #endif // HAS_SPLITUVROW_AVX2 #ifdef HAS_SPLITUVROW_SSE2 -void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { +void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" @@ -2520,7 +2726,7 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(pix) // %3 + "+r"(width) // %3 : : "memory", "cc", NACL_R14 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" @@ -2591,8 +2797,23 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, #ifdef HAS_COPYROW_SSE2 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { asm volatile ( + "test $0xf,%0 \n" + "jne 2f \n" + "test $0xf,%1 \n" + "jne 2f \n" LABELALIGN "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "jmp 9f \n" + LABELALIGN + "2: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" @@ -2600,7 +2821,8 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x20,%2 \n" - "jg 1b \n" + "jg 2b \n" + "9: \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(count) // %2 @@ -2714,6 +2936,33 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { } #endif // HAS_ARGBCOPYALPHAROW_AVX2 +#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 +// width in pixels +void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ", %%xmm0 \n" + "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n" + "lea " MEMLEA(0x20, 0) ", %0 \n" + "psrld $0x18, %%xmm0 \n" + "psrld $0x18, %%xmm1 \n" + "packssdw %%xmm1, %%xmm0 \n" + "packuswb %%xmm0, %%xmm0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8, 1) ", %1 \n" + "sub $0x8, %2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+rm"(width) // %2 + : + : "memory", "cc" + , "xmm0", "xmm1" + ); +} +#endif // HAS_ARGBEXTRACTALPHAROW_SSE2 + #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 // width in pixels void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { @@ -2786,7 +3035,7 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { #ifdef HAS_SETROW_X86 void SetRow_X86(uint8* dst, uint8 v8, int width) { size_t width_tmp = (size_t)(width >> 2); - const uint32 v32 = v8 * 0x01010101; // Duplicate byte to all bytes. + const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. asm volatile ( "rep stosl " MEMSTORESTRING(eax,0) " \n" : "+D"(dst), // %0 @@ -2817,7 +3066,7 @@ void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) { #endif // HAS_SETROW_X86 #ifdef HAS_YUY2TOYROW_SSE2 -void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { +void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" @@ -2835,7 +3084,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { "jg 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "memory", "cc" , "xmm0", "xmm1", "xmm5" @@ -2843,7 +3092,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { } void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" @@ -2873,7 +3122,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(pix) // %3 + "+r"(width) // %3 : "r"((intptr_t)(stride_yuy2)) // %4 : "memory", "cc", NACL_R14 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" @@ -2881,7 +3130,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, } void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" @@ -2907,14 +3156,14 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(pix) // %3 + "+r"(width) // %3 : : "memory", "cc", NACL_R14 "xmm0", "xmm1", "xmm5" ); } -void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { +void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) { asm volatile ( LABELALIGN "1: \n" @@ -2930,7 +3179,7 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { "jg 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "memory", "cc" , "xmm0", "xmm1" @@ -2938,7 +3187,7 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { } void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" @@ -2968,7 +3217,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(pix) // %3 + "+r"(width) // %3 : "r"((intptr_t)(stride_uyvy)) // %4 : "memory", "cc", NACL_R14 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" @@ -2976,7 +3225,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, } void UYVYToUV422Row_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" @@ -3002,7 +3251,7 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(pix) // %3 + "+r"(width) // %3 : : "memory", "cc", NACL_R14 "xmm0", "xmm1", "xmm5" @@ -3011,7 +3260,7 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, #endif // HAS_YUY2TOYROW_SSE2 #ifdef HAS_YUY2TOYROW_AVX2 -void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) { +void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) { asm volatile ( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" @@ -3031,7 +3280,7 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) { "vzeroupper \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "memory", "cc" , "xmm0", "xmm1", "xmm5" @@ -3039,7 +3288,7 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) { } void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { asm volatile ( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" @@ -3070,7 +3319,7 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(pix) // %3 + "+r"(width) // %3 : "r"((intptr_t)(stride_yuy2)) // %4 : "memory", "cc", NACL_R14 "xmm0", "xmm1", "xmm5" @@ -3078,7 +3327,7 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, } void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { asm volatile ( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" @@ -3107,14 +3356,14 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(pix) // %3 + "+r"(width) // %3 : : "memory", "cc", NACL_R14 "xmm0", "xmm1", "xmm5" ); } -void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) { +void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) { asm volatile ( LABELALIGN "1: \n" @@ -3132,14 +3381,14 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) { "vzeroupper \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "memory", "cc" , "xmm0", "xmm1", "xmm5" ); } void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { asm volatile ( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" @@ -3171,7 +3420,7 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(pix) // %3 + "+r"(width) // %3 : "r"((intptr_t)(stride_uyvy)) // %4 : "memory", "cc", NACL_R14 "xmm0", "xmm1", "xmm5" @@ -3179,7 +3428,7 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, } void UYVYToUV422Row_AVX2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { asm volatile ( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" @@ -3208,7 +3457,7 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy, : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(pix) // %3 + "+r"(width) // %3 : : "memory", "cc", NACL_R14 "xmm0", "xmm1", "xmm5" @@ -3216,92 +3465,6 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy, } #endif // HAS_YUY2TOYROW_AVX2 -#ifdef HAS_ARGBBLENDROW_SSE2 -// Blend 8 pixels at a time. -void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $0xf,%%xmm7 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x8,%%xmm6 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $0x8,%%xmm5 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "pslld $0x18,%%xmm4 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - - // 4 pixel loop. - LABELALIGN - "41: \n" - "movdqu " MEMACCESS(0) ",%%xmm3 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movdqu " MEMACCESS(1) ",%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "pshufhw $0xf5,%%xmm3,%%xmm3 \n" - "pshuflw $0xf5,%%xmm3,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movdqu " MEMACCESS(1) ",%%xmm1 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jge 41b \n" - - "49: \n" - "add $0x3,%3 \n" - "jl 99f \n" - - // 1 pixel loop. - "91: \n" - "movd " MEMACCESS(0) ",%%xmm3 \n" - "lea " MEMLEA(0x4,0) ",%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movd " MEMACCESS(1) ",%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "pshufhw $0xf5,%%xmm3,%%xmm3 \n" - "pshuflw $0xf5,%%xmm3,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movd " MEMACCESS(1) ",%%xmm1 \n" - "lea " MEMLEA(0x4,1) ",%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movd %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x4,2) ",%2 \n" - "sub $0x1,%3 \n" - "jge 91b \n" - "99: \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} -#endif // HAS_ARGBBLENDROW_SSE2 - #ifdef HAS_ARGBBLENDROW_SSSE3 // Shuffle table for isolating alpha. static uvec8 kShuffleAlpha = { @@ -3310,15 +3473,6 @@ static uvec8 kShuffleAlpha = { }; // Blend 8 pixels at a time -// Shuffle table for reversing the bytes. - -// Same as SSE2, but replaces -// psrlw xmm3, 8 // alpha -// pshufhw xmm3, xmm3,0F5h // 8 alpha words -// pshuflw xmm3, xmm3,0F5h -// with.. -// pshufb xmm3, kShuffleAlpha // alpha - void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( @@ -3399,49 +3553,112 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, } #endif // HAS_ARGBBLENDROW_SSSE3 -#ifdef HAS_ARGBATTENUATEROW_SSE2 -// Attenuate 4 pixels at a time. -void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { +#ifdef HAS_BLENDPLANEROW_SSSE3 +// Blend 8 pixels at a time. +// unsigned version of math +// =((A2*C2)+(B2*(255-C2))+255)/256 +// signed version of math +// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 +void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, + const uint8* alpha, uint8* dst, int width) { asm volatile ( - "pcmpeqb %%xmm4,%%xmm4 \n" - "pslld $0x18,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrld $0x8,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "mov $0x80808080,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "mov $0x807f807f,%%eax \n" + "movd %%eax,%%xmm7 \n" + "pshufd $0x0,%%xmm7,%%xmm7 \n" + "sub %2,%0 \n" + "sub %2,%1 \n" + "sub %2,%3 \n" - // 4 pixel loop. + // 8 pixel loop. LABELALIGN "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "pshufhw $0xff,%%xmm0,%%xmm2 \n" - "pshuflw $0xff,%%xmm2,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm1 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "pshufhw $0xff,%%xmm1,%%xmm2 \n" - "pshuflw $0xff,%%xmm2,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "movdqu " MEMACCESS(0) ",%%xmm2 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "psrlw $0x8,%%xmm0 \n" - "pand %%xmm4,%%xmm2 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "pand %%xmm5,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%2 \n" + "movq (%2),%%xmm0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "pxor %%xmm5,%%xmm0 \n" + "movq (%0,%2,1),%%xmm1 \n" + "movq (%1,%2,1),%%xmm2 \n" + "punpcklbw %%xmm2,%%xmm1 \n" + "psubb %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "paddw %%xmm7,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%3,%2,1) \n" + "lea 0x8(%2),%2 \n" + "sub $0x8,%4 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(alpha), // %2 + "+r"(dst), // %3 + "+rm"(width) // %4 + :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7" ); } -#endif // HAS_ARGBATTENUATEROW_SSE2 +#endif // HAS_BLENDPLANEROW_SSSE3 + +#ifdef HAS_BLENDPLANEROW_AVX2 +// Blend 32 pixels at a time. +// unsigned version of math +// =((A2*C2)+(B2*(255-C2))+255)/256 +// signed version of math +// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 +void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, + const uint8* alpha, uint8* dst, int width) { + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsllw $0x8,%%ymm5,%%ymm5 \n" + "mov $0x80808080,%%eax \n" + "vmovd %%eax,%%xmm6 \n" + "vbroadcastss %%xmm6,%%ymm6 \n" + "mov $0x807f807f,%%eax \n" + "vmovd %%eax,%%xmm7 \n" + "vbroadcastss %%xmm7,%%ymm7 \n" + "sub %2,%0 \n" + "sub %2,%1 \n" + "sub %2,%3 \n" + + // 32 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%2),%%ymm0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpxor %%ymm5,%%ymm3,%%ymm3 \n" + "vpxor %%ymm5,%%ymm0,%%ymm0 \n" + "vmovdqu (%0,%2,1),%%ymm1 \n" + "vmovdqu (%1,%2,1),%%ymm2 \n" + "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" + "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" + "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" + "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm3,%%ymm3 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%3,%2,1) \n" + "lea 0x20(%2),%2 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(alpha), // %2 + "+r"(dst), // %3 + "+rm"(width) // %4 + :: "memory", "cc", "eax", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_BLENDPLANEROW_AVX2 #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha @@ -3542,7 +3759,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { // Unattenuate 4 pixels at a time. void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { - uintptr_t alpha = 0; + uintptr_t alpha; asm volatile ( // 4 pixel loop. LABELALIGN @@ -3573,10 +3790,10 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, "lea " MEMLEA(0x10,1) ",%1 \n" "sub $0x4,%2 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width), // %2 - "+r"(alpha) // %3 + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width), // %2 + "=&r"(alpha) // %3 : "r"(fixed_invtbl8) // %4 : "memory", "cc", NACL_R14 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" @@ -3592,7 +3809,7 @@ static const uvec8 kUnattenShuffleAlpha_AVX2 = { // Unattenuate 8 pixels at a time. void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { - uintptr_t alpha = 0; + uintptr_t alpha; asm volatile ( "sub %0,%1 \n" "vbroadcastf128 %5,%%ymm5 \n" @@ -3641,10 +3858,10 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width), // %2 - "+r"(alpha) // %3 + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width), // %2 + "=&r"(alpha) // %3 : "r"(fixed_invtbl8), // %4 "m"(kUnattenShuffleAlpha_AVX2) // %5 : "memory", "cc", NACL_R14 @@ -4569,7 +4786,7 @@ LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, uint8* dst_argb, const float* src_dudv, int width) { intptr_t src_argb_stride_temp = src_argb_stride; - intptr_t temp = 0; + intptr_t temp; asm volatile ( "movq " MEMACCESS(3) ",%%xmm2 \n" "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n" @@ -4641,7 +4858,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, "+r"(dst_argb), // %2 "+r"(src_dudv), // %3 "+rm"(width), // %4 - "+r"(temp) // %5 + "=&r"(temp) // %5 : : "memory", "cc", NACL_R14 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" @@ -4656,56 +4873,47 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, int source_y_fraction) { asm volatile ( "sub %1,%0 \n" - "shr %3 \n" "cmp $0x0,%3 \n" "je 100f \n" - "cmp $0x20,%3 \n" - "je 75f \n" - "cmp $0x40,%3 \n" + "cmp $0x80,%3 \n" "je 50f \n" - "cmp $0x60,%3 \n" - "je 25f \n" "movd %3,%%xmm0 \n" "neg %3 \n" - "add $0x80,%3 \n" + "add $0x100,%3 \n" "movd %3,%%xmm5 \n" "punpcklbw %%xmm0,%%xmm5 \n" "punpcklwd %%xmm5,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x80808080,%%eax \n" + "movd %%eax,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" // General purpose row blend. LABELALIGN "1: \n" "movdqu " MEMACCESS(1) ",%%xmm0 \n" MEMOPREG(movdqu,0x00,1,4,1,xmm2) - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "pmaddubsw %%xmm5,%%xmm0 \n" - "pmaddubsw %%xmm5,%%xmm1 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "psubb %%xmm4,%%xmm0 \n" + "psubb %%xmm4,%%xmm1 \n" + "movdqa %%xmm5,%%xmm2 \n" + "movdqa %%xmm5,%%xmm3 \n" + "pmaddubsw %%xmm0,%%xmm2 \n" + "pmaddubsw %%xmm1,%%xmm3 \n" + "paddw %%xmm4,%%xmm2 \n" + "paddw %%xmm4,%%xmm3 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + MEMOPMEM(movdqu,xmm2,0x00,1,0,1) "lea " MEMLEA(0x10,1) ",%1 \n" "sub $0x10,%2 \n" "jg 1b \n" "jmp 99f \n" - // Blend 25 / 75. - LABELALIGN - "25: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm1) - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 25b \n" - "jmp 99f \n" - // Blend 50 / 50. LABELALIGN "50: \n" @@ -4718,19 +4926,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "jg 50b \n" "jmp 99f \n" - // Blend 75 / 25. - LABELALIGN - "75: \n" - "movdqu " MEMACCESS(1) ",%%xmm1 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm0) - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 75b \n" - "jmp 99f \n" - // Blend 100 / 0 - Copy row unchanged. LABELALIGN "100: \n" @@ -4741,13 +4936,13 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "jg 100b \n" "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width), // %2 + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+rm"(dst_width), // %2 "+r"(source_y_fraction) // %3 : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm5" + : "memory", "cc", "eax", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_INTERPOLATEROW_SSSE3 @@ -4758,25 +4953,22 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { asm volatile ( - "shr %3 \n" "cmp $0x0,%3 \n" "je 100f \n" "sub %1,%0 \n" - "cmp $0x20,%3 \n" - "je 75f \n" - "cmp $0x40,%3 \n" + "cmp $0x80,%3 \n" "je 50f \n" - "cmp $0x60,%3 \n" - "je 25f \n" "vmovd %3,%%xmm0 \n" "neg %3 \n" - "add $0x80,%3 \n" + "add $0x100,%3 \n" "vmovd %3,%%xmm5 \n" "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" - "vpxor %%ymm0,%%ymm0,%%ymm0 \n" - "vpermd %%ymm5,%%ymm0,%%ymm5 \n" + "vbroadcastss %%xmm5,%%ymm5 \n" + "mov $0x80808080,%%eax \n" + "vmovd %%eax,%%xmm4 \n" + "vbroadcastss %%xmm4,%%ymm4 \n" // General purpose row blend. LABELALIGN @@ -4785,10 +4977,14 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, MEMOPREG(vmovdqu,0x00,1,4,1,ymm2) "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm5,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm5,%%ymm1,%%ymm1 \n" - "vpsrlw $0x7,%%ymm0,%%ymm0 \n" - "vpsrlw $0x7,%%ymm1,%%ymm1 \n" + "vpsubb %%ymm4,%%ymm1,%%ymm1 \n" + "vpsubb %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n" + "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n" + "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" + "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) "lea " MEMLEA(0x20,1) ",%1 \n" @@ -4796,19 +4992,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, "jg 1b \n" "jmp 99f \n" - // Blend 25 / 75. - LABELALIGN - "25: \n" - "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" - MEMOPREG(vmovdqu,0x00,1,4,1,ymm1) - "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" - MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 25b \n" - "jmp 99f \n" - // Blend 50 / 50. LABELALIGN "50: \n" @@ -4820,19 +5003,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, "jg 50b \n" "jmp 99f \n" - // Blend 75 / 25. - LABELALIGN - "75: \n" - "vmovdqu " MEMACCESS(1) ",%%ymm1 \n" - MEMOPREG(vmovdqu,0x00,1,4,1,ymm0) - "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" - MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 75b \n" - "jmp 99f \n" - // Blend 100 / 0 - Copy row unchanged. LABELALIGN "100: \n" @@ -4844,130 +5014,19 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, "999: \n" : "+D"(dst_ptr), // %0 "+S"(src_ptr), // %1 - "+c"(dst_width), // %2 + "+cm"(dst_width), // %2 "+r"(source_y_fraction) // %3 : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm5" + : "memory", "cc", "eax", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm4", "xmm5" ); } #endif // HAS_INTERPOLATEROW_AVX2 -#ifdef HAS_INTERPOLATEROW_SSE2 -// Bilinear filter 16x2 -> 16x1 -void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { - asm volatile ( - "sub %1,%0 \n" - "shr %3 \n" - "cmp $0x0,%3 \n" - "je 100f \n" - "cmp $0x20,%3 \n" - "je 75f \n" - "cmp $0x40,%3 \n" - "je 50f \n" - "cmp $0x60,%3 \n" - "je 25f \n" - - "movd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x80,%3 \n" - "movd %3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - - // General purpose row blend. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2 - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm4,%%xmm2 \n" - "punpckhbw %%xmm4,%%xmm3 \n" - "punpcklbw %%xmm4,%%xmm0 \n" - "punpckhbw %%xmm4,%%xmm1 \n" - "psubw %%xmm0,%%xmm2 \n" - "psubw %%xmm1,%%xmm3 \n" - "paddw %%xmm2,%%xmm2 \n" - "paddw %%xmm3,%%xmm3 \n" - "pmulhw %%xmm5,%%xmm2 \n" - "pmulhw %%xmm5,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "jmp 99f \n" - - // Blend 25 / 75. - LABELALIGN - "25: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 25b \n" - "jmp 99f \n" - - // Blend 50 / 50. - LABELALIGN - "50: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 - "pavgb %%xmm1,%%xmm0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 50b \n" - "jmp 99f \n" - - // Blend 75 / 25. - LABELALIGN - "75: \n" - "movdqu " MEMACCESS(1) ",%%xmm1 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0 - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 75b \n" - "jmp 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - LABELALIGN - "100: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 100b \n" - - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(source_y_fraction) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -#endif // HAS_INTERPOLATEROW_SSE2 - #ifdef HAS_ARGBSHUFFLEROW_SSSE3 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix) { + const uint8* shuffler, int width) { asm volatile ( "movdqu " MEMACCESS(3) ",%%xmm5 \n" LABELALIGN @@ -4984,7 +5043,7 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : "r"(shuffler) // %3 : "memory", "cc" , "xmm0", "xmm1", "xmm5" @@ -4995,7 +5054,7 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBSHUFFLEROW_AVX2 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix) { + const uint8* shuffler, int width) { asm volatile ( "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" LABELALIGN @@ -5013,7 +5072,7 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : "r"(shuffler) // %3 : "memory", "cc" , "xmm0", "xmm1", "xmm5" @@ -5024,8 +5083,8 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBSHUFFLEROW_SSE2 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix) { - uintptr_t pixel_temp = 0u; + const uint8* shuffler, int width) { + uintptr_t pixel_temp; asm volatile ( "pxor %%xmm5,%%xmm5 \n" "mov " MEMACCESS(4) ",%k2 \n" @@ -5130,11 +5189,11 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, "jg 3012b \n" "99: \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+d"(pixel_temp), // %2 - "+r"(pix) // %3 - : "r"(shuffler) // %4 + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "=&d"(pixel_temp), // %2 + "+r"(width) // %3 + : "r"(shuffler) // %4 : "memory", "cc", NACL_R14 "xmm0", "xmm1", "xmm5" ); @@ -5311,7 +5370,7 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, // Tranform ARGB pixels with color table. void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { - uintptr_t pixel_temp = 0u; + uintptr_t pixel_temp; asm volatile ( // 1 pixel loop. LABELALIGN @@ -5331,10 +5390,10 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, "mov %b1," MEMACCESS2(-0x1,0) " \n" "dec %2 \n" "jg 1b \n" - : "+r"(dst_argb), // %0 - "+d"(pixel_temp), // %1 - "+r"(width) // %2 - : "r"(table_argb) // %3 + : "+r"(dst_argb), // %0 + "=&d"(pixel_temp), // %1 + "+r"(width) // %2 + : "r"(table_argb) // %3 : "memory", "cc"); } #endif // HAS_ARGBCOLORTABLEROW_X86 @@ -5342,7 +5401,7 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, #ifdef HAS_RGBCOLORTABLEROW_X86 // Tranform RGB pixels with color table. void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { - uintptr_t pixel_temp = 0u; + uintptr_t pixel_temp; asm volatile ( // 1 pixel loop. LABELALIGN @@ -5359,10 +5418,10 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { "mov %b1," MEMACCESS2(-0x2,0) " \n" "dec %2 \n" "jg 1b \n" - : "+r"(dst_argb), // %0 - "+d"(pixel_temp), // %1 - "+r"(width) // %2 - : "r"(table_argb) // %3 + : "+r"(dst_argb), // %0 + "=&d"(pixel_temp), // %1 + "+r"(width) // %2 + : "r"(table_argb) // %3 : "memory", "cc"); } #endif // HAS_RGBCOLORTABLEROW_X86 @@ -5372,8 +5431,8 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width, const uint8* luma, uint32 lumacoeff) { - uintptr_t pixel_temp = 0u; - uintptr_t table_temp = 0u; + uintptr_t pixel_temp; + uintptr_t table_temp; asm volatile ( "movd %6,%%xmm3 \n" "pshufd $0x0,%%xmm3,%%xmm3 \n" @@ -5455,13 +5514,13 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, "lea " MEMLEA(0x10,3) ",%3 \n" "sub $0x4,%4 \n" "jg 1b \n" - : "+d"(pixel_temp), // %0 - "+a"(table_temp), // %1 - "+r"(src_argb), // %2 - "+r"(dst_argb), // %3 - "+rm"(width) // %4 - : "r"(luma), // %5 - "rm"(lumacoeff) // %6 + : "=&d"(pixel_temp), // %0 + "=&a"(table_temp), // %1 + "+r"(src_argb), // %2 + "+r"(dst_argb), // %3 + "+rm"(width) // %4 + : "r"(luma), // %5 + "rm"(lumacoeff) // %6 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5" ); } diff --git a/third_party/libyuv/source/row_mips.cc b/third_party/libyuv/source/row_mips.cc index cfc9ffe03..285f0b5ad 100644 --- a/third_party/libyuv/source/row_mips.cc +++ b/third_party/libyuv/source/row_mips.cc @@ -375,13 +375,13 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) { } #endif // HAS_COPYROW_MIPS -// MIPS DSPR2 functions +// DSPR2 functions #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \ (__mips_dsp_rev >= 2) && \ (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6) -void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width) { +void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { __asm__ __volatile__ ( ".set push \n" ".set noreorder \n" @@ -389,7 +389,6 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, "blez $t4, 2f \n" " andi %[width], %[width], 0xf \n" // residual - ".p2align 2 \n" "1: \n" "addiu $t4, $t4, -1 \n" "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0 @@ -447,7 +446,7 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ); } -void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) { +void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) { __asm__ __volatile__ ( ".set push \n" ".set noreorder \n" @@ -457,7 +456,6 @@ void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) { "blez $t4, 2f \n" " addu %[src], %[src], %[width] \n" // src += width - ".p2align 2 \n" "1: \n" "lw $t0, -16(%[src]) \n" // |3|2|1|0| "lw $t1, -12(%[src]) \n" // |7|6|5|4| @@ -498,10 +496,10 @@ void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) { ); } -void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width) { - int x = 0; - int y = 0; +void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { + int x; + int y; __asm__ __volatile__ ( ".set push \n" ".set noreorder \n" @@ -512,7 +510,6 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, "blez %[x], 2f \n" " addu %[src_uv], %[src_uv], $t4 \n" - ".p2align 2 \n" "1: \n" "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0| "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4| @@ -582,7 +579,7 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, [dst_u] "+r" (dst_u), [dst_v] "+r" (dst_v), [x] "=&r" (x), - [y] "+r" (y) + [y] "=&r" (y) : [width] "r" (width) : "t0", "t1", "t2", "t3", "t4", "t5", "t7", "t8", "t9" @@ -596,7 +593,7 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, // t8 = | 0 | G1 | 0 | g1 | // t2 = | 0 | R0 | 0 | r0 | // t1 = | 0 | R1 | 0 | r1 | -#define I422ToTransientMipsRGB \ +#define YUVTORGB \ "lw $t0, 0(%[y_buf]) \n" \ "lhu $t1, 0(%[u_buf]) \n" \ "lhu $t2, 0(%[v_buf]) \n" \ @@ -655,11 +652,13 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, "addu.ph $t2, $t2, $s5 \n" \ "addu.ph $t1, $t1, $s5 \n" -void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { +// TODO(fbarchard): accept yuv conversion constants. +void I422ToARGBRow_DSPR2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { __asm__ __volatile__ ( ".set push \n" ".set noreorder \n" @@ -673,9 +672,8 @@ void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf, "lui $s6, 0xff00 \n" "ori $s6, 0xff00 \n" // |ff|00|ff|00|ff| - ".p2align 2 \n" "1: \n" - I422ToTransientMipsRGB + YUVTORGB // Arranging into argb format "precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1| "precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0| @@ -717,136 +715,10 @@ void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf, ); } -void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - "beqz %[width], 2f \n" - " repl.ph $s0, 74 \n" // |YG|YG| = |74|74| - "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25| - "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52| - "repl.ph $s3, 102 \n" // |VR|VR| = |102|102| - "repl.ph $s4, 16 \n" // |0|16|0|16| - "repl.ph $s5, 128 \n" // |128|128| - "lui $s6, 0xff00 \n" - "ori $s6, 0xff00 \n" // |ff|00|ff|00| - - ".p2align 2 \n" - "1: \n" - I422ToTransientMipsRGB -// Arranging into abgr format - "precr.qb.ph $t0, $t8, $t1 \n" // |G1|g1|R1|r1| - "precr.qb.ph $t3, $t9, $t2 \n" // |G0|g0|R0|r0| - "precrq.qb.ph $t8, $t0, $t3 \n" // |G1|R1|G0|R0| - "precr.qb.ph $t9, $t0, $t3 \n" // |g1|r1|g0|r0| - - "precr.qb.ph $t2, $t4, $t5 \n" // |B1|b1|B0|b0| - "addiu %[width], -4 \n" - "addiu %[y_buf], 4 \n" - "preceu.ph.qbla $t1, $t2 \n" // |0 |B1|0 |B0| - "preceu.ph.qbra $t2, $t2 \n" // |0 |b1|0 |b0| - "or $t1, $t1, $s6 \n" // |ff|B1|ff|B0| - "or $t2, $t2, $s6 \n" // |ff|b1|ff|b0| - "precrq.ph.w $t0, $t2, $t9 \n" // |ff|b1|g1|r1| - "precrq.ph.w $t3, $t1, $t8 \n" // |ff|B1|G1|R1| - "sll $t9, $t9, 16 \n" - "sll $t8, $t8, 16 \n" - "packrl.ph $t2, $t2, $t9 \n" // |ff|b0|g0|r0| - "packrl.ph $t1, $t1, $t8 \n" // |ff|B0|G0|R0| -// Store results. - "sw $t2, 0(%[rgb_buf]) \n" - "sw $t0, 4(%[rgb_buf]) \n" - "sw $t1, 8(%[rgb_buf]) \n" - "sw $t3, 12(%[rgb_buf]) \n" - "bnez %[width], 1b \n" - " addiu %[rgb_buf], 16 \n" - "2: \n" - ".set pop \n" - :[y_buf] "+r" (y_buf), - [u_buf] "+r" (u_buf), - [v_buf] "+r" (v_buf), - [width] "+r" (width), - [rgb_buf] "+r" (rgb_buf) - : - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9", - "s0", "s1", "s2", "s3", - "s4", "s5", "s6" - ); -} - -void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - "beqz %[width], 2f \n" - " repl.ph $s0, 74 \n" // |YG|YG| = |74 |74 | - "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25| - "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52| - "repl.ph $s3, 102 \n" // |VR|VR| = |102|102| - "repl.ph $s4, 16 \n" // |0|16|0|16| - "repl.ph $s5, 128 \n" // |128|128| - "lui $s6, 0xff \n" - "ori $s6, 0xff \n" // |00|ff|00|ff| - - ".p2align 2 \n" - "1: \n" - I422ToTransientMipsRGB - // Arranging into bgra format - "precr.qb.ph $t4, $t4, $t8 \n" // |B1|b1|G1|g1| - "precr.qb.ph $t5, $t5, $t9 \n" // |B0|b0|G0|g0| - "precrq.qb.ph $t8, $t4, $t5 \n" // |B1|G1|B0|G0| - "precr.qb.ph $t9, $t4, $t5 \n" // |b1|g1|b0|g0| - - "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0| - "addiu %[width], -4 \n" - "addiu %[y_buf], 4 \n" - "preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0| - "preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0| - "sll $t1, $t1, 8 \n" // |R1|0 |R0|0 | - "sll $t2, $t2, 8 \n" // |r1|0 |r0|0 | - "or $t1, $t1, $s6 \n" // |R1|ff|R0|ff| - "or $t2, $t2, $s6 \n" // |r1|ff|r0|ff| - "precrq.ph.w $t0, $t9, $t2 \n" // |b1|g1|r1|ff| - "precrq.ph.w $t3, $t8, $t1 \n" // |B1|G1|R1|ff| - "sll $t1, $t1, 16 \n" - "sll $t2, $t2, 16 \n" - "packrl.ph $t2, $t9, $t2 \n" // |b0|g0|r0|ff| - "packrl.ph $t1, $t8, $t1 \n" // |B0|G0|R0|ff| -// Store results. - "sw $t2, 0(%[rgb_buf]) \n" - "sw $t0, 4(%[rgb_buf]) \n" - "sw $t1, 8(%[rgb_buf]) \n" - "sw $t3, 12(%[rgb_buf]) \n" - "bnez %[width], 1b \n" - " addiu %[rgb_buf], 16 \n" - "2: \n" - ".set pop \n" - :[y_buf] "+r" (y_buf), - [u_buf] "+r" (u_buf), - [v_buf] "+r" (v_buf), - [width] "+r" (width), - [rgb_buf] "+r" (rgb_buf) - : - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9", - "s0", "s1", "s2", "s3", - "s4", "s5", "s6" - ); -} - // Bilinear filter 8x2 -> 8x1 -void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { +void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { int y0_fraction = 256 - source_y_fraction; const uint8* src_ptr1 = src_ptr + src_stride; @@ -857,7 +729,6 @@ void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, "replv.ph $t0, %[y0_fraction] \n" "replv.ph $t1, %[source_y_fraction] \n" - ".p2align 2 \n" "1: \n" "lw $t2, 0(%[src_ptr]) \n" "lw $t3, 0(%[src_ptr1]) \n" diff --git a/third_party/libyuv/source/row_neon.cc b/third_party/libyuv/source/row_neon.cc index 1a72eb903..909df060c 100644 --- a/third_party/libyuv/source/row_neon.cc +++ b/third_party/libyuv/source/row_neon.cc @@ -93,7 +93,7 @@ extern "C" { "vuzp.u8 d2, d3 \n" \ "vtrn.u32 d2, d3 \n" -#define YUV422TORGB_SETUP_REG \ +#define YUVTORGB_SETUP \ MEMACCESS([kUVToRB]) \ "vld1.8 {d24}, [%[kUVToRB]] \n" \ MEMACCESS([kUVToG]) \ @@ -107,7 +107,7 @@ extern "C" { MEMACCESS([kYToRgb]) \ "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n" -#define YUV422TORGB \ +#define YUVTORGB \ "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */\ "vmull.u8 q9, d2, d25 \n" /* u/v G component */\ "vmovl.u8 q0, d0 \n" /* Y */\ @@ -134,52 +134,19 @@ extern "C" { "vqshrun.s16 d22, q9, #6 \n" /* R */ \ "vqshrun.s16 d21, q0, #6 \n" /* G */ -// YUV to RGB conversion constants. -// Y contribution to R,G,B. Scale and bias. -#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ -#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */ - -// U and V contributions to R,G,B. -#define UB -128 /* -min(128, round(2.018 * 64)) */ -#define UG 25 /* -round(-0.391 * 64) */ -#define VG 52 /* -round(-0.813 * 64) */ -#define VR -102 /* -round(1.596 * 64) */ - -// Bias values to subtract 16 from Y and 128 from U and V. -#define BB (UB * 128 - YGB) -#define BG (UG * 128 + VG * 128 - YGB) -#define BR (VR * 128 - YGB) - -static uvec8 kUVToRB = { 128, 128, 128, 128, 102, 102, 102, 102, - 0, 0, 0, 0, 0, 0, 0, 0 }; -static uvec8 kUVToG = { 25, 25, 25, 25, 52, 52, 52, 52, - 0, 0, 0, 0, 0, 0, 0, 0 }; -static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 }; -static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 }; - -#undef YG -#undef YGB -#undef UB -#undef UG -#undef VG -#undef VR -#undef BB -#undef BG -#undef BR - void I444ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG - ".p2align 2 \n" + YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" "1: \n" READYUV444 - YUV422TORGB + YUVTORGB "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(3) "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" @@ -188,10 +155,10 @@ void I444ToARGBRow_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_argb), // %3 "+r"(width) // %4 - : [kUVToRB]"r"(&kUVToRB), // %5 - [kUVToG]"r"(&kUVToG), // %6 - [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -201,45 +168,15 @@ void I422ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG - ".p2align 2 \n" - "1: \n" - READYUV422 - YUV422TORGB - "subs %4, %4, #8 \n" + YUVTORGB_SETUP "vmov.u8 d23, #255 \n" - MEMACCESS(3) - "vst4.8 {d20, d21, d22, d23}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&kUVToRB), // %5 - [kUVToG]"r"(&kUVToG), // %6 - [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void I411ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width) { - asm volatile ( - YUV422TORGB_SETUP_REG - ".p2align 2 \n" "1: \n" - READYUV411 - YUV422TORGB + READYUV422 + YUVTORGB "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(3) "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" @@ -248,72 +185,73 @@ void I411ToARGBRow_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_argb), // %3 "+r"(width) // %4 - : [kUVToRB]"r"(&kUVToRB), // %5 - [kUVToG]"r"(&kUVToG), // %6 - [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } -void I422ToBGRARow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_bgra, - int width) { +void I422AlphaToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + const uint8* src_a, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { asm volatile ( - YUV422TORGB_SETUP_REG - ".p2align 2 \n" + YUVTORGB_SETUP "1: \n" READYUV422 - YUV422TORGB - "subs %4, %4, #8 \n" - "vswp.u8 d20, d22 \n" - "vmov.u8 d19, #255 \n" + YUVTORGB + "subs %5, %5, #8 \n" MEMACCESS(3) - "vst4.8 {d19, d20, d21, d22}, [%3]! \n" + "vld1.8 {d23}, [%3]! \n" + MEMACCESS(4) + "vst4.8 {d20, d21, d22, d23}, [%4]! \n" "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 - "+r"(dst_bgra), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&kUVToRB), // %5 - [kUVToG]"r"(&kUVToG), // %6 - [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } -void I422ToABGRRow_NEON(const uint8* src_y, +void I411ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* dst_abgr, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG - ".p2align 2 \n" + YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" "1: \n" - READYUV422 - YUV422TORGB + READYUV411 + YUVTORGB "subs %4, %4, #8 \n" - "vswp.u8 d20, d22 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(3) "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 - "+r"(dst_abgr), // %3 + "+r"(dst_argb), // %3 "+r"(width) // %4 - : [kUVToRB]"r"(&kUVToRB), // %5 - [kUVToG]"r"(&kUVToG), // %6 - [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -323,15 +261,15 @@ void I422ToRGBARow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgba, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG - ".p2align 2 \n" + YUVTORGB_SETUP "1: \n" READYUV422 - YUV422TORGB + YUVTORGB "subs %4, %4, #8 \n" - "vmov.u8 d19, #255 \n" + "vmov.u8 d19, #255 \n" // d19 modified by YUVTORGB MEMACCESS(3) "vst4.8 {d19, d20, d21, d22}, [%3]! \n" "bgt 1b \n" @@ -340,10 +278,10 @@ void I422ToRGBARow_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_rgba), // %3 "+r"(width) // %4 - : [kUVToRB]"r"(&kUVToRB), // %5 - [kUVToG]"r"(&kUVToG), // %6 - [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -353,13 +291,13 @@ void I422ToRGB24Row_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgb24, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG - ".p2align 2 \n" + YUVTORGB_SETUP "1: \n" READYUV422 - YUV422TORGB + YUVTORGB "subs %4, %4, #8 \n" MEMACCESS(3) "vst3.8 {d20, d21, d22}, [%3]! \n" @@ -369,68 +307,33 @@ void I422ToRGB24Row_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_rgb24), // %3 "+r"(width) // %4 - : [kUVToRB]"r"(&kUVToRB), // %5 - [kUVToG]"r"(&kUVToG), // %6 - [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void I422ToRAWRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_raw, - int width) { - asm volatile ( - YUV422TORGB_SETUP_REG - ".p2align 2 \n" - "1: \n" - READYUV422 - YUV422TORGB - "subs %4, %4, #8 \n" - "vswp.u8 d20, d22 \n" - MEMACCESS(3) - "vst3.8 {d20, d21, d22}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_raw), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&kUVToRB), // %5 - [kUVToG]"r"(&kUVToG), // %6 - [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } #define ARGBTORGB565 \ - "vshr.u8 d20, d20, #3 \n" /* B */ \ - "vshr.u8 d21, d21, #2 \n" /* G */ \ - "vshr.u8 d22, d22, #3 \n" /* R */ \ - "vmovl.u8 q8, d20 \n" /* B */ \ - "vmovl.u8 q9, d21 \n" /* G */ \ - "vmovl.u8 q10, d22 \n" /* R */ \ - "vshl.u16 q9, q9, #5 \n" /* G */ \ - "vshl.u16 q10, q10, #11 \n" /* R */ \ - "vorr q0, q8, q9 \n" /* BG */ \ - "vorr q0, q0, q10 \n" /* BGR */ + "vshll.u8 q0, d22, #8 \n" /* R */ \ + "vshll.u8 q8, d21, #8 \n" /* G */ \ + "vshll.u8 q9, d20, #8 \n" /* B */ \ + "vsri.16 q0, q8, #5 \n" /* RG */ \ + "vsri.16 q0, q9, #11 \n" /* RGB */ void I422ToRGB565Row_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgb565, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG - ".p2align 2 \n" + YUVTORGB_SETUP "1: \n" READYUV422 - YUV422TORGB + YUVTORGB "subs %4, %4, #8 \n" ARGBTORGB565 MEMACCESS(3) @@ -441,41 +344,35 @@ void I422ToRGB565Row_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_rgb565), // %3 "+r"(width) // %4 - : [kUVToRB]"r"(&kUVToRB), // %5 - [kUVToG]"r"(&kUVToG), // %6 - [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } #define ARGBTOARGB1555 \ - "vshr.u8 q10, q10, #3 \n" /* B */ \ - "vshr.u8 d22, d22, #3 \n" /* R */ \ - "vshr.u8 d23, d23, #7 \n" /* A */ \ - "vmovl.u8 q8, d20 \n" /* B */ \ - "vmovl.u8 q9, d21 \n" /* G */ \ - "vmovl.u8 q10, d22 \n" /* R */ \ - "vmovl.u8 q11, d23 \n" /* A */ \ - "vshl.u16 q9, q9, #5 \n" /* G */ \ - "vshl.u16 q10, q10, #10 \n" /* R */ \ - "vshl.u16 q11, q11, #15 \n" /* A */ \ - "vorr q0, q8, q9 \n" /* BG */ \ - "vorr q1, q10, q11 \n" /* RA */ \ - "vorr q0, q0, q1 \n" /* BGRA */ + "vshll.u8 q0, d23, #8 \n" /* A */ \ + "vshll.u8 q8, d22, #8 \n" /* R */ \ + "vshll.u8 q9, d21, #8 \n" /* G */ \ + "vshll.u8 q10, d20, #8 \n" /* B */ \ + "vsri.16 q0, q8, #1 \n" /* AR */ \ + "vsri.16 q0, q9, #6 \n" /* ARG */ \ + "vsri.16 q0, q10, #11 \n" /* ARGB */ void I422ToARGB1555Row_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb1555, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG - ".p2align 2 \n" + YUVTORGB_SETUP "1: \n" READYUV422 - YUV422TORGB + YUVTORGB "subs %4, %4, #8 \n" "vmov.u8 d23, #255 \n" ARGBTOARGB1555 @@ -487,10 +384,10 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_argb1555), // %3 "+r"(width) // %4 - : [kUVToRB]"r"(&kUVToRB), // %5 - [kUVToG]"r"(&kUVToG), // %6 - [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -509,14 +406,14 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb4444, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG + YUVTORGB_SETUP "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. - ".p2align 2 \n" "1: \n" READYUV422 - YUV422TORGB + YUVTORGB "subs %4, %4, #8 \n" "vmov.u8 d23, #255 \n" ARGBTOARGB4444 @@ -528,10 +425,10 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_argb4444), // %3 "+r"(width) // %4 - : [kUVToRB]"r"(&kUVToRB), // %5 - [kUVToG]"r"(&kUVToG), // %6 - [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -541,23 +438,22 @@ void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( - YUV422TORGB_SETUP_REG - ".p2align 2 \n" + YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" "1: \n" READYUV400 - YUV422TORGB + YUVTORGB "subs %2, %2, #8 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(1) "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 - : [kUVToRB]"r"(&kUVToRB), // %3 - [kUVToG]"r"(&kUVToG), // %4 - [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB), + [kUVToG]"r"(&kYuvI601Constants.kUVToG), + [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR), + [kYToRgb]"r"(&kYuvI601Constants.kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -568,7 +464,6 @@ void J400ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( "vmov.u8 d23, #255 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {d20}, [%0]! \n" @@ -589,15 +484,15 @@ void J400ToARGBRow_NEON(const uint8* src_y, void NV12ToARGBRow_NEON(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG - ".p2align 2 \n" + YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" "1: \n" READNV12 - YUV422TORGB + YUVTORGB "subs %3, %3, #8 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(2) "vst4.8 {d20, d21, d22, d23}, [%2]! \n" "bgt 1b \n" @@ -605,38 +500,38 @@ void NV12ToARGBRow_NEON(const uint8* src_y, "+r"(src_uv), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 - : [kUVToRB]"r"(&kUVToRB), // %4 - [kUVToG]"r"(&kUVToG), // %5 - [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } void NV21ToARGBRow_NEON(const uint8* src_y, - const uint8* src_uv, + const uint8* src_vu, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG - ".p2align 2 \n" + YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" "1: \n" READNV21 - YUV422TORGB + YUVTORGB "subs %3, %3, #8 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(2) "vst4.8 {d20, d21, d22, d23}, [%2]! \n" "bgt 1b \n" : "+r"(src_y), // %0 - "+r"(src_uv), // %1 + "+r"(src_vu), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 - : [kUVToRB]"r"(&kUVToRB), // %4 - [kUVToG]"r"(&kUVToG), // %5 - [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -645,13 +540,13 @@ void NV21ToARGBRow_NEON(const uint8* src_y, void NV12ToRGB565Row_NEON(const uint8* src_y, const uint8* src_uv, uint8* dst_rgb565, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG - ".p2align 2 \n" + YUVTORGB_SETUP "1: \n" READNV12 - YUV422TORGB + YUVTORGB "subs %3, %3, #8 \n" ARGBTORGB565 MEMACCESS(2) @@ -661,38 +556,10 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, "+r"(src_uv), // %1 "+r"(dst_rgb565), // %2 "+r"(width) // %3 - : [kUVToRB]"r"(&kUVToRB), // %4 - [kUVToG]"r"(&kUVToG), // %5 - [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void NV21ToRGB565Row_NEON(const uint8* src_y, - const uint8* src_uv, - uint8* dst_rgb565, - int width) { - asm volatile ( - YUV422TORGB_SETUP_REG - ".p2align 2 \n" - "1: \n" - READNV21 - YUV422TORGB - "subs %3, %3, #8 \n" - ARGBTORGB565 - MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_rgb565), // %2 - "+r"(width) // %3 - : [kUVToRB]"r"(&kUVToRB), // %4 - [kUVToG]"r"(&kUVToG), // %5 - [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -700,25 +567,25 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, void YUY2ToARGBRow_NEON(const uint8* src_yuy2, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG - ".p2align 2 \n" + YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" "1: \n" READYUY2 - YUV422TORGB + YUVTORGB "subs %2, %2, #8 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(1) "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 - : [kUVToRB]"r"(&kUVToRB), // %3 - [kUVToG]"r"(&kUVToG), // %4 - [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -726,25 +593,25 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, void UYVYToARGBRow_NEON(const uint8* src_uyvy, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG - ".p2align 2 \n" + YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" "1: \n" READUYVY - YUV422TORGB + YUVTORGB "subs %2, %2, #8 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(1) "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 - : [kUVToRB]"r"(&kUVToRB), // %3 - [kUVToG]"r"(&kUVToG), // %4 - [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -754,7 +621,6 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV @@ -777,7 +643,6 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load U @@ -800,7 +665,6 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. void CopyRow_NEON(const uint8* src, uint8* dst, int count) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 @@ -855,7 +719,6 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { "add %0, %0, %2 \n" "sub %0, #16 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0], r3 \n" // src -= 16 @@ -882,7 +745,6 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, "add %0, %0, %3, lsl #1 \n" "sub %0, #16 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 @@ -909,7 +771,6 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { "add %0, %0, %2, lsl #2 \n" "sub %0, #16 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0], r3 \n" // src -= 16 @@ -928,10 +789,9 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { ); } -void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { +void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { asm volatile ( "vmov.u8 d4, #255 \n" // Alpha - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. @@ -941,16 +801,15 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { "bgt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List ); } -void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { +void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { asm volatile ( "vmov.u8 d4, #255 \n" // Alpha - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. @@ -961,12 +820,30 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List ); } +void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + MEMACCESS(1) + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3" // Clobber List + ); +} + #define RGB565TOARGB \ "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ @@ -979,10 +856,9 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { "vorr.u8 d2, d1, d5 \n" /* R */ \ "vorr.u8 d1, d4, d6 \n" /* G */ -void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { +void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { asm volatile ( "vmov.u8 d3, #255 \n" // Alpha - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. @@ -993,7 +869,7 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { "bgt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_argb), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List ); @@ -1027,10 +903,9 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { "vorr.u8 d1, d4, d6 \n" /* G */ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, - int pix) { + int width) { asm volatile ( "vmov.u8 d3, #255 \n" // Alpha - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. @@ -1041,7 +916,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, "bgt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_argb), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List ); @@ -1058,10 +933,9 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, - int pix) { + int width) { asm volatile ( "vmov.u8 d3, #255 \n" // Alpha - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. @@ -1072,15 +946,14 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, "bgt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_argb), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2" // Clobber List ); } -void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { +void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. @@ -1090,15 +963,14 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb24), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List ); } -void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { +void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. @@ -1109,15 +981,14 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_raw), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List ); } -void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { +void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. @@ -1127,15 +998,14 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "q0", "q1" // Clobber List ); } -void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { +void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. @@ -1145,16 +1015,15 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "q0", "q1" // Clobber List ); } void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, - int pix) { + int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. @@ -1167,16 +1036,15 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(pix) // %3 + "+r"(width) // %3 : : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List ); } void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, - int pix) { + int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. @@ -1189,17 +1057,16 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(pix) // %3 + "+r"(width) // %3 : : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List ); } void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // stride + src_yuy2 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. @@ -1217,17 +1084,16 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, "+r"(stride_yuy2), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List ); } void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // stride + src_uyvy - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. @@ -1245,7 +1111,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, "+r"(stride_uyvy), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List ); @@ -1253,7 +1119,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix) { + const uint8* shuffler, int width) { asm volatile ( MEMACCESS(3) "vld1.8 {q2}, [%3] \n" // shuffler @@ -1268,7 +1134,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : "r"(shuffler) // %3 : "cc", "memory", "q0", "q1", "q2" // Clobber List ); @@ -1279,7 +1145,6 @@ void I422ToYUY2Row_NEON(const uint8* src_y, const uint8* src_v, uint8* dst_yuy2, int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys @@ -1306,7 +1171,6 @@ void I422ToUYVYRow_NEON(const uint8* src_y, const uint8* src_v, uint8* dst_uyvy, int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys @@ -1328,9 +1192,8 @@ void I422ToUYVYRow_NEON(const uint8* src_y, ); } -void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { +void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. @@ -1341,7 +1204,7 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb565), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "q0", "q8", "q9", "q10", "q11" ); @@ -1350,7 +1213,6 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, const uint32 dither4, int width) { asm volatile ( - ".p2align 2 \n" "vdup.32 d2, %2 \n" // dither4 "1: \n" MEMACCESS(1) @@ -1372,9 +1234,8 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, } void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, - int pix) { + int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. @@ -1385,17 +1246,16 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb1555), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "q0", "q8", "q9", "q10", "q11" ); } void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, - int pix) { + int width) { asm volatile ( "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. @@ -1406,19 +1266,18 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb4444), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "q0", "q8", "q9", "q10", "q11" ); } -void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { +void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { asm volatile ( "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d27, #16 \n" // Add 16 constant - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. @@ -1433,18 +1292,35 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q12", "q13" ); } -void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { +void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels + "subs %2, %2, #16 \n" // 16 processed per loop + MEMACCESS(1) + "vst1.8 {q3}, [%1]! \n" // store 16 A's. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { asm volatile ( "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. @@ -1458,7 +1334,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q12", "q13" ); @@ -1466,7 +1342,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { // 8x1 pixels. void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int pix) { + int width) { asm volatile ( "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient @@ -1474,7 +1350,6 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. @@ -1500,65 +1375,15 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(pix) // %3 + "+r"(width) // %3 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15" ); } -// 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16. -void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int pix) { - asm volatile ( - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(0) - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - - "subs %3, %3, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q0, q10 \n" // B - "vmls.s16 q8, q1, q11 \n" // G - "vmls.s16 q8, q2, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - - "vmul.s16 q9, q2, q10 \n" // R - "vmls.s16 q9, q1, q14 \n" // G - "vmls.s16 q9, q0, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. - MEMACCESS(2) - "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(pix) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -// 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32. +// 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32. void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int pix) { + int width) { asm volatile ( "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient @@ -1566,7 +1391,6 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. @@ -1613,14 +1437,14 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(pix) // %3 + "+r"(width) // %3 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } -// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. #define RGBTOUV(QB, QG, QR) \ "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ @@ -1635,7 +1459,7 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1644,7 +1468,6 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. @@ -1676,7 +1499,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, "+r"(src_stride_argb), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" @@ -1685,7 +1508,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, // TODO(fbarchard): Subsample match C code. void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient @@ -1694,7 +1517,6 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. @@ -1726,7 +1548,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, "+r"(src_stride_argb), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" @@ -1734,7 +1556,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, } void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_bgra "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1743,7 +1565,6 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. @@ -1775,7 +1596,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, "+r"(src_stride_bgra), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" @@ -1783,7 +1604,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, } void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_abgr "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1792,7 +1613,6 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. @@ -1824,7 +1644,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, "+r"(src_stride_abgr), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" @@ -1832,7 +1652,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, } void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_rgba "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1841,7 +1661,6 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. @@ -1873,7 +1692,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, "+r"(src_stride_rgba), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" @@ -1881,7 +1700,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, } void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_rgb24 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1890,7 +1709,6 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. @@ -1922,7 +1740,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, "+r"(src_stride_rgb24), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" @@ -1930,7 +1748,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, } void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_raw "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1939,7 +1757,6 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. @@ -1971,16 +1788,16 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, "+r"(src_stride_raw), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } -// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1989,7 +1806,6 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. @@ -2041,16 +1857,16 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, "+r"(src_stride_rgb565), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } -// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -2059,7 +1875,6 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. @@ -2111,16 +1926,16 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, "+r"(src_stride_argb1555), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } -// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -2129,7 +1944,6 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. @@ -2181,20 +1995,19 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, "+r"(src_stride_argb4444), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } -void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { +void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { asm volatile ( "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d27, #16 \n" // Add 16 constant - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. @@ -2210,19 +2023,18 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { "bgt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" ); } -void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { +void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { asm volatile ( "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d27, #16 \n" // Add 16 constant - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. @@ -2238,19 +2050,18 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { "bgt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" ); } -void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { +void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { asm volatile ( "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d27, #16 \n" // Add 16 constant - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. @@ -2266,19 +2077,18 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { "bgt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" ); } -void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { +void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { asm volatile ( "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. @@ -2293,19 +2103,18 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { "bgt 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" ); } -void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { +void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { asm volatile ( "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. @@ -2320,19 +2129,18 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { "bgt 1b \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" ); } -void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { +void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { asm volatile ( "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. @@ -2347,19 +2155,18 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { "bgt 1b \n" : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" ); } -void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { +void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { asm volatile ( "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. @@ -2374,19 +2181,18 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { "bgt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" ); } -void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { +void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { asm volatile ( "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. @@ -2401,7 +2207,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" ); @@ -2411,16 +2217,13 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { + int y1_fraction = source_y_fraction; asm volatile ( "cmp %4, #0 \n" "beq 100f \n" "add %2, %1 \n" - "cmp %4, #64 \n" - "beq 75f \n" "cmp %4, #128 \n" "beq 50f \n" - "cmp %4, #192 \n" - "beq 25f \n" "vdup.8 d5, %4 \n" "rsb %4, #256 \n" @@ -2443,20 +2246,6 @@ void InterpolateRow_NEON(uint8* dst_ptr, "bgt 1b \n" "b 99f \n" - // Blend 25 / 75. - "25: \n" - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" - MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 25b \n" - "b 99f \n" - // Blend 50 / 50. "50: \n" MEMACCESS(1) @@ -2470,20 +2259,6 @@ void InterpolateRow_NEON(uint8* dst_ptr, "bgt 50b \n" "b 99f \n" - // Blend 75 / 25. - "75: \n" - MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" - MEMACCESS(2) - "vld1.8 {q0}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 75b \n" - "b 99f \n" - // Blend 100 / 0 - Copy row unchanged. "100: \n" MEMACCESS(1) @@ -2498,7 +2273,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, "+r"(src_ptr), // %1 "+r"(src_stride), // %2 "+r"(dst_width), // %3 - "+r"(source_y_fraction) // %4 + "+r"(y1_fraction) // %4 : : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14" ); @@ -2605,7 +2380,6 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, "vdup.u16 q10, %4 \n" // interval add // 8 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. @@ -2648,7 +2422,6 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, "vshr.u16 q0, q0, #1 \n" // scale / 2. // 8 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. @@ -2684,7 +2457,6 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. @@ -2721,7 +2493,6 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { "vmov.u8 d28, #24 \n" // BB coefficient "vmov.u8 d29, #98 \n" // BG coefficient "vmov.u8 d30, #50 \n" // BR coefficient - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. @@ -2760,7 +2531,6 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, "vmovl.s8 q0, d4 \n" // B,G coefficients s16. "vmovl.s8 q1, d5 \n" // R,A coefficients s16. - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. @@ -2813,14 +2583,11 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, ); } -// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. -#ifdef HAS_ARGBMULTIPLYROW_NEON // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( // 8 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. @@ -2847,14 +2614,12 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, : "cc", "memory", "q0", "q1", "q2", "q3" ); } -#endif // HAS_ARGBMULTIPLYROW_NEON // Add 2 rows of ARGB pixels together, 8 pixels at a time. void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( // 8 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. @@ -2881,7 +2646,6 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( // 8 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. @@ -2913,7 +2677,6 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, asm volatile ( "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. @@ -2940,7 +2703,6 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_y, int width) { asm volatile ( // 16 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. @@ -2970,7 +2732,6 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, asm volatile ( "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. @@ -2997,7 +2758,6 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, uint8* dst_sobelx, int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {d0}, [%0],%5 \n" // top @@ -3041,7 +2801,6 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely, int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {d0}, [%0],%4 \n" // left diff --git a/third_party/libyuv/source/row_neon64.cc b/third_party/libyuv/source/row_neon64.cc index 5d015454b..6375d4f55 100644 --- a/third_party/libyuv/source/row_neon64.cc +++ b/third_party/libyuv/source/row_neon64.cc @@ -91,17 +91,15 @@ extern "C" { "uzp2 v3.8b, v2.8b, v2.8b \n" \ "ins v1.s[1], v3.s[0] \n" -#define YUV422TORGB_SETUP_REG \ +#define YUVTORGB_SETUP \ "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \ "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \ "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \ "ld1r {v31.4s}, [%[kYToRgb]] \n" \ - "movi v27.8h, #128 \n" \ - "movi v28.8h, #102 \n" \ - "movi v29.8h, #25 \n" \ - "movi v30.8h, #52 \n" + "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \ + "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n" -#define YUV422TORGB(vR, vG, vB) \ +#define YUVTORGB(vR, vG, vB) \ "uxtl v0.8h, v0.8b \n" /* Extract Y */ \ "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \ "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \ @@ -129,57 +127,19 @@ extern "C" { "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \ "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \ -// YUV to RGB conversion constants. -// Y contribution to R,G,B. Scale and bias. -#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ -#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */ - -// U and V contributions to R,G,B. -#define UB -128 /* -min(128, round(2.018 * 64)) */ -#define UG 25 /* -round(-0.391 * 64) */ -#define VG 52 /* -round(-0.813 * 64) */ -#define VR -102 /* -round(1.596 * 64) */ - -// Bias values to subtract 16 from Y and 128 from U and V. -#define BB (UB * 128 - YGB) -#define BG (UG * 128 + VG * 128 - YGB) -#define BR (VR * 128 - YGB) - -static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 }; -static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 }; - -#undef YG -#undef YGB -#undef UB -#undef UG -#undef VG -#undef VR -#undef BB -#undef BG -#undef BR - -#define RGBTOUV_SETUP_REG \ - "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ - "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ - "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ - "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ - "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ - "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ - - -#ifdef HAS_I444TOARGBROW_NEON void I444ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG + YUVTORGB_SETUP + "movi v23.8b, #255 \n" /* A */ "1: \n" READYUV444 - YUV422TORGB(v22, v21, v20) - "subs %w4, %w4, #8 \n" - "movi v23.8b, #255 \n" /* A */ + YUVTORGB(v22, v21, v20) + "subs %w4, %w4, #8 \n" MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" @@ -188,56 +148,28 @@ void I444ToARGBRow_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_argb), // %3 "+r"(width) // %4 - : [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } -#endif // HAS_I444TOARGBROW_NEON -#ifdef HAS_I422TOARGBROW_NEON void I422ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG - "1: \n" - READYUV422 - YUV422TORGB(v22, v21, v20) - "subs %w4, %w4, #8 \n" + YUVTORGB_SETUP "movi v23.8b, #255 \n" /* A */ - MEMACCESS(3) - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); -} -#endif // HAS_I422TOARGBROW_NEON - -#ifdef HAS_I411TOARGBROW_NEON -void I411ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width) { - asm volatile ( - YUV422TORGB_SETUP_REG "1: \n" - READYUV411 - YUV422TORGB(v22, v21, v20) + READYUV422 + YUVTORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" - "movi v23.8b, #255 \n" /* A */ MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" @@ -246,85 +178,91 @@ void I411ToARGBRow_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_argb), // %3 "+r"(width) // %4 - : [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } -#endif // HAS_I411TOARGBROW_NEON -#ifdef HAS_I422TOBGRAROW_NEON -void I422ToBGRARow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_bgra, - int width) { +void I422AlphaToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + const uint8* src_a, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { asm volatile ( - YUV422TORGB_SETUP_REG + YUVTORGB_SETUP "1: \n" READYUV422 - YUV422TORGB(v21, v22, v23) - "subs %w4, %w4, #8 \n" - "movi v20.8b, #255 \n" /* A */ + YUVTORGB(v22, v21, v20) MEMACCESS(3) - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "ld1 {v23.8b}, [%3], #8 \n" + "subs %w5, %w5, #8 \n" + MEMACCESS(4) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 - "+r"(dst_bgra), // %3 - "+r"(width) // %4 - : [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } -#endif // HAS_I422TOBGRAROW_NEON -#ifdef HAS_I422TOABGRROW_NEON -void I422ToABGRRow_NEON(const uint8* src_y, +void I411ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* dst_abgr, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG + YUVTORGB_SETUP + "movi v23.8b, #255 \n" /* A */ "1: \n" - READYUV422 - YUV422TORGB(v20, v21, v22) + READYUV411 + YUVTORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" - "movi v23.8b, #255 \n" /* A */ MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 - "+r"(dst_abgr), // %3 + "+r"(dst_argb), // %3 "+r"(width) // %4 - : [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } -#endif // HAS_I422TOABGRROW_NEON -#ifdef HAS_I422TORGBAROW_NEON void I422ToRGBARow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgba, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG + YUVTORGB_SETUP + "movi v20.8b, #255 \n" /* A */ "1: \n" READYUV422 - YUV422TORGB(v23, v22, v21) + YUVTORGB(v23, v22, v21) "subs %w4, %w4, #8 \n" - "movi v20.8b, #255 \n" /* A */ MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" @@ -333,25 +271,26 @@ void I422ToRGBARow_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_rgba), // %3 "+r"(width) // %4 - : [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } -#endif // HAS_I422TORGBAROW_NEON -#ifdef HAS_I422TORGB24ROW_NEON void I422ToRGB24Row_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgb24, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG + YUVTORGB_SETUP "1: \n" READYUV422 - YUV422TORGB(v22, v21, v20) + YUVTORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" MEMACCESS(3) "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" @@ -361,60 +300,33 @@ void I422ToRGB24Row_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_rgb24), // %3 "+r"(width) // %4 - : [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } -#endif // HAS_I422TORGB24ROW_NEON - -#ifdef HAS_I422TORAWROW_NEON -void I422ToRAWRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_raw, - int width) { - asm volatile ( - YUV422TORGB_SETUP_REG - "1: \n" - READYUV422 - YUV422TORGB(v20, v21, v22) - "subs %w4, %w4, #8 \n" - MEMACCESS(3) - "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_raw), // %3 - "+r"(width) // %4 - : [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); -} -#endif // HAS_I422TORAWROW_NEON #define ARGBTORGB565 \ "shll v0.8h, v22.8b, #8 \n" /* R */ \ - "shll v20.8h, v20.8b, #8 \n" /* B */ \ "shll v21.8h, v21.8b, #8 \n" /* G */ \ + "shll v20.8h, v20.8b, #8 \n" /* B */ \ "sri v0.8h, v21.8h, #5 \n" /* RG */ \ "sri v0.8h, v20.8h, #11 \n" /* RGB */ -#ifdef HAS_I422TORGB565ROW_NEON void I422ToRGB565Row_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgb565, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG + YUVTORGB_SETUP "1: \n" READYUV422 - YUV422TORGB(v22, v21, v20) + YUVTORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" ARGBTORGB565 MEMACCESS(3) @@ -425,36 +337,37 @@ void I422ToRGB565Row_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_rgb565), // %3 "+r"(width) // %4 - : [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } -#endif // HAS_I422TORGB565ROW_NEON #define ARGBTOARGB1555 \ "shll v0.8h, v23.8b, #8 \n" /* A */ \ "shll v22.8h, v22.8b, #8 \n" /* R */ \ - "shll v20.8h, v20.8b, #8 \n" /* B */ \ "shll v21.8h, v21.8b, #8 \n" /* G */ \ + "shll v20.8h, v20.8b, #8 \n" /* B */ \ "sri v0.8h, v22.8h, #1 \n" /* AR */ \ "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ "sri v0.8h, v20.8h, #11 \n" /* ARGB */ -#ifdef HAS_I422TOARGB1555ROW_NEON void I422ToARGB1555Row_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb1555, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG + YUVTORGB_SETUP + "movi v23.8b, #255 \n" "1: \n" READYUV422 - YUV422TORGB(v22, v21, v20) + YUVTORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" - "movi v23.8b, #255 \n" ARGBTOARGB1555 MEMACCESS(3) "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. @@ -464,13 +377,14 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_argb1555), // %3 "+r"(width) // %4 - : [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } -#endif // HAS_I422TOARGB1555ROW_NEON #define ARGBTOARGB4444 \ /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ @@ -482,18 +396,18 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ -#ifdef HAS_I422TOARGB4444ROW_NEON void I422ToARGB4444Row_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb4444, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG + YUVTORGB_SETUP "movi v4.16b, #0x0f \n" // bits to clear with vbic. "1: \n" READYUV422 - YUV422TORGB(v22, v21, v20) + YUVTORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" "movi v23.8b, #255 \n" ARGBTOARGB4444 @@ -505,41 +419,40 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_argb4444), // %3 "+r"(width) // %4 - : [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } -#endif // HAS_I422TOARGB4444ROW_NEON -#ifdef HAS_I400TOARGBROW_NEON void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { - int64 width64 = (int64)(width); asm volatile ( - YUV422TORGB_SETUP_REG + YUVTORGB_SETUP + "movi v23.8b, #255 \n" "1: \n" READYUV400 - YUV422TORGB(v22, v21, v20) + YUVTORGB(v22, v21, v20) "subs %w2, %w2, #8 \n" - "movi v23.8b, #255 \n" MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 - "+r"(width64) // %2 - : [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + "+r"(width) // %2 + : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB), + [kUVToG]"r"(&kYuvI601Constants.kUVToG), + [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR), + [kYToRgb]"r"(&kYuvI601Constants.kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } -#endif // HAS_I400TOARGBROW_NEON -#ifdef HAS_J400TOARGBROW_NEON void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { @@ -561,20 +474,19 @@ void J400ToARGBRow_NEON(const uint8* src_y, : "cc", "memory", "v20", "v21", "v22", "v23" ); } -#endif // HAS_J400TOARGBROW_NEON -#ifdef HAS_NV12TOARGBROW_NEON void NV12ToARGBRow_NEON(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG + YUVTORGB_SETUP + "movi v23.8b, #255 \n" "1: \n" READNV12 - YUV422TORGB(v22, v21, v20) + YUVTORGB(v22, v21, v20) "subs %w3, %w3, #8 \n" - "movi v23.8b, #255 \n" MEMACCESS(2) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" "b.gt 1b \n" @@ -582,78 +494,53 @@ void NV12ToARGBRow_NEON(const uint8* src_y, "+r"(src_uv), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 - : [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } -#endif // HAS_NV12TOARGBROW_NEON -#ifdef HAS_NV21TOARGBROW_NEON void NV21ToARGBRow_NEON(const uint8* src_y, - const uint8* src_uv, + const uint8* src_vu, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG + YUVTORGB_SETUP + "movi v23.8b, #255 \n" "1: \n" READNV21 - YUV422TORGB(v22, v21, v20) + YUVTORGB(v22, v21, v20) "subs %w3, %w3, #8 \n" - "movi v23.8b, #255 \n" MEMACCESS(2) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 - "+r"(src_uv), // %1 + "+r"(src_vu), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 - : [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } -#endif // HAS_NV21TOARGBROW_NEON -#ifdef HAS_NV12TORGB565ROW_NEON void NV12ToRGB565Row_NEON(const uint8* src_y, const uint8* src_uv, uint8* dst_rgb565, + const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUV422TORGB_SETUP_REG + YUVTORGB_SETUP "1: \n" READNV12 - YUV422TORGB(v22, v21, v20) - "subs %w3, %w3, #8 \n" - ARGBTORGB565 - MEMACCESS(2) - "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_rgb565), // %2 - "+r"(width) // %3 - : [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); -} -#endif // HAS_NV12TORGB565ROW_NEON - -#ifdef HAS_NV21TORGB565ROW_NEON -void NV21ToRGB565Row_NEON(const uint8* src_y, - const uint8* src_uv, - uint8* dst_rgb565, - int width) { - asm volatile ( - YUV422TORGB_SETUP_REG - "1: \n" - READNV21 - YUV422TORGB(v22, v21, v20) + YUVTORGB(v22, v21, v20) "subs %w3, %w3, #8 \n" ARGBTORGB565 MEMACCESS(2) @@ -663,68 +550,68 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, "+r"(src_uv), // %1 "+r"(dst_rgb565), // %2 "+r"(width) // %3 - : [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } -#endif // HAS_NV21TORGB565ROW_NEON -#ifdef HAS_YUY2TOARGBROW_NEON void YUY2ToARGBRow_NEON(const uint8* src_yuy2, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { - int64 width64 = (int64)(width); asm volatile ( - YUV422TORGB_SETUP_REG + YUVTORGB_SETUP + "movi v23.8b, #255 \n" "1: \n" READYUY2 - YUV422TORGB(v22, v21, v20) + YUVTORGB(v22, v21, v20) "subs %w2, %w2, #8 \n" - "movi v23.8b, #255 \n" MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "b.gt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_argb), // %1 - "+r"(width64) // %2 - : [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + "+r"(width) // %2 + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } -#endif // HAS_YUY2TOARGBROW_NEON -#ifdef HAS_UYVYTOARGBROW_NEON void UYVYToARGBRow_NEON(const uint8* src_uyvy, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { - int64 width64 = (int64)(width); asm volatile ( - YUV422TORGB_SETUP_REG + YUVTORGB_SETUP + "movi v23.8b, #255 \n" "1: \n" READUYVY - YUV422TORGB(v22, v21, v20) + YUVTORGB(v22, v21, v20) "subs %w2, %w2, #8 \n" - "movi v23.8b, #255 \n" MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" "b.gt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_argb), // %1 - "+r"(width64) // %2 - : [kUVBiasBGR]"r"(&kUVBiasBGR), - [kYToRgb]"r"(&kYToRgb) + "+r"(width) // %2 + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } -#endif // HAS_UYVYTOARGBROW_NEON // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. -#ifdef HAS_SPLITUVROW_NEON void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( @@ -745,10 +632,8 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, : "cc", "memory", "v0", "v1" // Clobber List ); } -#endif // HAS_SPLITUVROW_NEON // Reads 16 U's and V's and writes out 16 pairs of UV. -#ifdef HAS_MERGEUVROW_NEON void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) { asm volatile ( @@ -770,10 +655,8 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, : "cc", "memory", "v0", "v1" // Clobber List ); } -#endif // HAS_MERGEUVROW_NEON // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. -#ifdef HAS_COPYROW_NEON void CopyRow_NEON(const uint8* src, uint8* dst, int count) { asm volatile ( "1: \n" @@ -790,17 +673,16 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } -#endif // HAS_COPYROW_NEON // SetRow writes 'count' bytes using an 8 bit value repeated. void SetRow_NEON(uint8* dst, uint8 v8, int count) { asm volatile ( "dup v0.16b, %w2 \n" // duplicate 16 bytes "1: \n" - "subs %w1, %w1, #16 \n" // 16 bytes per loop + "subs %w1, %w1, #16 \n" // 16 bytes per loop MEMACCESS(0) "st1 {v0.16b}, [%0], #16 \n" // store - "b.gt 1b \n" + "b.gt 1b \n" : "+r"(dst), // %0 "+r"(count) // %1 : "r"(v8) // %2 @@ -812,10 +694,10 @@ void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { asm volatile ( "dup v0.4s, %w2 \n" // duplicate 4 ints "1: \n" - "subs %w1, %w1, #4 \n" // 4 ints per loop + "subs %w1, %w1, #4 \n" // 4 ints per loop MEMACCESS(0) "st1 {v0.16b}, [%0], #16 \n" // store - "b.gt 1b \n" + "b.gt 1b \n" : "+r"(dst), // %0 "+r"(count) // %1 : "r"(v32) // %2 @@ -823,18 +705,15 @@ void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { ); } -#ifdef HAS_MIRRORROW_NEON void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { - int64 width64 = (int64) width; asm volatile ( // Start at end of source row. - "add %0, %0, %2 \n" + "add %0, %0, %w2, sxtw \n" "sub %0, %0, #16 \n" - "1: \n" MEMACCESS(0) "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 - "subs %2, %2, #16 \n" // 16 pixels per loop. + "subs %w2, %w2, #16 \n" // 16 pixels per loop. "rev64 v0.16b, v0.16b \n" MEMACCESS(1) "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 @@ -843,26 +722,22 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 - "+r"(width64) // %2 + "+r"(width) // %2 : "r"((ptrdiff_t)-16) // %3 : "cc", "memory", "v0" ); } -#endif // HAS_MIRRORROW_NEON -#ifdef HAS_MIRRORUVROW_NEON void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { - int64 width64 = (int64) width; asm volatile ( // Start at end of source row. - "add %0, %0, %3, lsl #1 \n" + "add %0, %0, %w3, sxtw #1 \n" "sub %0, %0, #16 \n" - "1: \n" MEMACCESS(0) "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 - "subs %3, %3, #8 \n" // 8 pixels per loop. + "subs %w3, %w3, #8 \n" // 8 pixels per loop. "rev64 v0.8b, v0.8b \n" "rev64 v1.8b, v1.8b \n" MEMACCESS(1) @@ -873,25 +748,21 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(width64) // %3 + "+r"(width) // %3 : "r"((ptrdiff_t)-16) // %4 : "cc", "memory", "v0", "v1" ); } -#endif // HAS_MIRRORUVROW_NEON -#ifdef HAS_ARGBMIRRORROW_NEON void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { - int64 width64 = (int64) width; asm volatile ( - // Start at end of source row. - "add %0, %0, %2, lsl #2 \n" + // Start at end of source row. + "add %0, %0, %w2, sxtw #2 \n" "sub %0, %0, #16 \n" - "1: \n" MEMACCESS(0) "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 - "subs %2, %2, #4 \n" // 4 pixels per loop. + "subs %w2, %w2, #4 \n" // 4 pixels per loop. "rev64 v0.4s, v0.4s \n" MEMACCESS(1) "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 @@ -900,15 +771,13 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 - "+r"(width64) // %2 + "+r"(width) // %2 : "r"((ptrdiff_t)-16) // %3 : "cc", "memory", "v0" ); } -#endif // HAS_ARGBMIRRORROW_NEON -#ifdef HAS_RGB24TOARGBROW_NEON -void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { +void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { asm volatile ( "movi v4.8b, #255 \n" // Alpha "1: \n" @@ -920,15 +789,13 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { "b.gt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List ); } -#endif // HAS_RGB24TOARGBROW_NEON -#ifdef HAS_RAWTOARGBROW_NEON -void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { +void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { asm volatile ( "movi v5.8b, #255 \n" // Alpha "1: \n" @@ -942,12 +809,30 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List ); } -#endif // HAS_RAWTOARGBROW_NEON + +void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v3.8b, v1.8b, v1.8b \n" // move g + "orr v4.8b, v0.8b, v0.8b \n" // move r + MEMACCESS(1) + "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List + ); +} #define RGB565TOARGB \ "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ @@ -962,8 +847,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ "dup v2.2D, v0.D[1] \n" /* R */ -#ifdef HAS_RGB565TOARGBROW_NEON -void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { +void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { asm volatile ( "movi v3.8b, #255 \n" // Alpha "1: \n" @@ -976,12 +860,11 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { "b.gt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_argb), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List ); } -#endif // HAS_RGB565TOARGBROW_NEON #define ARGB1555TOARGB \ "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ @@ -1020,9 +903,8 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ "dup v1.2D, v0.D[1] \n" /* G */ \ -#ifdef HAS_ARGB1555TOARGBROW_NEON void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, - int pix) { + int width) { asm volatile ( "movi v3.8b, #255 \n" // Alpha "1: \n" @@ -1035,12 +917,11 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, "b.gt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_argb), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } -#endif // HAS_ARGB1555TOARGBROW_NEON #define ARGB4444TOARGB \ "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ @@ -1054,9 +935,8 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, "dup v0.2D, v2.D[1] \n" \ "dup v1.2D, v3.D[1] \n" -#ifdef HAS_ARGB4444TOARGBROW_NEON void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, - int pix) { + int width) { asm volatile ( "1: \n" MEMACCESS(0) @@ -1068,15 +948,13 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, "b.gt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_argb), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List ); } -#endif // HAS_ARGB4444TOARGBROW_NEON -#ifdef HAS_ARGBTORGB24ROW_NEON -void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { +void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { asm volatile ( "1: \n" MEMACCESS(0) @@ -1087,15 +965,13 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb24), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List ); } -#endif // HAS_ARGBTORGB24ROW_NEON -#ifdef HAS_ARGBTORAWROW_NEON -void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { +void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { asm volatile ( "1: \n" MEMACCESS(0) @@ -1108,15 +984,13 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_raw), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List ); } -#endif // HAS_ARGBTORAWROW_NEON -#ifdef HAS_YUY2TOYROW_NEON -void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { +void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { asm volatile ( "1: \n" MEMACCESS(0) @@ -1127,15 +1001,13 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { "b.gt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "v0", "v1" // Clobber List ); } -#endif // HAS_YUY2TOYROW_NEON -#ifdef HAS_UYVYTOYROW_NEON -void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { +void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { asm volatile ( "1: \n" MEMACCESS(0) @@ -1146,16 +1018,14 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { "b.gt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "v0", "v1" // Clobber List ); } -#endif // HAS_UYVYTOYROW_NEON -#ifdef HAS_YUY2TOUV422ROW_NEON void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, - int pix) { + int width) { asm volatile ( "1: \n" MEMACCESS(0) @@ -1169,16 +1039,14 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(pix) // %3 + "+r"(width) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } -#endif // HAS_YUY2TOUV422ROW_NEON -#ifdef HAS_UYVYTOUV422ROW_NEON void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, - int pix) { + int width) { asm volatile ( "1: \n" MEMACCESS(0) @@ -1192,16 +1060,14 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(pix) // %3 + "+r"(width) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } -#endif // HAS_UYVYTOUV422ROW_NEON -#ifdef HAS_YUY2TOUVROW_NEON void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { const uint8* src_yuy2b = src_yuy2 + stride_yuy2; asm volatile ( "1: \n" @@ -1221,17 +1087,15 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, "+r"(src_yuy2b), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List ); } -#endif // HAS_YUY2TOUVROW_NEON -#ifdef HAS_UYVYTOUVROW_NEON void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { const uint8* src_uyvyb = src_uyvy + stride_uyvy; asm volatile ( "1: \n" @@ -1251,18 +1115,16 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, "+r"(src_uyvyb), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List ); } -#endif // HAS_UYVYTOUVROW_NEON // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -#ifdef HAS_ARGBSHUFFLEROW_NEON void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix) { + const uint8* shuffler, int width) { asm volatile ( MEMACCESS(3) "ld1 {v2.16b}, [%3] \n" // shuffler @@ -1276,14 +1138,12 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : "r"(shuffler) // %3 : "cc", "memory", "v0", "v1", "v2" // Clobber List ); } -#endif // HAS_ARGBSHUFFLEROW_NEON -#ifdef HAS_I422TOYUY2ROW_NEON void I422ToYUY2Row_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1310,9 +1170,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y, : "cc", "memory", "v0", "v1", "v2", "v3" ); } -#endif // HAS_I422TOYUY2ROW_NEON -#ifdef HAS_I422TOUYVYROW_NEON void I422ToUYVYRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1339,10 +1197,8 @@ void I422ToUYVYRow_NEON(const uint8* src_y, : "cc", "memory", "v0", "v1", "v2", "v3" ); } -#endif // HAS_I422TOUYVYROW_NEON -#ifdef HAS_ARGBTORGB565ROW_NEON -void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { +void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { asm volatile ( "1: \n" MEMACCESS(0) @@ -1354,14 +1210,12 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb565), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "v0", "v20", "v21", "v22", "v23" ); } -#endif // HAS_ARGBTORGB565ROW_NEON -#ifdef HAS_ARGBTORGB565DITHERROW_NEON void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, const uint32 dither4, int width) { asm volatile ( @@ -1384,11 +1238,9 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23" ); } -#endif // HAS_ARGBTORGB565ROW_NEON -#ifdef HAS_ARGBTOARGB1555ROW_NEON void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, - int pix) { + int width) { asm volatile ( "1: \n" MEMACCESS(0) @@ -1400,16 +1252,14 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb1555), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "v0", "v20", "v21", "v22", "v23" ); } -#endif // HAS_ARGBTOARGB1555ROW_NEON -#ifdef HAS_ARGBTOARGB4444ROW_NEON void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, - int pix) { + int width) { asm volatile ( "movi v4.16b, #0x0f \n" // bits to clear with vbic. "1: \n" @@ -1422,15 +1272,13 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb4444), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" ); } -#endif // HAS_ARGBTOARGB4444ROW_NEON -#ifdef HAS_ARGBTOYROW_NEON -void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { +void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { asm volatile ( "movi v4.8b, #13 \n" // B * 0.1016 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient @@ -1450,15 +1298,30 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" ); } -#endif // HAS_ARGBTOYROW_NEON -#ifdef HAS_ARGBTOYJROW_NEON -void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { +void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 pixels + "subs %w2, %w2, #16 \n" // 16 processed per loop + MEMACCESS(1) + "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { asm volatile ( "movi v4.8b, #15 \n" // B * 0.11400 coefficient "movi v5.8b, #75 \n" // G * 0.58700 coefficient @@ -1476,17 +1339,15 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" ); } -#endif // HAS_ARGBTOYJROW_NEON // 8x1 pixels. -#ifdef HAS_ARGBTOUV444ROW_NEON void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int pix) { + int width) { asm volatile ( "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient "movi v25.8b, #74 \n" // UG -0.5781 coefficient @@ -1519,62 +1380,24 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(pix) // %3 + "+r"(width) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26", "v27", "v28", "v29" ); } -#endif // HAS_ARGBTOUV444ROW_NEON -// 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16. -#ifdef HAS_ARGBTOUV422ROW_NEON -void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int pix) { - asm volatile ( - RGBTOUV_SETUP_REG - "1: \n" - MEMACCESS(0) - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - - "subs %w3, %w3, #16 \n" // 16 processed per loop. - "mul v3.8h, v0.8h, v20.8h \n" // B - "mls v3.8h, v1.8h, v21.8h \n" // G - "mls v3.8h, v2.8h, v22.8h \n" // R - "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned - - "mul v4.8h, v2.8h, v20.8h \n" // R - "mls v4.8h, v1.8h, v24.8h \n" // G - "mls v4.8h, v0.8h, v23.8h \n" // B - "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned - - "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V - - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. - MEMACCESS(2) - "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(pix) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v23", "v24", "v25" - ); -} -#endif // HAS_ARGBTOUV422ROW_NEON +#define RGBTOUV_SETUP_REG \ + "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ + "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ + "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ + "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ + "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ + "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ -// 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32. -#ifdef HAS_ARGBTOUV411ROW_NEON +// 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32. void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int pix) { + int width) { asm volatile ( RGBTOUV_SETUP_REG "1: \n" @@ -1616,15 +1439,14 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(pix) // %3 + "+r"(width) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } -#endif // HAS_ARGBTOUV411ROW_NEON -// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. #define RGBTOUV(QB, QG, QR) \ "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ @@ -1640,9 +1462,8 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. // TODO(fbarchard): consider ptrdiff_t for all strides. -#ifdef HAS_ARGBTOUVROW_NEON void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { const uint8* src_argb_1 = src_argb + src_stride_argb; asm volatile ( RGBTOUV_SETUP_REG @@ -1674,18 +1495,16 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, "+r"(src_argb_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } -#endif // HAS_ARGBTOUVROW_NEON // TODO(fbarchard): Subsample match C code. -#ifdef HAS_ARGBTOUVJROW_NEON void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { const uint8* src_argb_1 = src_argb + src_stride_argb; asm volatile ( "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 @@ -1721,17 +1540,15 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, "+r"(src_argb_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } -#endif // HAS_ARGBTOUVJROW_NEON -#ifdef HAS_BGRATOUVROW_NEON void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { const uint8* src_bgra_1 = src_bgra + src_stride_bgra; asm volatile ( RGBTOUV_SETUP_REG @@ -1762,17 +1579,15 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, "+r"(src_bgra_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } -#endif // HAS_BGRATOUVROW_NEON -#ifdef HAS_ABGRTOUVROW_NEON void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { const uint8* src_abgr_1 = src_abgr + src_stride_abgr; asm volatile ( RGBTOUV_SETUP_REG @@ -1803,17 +1618,15 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, "+r"(src_abgr_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } -#endif // HAS_ABGRTOUVROW_NEON -#ifdef HAS_RGBATOUVROW_NEON void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { const uint8* src_rgba_1 = src_rgba + src_stride_rgba; asm volatile ( RGBTOUV_SETUP_REG @@ -1844,17 +1657,15 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, "+r"(src_rgba_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } -#endif // HAS_RGBATOUVROW_NEON -#ifdef HAS_RGB24TOUVROW_NEON void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; asm volatile ( RGBTOUV_SETUP_REG @@ -1885,17 +1696,15 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, "+r"(src_rgb24_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } -#endif // HAS_RGB24TOUVROW_NEON -#ifdef HAS_RAWTOUVROW_NEON void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { const uint8* src_raw_1 = src_raw + src_stride_raw; asm volatile ( RGBTOUV_SETUP_REG @@ -1926,18 +1735,16 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, "+r"(src_raw_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } -#endif // HAS_RAWTOUVROW_NEON -// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. -#ifdef HAS_RGB565TOUVROW_NEON +// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; asm volatile ( "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 @@ -2001,19 +1808,17 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, "+r"(src_rgb565_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27" ); } -#endif // HAS_RGB565TOUVROW_NEON -// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. -#ifdef HAS_ARGB1555TOUVROW_NEON +// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; asm volatile ( RGBTOUV_SETUP_REG @@ -2072,19 +1877,17 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, "+r"(src_argb1555_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28" ); } -#endif // HAS_ARGB1555TOUVROW_NEON -// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. -#ifdef HAS_ARGB4444TOUVROW_NEON +// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; asm volatile ( RGBTOUV_SETUP_REG @@ -2143,7 +1946,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, "+r"(src_argb4444_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(pix) // %4 + "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", @@ -2151,10 +1954,8 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, ); } -#endif // HAS_ARGB4444TOUVROW_NEON -#ifdef HAS_RGB565TOYROW_NEON -void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { +void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { asm volatile ( "movi v24.8b, #13 \n" // B * 0.1016 coefficient "movi v25.8b, #65 \n" // G * 0.5078 coefficient @@ -2175,16 +1976,14 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { "b.gt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26", "v27" ); } -#endif // HAS_RGB565TOYROW_NEON -#ifdef HAS_ARGB1555TOYROW_NEON -void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { +void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { asm volatile ( "movi v4.8b, #13 \n" // B * 0.1016 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient @@ -2205,15 +2004,13 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { "b.gt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" ); } -#endif // HAS_ARGB1555TOYROW_NEON -#ifdef HAS_ARGB4444TOYROW_NEON -void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { +void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { asm volatile ( "movi v24.8b, #13 \n" // B * 0.1016 coefficient "movi v25.8b, #65 \n" // G * 0.5078 coefficient @@ -2234,15 +2031,13 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { "b.gt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" ); } -#endif // HAS_ARGB4444TOYROW_NEON -#ifdef HAS_BGRATOYROW_NEON -void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { +void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { asm volatile ( "movi v4.8b, #33 \n" // R * 0.2578 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient @@ -2262,15 +2057,13 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { "b.gt 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } -#endif // HAS_BGRATOYROW_NEON -#ifdef HAS_ABGRTOYROW_NEON -void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { +void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { asm volatile ( "movi v4.8b, #33 \n" // R * 0.2578 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient @@ -2290,15 +2083,13 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { "b.gt 1b \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } -#endif // HAS_ABGRTOYROW_NEON -#ifdef HAS_RGBATOYROW_NEON -void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { +void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { asm volatile ( "movi v4.8b, #13 \n" // B * 0.1016 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient @@ -2318,15 +2109,13 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { "b.gt 1b \n" : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } -#endif // HAS_RGBATOYROW_NEON -#ifdef HAS_RGB24TOYROW_NEON -void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { +void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { asm volatile ( "movi v4.8b, #13 \n" // B * 0.1016 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient @@ -2346,15 +2135,13 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { "b.gt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } -#endif // HAS_RGB24TOYROW_NEON -#ifdef HAS_RAWTOYROW_NEON -void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { +void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { asm volatile ( "movi v4.8b, #33 \n" // R * 0.2578 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient @@ -2374,15 +2161,13 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } -#endif // HAS_RAWTOYROW_NEON // Bilinear filter 16x2 -> 16x1 -#ifdef HAS_INTERPOLATEROW_NEON void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { @@ -2392,12 +2177,8 @@ void InterpolateRow_NEON(uint8* dst_ptr, asm volatile ( "cmp %w4, #0 \n" "b.eq 100f \n" - "cmp %w4, #64 \n" - "b.eq 75f \n" "cmp %w4, #128 \n" "b.eq 50f \n" - "cmp %w4, #192 \n" - "b.eq 25f \n" "dup v5.16b, %w4 \n" "dup v4.16b, %w5 \n" @@ -2419,20 +2200,6 @@ void InterpolateRow_NEON(uint8* dst_ptr, "b.gt 1b \n" "b 99f \n" - // Blend 25 / 75. - "25: \n" - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" - MEMACCESS(2) - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 25b \n" - "b 99f \n" - // Blend 50 / 50. "50: \n" MEMACCESS(1) @@ -2446,20 +2213,6 @@ void InterpolateRow_NEON(uint8* dst_ptr, "b.gt 50b \n" "b 99f \n" - // Blend 75 / 25. - "75: \n" - MEMACCESS(1) - "ld1 {v1.16b}, [%1], #16 \n" - MEMACCESS(2) - "ld1 {v0.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 75b \n" - "b 99f \n" - // Blend 100 / 0 - Copy row unchanged. "100: \n" MEMACCESS(1) @@ -2480,10 +2233,8 @@ void InterpolateRow_NEON(uint8* dst_ptr, : "cc", "memory", "v0", "v1", "v3", "v4", "v5" ); } -#endif // HAS_INTERPOLATEROW_NEON // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr -#ifdef HAS_ARGBBLENDROW_NEON void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( @@ -2552,10 +2303,8 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "v16", "v17", "v18" ); } -#endif // HAS_ARGBBLENDROW_NEON // Attenuate 8 pixels at a time. -#ifdef HAS_ARGBATTENUATEROW_NEON void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { asm volatile ( // Attenuate 8 pixels. @@ -2579,11 +2328,9 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" ); } -#endif // HAS_ARGBATTENUATEROW_NEON // Quantize 8 ARGB pixels (32 bytes). // dst = (dst * scale >> 16) * interval_size + interval_offset; -#ifdef HAS_ARGBQUANTIZEROW_NEON void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, int interval_offset, int width) { asm volatile ( @@ -2623,12 +2370,10 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" ); } -#endif // HAS_ARGBQUANTIZEROW_NEON // Shade 8 pixels at a time by specified value. // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. -#ifdef HAS_ARGBSHADEROW_NEON void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, uint32 value) { asm volatile ( @@ -2663,12 +2408,10 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, : "cc", "memory", "v0", "v4", "v5", "v6", "v7" ); } -#endif // HAS_ARGBSHADEROW_NEON // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels // Similar to ARGBToYJ but stores ARGB. // C code is (15 * b + 75 * g + 38 * r + 64) >> 7; -#ifdef HAS_ARGBGRAYROW_NEON void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { asm volatile ( "movi v24.8b, #15 \n" // B * 0.11400 coefficient @@ -2694,14 +2437,12 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26" ); } -#endif // HAS_ARGBGRAYROW_NEON // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. // b = (r * 35 + g * 68 + b * 17) >> 7 // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 -#ifdef HAS_ARGBSEPIAROW_NEON void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { asm volatile ( "movi v20.8b, #17 \n" // BB coefficient @@ -2739,12 +2480,10 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30" ); } -#endif // HAS_ARGBSEPIAROW_NEON // Tranform 8 ARGB pixels (32 bytes) with color matrix. // TODO(fbarchard): Was same as Sepia except matrix is provided. This function // needs to saturate. Consider doing a non-saturating version. -#ifdef HAS_ARGBCOLORMATRIXROW_NEON void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, const int8* matrix_argb, int width) { asm volatile ( @@ -2804,11 +2543,9 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, "v18", "v19", "v22", "v23", "v24", "v25" ); } -#endif // HAS_ARGBCOLORMATRIXROW_NEON // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -#ifdef HAS_ARGBMULTIPLYROW_NEON void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( @@ -2839,10 +2576,8 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" ); } -#endif // HAS_ARGBMULTIPLYROW_NEON // Add 2 rows of ARGB pixels together, 8 pixels at a time. -#ifdef HAS_ARGBADDROW_NEON void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( @@ -2869,10 +2604,8 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" ); } -#endif // HAS_ARGBADDROW_NEON // Subtract 2 rows of ARGB pixels, 8 pixels at a time. -#ifdef HAS_ARGBSUBTRACTROW_NEON void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( @@ -2899,14 +2632,12 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" ); } -#endif // HAS_ARGBSUBTRACTROW_NEON // Adds Sobel X and Sobel Y and stores Sobel into ARGB. // A = 255 // R = Sobel // G = Sobel // B = Sobel -#ifdef HAS_SOBELROW_NEON void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) { asm volatile ( @@ -2932,10 +2663,8 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, : "cc", "memory", "v0", "v1", "v2", "v3" ); } -#endif // HAS_SOBELROW_NEON // Adds Sobel X and Sobel Y and stores Sobel into plane. -#ifdef HAS_SOBELTOPLANEROW_NEON void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_y, int width) { asm volatile ( @@ -2958,14 +2687,12 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, : "cc", "memory", "v0", "v1" ); } -#endif // HAS_SOBELTOPLANEROW_NEON // Mixes Sobel X, Sobel Y and Sobel into ARGB. // A = 255 // R = Sobel X // G = Sobel // B = Sobel Y -#ifdef HAS_SOBELXYROW_NEON void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) { asm volatile ( @@ -2989,13 +2716,11 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, : "cc", "memory", "v0", "v1", "v2", "v3" ); } -#endif // HAS_SOBELXYROW_NEON // SobelX as a matrix is // -1 0 1 // -2 0 2 // -1 0 1 -#ifdef HAS_SOBELXROW_NEON void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, uint8* dst_sobelx, int width) { asm volatile ( @@ -3034,13 +2759,11 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } -#endif // HAS_SOBELXROW_NEON // SobelY as a matrix is // -1 -2 -1 // 0 0 0 // 1 2 1 -#ifdef HAS_SOBELYROW_NEON void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely, int width) { asm volatile ( @@ -3078,7 +2801,6 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } -#endif // HAS_SOBELYROW_NEON #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/third_party/libyuv/source/row_win.cc b/third_party/libyuv/source/row_win.cc index 71be268b4..cdb760603 100644 --- a/third_party/libyuv/source/row_win.cc +++ b/third_party/libyuv/source/row_win.cc @@ -21,183 +21,108 @@ namespace libyuv { extern "C" { #endif -// This module is for Visual C. -#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \ - defined(_MSC_VER) && !defined(__clang__) +// This module is for Visual C 32/64 bit and clangcl 32 bit +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__))) -struct YuvConstants { - lvec8 kUVToB; // 0 - lvec8 kUVToG; // 32 - lvec8 kUVToR; // 64 - lvec16 kUVBiasB; // 96 - lvec16 kUVBiasG; // 128 - lvec16 kUVBiasR; // 160 - lvec16 kYToRgb; // 192 -}; +// 64 bit +#if defined(_M_X64) -// BT.601 YUV to RGB reference -// R = (Y - 16) * 1.164 - V * -1.596 -// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 -// B = (Y - 16) * 1.164 - U * -2.018 - -// Y contribution to R,G,B. Scale and bias. -// TODO(fbarchard): Consider moving constants into a common header. -#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ -#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ - -// U and V contributions to R,G,B. -#define UB -128 /* max(-128, round(-2.018 * 64)) */ -#define UG 25 /* round(0.391 * 64) */ -#define VG 52 /* round(0.813 * 64) */ -#define VR -102 /* round(-1.596 * 64) */ - -// Bias values to subtract 16 from Y and 128 from U and V. -#define BB (UB * 128 + YGB) -#define BG (UG * 128 + VG * 128 + YGB) -#define BR (VR * 128 + YGB) - -// BT601 constants for YUV to RGB. -static YuvConstants SIMD_ALIGNED(kYuvConstants) = { - { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, - UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, - { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, - { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, - 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, - { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, - { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, - { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, - { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } -}; +// Read 4 UV from 422, upsample to 8 UV. +#define READYUV422 \ + xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \ + xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ + u_buf += 4; \ + xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ + y_buf += 8; -// BT601 constants for NV21 where chroma plane is VU instead of UV. -static YuvConstants SIMD_ALIGNED(kYvuConstants) = { - { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, - 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, - { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, - VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, - { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, - VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, - { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, - { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, - { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, - { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } -}; +// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. +#define READYUVA422 \ + xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \ + xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ + u_buf += 4; \ + xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ + y_buf += 8; \ + xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ + a_buf += 8; -#undef YG -#undef YGB -#undef UB -#undef UG -#undef VG -#undef VR -#undef BB -#undef BG -#undef BR - -// JPEG YUV to RGB reference -// * R = Y - V * -1.40200 -// * G = Y - U * 0.34414 - V * 0.71414 -// * B = Y - U * -1.77200 - -// Y contribution to R,G,B. Scale and bias. -// TODO(fbarchard): Consider moving constants into a common header. -#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ -#define YGBJ 32 /* 64 / 2 */ - -// U and V contributions to R,G,B. -#define UBJ -113 /* round(-1.77200 * 64) */ -#define UGJ 22 /* round(0.34414 * 64) */ -#define VGJ 46 /* round(0.71414 * 64) */ -#define VRJ -90 /* round(-1.40200 * 64) */ - -// Bias values to subtract 16 from Y and 128 from U and V. -#define BBJ (UBJ * 128 + YGBJ) -#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ) -#define BRJ (VRJ * 128 + YGBJ) - -// JPEG constants for YUV to RGB. -static YuvConstants SIMD_ALIGNED(kYuvJConstants) = { - { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, - UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 }, - { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, - UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, - UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, - UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ }, - { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, - 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ }, - { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, - BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ }, - { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, - BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ }, - { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, - BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ }, - { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, - YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ } -}; +// Convert 8 pixels: 8 UV and 8 Y. +#define YUVTORGB(yuvconstants) \ + xmm1 = _mm_loadu_si128(&xmm0); \ + xmm2 = _mm_loadu_si128(&xmm0); \ + xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \ + xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \ + xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \ + xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \ + xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \ + xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \ + xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ + xmm0 = _mm_adds_epi16(xmm0, xmm4); \ + xmm1 = _mm_adds_epi16(xmm1, xmm4); \ + xmm2 = _mm_adds_epi16(xmm2, xmm4); \ + xmm0 = _mm_srai_epi16(xmm0, 6); \ + xmm1 = _mm_srai_epi16(xmm1, 6); \ + xmm2 = _mm_srai_epi16(xmm2, 6); \ + xmm0 = _mm_packus_epi16(xmm0, xmm0); \ + xmm1 = _mm_packus_epi16(xmm1, xmm1); \ + xmm2 = _mm_packus_epi16(xmm2, xmm2); + +// Store 8 ARGB values. +#define STOREARGB \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \ + xmm1 = _mm_loadu_si128(&xmm0); \ + xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \ + xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \ + _mm_storeu_si128((__m128i *)dst_argb, xmm0); \ + _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); \ + dst_argb += 32; -#undef YGJ -#undef YGBJ -#undef UBJ -#undef UGJ -#undef VGJ -#undef VRJ -#undef BBJ -#undef BGJ -#undef BRJ -// 64 bit -#if defined(_M_X64) #if defined(HAS_I422TOARGBROW_SSSE3) void I422ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { - __m128i xmm0, xmm1, xmm2, xmm3; + __m128i xmm0, xmm1, xmm2, xmm4; const __m128i xmm5 = _mm_set1_epi8(-1); const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; - while (width > 0) { - xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); - xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); - xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); - xmm1 = _mm_loadu_si128(&xmm0); - xmm2 = _mm_loadu_si128(&xmm0); - xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB); - xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG); - xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR); - xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0); - xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1); - xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2); - xmm3 = _mm_loadl_epi64((__m128i*)y_buf); - xmm3 = _mm_unpacklo_epi8(xmm3, xmm3); - xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb); - xmm0 = _mm_adds_epi16(xmm0, xmm3); - xmm1 = _mm_adds_epi16(xmm1, xmm3); - xmm2 = _mm_adds_epi16(xmm2, xmm3); - xmm0 = _mm_srai_epi16(xmm0, 6); - xmm1 = _mm_srai_epi16(xmm1, 6); - xmm2 = _mm_srai_epi16(xmm2, 6); - xmm0 = _mm_packus_epi16(xmm0, xmm0); - xmm1 = _mm_packus_epi16(xmm1, xmm1); - xmm2 = _mm_packus_epi16(xmm2, xmm2); - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); - xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); - xmm1 = _mm_loadu_si128(&xmm0); - xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); - xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); - - _mm_storeu_si128((__m128i *)dst_argb, xmm0); - _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); + READYUV422 + YUVTORGB(yuvconstants) + STOREARGB + width -= 8; + } +} +#endif - y_buf += 8; - u_buf += 4; - dst_argb += 32; +#if defined(HAS_I422ALPHATOARGBROW_SSSE3) +void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __m128i xmm0, xmm1, xmm2, xmm4, xmm5; + const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; + while (width > 0) { + READYUVA422 + YUVTORGB(yuvconstants) + STOREARGB width -= 8; } } #endif + // 32 bit #else // defined(_M_X64) #ifdef HAS_ARGBTOYROW_SSSE3 @@ -301,6 +226,24 @@ static const uvec8 kShuffleMaskRAWToARGB = { 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u }; +// Shuffle table for converting RAW to RGB24. First 8. +static const uvec8 kShuffleMaskRAWToRGB24_0 = { + 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u +}; + +// Shuffle table for converting RAW to RGB24. Middle 8. +static const uvec8 kShuffleMaskRAWToRGB24_1 = { + 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u +}; + +// Shuffle table for converting RAW to RGB24. Last 8. +static const uvec8 kShuffleMaskRAWToRGB24_2 = { + 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u +}; + // Shuffle table for converting ARGB to RGB24. static const uvec8 kShuffleMaskARGBToRGB24 = { 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u @@ -316,18 +259,43 @@ static const uvec8 kShuffleMaskARGBToRGB24_0 = { 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u }; -// Shuffle table for converting ARGB to RAW. -static const uvec8 kShuffleMaskARGBToRAW_0 = { - 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u +// YUY2 shuf 16 Y to 32 Y. +static const lvec8 kShuffleYUY2Y = { + 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, + 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 +}; + +// YUY2 shuf 8 UV to 16 UV. +static const lvec8 kShuffleYUY2UV = { + 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15, + 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15 +}; + +// UYVY shuf 16 Y to 32 Y. +static const lvec8 kShuffleUYVYY = { + 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, + 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 +}; + +// UYVY shuf 8 UV to 16 UV. +static const lvec8 kShuffleUYVYUV = { + 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14, + 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 +}; + +// NV21 shuf 8 VU to 16 UV. +static const lvec8 kShuffleNV21 = { + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, }; // Duplicates gray value 3 times and fills in alpha opaque. __declspec(naked) -void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { +void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) { __asm { mov eax, [esp + 4] // src_y mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix + mov ecx, [esp + 12] // width pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pslld xmm5, 24 @@ -352,11 +320,11 @@ void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { #ifdef HAS_J400TOARGBROW_AVX2 // Duplicates gray value 3 times and fills in alpha opaque. __declspec(naked) -void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) { +void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) { __asm { mov eax, [esp + 4] // src_y mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix + mov ecx, [esp + 12] // width vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 vpslld ymm5, ymm5, 24 @@ -382,14 +350,14 @@ void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) { #endif // HAS_J400TOARGBROW_AVX2 __declspec(naked) -void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { +void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) { __asm { mov eax, [esp + 4] // src_rgb24 mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix + mov ecx, [esp + 12] // width pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pslld xmm5, 24 - movdqa xmm4, kShuffleMaskRGB24ToARGB + movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB convertloop: movdqu xmm0, [eax] @@ -421,14 +389,14 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { __declspec(naked) void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, - int pix) { + int width) { __asm { mov eax, [esp + 4] // src_raw mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix + mov ecx, [esp + 12] // width pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pslld xmm5, 24 - movdqa xmm4, kShuffleMaskRAWToARGB + movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB convertloop: movdqu xmm0, [eax] @@ -458,6 +426,34 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, } } +__declspec(naked) +void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) { + __asm { + mov eax, [esp + 4] // src_raw + mov edx, [esp + 8] // dst_rgb24 + mov ecx, [esp + 12] // width + movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0 + movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1 + movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2 + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 4] + movdqu xmm2, [eax + 8] + lea eax, [eax + 24] + pshufb xmm0, xmm3 + pshufb xmm1, xmm4 + pshufb xmm2, xmm5 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + 8], xmm1 + movq qword ptr [edx + 16], xmm2 + lea edx, [edx + 24] + sub ecx, 8 + jg convertloop + ret + } +} + // pmul method to replicate bits. // Math to replicate bits: // (v << 8) | (v << 3) @@ -467,7 +463,7 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, // 20 instructions. __declspec(naked) void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, - int pix) { + int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits movd xmm5, eax @@ -485,7 +481,7 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, mov eax, [esp + 4] // src_rgb565 mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix + mov ecx, [esp + 12] // width sub edx, eax sub edx, eax @@ -523,13 +519,13 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 __declspec(naked) void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, - int pix) { + int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits vmovd xmm5, eax vbroadcastss ymm5, xmm5 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits - movd xmm6, eax + vmovd xmm6, eax vbroadcastss ymm6, xmm6 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red vpsllw ymm3, ymm3, 11 @@ -541,7 +537,7 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, mov eax, [esp + 4] // src_rgb565 mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix + mov ecx, [esp + 12] // width sub edx, eax sub edx, eax @@ -574,13 +570,13 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, #ifdef HAS_ARGB1555TOARGBROW_AVX2 __declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, - int pix) { + int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits vmovd xmm5, eax vbroadcastss ymm5, xmm5 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits - movd xmm6, eax + vmovd xmm6, eax vbroadcastss ymm6, xmm6 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red vpsllw ymm3, ymm3, 11 @@ -590,7 +586,7 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, mov eax, [esp + 4] // src_argb1555 mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix + mov ecx, [esp + 12] // width sub edx, eax sub edx, eax @@ -626,7 +622,7 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, #ifdef HAS_ARGB4444TOARGBROW_AVX2 __declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, - int pix) { + int width) { __asm { mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f vmovd xmm4, eax @@ -634,7 +630,7 @@ void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles mov eax, [esp + 4] // src_argb4444 mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix + mov ecx, [esp + 12] // width sub edx, eax sub edx, eax @@ -664,7 +660,7 @@ void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, // 24 instructions __declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, - int pix) { + int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits movd xmm5, eax @@ -681,7 +677,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, mov eax, [esp + 4] // src_argb1555 mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix + mov ecx, [esp + 12] // width sub edx, eax sub edx, eax @@ -717,7 +713,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, // 18 instructions. __declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, - int pix) { + int width) { __asm { mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f movd xmm4, eax @@ -726,7 +722,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, pslld xmm5, 4 mov eax, [esp + 4] // src_argb4444 mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix + mov ecx, [esp + 12] // width sub edx, eax sub edx, eax @@ -754,12 +750,12 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, } __declspec(naked) -void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { +void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // pix - movdqa xmm6, kShuffleMaskARGBToRGB24 + mov ecx, [esp + 12] // width + movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 convertloop: movdqu xmm0, [eax] // fetch 16 pixels of argb @@ -792,12 +788,12 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { } __declspec(naked) -void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { +void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // pix - movdqa xmm6, kShuffleMaskARGBToRAW + mov ecx, [esp + 12] // width + movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW convertloop: movdqu xmm0, [eax] // fetch 16 pixels of argb @@ -829,13 +825,12 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { } } -// 4 pixels __declspec(naked) -void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { +void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // pix + mov ecx, [esp + 12] // width pcmpeqb xmm3, xmm3 // generate mask 0x0000001f psrld xmm3, 27 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 @@ -867,16 +862,15 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { } } -// 8 pixels __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int pix) { + const uint32 dither4, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_rgb movd xmm6, [esp + 12] // dither4 - mov ecx, [esp + 16] // pix + mov ecx, [esp + 16] // width punpcklbw xmm6, xmm6 // make dither 16 bytes movdqa xmm7, xmm6 punpcklwd xmm6, xmm6 @@ -916,12 +910,12 @@ void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 __declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int pix) { + const uint32 dither4, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_rgb vbroadcastss xmm6, [esp + 12] // dither4 - mov ecx, [esp + 16] // pix + mov ecx, [esp + 16] // width vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes vpermq ymm6, ymm6, 0xd8 vpunpcklwd ymm6, ymm6, ymm6 @@ -958,11 +952,11 @@ void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, // TODO(fbarchard): Improve sign extension/packing. __declspec(naked) -void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { +void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // pix + mov ecx, [esp + 12] // width pcmpeqb xmm4, xmm4 // generate mask 0x0000001f psrld xmm4, 27 movdqa xmm5, xmm4 // generate mask 0x000003e0 @@ -999,11 +993,11 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { } __declspec(naked) -void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { +void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // pix + mov ecx, [esp + 12] // width pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 psllw xmm4, 12 movdqa xmm3, xmm4 // generate mask 0x00f000f0 @@ -1029,11 +1023,11 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { #ifdef HAS_ARGBTORGB565ROW_AVX2 __declspec(naked) -void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { +void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // pix + mov ecx, [esp + 12] // width vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f vpsrld ymm3, ymm3, 27 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 @@ -1066,11 +1060,11 @@ void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { #ifdef HAS_ARGBTOARGB1555ROW_AVX2 __declspec(naked) -void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { +void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // pix + mov ecx, [esp + 12] // width vpcmpeqb ymm4, ymm4, ymm4 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 @@ -1106,11 +1100,11 @@ void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { #ifdef HAS_ARGBTOARGB4444ROW_AVX2 __declspec(naked) -void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { +void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // pix + mov ecx, [esp + 12] // width vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 vpsllw ymm4, ymm4, 12 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 @@ -1137,13 +1131,13 @@ void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { // Convert 16 ARGB pixels (64 bytes) to 16 Y values. __declspec(naked) -void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* pix */ - movdqa xmm4, kARGBToY - movdqa xmm5, kAddY16 + mov ecx, [esp + 12] /* width */ + movdqa xmm4, xmmword ptr kARGBToY + movdqa xmm5, xmmword ptr kAddY16 convertloop: movdqu xmm0, [eax] @@ -1172,13 +1166,13 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. __declspec(naked) -void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* pix */ - movdqa xmm4, kARGBToYJ - movdqa xmm5, kAddYJ64 + mov ecx, [esp + 12] /* width */ + movdqa xmm4, xmmword ptr kARGBToYJ + movdqa xmm5, xmmword ptr kAddYJ64 convertloop: movdqu xmm0, [eax] @@ -1213,14 +1207,14 @@ static const lvec32 kPermdARGBToY_AVX = { // Convert 32 ARGB pixels (128 bytes) to 32 Y values. __declspec(naked) -void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { +void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* pix */ - vbroadcastf128 ymm4, kARGBToY - vbroadcastf128 ymm5, kAddY16 - vmovdqu ymm6, kPermdARGBToY_AVX + mov ecx, [esp + 12] /* width */ + vbroadcastf128 ymm4, xmmword ptr kARGBToY + vbroadcastf128 ymm5, xmmword ptr kAddY16 + vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX convertloop: vmovdqu ymm0, [eax] @@ -1252,14 +1246,14 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { #ifdef HAS_ARGBTOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. __declspec(naked) -void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { +void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* pix */ - vbroadcastf128 ymm4, kARGBToYJ - vbroadcastf128 ymm5, kAddYJ64 - vmovdqu ymm6, kPermdARGBToY_AVX + mov ecx, [esp + 12] /* width */ + vbroadcastf128 ymm4, xmmword ptr kARGBToYJ + vbroadcastf128 ymm5, xmmword ptr kAddYJ64 + vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX convertloop: vmovdqu ymm0, [eax] @@ -1291,13 +1285,13 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { #endif // HAS_ARGBTOYJROW_AVX2 __declspec(naked) -void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* pix */ - movdqa xmm4, kBGRAToY - movdqa xmm5, kAddY16 + mov ecx, [esp + 12] /* width */ + movdqa xmm4, xmmword ptr kBGRAToY + movdqa xmm5, xmmword ptr kAddY16 convertloop: movdqu xmm0, [eax] @@ -1324,13 +1318,13 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { } __declspec(naked) -void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* pix */ - movdqa xmm4, kABGRToY - movdqa xmm5, kAddY16 + mov ecx, [esp + 12] /* width */ + movdqa xmm4, xmmword ptr kABGRToY + movdqa xmm5, xmmword ptr kAddY16 convertloop: movdqu xmm0, [eax] @@ -1357,13 +1351,13 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { } __declspec(naked) -void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* pix */ - movdqa xmm4, kRGBAToY - movdqa xmm5, kAddY16 + mov ecx, [esp + 12] /* width */ + movdqa xmm4, xmmword ptr kRGBAToY + movdqa xmm5, xmmword ptr kAddY16 convertloop: movdqu xmm0, [eax] @@ -1399,10 +1393,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - movdqa xmm5, kAddUV128 - movdqa xmm6, kARGBToV - movdqa xmm7, kARGBToU + mov ecx, [esp + 8 + 20] // width + movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm6, xmmword ptr kARGBToV + movdqa xmm7, xmmword ptr kARGBToU sub edi, edx // stride from u to v convertloop: @@ -1469,10 +1463,10 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - movdqa xmm5, kAddUVJ128 - movdqa xmm6, kARGBToVJ - movdqa xmm7, kARGBToUJ + mov ecx, [esp + 8 + 20] // width + movdqa xmm5, xmmword ptr kAddUVJ128 + movdqa xmm6, xmmword ptr kARGBToVJ + movdqa xmm7, xmmword ptr kARGBToUJ sub edi, edx // stride from u to v convertloop: @@ -1511,7 +1505,7 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, pmaddubsw xmm3, xmm6 phaddw xmm0, xmm2 phaddw xmm1, xmm3 - paddw xmm0, xmm5 // +.5 rounding -> unsigned + paddw xmm0, xmm5 // +.5 rounding -> unsigned paddw xmm1, xmm5 psraw xmm0, 8 psraw xmm1, 8 @@ -1541,10 +1535,10 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - vbroadcastf128 ymm5, kAddUV128 - vbroadcastf128 ymm6, kARGBToV - vbroadcastf128 ymm7, kARGBToU + mov ecx, [esp + 8 + 20] // width + vbroadcastf128 ymm5, xmmword ptr kAddUV128 + vbroadcastf128 ymm6, xmmword ptr kARGBToV + vbroadcastf128 ymm7, xmmword ptr kARGBToU sub edi, edx // stride from u to v convertloop: @@ -1578,7 +1572,7 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, vpsraw ymm0, ymm0, 8 vpacksswb ymm0, ymm1, ymm0 // mutates vpermq ymm0, ymm0, 0xd8 // For vpacksswb - vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw + vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw vpaddb ymm0, ymm0, ymm5 // -> unsigned // step 3 - store 16 U and 16 V values @@ -1596,6 +1590,73 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, } #endif // HAS_ARGBTOUVROW_AVX2 +#ifdef HAS_ARGBTOUVJROW_AVX2 +__declspec(naked) +void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + vbroadcastf128 ymm5, xmmword ptr kAddUV128 + vbroadcastf128 ymm6, xmmword ptr kARGBToV + vbroadcastf128 ymm7, xmmword ptr kARGBToU + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 32x2 argb pixels to 16x1 */ + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + vpavgb ymm2, ymm2, [eax + esi + 64] + vpavgb ymm3, ymm3, [eax + esi + 96] + lea eax, [eax + 128] + vshufps ymm4, ymm0, ymm1, 0x88 + vshufps ymm0, ymm0, ymm1, 0xdd + vpavgb ymm0, ymm0, ymm4 // mutated by vshufps + vshufps ymm4, ymm2, ymm3, 0x88 + vshufps ymm2, ymm2, ymm3, 0xdd + vpavgb ymm2, ymm2, ymm4 // mutated by vshufps + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 32 different pixels, its 16 pixels of U and 16 of V + vpmaddubsw ymm1, ymm0, ymm7 // U + vpmaddubsw ymm3, ymm2, ymm7 + vpmaddubsw ymm0, ymm0, ymm6 // V + vpmaddubsw ymm2, ymm2, ymm6 + vphaddw ymm1, ymm1, ymm3 // mutates + vphaddw ymm0, ymm0, ymm2 + vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned + vpaddw ymm0, ymm0, ymm5 + vpsraw ymm1, ymm1, 8 + vpsraw ymm0, ymm0, 8 + vpacksswb ymm0, ymm1, ymm0 // mutates + vpermq ymm0, ymm0, 0xd8 // For vpacksswb + vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw + + // step 3 - store 16 U and 16 V values + vextractf128 [edx], ymm0, 0 // U + vextractf128 [edx + edi], ymm0, 1 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_ARGBTOUVJROW_AVX2 + __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8* src_argb0, uint8* dst_u, uint8* dst_v, int width) { @@ -1604,10 +1665,10 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0, mov eax, [esp + 4 + 4] // src_argb mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - movdqa xmm5, kAddUV128 - movdqa xmm6, kARGBToV - movdqa xmm7, kARGBToU + mov ecx, [esp + 4 + 16] // width + movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm6, xmmword ptr kARGBToV + movdqa xmm7, xmmword ptr kARGBToU sub edi, edx // stride from u to v convertloop: @@ -1654,64 +1715,6 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0, } __declspec(naked) -void ARGBToUV422Row_SSSE3(const uint8* src_argb0, - uint8* dst_u, uint8* dst_v, int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_argb - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - movdqa xmm5, kAddUV128 - movdqa xmm6, kARGBToV - movdqa xmm7, kARGBToU - sub edi, edx // stride from u to v - - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -1721,10 +1724,10 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - movdqa xmm5, kAddUV128 - movdqa xmm6, kBGRAToV - movdqa xmm7, kBGRAToU + mov ecx, [esp + 8 + 20] // width + movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm6, xmmword ptr kBGRAToV + movdqa xmm7, xmmword ptr kBGRAToU sub edi, edx // stride from u to v convertloop: @@ -1791,10 +1794,10 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - movdqa xmm5, kAddUV128 - movdqa xmm6, kABGRToV - movdqa xmm7, kABGRToU + mov ecx, [esp + 8 + 20] // width + movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm6, xmmword ptr kABGRToV + movdqa xmm7, xmmword ptr kABGRToU sub edi, edx // stride from u to v convertloop: @@ -1861,10 +1864,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - movdqa xmm5, kAddUV128 - movdqa xmm6, kRGBAToV - movdqa xmm7, kRGBAToU + mov ecx, [esp + 8 + 20] // width + movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm6, xmmword ptr kRGBAToV + movdqa xmm7, xmmword ptr kRGBAToU sub edi, edx // stride from u to v convertloop: @@ -1924,33 +1927,62 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, // Read 16 UV from 444 #define READYUV444_AVX2 __asm { \ - __asm vmovdqu xmm0, [esi] /* U */ /* NOLINT */ \ - __asm vmovdqu xmm1, [esi + edi] /* V */ /* NOLINT */ \ + __asm vmovdqu xmm0, [esi] /* U */ \ + __asm vmovdqu xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 16] \ __asm vpermq ymm0, ymm0, 0xd8 \ __asm vpermq ymm1, ymm1, 0xd8 \ __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpermq ymm4, ymm4, 0xd8 \ + __asm vpunpcklbw ymm4, ymm4, ymm4 \ + __asm lea eax, [eax + 16] \ } // Read 8 UV from 422, upsample to 16 UV. #define READYUV422_AVX2 __asm { \ - __asm vmovq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ - __asm vmovq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ + __asm vmovq xmm0, qword ptr [esi] /* U */ \ + __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 8] \ __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ __asm vpermq ymm0, ymm0, 0xd8 \ __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpermq ymm4, ymm4, 0xd8 \ + __asm vpunpcklbw ymm4, ymm4, ymm4 \ + __asm lea eax, [eax + 16] \ + } + +// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. +#define READYUVA422_AVX2 __asm { \ + __asm vmovq xmm0, qword ptr [esi] /* U */ \ + __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ + __asm lea esi, [esi + 8] \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ + __asm vpermq ymm0, ymm0, 0xd8 \ + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpermq ymm4, ymm4, 0xd8 \ + __asm vpunpcklbw ymm4, ymm4, ymm4 \ + __asm lea eax, [eax + 16] \ + __asm vmovdqu xmm5, [ebp] /* A */ \ + __asm vpermq ymm5, ymm5, 0xd8 \ + __asm lea ebp, [ebp + 16] \ } // Read 4 UV from 411, upsample to 16 UV. #define READYUV411_AVX2 __asm { \ - __asm vmovd xmm0, dword ptr [esi] /* U */ /* NOLINT */ \ - __asm vmovd xmm1, dword ptr [esi + edi] /* V */ /* NOLINT */ \ + __asm vmovd xmm0, dword ptr [esi] /* U */ \ + __asm vmovd xmm1, dword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 4] \ __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ __asm vpermq ymm0, ymm0, 0xd8 \ __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpermq ymm4, ymm4, 0xd8 \ + __asm vpunpcklbw ymm4, ymm4, ymm4 \ + __asm lea eax, [eax + 16] \ } // Read 8 UV from NV12, upsample to 16 UV. @@ -1959,29 +1991,58 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, __asm lea esi, [esi + 16] \ __asm vpermq ymm0, ymm0, 0xd8 \ __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpermq ymm4, ymm4, 0xd8 \ + __asm vpunpcklbw ymm4, ymm4, ymm4 \ + __asm lea eax, [eax + 16] \ + } + +// Read 8 UV from NV21, upsample to 16 UV. +#define READNV21_AVX2 __asm { \ + __asm vmovdqu xmm0, [esi] /* UV */ \ + __asm lea esi, [esi + 16] \ + __asm vpermq ymm0, ymm0, 0xd8 \ + __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \ + __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpermq ymm4, ymm4, 0xd8 \ + __asm vpunpcklbw ymm4, ymm4, ymm4 \ + __asm lea eax, [eax + 16] \ + } + +// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. +#define READYUY2_AVX2 __asm { \ + __asm vmovdqu ymm4, [eax] /* YUY2 */ \ + __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \ + __asm vmovdqu ymm0, [eax] /* UV */ \ + __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \ + __asm lea eax, [eax + 32] \ + } + +// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. +#define READUYVY_AVX2 __asm { \ + __asm vmovdqu ymm4, [eax] /* UYVY */ \ + __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \ + __asm vmovdqu ymm0, [eax] /* UV */ \ + __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \ + __asm lea eax, [eax + 32] \ } // Convert 16 pixels: 16 UV and 16 Y. #define YUVTORGB_AVX2(YuvConstants) __asm { \ - /* Step 1: Find 8 UV contributions to 16 R,G,B values */ \ - __asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR /* scale R UV */ \ - __asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG /* scale G UV */ \ - __asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB /* scale B UV */ \ - __asm vmovdqu ymm3, YuvConstants.kUVBiasR \ + __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\ + __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\ + __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\ + __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \ __asm vpsubw ymm2, ymm3, ymm2 \ - __asm vmovdqu ymm3, YuvConstants.kUVBiasG \ + __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \ __asm vpsubw ymm1, ymm3, ymm1 \ - __asm vmovdqu ymm3, YuvConstants.kUVBiasB \ + __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \ __asm vpsubw ymm0, ymm3, ymm0 \ /* Step 2: Find Y contribution to 16 R,G,B values */ \ - __asm vmovdqu xmm3, [eax] /* NOLINT */ \ - __asm lea eax, [eax + 16] \ - __asm vpermq ymm3, ymm3, 0xd8 \ - __asm vpunpcklbw ymm3, ymm3, ymm3 \ - __asm vpmulhuw ymm3, ymm3, YuvConstants.kYToRgb \ - __asm vpaddsw ymm0, ymm0, ymm3 /* B += Y */ \ - __asm vpaddsw ymm1, ymm1, ymm3 /* G += Y */ \ - __asm vpaddsw ymm2, ymm2, ymm3 /* R += Y */ \ + __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \ + __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \ + __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \ + __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \ __asm vpsraw ymm0, ymm0, 6 \ __asm vpsraw ymm1, ymm1, 6 \ __asm vpsraw ymm2, ymm2, 6 \ @@ -1992,7 +2053,6 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, // Store 16 ARGB values. #define STOREARGB_AVX2 __asm { \ - /* Step 3: Weave into ARGB */ \ __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ __asm vpermq ymm0, ymm0, 0xd8 \ __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ @@ -2004,6 +2064,19 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, __asm lea edx, [edx + 64] \ } +// Store 16 RGBA values. +#define STORERGBA_AVX2 __asm { \ + __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ + __asm vpermq ymm1, ymm1, 0xd8 \ + __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ + __asm vpermq ymm2, ymm2, 0xd8 \ + __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ + __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ + __asm vmovdqu [edx], ymm0 \ + __asm vmovdqu [edx + 32], ymm1 \ + __asm lea edx, [edx + 64] \ + } + #ifdef HAS_I422TOARGBROW_AVX2 // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). @@ -2012,26 +2085,30 @@ void I422ToARGBRow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // argb - mov ecx, [esp + 8 + 20] // width + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width sub edi, esi vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READYUV422_AVX2 - YUVTORGB_AVX2(kYuvConstants) + YUVTORGB_AVX2(ebx) STOREARGB_AVX2 sub ecx, 16 jg convertloop + pop ebx pop edi pop esi vzeroupper @@ -2040,41 +2117,48 @@ void I422ToARGBRow_AVX2(const uint8* y_buf, } #endif // HAS_I422TOARGBROW_AVX2 -#ifdef HAS_J422TOARGBROW_AVX2 +#ifdef HAS_I422ALPHATOARGBROW_AVX2 // 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) -void J422ToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { +// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. +__declspec(naked) +void I422AlphaToARGBRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // argb - mov ecx, [esp + 8 + 20] // width + push ebx + push ebp + mov eax, [esp + 16 + 4] // Y + mov esi, [esp + 16 + 8] // U + mov edi, [esp + 16 + 12] // V + mov ebp, [esp + 16 + 16] // A + mov edx, [esp + 16 + 20] // argb + mov ebx, [esp + 16 + 24] // yuvconstants + mov ecx, [esp + 16 + 28] // width sub edi, esi - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: - READYUV422_AVX2 - YUVTORGB_AVX2(kYuvJConstants) + READYUVA422_AVX2 + YUVTORGB_AVX2(ebx) STOREARGB_AVX2 sub ecx, 16 jg convertloop + pop ebp + pop ebx pop edi pop esi vzeroupper ret } } -#endif // HAS_J422TOARGBROW_AVX2 +#endif // HAS_I422ALPHATOARGBROW_AVX2 #ifdef HAS_I444TOARGBROW_AVX2 // 16 pixels @@ -2084,26 +2168,29 @@ void I444ToARGBRow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // argb - mov ecx, [esp + 8 + 20] // width + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width sub edi, esi vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - convertloop: READYUV444_AVX2 - YUVTORGB_AVX2(kYuvConstants) + YUVTORGB_AVX2(ebx) STOREARGB_AVX2 sub ecx, 16 jg convertloop + pop ebx pop edi pop esi vzeroupper @@ -2120,26 +2207,30 @@ void I411ToARGBRow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // argb - mov ecx, [esp + 8 + 20] // width + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // abgr + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width sub edi, esi vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READYUV411_AVX2 - YUVTORGB_AVX2(kYuvConstants) + YUVTORGB_AVX2(ebx) STOREARGB_AVX2 sub ecx, 16 jg convertloop + pop ebx pop edi pop esi vzeroupper @@ -2155,23 +2246,27 @@ __declspec(naked) void NV12ToARGBRow_AVX2(const uint8* y_buf, const uint8* uv_buf, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // Y - mov esi, [esp + 4 + 8] // UV - mov edx, [esp + 4 + 12] // argb - mov ecx, [esp + 4 + 16] // width + push ebx + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // UV + mov edx, [esp + 8 + 12] // argb + mov ebx, [esp + 8 + 16] // yuvconstants + mov ecx, [esp + 8 + 20] // width vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READNV12_AVX2 - YUVTORGB_AVX2(kYuvConstants) + YUVTORGB_AVX2(ebx) STOREARGB_AVX2 sub ecx, 16 jg convertloop + pop ebx pop esi vzeroupper ret @@ -2181,28 +2276,32 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_NV21TOARGBROW_AVX2 // 16 pixels. -// 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes). +// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). __declspec(naked) void NV21ToARGBRow_AVX2(const uint8* y_buf, - const uint8* uv_buf, + const uint8* vu_buf, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // Y - mov esi, [esp + 4 + 8] // UV - mov edx, [esp + 4 + 12] // argb - mov ecx, [esp + 4 + 16] // width + push ebx + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // VU + mov edx, [esp + 8 + 12] // argb + mov ebx, [esp + 8 + 16] // yuvconstants + mov ecx, [esp + 8 + 20] // width vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: - READNV12_AVX2 - YUVTORGB_AVX2(kYvuConstants) + READNV21_AVX2 + YUVTORGB_AVX2(ebx) STOREARGB_AVX2 sub ecx, 16 jg convertloop + pop ebx pop esi vzeroupper ret @@ -2210,153 +2309,121 @@ void NV21ToARGBRow_AVX2(const uint8* y_buf, } #endif // HAS_NV21TOARGBROW_AVX2 -#ifdef HAS_I422TOBGRAROW_AVX2 -// 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). -// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. +#ifdef HAS_YUY2TOARGBROW_AVX2 +// 16 pixels. +// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). __declspec(naked) -void I422ToBGRARow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, +void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // argb - mov ecx, [esp + 8 + 20] // width - sub edi, esi + push ebx + mov eax, [esp + 4 + 4] // yuy2 + mov edx, [esp + 4 + 8] // argb + mov ebx, [esp + 4 + 12] // yuvconstants + mov ecx, [esp + 4 + 16] // width vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: - READYUV422_AVX2 - YUVTORGB_AVX2(kYuvConstants) + READYUY2_AVX2 + YUVTORGB_AVX2(ebx) + STOREARGB_AVX2 - // Step 3: Weave into BGRA - vpunpcklbw ymm1, ymm1, ymm0 // GB - vpermq ymm1, ymm1, 0xd8 - vpunpcklbw ymm2, ymm5, ymm2 // AR - vpermq ymm2, ymm2, 0xd8 - vpunpcklwd ymm0, ymm2, ymm1 // ARGB first 8 pixels - vpunpckhwd ymm2, ymm2, ymm1 // ARGB next 8 pixels - vmovdqu [edx], ymm0 - vmovdqu [edx + 32], ymm2 - lea edx, [edx + 64] sub ecx, 16 jg convertloop - pop edi - pop esi + pop ebx vzeroupper ret } } -#endif // HAS_I422TOBGRAROW_AVX2 +#endif // HAS_YUY2TOARGBROW_AVX2 -#ifdef HAS_I422TORGBAROW_AVX2 -// 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). -// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. +#ifdef HAS_UYVYTOARGBROW_AVX2 +// 16 pixels. +// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). __declspec(naked) -void I422ToRGBARow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, +void UYVYToARGBRow_AVX2(const uint8* src_uyvy, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // argb - mov ecx, [esp + 8 + 20] // width - sub edi, esi + push ebx + mov eax, [esp + 4 + 4] // uyvy + mov edx, [esp + 4 + 8] // argb + mov ebx, [esp + 4 + 12] // yuvconstants + mov ecx, [esp + 4 + 16] // width vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: - READYUV422_AVX2 - YUVTORGB_AVX2(kYuvConstants) + READUYVY_AVX2 + YUVTORGB_AVX2(ebx) + STOREARGB_AVX2 - // Step 3: Weave into RGBA - vpunpcklbw ymm1, ymm1, ymm2 // GR - vpermq ymm1, ymm1, 0xd8 - vpunpcklbw ymm2, ymm5, ymm0 // AB - vpermq ymm2, ymm2, 0xd8 - vpunpcklwd ymm0, ymm2, ymm1 // ABGR first 8 pixels - vpunpckhwd ymm1, ymm2, ymm1 // ABGR next 8 pixels - vmovdqu [edx], ymm0 - vmovdqu [edx + 32], ymm1 - lea edx, [edx + 64] sub ecx, 16 jg convertloop - pop edi - pop esi + pop ebx vzeroupper ret } } -#endif // HAS_I422TORGBAROW_AVX2 +#endif // HAS_UYVYTOARGBROW_AVX2 -#ifdef HAS_I422TOABGRROW_AVX2 +#ifdef HAS_I422TORGBAROW_AVX2 // 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). -// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). __declspec(naked) -void I422ToABGRRow_AVX2(const uint8* y_buf, +void I422ToRGBARow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // argb - mov ecx, [esp + 8 + 20] // width + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // abgr + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width sub edi, esi vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READYUV422_AVX2 - YUVTORGB_AVX2(kYuvConstants) + YUVTORGB_AVX2(ebx) + STORERGBA_AVX2 - // Step 3: Weave into ABGR - vpunpcklbw ymm1, ymm2, ymm1 // RG - vpermq ymm1, ymm1, 0xd8 - vpunpcklbw ymm2, ymm0, ymm5 // BA - vpermq ymm2, ymm2, 0xd8 - vpunpcklwd ymm0, ymm1, ymm2 // RGBA first 8 pixels - vpunpckhwd ymm1, ymm1, ymm2 // RGBA next 8 pixels - vmovdqu [edx], ymm0 - vmovdqu [edx + 32], ymm1 - lea edx, [edx + 64] sub ecx, 16 jg convertloop + pop ebx pop edi pop esi vzeroupper ret } } -#endif // HAS_I422TOABGRROW_AVX2 +#endif // HAS_I422TORGBAROW_AVX2 #if defined(HAS_I422TOARGBROW_SSSE3) // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. +// Allows a conversion with half size scaling. // Read 8 UV from 444. #define READYUV444 __asm { \ - __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ - __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ + __asm movq xmm0, qword ptr [esi] /* U */ \ + __asm movq xmm1, qword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 8] \ __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm movq xmm4, qword ptr [eax] \ + __asm punpcklbw xmm4, xmm4 \ + __asm lea eax, [eax + 8] \ } // Read 4 UV from 422, upsample to 8 UV. @@ -2366,50 +2433,99 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm lea esi, [esi + 4] \ __asm punpcklbw xmm0, xmm1 /* UV */ \ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm movq xmm4, qword ptr [eax] \ + __asm punpcklbw xmm4, xmm4 \ + __asm lea eax, [eax + 8] \ + } + +// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. +#define READYUVA422 __asm { \ + __asm movd xmm0, [esi] /* U */ \ + __asm movd xmm1, [esi + edi] /* V */ \ + __asm lea esi, [esi + 4] \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm movq xmm4, qword ptr [eax] /* Y */ \ + __asm punpcklbw xmm4, xmm4 \ + __asm lea eax, [eax + 8] \ + __asm movq xmm5, qword ptr [ebp] /* A */ \ + __asm lea ebp, [ebp + 8] \ } // Read 2 UV from 411, upsample to 8 UV. -#define READYUV411 __asm { \ - __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \ +// drmemory fails with memory fault if pinsrw used. libyuv bug: 525 +// __asm pinsrw xmm0, [esi], 0 /* U */ +// __asm pinsrw xmm1, [esi + edi], 0 /* V */ +#define READYUV411_EBX __asm { \ + __asm movzx ebx, word ptr [esi] /* U */ \ __asm movd xmm0, ebx \ - __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \ + __asm movzx ebx, word ptr [esi + edi] /* V */ \ __asm movd xmm1, ebx \ __asm lea esi, [esi + 2] \ - __asm punpcklbw xmm0, xmm1 /* UV */ \ - __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ - __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \ + __asm movq xmm4, qword ptr [eax] \ + __asm punpcklbw xmm4, xmm4 \ + __asm lea eax, [eax + 8] \ } // Read 4 UV from NV12, upsample to 8 UV. #define READNV12 __asm { \ - __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \ + __asm movq xmm0, qword ptr [esi] /* UV */ \ __asm lea esi, [esi + 8] \ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm movq xmm4, qword ptr [eax] \ + __asm punpcklbw xmm4, xmm4 \ + __asm lea eax, [eax + 8] \ + } + +// Read 4 VU from NV21, upsample to 8 UV. +#define READNV21 __asm { \ + __asm movq xmm0, qword ptr [esi] /* UV */ \ + __asm lea esi, [esi + 8] \ + __asm pshufb xmm0, xmmword ptr kShuffleNV21 \ + __asm movq xmm4, qword ptr [eax] \ + __asm punpcklbw xmm4, xmm4 \ + __asm lea eax, [eax + 8] \ + } + +// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV. +#define READYUY2 __asm { \ + __asm movdqu xmm4, [eax] /* YUY2 */ \ + __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \ + __asm movdqu xmm0, [eax] /* UV */ \ + __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \ + __asm lea eax, [eax + 16] \ + } + +// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV. +#define READUYVY __asm { \ + __asm movdqu xmm4, [eax] /* UYVY */ \ + __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \ + __asm movdqu xmm0, [eax] /* UV */ \ + __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \ + __asm lea eax, [eax + 16] \ } // Convert 8 pixels: 8 UV and 8 Y. #define YUVTORGB(YuvConstants) __asm { \ - /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ __asm movdqa xmm1, xmm0 \ __asm movdqa xmm2, xmm0 \ __asm movdqa xmm3, xmm0 \ - __asm movdqa xmm0, YuvConstants.kUVBiasB /* unbias back to signed */ \ - __asm pmaddubsw xmm1, YuvConstants.kUVToB /* scale B UV */ \ + __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \ + __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \ __asm psubw xmm0, xmm1 \ - __asm movdqa xmm1, YuvConstants.kUVBiasG \ - __asm pmaddubsw xmm2, YuvConstants.kUVToG /* scale G UV */ \ + __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \ + __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \ __asm psubw xmm1, xmm2 \ - __asm movdqa xmm2, YuvConstants.kUVBiasR \ - __asm pmaddubsw xmm3, YuvConstants.kUVToR /* scale R UV */ \ + __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \ + __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \ __asm psubw xmm2, xmm3 \ - /* Step 2: Find Y contribution to 8 R,G,B values */ \ - __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ - __asm lea eax, [eax + 8] \ - __asm punpcklbw xmm3, xmm3 \ - __asm pmulhuw xmm3, YuvConstants.kYToRgb \ - __asm paddsw xmm0, xmm3 /* B += Y */ \ - __asm paddsw xmm1, xmm3 /* G += Y */ \ - __asm paddsw xmm2, xmm3 /* R += Y */ \ + __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \ + __asm paddsw xmm0, xmm4 /* B += Y */ \ + __asm paddsw xmm1, xmm4 /* G += Y */ \ + __asm paddsw xmm2, xmm4 /* R += Y */ \ __asm psraw xmm0, 6 \ __asm psraw xmm1, 6 \ __asm psraw xmm2, 6 \ @@ -2420,7 +2536,6 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, // Store 8 ARGB values. #define STOREARGB __asm { \ - /* Step 3: Weave into ARGB */ \ __asm punpcklbw xmm0, xmm1 /* BG */ \ __asm punpcklbw xmm2, xmm5 /* RA */ \ __asm movdqa xmm1, xmm0 \ @@ -2433,7 +2548,6 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, // Store 8 BGRA values. #define STOREBGRA __asm { \ - /* Step 3: Weave into BGRA */ \ __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ __asm punpcklbw xmm1, xmm0 /* GB */ \ __asm punpcklbw xmm5, xmm2 /* AR */ \ @@ -2445,22 +2559,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm lea edx, [edx + 32] \ } -// Store 8 ABGR values. -#define STOREABGR __asm { \ - /* Step 3: Weave into ABGR */ \ - __asm punpcklbw xmm2, xmm1 /* RG */ \ - __asm punpcklbw xmm0, xmm5 /* BA */ \ - __asm movdqa xmm1, xmm2 \ - __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \ - __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \ - __asm movdqu 0[edx], xmm2 \ - __asm movdqu 16[edx], xmm1 \ - __asm lea edx, [edx + 32] \ - } - // Store 8 RGBA values. #define STORERGBA __asm { \ - /* Step 3: Weave into RGBA */ \ __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ __asm punpcklbw xmm1, xmm2 /* GR */ \ __asm punpcklbw xmm5, xmm0 /* AB */ \ @@ -2474,30 +2574,13 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, // Store 8 RGB24 values. #define STORERGB24 __asm { \ - /* Step 3: Weave into RRGB */ \ + /* Weave into RRGB */ \ __asm punpcklbw xmm0, xmm1 /* BG */ \ __asm punpcklbw xmm2, xmm2 /* RR */ \ __asm movdqa xmm1, xmm0 \ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ - /* Step 4: RRGB -> RGB24 */ \ - __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ - __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ - __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ - __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ - __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ - __asm lea edx, [edx + 24] \ - } - -// Store 8 RAW values. -#define STORERAW __asm { \ - /* Step 3: Weave into RRGB */ \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm2 /* RR */ \ - __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ - /* Step 4: RRGB -> RAW */ \ + /* RRGB -> RGB24 */ \ __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ @@ -2508,13 +2591,13 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, // Store 8 RGB565 values. #define STORERGB565 __asm { \ - /* Step 3: Weave into RRGB */ \ + /* Weave into RRGB */ \ __asm punpcklbw xmm0, xmm1 /* BG */ \ __asm punpcklbw xmm2, xmm2 /* RR */ \ __asm movdqa xmm1, xmm0 \ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ - /* Step 4: RRGB -> RGB565 */ \ + /* RRGB -> RGB565 */ \ __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ __asm movdqa xmm2, xmm0 /* G */ \ __asm pslld xmm0, 8 /* R */ \ @@ -2549,26 +2632,30 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // argb - mov ecx, [esp + 8 + 20] // width + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READYUV444 - YUVTORGB(kYuvConstants) + YUVTORGB(ebx) STOREARGB sub ecx, 8 jg convertloop + pop ebx pop edi pop esi ret @@ -2582,61 +2669,31 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* dst_rgb24, + const struct YuvConstants* yuvconstants, int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // rgb24 - mov ecx, [esp + 8 + 20] // width + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width sub edi, esi - movdqa xmm5, kShuffleMaskARGBToRGB24_0 - movdqa xmm6, kShuffleMaskARGBToRGB24 + movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 + movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 convertloop: READYUV422 - YUVTORGB(kYuvConstants) + YUVTORGB(ebx) STORERGB24 sub ecx, 8 jg convertloop - pop edi - pop esi - ret - } -} - -// 8 pixels. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes). -__declspec(naked) -void I422ToRAWRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_raw, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // raw - mov ecx, [esp + 8 + 20] // width - sub edi, esi - movdqa xmm5, kShuffleMaskARGBToRAW_0 - movdqa xmm6, kShuffleMaskARGBToRAW - - convertloop: - READYUV422 - YUVTORGB(kYuvConstants) - STORERAW - - sub ecx, 8 - jg convertloop - + pop ebx pop edi pop esi ret @@ -2650,15 +2707,18 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb565_buf, + const struct YuvConstants* yuvconstants, int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // rgb565 - mov ecx, [esp + 8 + 20] // width + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width sub edi, esi pcmpeqb xmm5, xmm5 // generate mask 0x0000001f psrld xmm5, 27 @@ -2670,12 +2730,13 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf, convertloop: READYUV422 - YUVTORGB(kYuvConstants) + YUVTORGB(ebx) STORERGB565 sub ecx, 8 jg convertloop + pop ebx pop edi pop esi ret @@ -2689,26 +2750,30 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // argb - mov ecx, [esp + 8 + 20] // width + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width sub edi, esi pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READYUV422 - YUVTORGB(kYuvConstants) + YUVTORGB(ebx) STOREARGB sub ecx, 8 jg convertloop + pop ebx pop edi pop esi ret @@ -2716,33 +2781,39 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, } // 8 pixels. -// JPeg color space version of I422ToARGB -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB. __declspec(naked) -void J422ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { +void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // argb - mov ecx, [esp + 8 + 20] // width + push ebx + push ebp + mov eax, [esp + 16 + 4] // Y + mov esi, [esp + 16 + 8] // U + mov edi, [esp + 16 + 12] // V + mov ebp, [esp + 16 + 16] // A + mov edx, [esp + 16 + 20] // argb + mov ebx, [esp + 16 + 24] // yuvconstants + mov ecx, [esp + 16 + 28] // width sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: - READYUV422 - YUVTORGB(kYuvJConstants) + READYUVA422 + YUVTORGB(ebx) STOREARGB sub ecx, 8 jg convertloop + pop ebp + pop ebx pop edi pop esi ret @@ -2757,30 +2828,34 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { __asm { - push ebx push esi push edi - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ecx, [esp + 12 + 20] // width + push ebx + push ebp + mov eax, [esp + 16 + 4] // Y + mov esi, [esp + 16 + 8] // U + mov edi, [esp + 16 + 12] // V + mov edx, [esp + 16 + 16] // abgr + mov ebp, [esp + 16 + 20] // yuvconstants + mov ecx, [esp + 16 + 24] // width sub edi, esi pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: - READYUV411 // modifies EBX - YUVTORGB(kYuvConstants) + READYUV411_EBX + YUVTORGB(ebp) STOREARGB sub ecx, 8 jg convertloop + pop ebp + pop ebx pop edi pop esi - pop ebx ret } } @@ -2791,113 +2866,116 @@ __declspec(naked) void NV12ToARGBRow_SSSE3(const uint8* y_buf, const uint8* uv_buf, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // Y - mov esi, [esp + 4 + 8] // UV - mov edx, [esp + 4 + 12] // argb - mov ecx, [esp + 4 + 16] // width + push ebx + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // UV + mov edx, [esp + 8 + 12] // argb + mov ebx, [esp + 8 + 16] // yuvconstants + mov ecx, [esp + 8 + 20] // width pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READNV12 - YUVTORGB(kYuvConstants) + YUVTORGB(ebx) STOREARGB sub ecx, 8 jg convertloop + pop ebx pop esi ret } } // 8 pixels. -// 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes). +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) void NV21ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* uv_buf, + const uint8* vu_buf, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // Y - mov esi, [esp + 4 + 8] // UV - mov edx, [esp + 4 + 12] // argb - mov ecx, [esp + 4 + 16] // width + push ebx + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // VU + mov edx, [esp + 8 + 12] // argb + mov ebx, [esp + 8 + 16] // yuvconstants + mov ecx, [esp + 8 + 20] // width pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: - READNV12 - YUVTORGB(kYvuConstants) + READNV21 + YUVTORGB(ebx) STOREARGB sub ecx, 8 jg convertloop + pop ebx pop esi ret } } +// 8 pixels. +// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes). __declspec(naked) -void I422ToBGRARow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_bgra, +void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // bgra - mov ecx, [esp + 8 + 20] // width - sub edi, esi + push ebx + mov eax, [esp + 4 + 4] // yuy2 + mov edx, [esp + 4 + 8] // argb + mov ebx, [esp + 4 + 12] // yuvconstants + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: - READYUV422 - YUVTORGB(kYuvConstants) - STOREBGRA + READYUY2 + YUVTORGB(ebx) + STOREARGB sub ecx, 8 jg convertloop - pop edi - pop esi + pop ebx ret } } +// 8 pixels. +// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes). __declspec(naked) -void I422ToABGRRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_abgr, +void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) { __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // abgr - mov ecx, [esp + 8 + 20] // width - sub edi, esi + push ebx + mov eax, [esp + 4 + 4] // uyvy + mov edx, [esp + 4 + 8] // argb + mov ebx, [esp + 4 + 12] // yuvconstants + mov ecx, [esp + 4 + 16] // width pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: - READYUV422 - YUVTORGB(kYuvConstants) - STOREABGR + READUYVY + YUVTORGB(ebx) + STOREARGB sub ecx, 8 jg convertloop - pop edi - pop esi + pop ebx ret } } @@ -2907,31 +2985,34 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* dst_rgba, + const struct YuvConstants* yuvconstants, int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // rgba - mov ecx, [esp + 8 + 20] // width + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width sub edi, esi convertloop: READYUV422 - YUVTORGB(kYuvConstants) + YUVTORGB(ebx) STORERGBA sub ecx, 8 jg convertloop + pop ebx pop edi pop esi ret } } - #endif // HAS_I422TOARGBROW_SSSE3 #ifdef HAS_I400TOARGBROW_SSE2 @@ -3045,7 +3126,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width - movdqa xmm5, kShuffleMirror + movdqa xmm5, xmmword ptr kShuffleMirror convertloop: movdqu xmm0, [eax - 16 + ecx] @@ -3066,7 +3147,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width - vbroadcastf128 ymm5, kShuffleMirror + vbroadcastf128 ymm5, xmmword ptr kShuffleMirror convertloop: vmovdqu ymm0, [eax - 32 + ecx] @@ -3082,33 +3163,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { } #endif // HAS_MIRRORROW_AVX2 -#ifdef HAS_MIRRORROW_SSE2 -__declspec(naked) -void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - - convertloop: - movdqu xmm0, [eax - 16 + ecx] - movdqa xmm1, xmm0 // swap bytes - psllw xmm0, 8 - psrlw xmm1, 8 - por xmm0, xmm1 - pshuflw xmm0, xmm0, 0x1b // swap words - pshufhw xmm0, xmm0, 0x1b - pshufd xmm0, xmm0, 0x4e // swap qwords - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} -#endif // HAS_MIRRORROW_SSE2 - -#ifdef HAS_MIRRORROW_UV_SSSE3 +#ifdef HAS_MIRRORUVROW_SSSE3 // Shuffle table for reversing the bytes of UV channels. static const uvec8 kShuffleMirrorUV = { 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u @@ -3123,7 +3178,7 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v mov ecx, [esp + 4 + 16] // width - movdqa xmm1, kShuffleMirrorUV + movdqa xmm1, xmmword ptr kShuffleMirrorUV lea eax, [eax + ecx * 2 - 16] sub edi, edx @@ -3141,7 +3196,7 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, ret } } -#endif // HAS_MIRRORROW_UV_SSSE3 +#endif // HAS_MIRRORUVROW_SSSE3 #ifdef HAS_ARGBMIRRORROW_SSE2 __declspec(naked) @@ -3177,7 +3232,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width - vmovdqu ymm5, kARGBShuffleMirror_AVX2 + vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2 convertloop: vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order @@ -3193,13 +3248,14 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { #ifdef HAS_SPLITUVROW_SSE2 __declspec(naked) -void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { +void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { __asm { push edi mov eax, [esp + 4 + 4] // src_uv mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix + mov ecx, [esp + 4 + 16] // width pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx @@ -3231,13 +3287,14 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { #ifdef HAS_SPLITUVROW_AVX2 __declspec(naked) -void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { +void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { __asm { push edi mov eax, [esp + 4 + 4] // src_uv mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix + mov ecx, [esp + 4 + 16] // width vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx @@ -3339,8 +3396,23 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // count + test eax, 15 + jne convertloopu + test edx, 15 + jne convertloopu - convertloop: + convertloopa: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloopa + ret + + convertloopu: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] @@ -3348,7 +3420,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 32 - jg convertloop + jg convertloopu ret } } @@ -3460,6 +3532,33 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { } #endif // HAS_ARGBCOPYALPHAROW_AVX2 +#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 +// width in pixels +__declspec(naked) +void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_a + mov ecx, [esp + 12] // width + + extractloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrld xmm0, 24 + psrld xmm1, 24 + packssdw xmm0, xmm1 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + sub ecx, 8 + jg extractloop + + ret + } +} +#endif // HAS_ARGBEXTRACTALPHAROW_SSE2 + #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 // width in pixels __declspec(naked) @@ -3579,12 +3678,11 @@ void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { #ifdef HAS_YUY2TOYROW_AVX2 __declspec(naked) -void YUY2ToYRow_AVX2(const uint8* src_yuy2, - uint8* dst_y, int pix) { +void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) { __asm { mov eax, [esp + 4] // src_yuy2 mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // pix + mov ecx, [esp + 12] // width vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 @@ -3607,7 +3705,7 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, __declspec(naked) void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { __asm { push esi push edi @@ -3615,7 +3713,7 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, mov esi, [esp + 8 + 8] // stride_yuy2 mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix + mov ecx, [esp + 8 + 20] // width vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx @@ -3651,13 +3749,13 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, __declspec(naked) void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { __asm { push edi mov eax, [esp + 4 + 4] // src_yuy2 mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix + mov ecx, [esp + 4 + 16] // width vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx @@ -3690,11 +3788,11 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, __declspec(naked) void UYVYToYRow_AVX2(const uint8* src_uyvy, - uint8* dst_y, int pix) { + uint8* dst_y, int width) { __asm { mov eax, [esp + 4] // src_uyvy mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // pix + mov ecx, [esp + 12] // width convertloop: vmovdqu ymm0, [eax] @@ -3715,7 +3813,7 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy, __declspec(naked) void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { __asm { push esi push edi @@ -3723,7 +3821,7 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, mov esi, [esp + 8 + 8] // stride_yuy2 mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix + mov ecx, [esp + 8 + 20] // width vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx @@ -3759,13 +3857,13 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, __declspec(naked) void UYVYToUV422Row_AVX2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { __asm { push edi mov eax, [esp + 4 + 4] // src_yuy2 mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix + mov ecx, [esp + 4 + 16] // width vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx @@ -3800,11 +3898,11 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy, #ifdef HAS_YUY2TOYROW_SSE2 __declspec(naked) void YUY2ToYRow_SSE2(const uint8* src_yuy2, - uint8* dst_y, int pix) { + uint8* dst_y, int width) { __asm { mov eax, [esp + 4] // src_yuy2 mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // pix + mov ecx, [esp + 12] // width pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 @@ -3825,7 +3923,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, __declspec(naked) void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { __asm { push esi push edi @@ -3833,7 +3931,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, mov esi, [esp + 8 + 8] // stride_yuy2 mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix + mov ecx, [esp + 8 + 20] // width pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx @@ -3868,13 +3966,13 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, __declspec(naked) void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { __asm { push edi mov eax, [esp + 4 + 4] // src_yuy2 mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix + mov ecx, [esp + 4 + 16] // width pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx @@ -3904,11 +4002,11 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, __declspec(naked) void UYVYToYRow_SSE2(const uint8* src_uyvy, - uint8* dst_y, int pix) { + uint8* dst_y, int width) { __asm { mov eax, [esp + 4] // src_uyvy mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // pix + mov ecx, [esp + 12] // width convertloop: movdqu xmm0, [eax] @@ -3927,7 +4025,7 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, __declspec(naked) void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { __asm { push esi push edi @@ -3935,7 +4033,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, mov esi, [esp + 8 + 8] // stride_yuy2 mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix + mov ecx, [esp + 8 + 20] // width pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx @@ -3970,13 +4068,13 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, __declspec(naked) void UYVYToUV422Row_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { __asm { push edi mov eax, [esp + 4 + 4] // src_yuy2 mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix + mov ecx, [esp + 4 + 16] // width pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx @@ -4005,92 +4103,122 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, } #endif // HAS_YUY2TOYROW_SSE2 -#ifdef HAS_ARGBBLENDROW_SSE2 +#ifdef HAS_BLENDPLANEROW_SSSE3 // Blend 8 pixels at a time. +// unsigned version of math +// =((A2*C2)+(B2*(255-C2))+255)/256 +// signed version of math +// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 __declspec(naked) -void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, + const uint8* alpha, uint8* dst, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm7, xmm7 // generate constant 1 - psrlw xmm7, 15 - pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff - psrlw xmm6, 8 + push edi pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 psllw xmm5, 8 - pcmpeqb xmm4, xmm4 // generate mask 0xff000000 - pslld xmm4, 24 - sub ecx, 4 - jl convertloop4b // less than 4 pixels? - - // 4 pixel loop. - convertloop4: - movdqu xmm3, [eax] // src argb - lea eax, [eax + 16] - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movdqu xmm2, [esi] // _r_b - psrlw xmm3, 8 // alpha - pshufhw xmm3, xmm3, 0F5h // 8 alpha words - pshuflw xmm3, xmm3, 0F5h - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movdqu xmm1, [esi] // _a_g - lea esi, [esi + 16] - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jge convertloop4 - - convertloop4b: - add ecx, 4 - 1 - jl convertloop1b + mov eax, 0x80808080 // 128 for biasing image to signed. + movd xmm6, eax + pshufd xmm6, xmm6, 0x00 + + mov eax, 0x807f807f // 32768 + 127 for unbias and round. + movd xmm7, eax + pshufd xmm7, xmm7, 0x00 + mov eax, [esp + 8 + 4] // src0 + mov edx, [esp + 8 + 8] // src1 + mov esi, [esp + 8 + 12] // alpha + mov edi, [esp + 8 + 16] // dst + mov ecx, [esp + 8 + 20] // width + sub eax, esi + sub edx, esi + sub edi, esi - // 1 pixel loop. - convertloop1: - movd xmm3, [eax] // src argb - lea eax, [eax + 4] - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movd xmm2, [esi] // _r_b - psrlw xmm3, 8 // alpha - pshufhw xmm3, xmm3, 0F5h // 8 alpha words - pshuflw xmm3, xmm3, 0F5h - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movd xmm1, [esi] // _a_g - lea esi, [esi + 4] - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb - movd [edx], xmm0 - lea edx, [edx + 4] - sub ecx, 1 - jge convertloop1 + // 8 pixel loop. + convertloop8: + movq xmm0, qword ptr [esi] // alpha + punpcklbw xmm0, xmm0 + pxor xmm0, xmm5 // a, 255-a + movq xmm1, qword ptr [eax + esi] // src0 + movq xmm2, qword ptr [edx + esi] // src1 + punpcklbw xmm1, xmm2 + psubb xmm1, xmm6 // bias src0/1 - 128 + pmaddubsw xmm0, xmm1 + paddw xmm0, xmm7 // unbias result - 32768 and round. + psrlw xmm0, 8 + packuswb xmm0, xmm0 + movq qword ptr [edi + esi], xmm0 + lea esi, [esi + 8] + sub ecx, 8 + jg convertloop8 - convertloop1b: + pop edi pop esi ret } } -#endif // HAS_ARGBBLENDROW_SSE2 +#endif // HAS_BLENDPLANEROW_SSSE3 + +#ifdef HAS_BLENDPLANEROW_AVX2 +// Blend 32 pixels at a time. +// unsigned version of math +// =((A2*C2)+(B2*(255-C2))+255)/256 +// signed version of math +// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 +__declspec(naked) +void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, + const uint8* alpha, uint8* dst, int width) { + __asm { + push esi + push edi + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00 + vpsllw ymm5, ymm5, 8 + mov eax, 0x80808080 // 128 for biasing image to signed. + vmovd xmm6, eax + vbroadcastss ymm6, xmm6 + mov eax, 0x807f807f // 32768 + 127 for unbias and round. + vmovd xmm7, eax + vbroadcastss ymm7, xmm7 + mov eax, [esp + 8 + 4] // src0 + mov edx, [esp + 8 + 8] // src1 + mov esi, [esp + 8 + 12] // alpha + mov edi, [esp + 8 + 16] // dst + mov ecx, [esp + 8 + 20] // width + sub eax, esi + sub edx, esi + sub edi, esi + + // 32 pixel loop. + convertloop32: + vmovdqu ymm0, [esi] // alpha + vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31 + vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23 + vpxor ymm3, ymm3, ymm5 // a, 255-a + vpxor ymm0, ymm0, ymm5 // a, 255-a + vmovdqu ymm1, [eax + esi] // src0 + vmovdqu ymm2, [edx + esi] // src1 + vpunpckhbw ymm4, ymm1, ymm2 + vpunpcklbw ymm1, ymm1, ymm2 + vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128 + vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 + vpmaddubsw ymm3, ymm3, ymm4 + vpmaddubsw ymm0, ymm0, ymm1 + vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round. + vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. + vpsrlw ymm3, ymm3, 8 + vpsrlw ymm0, ymm0, 8 + vpackuswb ymm0, ymm0, ymm3 + vmovdqu [edi + esi], ymm0 + lea esi, [esi + 32] + sub ecx, 32 + jg convertloop32 + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_BLENDPLANEROW_AVX2 #ifdef HAS_ARGBBLENDROW_SSSE3 // Shuffle table for isolating alpha. @@ -4098,14 +4226,8 @@ static const uvec8 kShuffleAlpha = { 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 }; -// Same as SSE2, but replaces: -// psrlw xmm3, 8 // alpha -// pshufhw xmm3, xmm3, 0F5h // 8 alpha words -// pshuflw xmm3, xmm3, 0F5h -// with.. -// pshufb xmm3, kShuffleAlpha // alpha -// Blend 8 pixels at a time. +// Blend 8 pixels at a time. __declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { @@ -4133,7 +4255,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, movdqa xmm0, xmm3 // src argb pxor xmm3, xmm4 // ~alpha movdqu xmm2, [esi] // _r_b - pshufb xmm3, kShuffleAlpha // alpha + pshufb xmm3, xmmword ptr kShuffleAlpha // alpha pand xmm2, xmm6 // _r_b paddw xmm3, xmm7 // 256 - alpha pmullw xmm2, xmm3 // _r_b * alpha @@ -4162,7 +4284,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, movdqa xmm0, xmm3 // src argb pxor xmm3, xmm4 // ~alpha movd xmm2, [esi] // _r_b - pshufb xmm3, kShuffleAlpha // alpha + pshufb xmm3, xmmword ptr kShuffleAlpha // alpha pand xmm2, xmm6 // _r_b paddw xmm3, xmm7 // 256 - alpha pmullw xmm2, xmm3 // _r_b * alpha @@ -4187,48 +4309,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, } #endif // HAS_ARGBBLENDROW_SSSE3 -#ifdef HAS_ARGBATTENUATEROW_SSE2 -// Attenuate 4 pixels at a time. -__declspec(naked) -void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { - __asm { - mov eax, [esp + 4] // src_argb0 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - pcmpeqb xmm4, xmm4 // generate mask 0xff000000 - pslld xmm4, 24 - pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff - psrld xmm5, 8 - - convertloop: - movdqu xmm0, [eax] // read 4 pixels - punpcklbw xmm0, xmm0 // first 2 - pshufhw xmm2, xmm0, 0FFh // 8 alpha words - pshuflw xmm2, xmm2, 0FFh - pmulhuw xmm0, xmm2 // rgb * a - movdqu xmm1, [eax] // read 4 pixels - punpckhbw xmm1, xmm1 // next 2 pixels - pshufhw xmm2, xmm1, 0FFh // 8 alpha words - pshuflw xmm2, xmm2, 0FFh - pmulhuw xmm1, xmm2 // rgb * a - movdqu xmm2, [eax] // alphas - lea eax, [eax + 16] - psrlw xmm0, 8 - pand xmm2, xmm4 - psrlw xmm1, 8 - packuswb xmm0, xmm1 - pand xmm0, xmm5 // keep original alphas - por xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg convertloop - - ret - } -} -#endif // HAS_ARGBATTENUATEROW_SSE2 - #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha. static const uvec8 kShuffleAlpha0 = { @@ -4246,8 +4326,8 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { mov ecx, [esp + 12] // width pcmpeqb xmm3, xmm3 // generate mask 0xff000000 pslld xmm3, 24 - movdqa xmm4, kShuffleAlpha0 - movdqa xmm5, kShuffleAlpha1 + movdqa xmm4, xmmword ptr kShuffleAlpha0 + movdqa xmm5, xmmword ptr kShuffleAlpha1 convertloop: movdqu xmm0, [eax] // read 4 pixels @@ -4289,7 +4369,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax - vbroadcastf128 ymm4,kShuffleAlpha_AVX2 + vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 vpslld ymm5, ymm5, 24 @@ -4323,19 +4403,21 @@ __declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { __asm { + push ebx push esi push edi - mov eax, [esp + 8 + 4] // src_argb0 - mov edx, [esp + 8 + 8] // dst_argb - mov ecx, [esp + 8 + 12] // width + mov eax, [esp + 12 + 4] // src_argb + mov edx, [esp + 12 + 8] // dst_argb + mov ecx, [esp + 12 + 12] // width + lea ebx, fixed_invtbl8 convertloop: movdqu xmm0, [eax] // read 4 pixels movzx esi, byte ptr [eax + 3] // first alpha movzx edi, byte ptr [eax + 7] // second alpha punpcklbw xmm0, xmm0 // first 2 - movd xmm2, dword ptr fixed_invtbl8[esi * 4] - movd xmm3, dword ptr fixed_invtbl8[edi * 4] + movd xmm2, dword ptr [ebx + esi * 4] + movd xmm3, dword ptr [ebx + edi * 4] pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words movlhps xmm2, xmm3 @@ -4345,21 +4427,22 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, movzx esi, byte ptr [eax + 11] // third alpha movzx edi, byte ptr [eax + 15] // forth alpha punpckhbw xmm1, xmm1 // next 2 - movd xmm2, dword ptr fixed_invtbl8[esi * 4] - movd xmm3, dword ptr fixed_invtbl8[edi * 4] + movd xmm2, dword ptr [ebx + esi * 4] + movd xmm3, dword ptr [ebx + edi * 4] pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words movlhps xmm2, xmm3 pmulhuw xmm1, xmm2 // rgb * a lea eax, [eax + 16] - packuswb xmm0, xmm1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 jg convertloop + pop edi pop esi + pop ebx ret } } @@ -4381,7 +4464,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax - vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2 + vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2 convertloop: vmovdqu ymm6, [eax] // read 8 pixels. @@ -4412,36 +4495,37 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { __asm { - mov eax, [esp + 4] // src_argb0 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2 - + push ebx push esi push edi + mov eax, [esp + 12 + 4] // src_argb + mov edx, [esp + 12 + 8] // dst_argb + mov ecx, [esp + 12 + 12] // width + sub edx, eax + lea ebx, fixed_invtbl8 + vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2 convertloop: // replace VPGATHER movzx esi, byte ptr [eax + 3] // alpha0 movzx edi, byte ptr [eax + 7] // alpha1 - vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0] - vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1] + vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0] + vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1] movzx esi, byte ptr [eax + 11] // alpha2 movzx edi, byte ptr [eax + 15] // alpha3 vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] - vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2] - vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3] + vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2] + vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3] movzx esi, byte ptr [eax + 19] // alpha4 movzx edi, byte ptr [eax + 23] // alpha5 vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] - vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4] - vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5] + vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4] + vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5] movzx esi, byte ptr [eax + 27] // alpha6 movzx edi, byte ptr [eax + 31] // alpha7 vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] - vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6] - vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7] + vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6] + vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7] vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] @@ -4465,6 +4549,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, pop edi pop esi + pop ebx vzeroupper ret } @@ -4480,8 +4565,8 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_argb */ mov ecx, [esp + 12] /* width */ - movdqa xmm4, kARGBToYJ - movdqa xmm5, kAddYJ64 + movdqa xmm4, xmmword ptr kARGBToYJ + movdqa xmm5, xmmword ptr kAddYJ64 convertloop: movdqu xmm0, [eax] // G @@ -4538,9 +4623,9 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { __asm { mov eax, [esp + 4] /* dst_argb */ mov ecx, [esp + 8] /* width */ - movdqa xmm2, kARGBToSepiaB - movdqa xmm3, kARGBToSepiaG - movdqa xmm4, kARGBToSepiaR + movdqa xmm2, xmmword ptr kARGBToSepiaB + movdqa xmm3, xmmword ptr kARGBToSepiaG + movdqa xmm4, xmmword ptr kARGBToSepiaR convertloop: movdqu xmm0, [eax] // B @@ -5190,6 +5275,7 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // dst points to pixel to store result to. // count is number of averaged pixels to produce. // Does 4 pixels at a time. +// This function requires alignment on accumulation buffer pointers. void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, int width, int area, uint8* dst, int count) { @@ -5517,36 +5603,38 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) - shr eax, 1 // Dispatch to specialized filters if applicable. cmp eax, 0 - je xloop100 // 0 / 128. Blend 100 / 0. + je xloop100 // 0 / 256. Blend 100 / 0. sub edi, esi - cmp eax, 32 - je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. - cmp eax, 64 - je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. - cmp eax, 96 - je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. - - vmovd xmm0, eax // high fraction 0..127 + cmp eax, 128 + je xloop50 // 128 /256 is 0.50. Blend 50 / 50. + + vmovd xmm0, eax // high fraction 0..255 neg eax - add eax, 128 - vmovd xmm5, eax // low fraction 128..1 + add eax, 256 + vmovd xmm5, eax // low fraction 256..1 vpunpcklbw xmm5, xmm5, xmm0 vpunpcklwd xmm5, xmm5, xmm5 - vpxor ymm0, ymm0, ymm0 - vpermd ymm5, ymm0, ymm5 + vbroadcastss ymm5, xmm5 + + mov eax, 0x80808080 // 128b for bias and rounding. + vmovd xmm4, eax + vbroadcastss ymm4, xmm4 xloop: vmovdqu ymm0, [esi] vmovdqu ymm2, [esi + edx] vpunpckhbw ymm1, ymm0, ymm2 // mutates - vpunpcklbw ymm0, ymm0, ymm2 // mutates - vpmaddubsw ymm0, ymm0, ymm5 - vpmaddubsw ymm1, ymm1, ymm5 - vpsrlw ymm0, ymm0, 7 - vpsrlw ymm1, ymm1, 7 + vpunpcklbw ymm0, ymm0, ymm2 + vpsubb ymm1, ymm1, ymm4 // bias to signed image + vpsubb ymm0, ymm0, ymm4 + vpmaddubsw ymm1, ymm5, ymm1 + vpmaddubsw ymm0, ymm5, ymm0 + vpaddw ymm1, ymm1, ymm4 // unbias and round + vpaddw ymm0, ymm0, ymm4 + vpsrlw ymm1, ymm1, 8 + vpsrlw ymm0, ymm0, 8 vpackuswb ymm0, ymm0, ymm1 // unmutates vmovdqu [esi + edi], ymm0 lea esi, [esi + 32] @@ -5554,18 +5642,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, jg xloop jmp xloop99 - // Blend 25 / 75. - xloop25: - vmovdqu ymm0, [esi] - vmovdqu ymm1, [esi + edx] - vpavgb ymm0, ymm0, ymm1 - vpavgb ymm0, ymm0, ymm1 - vmovdqu [esi + edi], ymm0 - lea esi, [esi + 32] - sub ecx, 32 - jg xloop25 - jmp xloop99 - // Blend 50 / 50. xloop50: vmovdqu ymm0, [esi] @@ -5576,18 +5652,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, jg xloop50 jmp xloop99 - // Blend 75 / 25. - xloop75: - vmovdqu ymm1, [esi] - vmovdqu ymm0, [esi + edx] - vpavgb ymm0, ymm0, ymm1 - vpavgb ymm0, ymm0, ymm1 - vmovdqu [esi + edi], ymm0 - lea esi, [esi + 32] - sub ecx, 32 - jg xloop75 - jmp xloop99 - // Blend 100 / 0 - Copy row unchanged. xloop100: rep movsb @@ -5602,6 +5666,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, #endif // HAS_INTERPOLATEROW_AVX2 // Bilinear filter 16x2 -> 16x1 +// TODO(fbarchard): Consider allowing 256 using memcpy. __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, @@ -5609,30 +5674,29 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, __asm { push esi push edi + mov edi, [esp + 8 + 4] // dst_ptr mov esi, [esp + 8 + 8] // src_ptr mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) sub edi, esi - shr eax, 1 // Dispatch to specialized filters if applicable. cmp eax, 0 - je xloop100 // 0 / 128. Blend 100 / 0. - cmp eax, 32 - je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. - cmp eax, 64 - je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. - cmp eax, 96 - je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. - - movd xmm0, eax // high fraction 0..127 + je xloop100 // 0 /256. Blend 100 / 0. + cmp eax, 128 + je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. + + movd xmm0, eax // high fraction 0..255 neg eax - add eax, 128 - movd xmm5, eax // low fraction 128..1 + add eax, 256 + movd xmm5, eax // low fraction 255..1 punpcklbw xmm5, xmm0 punpcklwd xmm5, xmm5 pshufd xmm5, xmm5, 0 + mov eax, 0x80808080 // 128 for biasing image to signed. + movd xmm4, eax + pshufd xmm4, xmm4, 0x00 xloop: movdqu xmm0, [esi] @@ -5640,136 +5704,23 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, movdqu xmm1, xmm0 punpcklbw xmm0, xmm2 punpckhbw xmm1, xmm2 - pmaddubsw xmm0, xmm5 - pmaddubsw xmm1, xmm5 - psrlw xmm0, 7 - psrlw xmm1, 7 - packuswb xmm0, xmm1 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - sub ecx, 16 - jg xloop - jmp xloop99 - - // Blend 25 / 75. - xloop25: - movdqu xmm0, [esi] - movdqu xmm1, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - sub ecx, 16 - jg xloop25 - jmp xloop99 - - // Blend 50 / 50. - xloop50: - movdqu xmm0, [esi] - movdqu xmm1, [esi + edx] - pavgb xmm0, xmm1 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - sub ecx, 16 - jg xloop50 - jmp xloop99 - - // Blend 75 / 25. - xloop75: - movdqu xmm1, [esi] - movdqu xmm0, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - sub ecx, 16 - jg xloop75 - jmp xloop99 - - // Blend 100 / 0 - Copy row unchanged. - xloop100: - movdqu xmm0, [esi] - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - sub ecx, 16 - jg xloop100 - - xloop99: - pop edi - pop esi - ret - } -} - -#ifdef HAS_INTERPOLATEROW_SSE2 -// Bilinear filter 16x2 -> 16x1 -__declspec(naked) -void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { - __asm { - push esi - push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr - mov edx, [esp + 8 + 12] // src_stride - mov ecx, [esp + 8 + 16] // dst_width - mov eax, [esp + 8 + 20] // source_y_fraction (0..255) - sub edi, esi - // Dispatch to specialized filters if applicable. - cmp eax, 0 - je xloop100 // 0 / 256. Blend 100 / 0. - cmp eax, 64 - je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. - cmp eax, 128 - je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. - cmp eax, 192 - je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. - - movd xmm5, eax // xmm5 = y fraction - punpcklbw xmm5, xmm5 - psrlw xmm5, 1 - punpcklwd xmm5, xmm5 - punpckldq xmm5, xmm5 - punpcklqdq xmm5, xmm5 - pxor xmm4, xmm4 - - xloop: - movdqu xmm0, [esi] // row0 - movdqu xmm2, [esi + edx] // row1 - movdqu xmm1, xmm0 - movdqu xmm3, xmm2 - punpcklbw xmm2, xmm4 - punpckhbw xmm3, xmm4 - punpcklbw xmm0, xmm4 - punpckhbw xmm1, xmm4 - psubw xmm2, xmm0 // row1 - row0 - psubw xmm3, xmm1 - paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 - paddw xmm3, xmm3 - pmulhw xmm2, xmm5 // scale diff - pmulhw xmm3, xmm5 - paddw xmm0, xmm2 // sum rows - paddw xmm1, xmm3 - packuswb xmm0, xmm1 - movdqu [esi + edi], xmm0 + psubb xmm0, xmm4 // bias image by -128 + psubb xmm1, xmm4 + movdqa xmm2, xmm5 + movdqa xmm3, xmm5 + pmaddubsw xmm2, xmm0 + pmaddubsw xmm3, xmm1 + paddw xmm2, xmm4 + paddw xmm3, xmm4 + psrlw xmm2, 8 + psrlw xmm3, 8 + packuswb xmm2, xmm3 + movdqu [esi + edi], xmm2 lea esi, [esi + 16] sub ecx, 16 jg xloop jmp xloop99 - // Blend 25 / 75. - xloop25: - movdqu xmm0, [esi] - movdqu xmm1, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - sub ecx, 16 - jg xloop25 - jmp xloop99 - // Blend 50 / 50. xloop50: movdqu xmm0, [esi] @@ -5781,18 +5732,6 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, jg xloop50 jmp xloop99 - // Blend 75 / 25. - xloop75: - movdqu xmm1, [esi] - movdqu xmm0, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - sub ecx, 16 - jg xloop75 - jmp xloop99 - // Blend 100 / 0 - Copy row unchanged. xloop100: movdqu xmm0, [esi] @@ -5807,18 +5746,17 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, ret } } -#endif // HAS_INTERPOLATEROW_SSE2 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix) { + const uint8* shuffler, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // shuffler movdqu xmm5, [ecx] - mov ecx, [esp + 16] // pix + mov ecx, [esp + 16] // width wloop: movdqu xmm0, [eax] @@ -5838,13 +5776,13 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBSHUFFLEROW_AVX2 __declspec(naked) void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix) { + const uint8* shuffler, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // shuffler vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. - mov ecx, [esp + 16] // pix + mov ecx, [esp + 16] // width wloop: vmovdqu ymm0, [eax] @@ -5866,14 +5804,14 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, __declspec(naked) void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix) { + const uint8* shuffler, int width) { __asm { push ebx push esi mov eax, [esp + 8 + 4] // src_argb mov edx, [esp + 8 + 8] // dst_argb mov esi, [esp + 8 + 12] // shuffler - mov ecx, [esp + 8 + 16] // pix + mov ecx, [esp + 8 + 16] // width pxor xmm5, xmm5 mov ebx, [esi] // shuffler @@ -6245,7 +6183,7 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, // 4 pixel loop. convertloop: - movdqu xmm0, qword ptr [eax] // generate luma ptr + movdqu xmm0, xmmword ptr [eax] // generate luma ptr pmaddubsw xmm0, xmm3 phaddw xmm0, xmm0 pand xmm0, xmm4 // mask out low bits diff --git a/third_party/libyuv/source/row_x86.asm b/third_party/libyuv/source/row_x86.asm deleted file mode 100644 index 0cb326f8e..000000000 --- a/third_party/libyuv/source/row_x86.asm +++ /dev/null @@ -1,146 +0,0 @@ -; -; Copyright 2012 The LibYuv Project Authors. All rights reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%ifdef __YASM_VERSION_ID__ -%if __YASM_VERSION_ID__ < 01020000h -%error AVX2 is supported only by yasm 1.2.0 or later. -%endif -%endif -%include "x86inc.asm" - -SECTION .text - -; cglobal numeric constants are parameters, gpr regs, mm regs - -; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) - -%macro YUY2TOYROW 2-3 -cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix -%ifidn %1,YUY2 - pcmpeqb m2, m2, m2 ; generate mask 0x00ff00ff - psrlw m2, m2, 8 -%endif - - ALIGN 4 -.convertloop: - mov%2 m0, [src_yuy2q] - mov%2 m1, [src_yuy2q + mmsize] - lea src_yuy2q, [src_yuy2q + mmsize * 2] -%ifidn %1,YUY2 - pand m0, m0, m2 ; YUY2 even bytes are Y - pand m1, m1, m2 -%else - psrlw m0, m0, 8 ; UYVY odd bytes are Y - psrlw m1, m1, 8 -%endif - packuswb m0, m0, m1 -%if cpuflag(AVX2) - vpermq m0, m0, 0xd8 -%endif - sub pixd, mmsize - mov%2 [dst_yq], m0 - lea dst_yq, [dst_yq + mmsize] - jg .convertloop - REP_RET -%endmacro - -; TODO(fbarchard): Remove MMX. Add SSSE3 pshufb version. -INIT_MMX MMX -YUY2TOYROW YUY2,a, -YUY2TOYROW YUY2,u,_Unaligned -YUY2TOYROW UYVY,a, -YUY2TOYROW UYVY,u,_Unaligned -INIT_XMM SSE2 -YUY2TOYROW YUY2,a, -YUY2TOYROW YUY2,u,_Unaligned -YUY2TOYROW UYVY,a, -YUY2TOYROW UYVY,u,_Unaligned -INIT_YMM AVX2 -YUY2TOYROW YUY2,a, -YUY2TOYROW UYVY,a, - -; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) - -%macro SplitUVRow 1-2 -cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix - pcmpeqb m4, m4, m4 ; generate mask 0x00ff00ff - psrlw m4, m4, 8 - sub dst_vq, dst_uq - - ALIGN 4 -.convertloop: - mov%1 m0, [src_uvq] - mov%1 m1, [src_uvq + mmsize] - lea src_uvq, [src_uvq + mmsize * 2] - psrlw m2, m0, 8 ; odd bytes - psrlw m3, m1, 8 - pand m0, m0, m4 ; even bytes - pand m1, m1, m4 - packuswb m0, m0, m1 - packuswb m2, m2, m3 -%if cpuflag(AVX2) - vpermq m0, m0, 0xd8 - vpermq m2, m2, 0xd8 -%endif - mov%1 [dst_uq], m0 - mov%1 [dst_uq + dst_vq], m2 - lea dst_uq, [dst_uq + mmsize] - sub pixd, mmsize - jg .convertloop - REP_RET -%endmacro - -INIT_MMX MMX -SplitUVRow a, -SplitUVRow u,_Unaligned -INIT_XMM SSE2 -SplitUVRow a, -SplitUVRow u,_Unaligned -INIT_YMM AVX2 -SplitUVRow a, - -; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, -; int width); - -%macro MergeUVRow_ 1-2 -cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix - sub src_vq, src_uq - - ALIGN 4 -.convertloop: - mov%1 m0, [src_uq] - mov%1 m1, [src_vq] - lea src_uq, [src_uq + mmsize] - punpcklbw m2, m0, m1 // first 8 UV pairs - punpckhbw m0, m0, m1 // next 8 UV pairs -%if cpuflag(AVX2) - vperm2i128 m1, m2, m0, 0x20 // low 128 of ymm2 and low 128 of ymm0 - vperm2i128 m2, m2, m0, 0x31 // high 128 of ymm2 and high 128 of ymm0 - mov%1 [dst_uvq], m1 - mov%1 [dst_uvq + mmsize], m2 -%else - mov%1 [dst_uvq], m2 - mov%1 [dst_uvq + mmsize], m0 -%endif - lea dst_uvq, [dst_uvq + mmsize * 2] - sub pixd, mmsize - jg .convertloop - REP_RET -%endmacro - -INIT_MMX MMX -MergeUVRow_ a, -MergeUVRow_ u,_Unaligned -INIT_XMM SSE2 -MergeUVRow_ a, -MergeUVRow_ u,_Unaligned -INIT_YMM AVX2 -MergeUVRow_ a, - diff --git a/third_party/libyuv/source/scale.cc b/third_party/libyuv/source/scale.cc index 0a01304c4..36e3fe528 100644 --- a/third_party/libyuv/source/scale.cc +++ b/third_party/libyuv/source/scale.cc @@ -61,15 +61,15 @@ static void ScalePlaneDown2(int src_width, int src_height, } } #endif -#if defined(HAS_SCALEROWDOWN2_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSE2 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSE2 : - ScaleRowDown2Box_Any_SSE2); +#if defined(HAS_SCALEROWDOWN2_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSSE3 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 : + ScaleRowDown2Box_Any_SSSE3); if (IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 : - ScaleRowDown2Box_SSE2); + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSSE3 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 : + ScaleRowDown2Box_SSSE3); } } #endif @@ -85,12 +85,12 @@ static void ScalePlaneDown2(int src_width, int src_height, } } #endif -#if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) && +#if defined(HAS_SCALEROWDOWN2_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { ScaleRowDown2 = filtering ? - ScaleRowDown2Box_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2; + ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2; } #endif @@ -135,12 +135,12 @@ static void ScalePlaneDown2_16(int src_width, int src_height, ScaleRowDown2Box_16_SSE2); } #endif -#if defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) && +#if defined(HAS_SCALEROWDOWN2_16_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { ScaleRowDown2 = filtering ? - ScaleRowDown2Box_16_MIPS_DSPR2 : ScaleRowDown2_16_MIPS_DSPR2; + ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2; } #endif @@ -182,12 +182,12 @@ static void ScalePlaneDown4(int src_width, int src_height, } } #endif -#if defined(HAS_SCALEROWDOWN4_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { +#if defined(HAS_SCALEROWDOWN4_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { ScaleRowDown4 = filtering ? - ScaleRowDown4Box_Any_SSE2 : ScaleRowDown4_Any_SSE2; + ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3; if (IS_ALIGNED(dst_width, 8)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2; + ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3; } } #endif @@ -200,12 +200,12 @@ static void ScalePlaneDown4(int src_width, int src_height, } } #endif -#if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) && +#if defined(HAS_SCALEROWDOWN4_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { ScaleRowDown4 = filtering ? - ScaleRowDown4Box_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2; + ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2; } #endif @@ -245,12 +245,12 @@ static void ScalePlaneDown4_16(int src_width, int src_height, ScaleRowDown4_16_SSE2; } #endif -#if defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) && +#if defined(HAS_SCALEROWDOWN4_16_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { ScaleRowDown4 = filtering ? - ScaleRowDown4Box_16_MIPS_DSPR2 : ScaleRowDown4_16_MIPS_DSPR2; + ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2; } #endif @@ -325,16 +325,16 @@ static void ScalePlaneDown34(int src_width, int src_height, } } #endif -#if defined(HAS_SCALEROWDOWN34_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) && +#if defined(HAS_SCALEROWDOWN34_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2; - ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2; + ScaleRowDown34_0 = ScaleRowDown34_DSPR2; + ScaleRowDown34_1 = ScaleRowDown34_DSPR2; } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_MIPS_DSPR2; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_MIPS_DSPR2; + ScaleRowDown34_0 = ScaleRowDown34_0_Box_DSPR2; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_DSPR2; } } #endif @@ -404,16 +404,16 @@ static void ScalePlaneDown34_16(int src_width, int src_height, } } #endif -#if defined(HAS_SCALEROWDOWN34_16_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) && +#if defined(HAS_SCALEROWDOWN34_16_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_16_MIPS_DSPR2; - ScaleRowDown34_1 = ScaleRowDown34_16_MIPS_DSPR2; + ScaleRowDown34_0 = ScaleRowDown34_16_DSPR2; + ScaleRowDown34_1 = ScaleRowDown34_16_DSPR2; } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_MIPS_DSPR2; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_MIPS_DSPR2; + ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_DSPR2; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_DSPR2; } } #endif @@ -517,16 +517,16 @@ static void ScalePlaneDown38(int src_width, int src_height, } } #endif -#if defined(HAS_SCALEROWDOWN38_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) && +#if defined(HAS_SCALEROWDOWN38_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2; - ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2; + ScaleRowDown38_3 = ScaleRowDown38_DSPR2; + ScaleRowDown38_2 = ScaleRowDown38_DSPR2; } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_MIPS_DSPR2; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_MIPS_DSPR2; + ScaleRowDown38_3 = ScaleRowDown38_3_Box_DSPR2; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_DSPR2; } } #endif @@ -595,16 +595,16 @@ static void ScalePlaneDown38_16(int src_width, int src_height, } } #endif -#if defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) && +#if defined(HAS_SCALEROWDOWN38_16_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_16_MIPS_DSPR2; - ScaleRowDown38_2 = ScaleRowDown38_16_MIPS_DSPR2; + ScaleRowDown38_3 = ScaleRowDown38_16_DSPR2; + ScaleRowDown38_2 = ScaleRowDown38_16_DSPR2; } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_MIPS_DSPR2; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_MIPS_DSPR2; + ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_DSPR2; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_DSPR2; } } #endif @@ -659,7 +659,6 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx, int i; int scaletbl[2]; int minboxwidth = dx >> 16; - int* scaleptr = scaletbl - minboxwidth; int boxwidth; scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight); scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight); @@ -667,7 +666,8 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx, int ix = x >> 16; x += dx; boxwidth = MIN1((x >> 16) - ix); - *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16; + *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * + scaletbl[boxwidth - minboxwidth] >> 16; } } @@ -676,7 +676,6 @@ static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx, int i; int scaletbl[2]; int minboxwidth = dx >> 16; - int* scaleptr = scaletbl - minboxwidth; int boxwidth; scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight); scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight); @@ -684,8 +683,8 @@ static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx, int ix = x >> 16; x += dx; boxwidth = MIN1((x >> 16) - ix); - *dst_ptr++ = - SumPixels_16(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16; + *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) * + scaletbl[boxwidth - minboxwidth] >> 16; } } @@ -875,14 +874,6 @@ void ScalePlaneBilinearDown(int src_width, int src_height, &x, &y, &dx, &dy); src_width = Abs(src_width); -#if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_SSE2; - if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } - } -#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -907,11 +898,11 @@ void ScalePlaneBilinearDown(int src_width, int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { - InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; +#if defined(HAS_INTERPOLATEROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + InterpolateRow = InterpolateRow_Any_DSPR2; if (IS_ALIGNED(src_width, 4)) { - InterpolateRow = InterpolateRow_MIPS_DSPR2; + InterpolateRow = InterpolateRow_DSPR2; } } #endif @@ -1011,11 +1002,11 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { - InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2; +#if defined(HAS_INTERPOLATEROW_16_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + InterpolateRow = InterpolateRow_Any_16_DSPR2; if (IS_ALIGNED(src_width, 4)) { - InterpolateRow = InterpolateRow_16_MIPS_DSPR2; + InterpolateRow = InterpolateRow_16_DSPR2; } } #endif @@ -1072,14 +1063,6 @@ void ScalePlaneBilinearUp(int src_width, int src_height, &x, &y, &dx, &dy); src_width = Abs(src_width); -#if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_SSE2; - if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } - } -#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -1104,11 +1087,11 @@ void ScalePlaneBilinearUp(int src_width, int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { - InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; +#if defined(HAS_INTERPOLATEROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + InterpolateRow = InterpolateRow_Any_DSPR2; if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_MIPS_DSPR2; + InterpolateRow = InterpolateRow_DSPR2; } } #endif @@ -1243,11 +1226,11 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { - InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2; +#if defined(HAS_INTERPOLATEROW_16_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + InterpolateRow = InterpolateRow_Any_16_DSPR2; if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_16_MIPS_DSPR2; + InterpolateRow = InterpolateRow_16_DSPR2; } } #endif diff --git a/third_party/libyuv/source/scale_any.cc b/third_party/libyuv/source/scale_any.cc index 2f6a2c8ba..ed76a9e4c 100644 --- a/third_party/libyuv/source/scale_any.cc +++ b/third_party/libyuv/source/scale_any.cc @@ -55,12 +55,29 @@ CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON, dst_ptr + n * BPP, r); \ } -#ifdef HAS_SCALEROWDOWN2_SSE2 -SDANY(ScaleRowDown2_Any_SSE2, ScaleRowDown2_SSE2, ScaleRowDown2_C, 2, 1, 15) -SDANY(ScaleRowDown2Linear_Any_SSE2, ScaleRowDown2Linear_SSE2, +// Fixed scale down for odd source width. Used by I420Blend subsampling. +// Since dst_width is (width + 1) / 2, this function scales one less pixel +// and copies the last pixel. +#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \ + uint8* dst_ptr, int dst_width) { \ + int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); \ + int n = dst_width - r; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ + dst_ptr + n * BPP, r); \ + } + +#ifdef HAS_SCALEROWDOWN2_SSSE3 +SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15) +SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3, ScaleRowDown2Linear_C, 2, 1, 15) -SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C, +SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C, 2, 1, 15) +SDODD(ScaleRowDown2Box_Odd_SSSE3, ScaleRowDown2Box_SSSE3, + ScaleRowDown2Box_Odd_C, 2, 1, 15) #endif #ifdef HAS_SCALEROWDOWN2_AVX2 SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31) @@ -68,6 +85,8 @@ SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2, ScaleRowDown2Linear_C, 2, 1, 31) SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C, 2, 1, 31) +SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C, + 2, 1, 31) #endif #ifdef HAS_SCALEROWDOWN2_NEON SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15) @@ -75,10 +94,12 @@ SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON, ScaleRowDown2Linear_C, 2, 1, 15) SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON, ScaleRowDown2Box_C, 2, 1, 15) +SDODD(ScaleRowDown2Box_Odd_NEON, ScaleRowDown2Box_NEON, + ScaleRowDown2Box_Odd_C, 2, 1, 15) #endif -#ifdef HAS_SCALEROWDOWN4_SSE2 -SDANY(ScaleRowDown4_Any_SSE2, ScaleRowDown4_SSE2, ScaleRowDown4_C, 4, 1, 7) -SDANY(ScaleRowDown4Box_Any_SSE2, ScaleRowDown4Box_SSE2, ScaleRowDown4Box_C, +#ifdef HAS_SCALEROWDOWN4_SSSE3 +SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7) +SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C, 4, 1, 7) #endif #ifdef HAS_SCALEROWDOWN4_AVX2 diff --git a/third_party/libyuv/source/scale_argb.cc b/third_party/libyuv/source/scale_argb.cc index 40a2d1ab2..17f51ae9b 100644 --- a/third_party/libyuv/source/scale_argb.cc +++ b/third_party/libyuv/source/scale_argb.cc @@ -210,14 +210,6 @@ static void ScaleARGBBilinearDown(int src_width, int src_height, clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4. src_argb += xl * 4; x -= (int)(xl << 16); -#if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_SSE2; - if (IS_ALIGNED(clip_src_width, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } - } -#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -242,12 +234,12 @@ static void ScaleARGBBilinearDown(int src_width, int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && +#if defined(HAS_INTERPOLATEROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) { - InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; + InterpolateRow = InterpolateRow_Any_DSPR2; if (IS_ALIGNED(clip_src_width, 4)) { - InterpolateRow = InterpolateRow_MIPS_DSPR2; + InterpolateRow = InterpolateRow_DSPR2; } } #endif @@ -308,14 +300,6 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, int dst_width, int x, int dx) = filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; const int max_y = (src_height - 1) << 16; -#if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_SSE2; - if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_SSE2; - } - } -#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -340,10 +324,10 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && +#if defined(HAS_INTERPOLATEROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { - InterpolateRow = InterpolateRow_MIPS_DSPR2; + InterpolateRow = InterpolateRow_DSPR2; } #endif if (src_width >= 32768) { @@ -481,27 +465,19 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, } } #endif -#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) && +#if defined(HAS_I422TOARGBROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_width, 4) && IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + I422ToARGBRow = I422ToARGBRow_DSPR2; } #endif void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; -#if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_SSE2; - if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_SSE2; - } - } -#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -526,10 +502,10 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && +#if defined(HAS_INTERPOLATEROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - InterpolateRow = InterpolateRow_MIPS_DSPR2; + InterpolateRow = InterpolateRow_DSPR2; } #endif @@ -847,6 +823,36 @@ int ARGBScale(const uint8* src_argb, int src_stride_argb, return 0; } +// Scale with YUV conversion to ARGB and clipping. +LIBYUV_API +int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint32 src_fourcc, + int src_width, int src_height, + uint8* dst_argb, int dst_stride_argb, + uint32 dst_fourcc, + int dst_width, int dst_height, + int clip_x, int clip_y, int clip_width, int clip_height, + enum FilterMode filtering) { + uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4); + int r; + I420ToARGB(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + argb_buffer, src_width * 4, + src_width, src_height); + + r = ARGBScaleClip(argb_buffer, src_width * 4, + src_width, src_height, + dst_argb, dst_stride_argb, + dst_width, dst_height, + clip_x, clip_y, clip_width, clip_height, + filtering); + free(argb_buffer); + return r; +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/third_party/libyuv/source/scale_common.cc b/third_party/libyuv/source/scale_common.cc index 1711f3d54..3507aa4d9 100644 --- a/third_party/libyuv/source/scale_common.cc +++ b/third_party/libyuv/source/scale_common.cc @@ -103,6 +103,28 @@ void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, } } +void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + const uint8* s = src_ptr; + const uint8* t = src_ptr + src_stride; + int x; + dst_width -= 1; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; + dst += 2; + s += 4; + t += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + dst += 1; + s += 2; + t += 2; + } + dst[0] = (s[0] + t[0] + 1) >> 1; +} + void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, uint16* dst, int dst_width) { const uint16* s = src_ptr; @@ -395,8 +417,14 @@ void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr, } // (1-f)a + fb can be replaced with a + f(b-a) +#if defined(__arm__) || defined(__aarch64__) +#define BLENDER(a, b, f) (uint8)((int)(a) + \ + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) +#else +// inteluses 7 bit math with rounding. #define BLENDER(a, b, f) (uint8)((int)(a) + \ - ((int)(f) * ((int)(b) - (int)(a)) >> 16)) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7)) +#endif void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, int dx) { @@ -448,8 +476,9 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr, } #undef BLENDER +// Same as 8 bit arm blender but return is cast to uint16 #define BLENDER(a, b, f) (uint16)((int)(a) + \ - ((int)(f) * ((int)(b) - (int)(a)) >> 16)) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr, int dst_width, int x, int dx) { @@ -787,6 +816,7 @@ void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb, } } +// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607. // Mimics SSSE3 blender #define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7 #define BLENDERC(a, b, f, s) (uint32)( \ @@ -876,14 +906,6 @@ void ScalePlaneVertical(int src_height, assert(dst_width > 0); assert(dst_height > 0); src_argb += (x >> 16) * bpp; -#if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_SSE2; - if (IS_ALIGNED(dst_width_bytes, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } - } -#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -908,13 +930,13 @@ void ScalePlaneVertical(int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && +#if defined(HAS_INTERPOLATEROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { - InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; + InterpolateRow = InterpolateRow_Any_DSPR2; if (IS_ALIGNED(dst_width_bytes, 4)) { - InterpolateRow = InterpolateRow_MIPS_DSPR2; + InterpolateRow = InterpolateRow_DSPR2; } } #endif @@ -982,13 +1004,13 @@ void ScalePlaneVertical_16(int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && +#if defined(HAS_INTERPOLATEROW_16_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { - InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2; + InterpolateRow = InterpolateRow_Any_16_DSPR2; if (IS_ALIGNED(dst_width_bytes, 4)) { - InterpolateRow = InterpolateRow_16_MIPS_DSPR2; + InterpolateRow = InterpolateRow_16_DSPR2; } } #endif diff --git a/third_party/libyuv/source/scale_gcc.cc b/third_party/libyuv/source/scale_gcc.cc index 8a6ac5459..e2f88544b 100644 --- a/third_party/libyuv/source/scale_gcc.cc +++ b/third_party/libyuv/source/scale_gcc.cc @@ -9,6 +9,7 @@ */ #include "libyuv/row.h" +#include "libyuv/scale_row.h" #ifdef __cplusplus namespace libyuv { @@ -16,7 +17,8 @@ extern "C" { #endif // This module is for GCC x86 and x64. -#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) // Offsets for source bytes 0 to 9 static uvec8 kShuf0 = @@ -96,8 +98,8 @@ static uvec16 kScaleAb2 = // Generated using gcc disassembly on Visual C object file: // objdump -D yuvscaler.obj >yuvscaler.txt -void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { asm volatile ( LABELALIGN "1: \n" @@ -118,26 +120,24 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psrlw $0x8,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm5,%%xmm3 \n" - "pavgw %%xmm2,%%xmm0 \n" - "pavgw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "sub $0x10,%2 \n" @@ -145,15 +145,17 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm5" + :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5" ); } -void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" @@ -162,17 +164,17 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psrlw $0x8,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm5,%%xmm3 \n" - "pavgw %%xmm2,%%xmm0 \n" - "pavgw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "psrlw $0x1,%%xmm0 \n" + "psrlw $0x1,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "sub $0x10,%2 \n" @@ -186,7 +188,105 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, +#ifdef HAS_SCALEROWDOWN2_AVX2 +void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + :: "memory", "cc", "xmm0", "xmm1" + ); +} + +void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20, 0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5" + ); +} + +void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2 + MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3 + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vpsrlw $0x1,%%ymm0,%%ymm0 \n" + "vpsrlw $0x1,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} +#endif // HAS_SCALEROWDOWN2_AVX2 + +void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" @@ -214,12 +314,15 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { - intptr_t stridex3 = 0; + intptr_t stridex3; asm volatile ( - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $0x8,%%xmm7 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "packuswb %%xmm4,%%xmm4 \n" + "psllw $0x3,%%xmm5 \n" "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" LABELALIGN @@ -228,31 +331,29 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2 MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3 - MEMOPREG(movdqu,0x00,0,3,1,xmm4) // movdqu (%0,%3,1),%%xmm4 - MEMOPREG(movdqu,0x10,0,3,1,xmm5) // movdqu 0x10(%0,%3,1),%%xmm5 + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 + MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm4,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm5,%%xmm3 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psrlw $0x8,%%xmm1 \n" - "pand %%xmm7,%%xmm2 \n" - "pand %%xmm7,%%xmm3 \n" - "pavgw %%xmm2,%%xmm0 \n" - "pavgw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "pand %%xmm7,%%xmm2 \n" - "pavgw %%xmm2,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm0 \n" + "paddw %%xmm5,%%xmm0 \n" + "psrlw $0x4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" "movq %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x8,1) ",%1 \n" "sub $0x8,%2 \n" @@ -260,13 +361,100 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 - "+r"(stridex3) // %3 + "=&r"(stridex3) // %3 : "r"((intptr_t)(src_stride)) // %4 : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } + +#ifdef HAS_SCALEROWDOWN4_AVX2 +void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrld $0x18,%%ymm5,%%ymm5 \n" + "vpslld $0x10,%%ymm5,%%ymm5 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + :: "memory", "cc", "xmm0", "xmm1", "xmm5" + ); +} + +void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpsllw $0x3,%%ymm4,%%ymm5 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2 + MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3 + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + MEMOPREG(vmovdqu,0x00,0,3,2,ymm2) // vmovdqu (%0,%3,2),%%ymm2 + MEMOPREG(vmovdqu,0x20,0,3,2,ymm3) // vmovdqu 0x20(%0,%3,2),%%ymm3 + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + MEMOPREG(vmovdqu,0x00,0,4,1,ymm2) // vmovdqu (%0,%4,1),%%ymm2 + MEMOPREG(vmovdqu,0x20,0,4,1,ymm3) // vmovdqu 0x20(%0,%4,1),%%ymm3 + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpsrlw $0x4,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(src_stride * 3)) // %4 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_SCALEROWDOWN4_AVX2 + void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( @@ -574,61 +762,89 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, } // Reads 16xN bytes and produces 16 shorts at a time. -void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height) { - int tmp_height = 0; - intptr_t tmp_src = 0; +void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { asm volatile ( - "mov %0,%3 \n" // row pointer - "mov %5,%2 \n" // height - "pxor %%xmm0,%%xmm0 \n" // clear accumulators - "pxor %%xmm1,%%xmm1 \n" - "pxor %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu " MEMACCESS(3) ",%%xmm2 \n" - "add %6,%3 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm4,%%xmm2 \n" - "punpckhbw %%xmm4,%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16 + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,1) ",%%xmm1 \n" + "movdqa %%xmm3,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" "paddusw %%xmm2,%%xmm0 \n" "paddusw %%xmm3,%%xmm1 \n" - "sub $0x1,%2 \n" - "jg 1b \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16 - "mov %0,%3 \n" // row pointer - "mov %5,%2 \n" // height - "pxor %%xmm0,%%xmm0 \n" // clear accumulators - "pxor %%xmm1,%%xmm1 \n" - "sub $0x10,%4 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} + + +#ifdef HAS_SCALEADDROW_AVX2 +// Reads 32 bytes and accumulates to 32 shorts at a time. +void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { + asm volatile ( + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm3 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" // src_ptr += 32 + "vpermq $0xd8,%%ymm3,%%ymm3 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpaddusw " MEMACCESS(1) ",%%ymm2,%%ymm0 \n" + "vpaddusw " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x20,%2 \n" "jg 1b \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 - "+r"(tmp_height), // %2 - "+r"(tmp_src), // %3 - "+r"(src_width), // %4 - "+rm"(src_height) // %5 - : "rm"((intptr_t)(src_stride)) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" + "+r"(src_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" ); } +#endif // HAS_SCALEADDROW_AVX2 + +// Constant for making pixels signed to avoid pmaddubsw +// saturation. +static uvec8 kFsub80 = + { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; + +// Constant for making pixels unsigned and adding .5 for rounding. +static uvec16 kFadd40 = + { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 }; // Bilinear column filtering. SSSE3 version. void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, int dx) { - intptr_t x0 = 0, x1 = 0, temp_pixel = 0; + intptr_t x0, x1, temp_pixel; asm volatile ( "movd %6,%%xmm2 \n" "movd %7,%%xmm3 \n" "movl $0x04040000,%k2 \n" "movd %k2,%%xmm5 \n" "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" // 0x007f007f + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $15,%%xmm7 \n" // 0x00010001 + "pextrw $0x1,%%xmm2,%k3 \n" "subl $0x2,%5 \n" "jl 29f \n" @@ -650,16 +866,19 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "movd %k2,%%xmm4 \n" "pshufb %%xmm5,%%xmm1 \n" "punpcklwd %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" + "psubb %8,%%xmm0 \n" // make pixels signed. + "pxor %%xmm6,%%xmm1 \n" // 128 -f = (f ^ 127 ) + 1 + "paddusb %%xmm7,%%xmm1 \n" + "pmaddubsw %%xmm0,%%xmm1 \n" "pextrw $0x1,%%xmm2,%k3 \n" "pextrw $0x3,%%xmm2,%k4 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,%k2 \n" + "paddw %9,%%xmm1 \n" // make pixels unsigned. + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movd %%xmm1,%k2 \n" "mov %w2," MEMACCESS(0) " \n" "lea " MEMLEA(0x2,0) ",%0 \n" - "sub $0x2,%5 \n" + "subl $0x2,%5 \n" "jge 2b \n" LABELALIGN @@ -670,23 +889,37 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "movd %k2,%%xmm0 \n" "psrlw $0x9,%%xmm2 \n" "pshufb %%xmm5,%%xmm2 \n" + "psubb %8,%%xmm0 \n" // make pixels signed. "pxor %%xmm6,%%xmm2 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,%k2 \n" + "paddusb %%xmm7,%%xmm2 \n" + "pmaddubsw %%xmm0,%%xmm2 \n" + "paddw %9,%%xmm2 \n" // make pixels unsigned. + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movd %%xmm2,%k2 \n" "mov %b2," MEMACCESS(0) " \n" "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+a"(temp_pixel), // %2 - "+r"(x0), // %3 - "+r"(x1), // %4 - "+rm"(dst_width) // %5 - : "rm"(x), // %6 - "rm"(dx) // %7 + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "=&a"(temp_pixel), // %2 + "=&r"(x0), // %3 + "=&r"(x1), // %4 +#if defined(__x86_64__) + "+rm"(dst_width) // %5 +#else + "+m"(dst_width) // %5 +#endif + : "rm"(x), // %6 + "rm"(dx), // %7 +#if defined(__x86_64__) + "x"(kFsub80), // %8 + "x"(kFadd40) // %9 +#else + "m"(kFsub80), // %8 + "m"(kFadd40) // %9 +#endif : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } @@ -795,7 +1028,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width) { intptr_t src_stepx_x4 = (intptr_t)(src_stepx); - intptr_t src_stepx_x12 = 0; + intptr_t src_stepx_x12; asm volatile ( "lea " MEMLEA3(0x00,1,4) ",%1 \n" "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" @@ -813,11 +1046,11 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, "lea " MEMLEA(0x10,2) ",%2 \n" "sub $0x4,%3 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stepx_x4), // %1 - "+r"(dst_argb), // %2 - "+r"(dst_width), // %3 - "+r"(src_stepx_x12) // %4 + : "+r"(src_argb), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width), // %3 + "=&r"(src_stepx_x12) // %4 :: "memory", "cc", NACL_R14 "xmm0", "xmm1", "xmm2", "xmm3" ); @@ -829,7 +1062,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width) { intptr_t src_stepx_x4 = (intptr_t)(src_stepx); - intptr_t src_stepx_x12 = 0; + intptr_t src_stepx_x12; intptr_t row1 = (intptr_t)(src_stride); asm volatile ( "lea " MEMLEA3(0x00,1,4) ",%1 \n" @@ -858,12 +1091,12 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, "lea " MEMLEA(0x10,2) ",%2 \n" "sub $0x4,%3 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stepx_x4), // %1 - "+r"(dst_argb), // %2 - "+rm"(dst_width), // %3 - "+r"(src_stepx_x12), // %4 - "+r"(row1) // %5 + : "+r"(src_argb), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_argb), // %2 + "+rm"(dst_width), // %3 + "=&r"(src_stepx_x12), // %4 + "+r"(row1) // %5 :: "memory", "cc", NACL_R14 "xmm0", "xmm1", "xmm2", "xmm3" ); @@ -871,7 +1104,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx) { - intptr_t x0 = 0, x1 = 0; + intptr_t x0, x1; asm volatile ( "movd %5,%%xmm2 \n" "movd %6,%%xmm3 \n" @@ -924,8 +1157,8 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 "movd %%xmm0," MEMACCESS(2) " \n" "99: \n" - : "+a"(x0), // %0 - "+d"(x1), // %1 + : "=&a"(x0), // %0 + "=&d"(x1), // %1 "+r"(dst_argb), // %2 "+r"(src_argb), // %3 "+r"(dst_width) // %4 @@ -976,7 +1209,7 @@ static uvec8 kShuffleFractions = { // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx) { - intptr_t x0 = 0, x1 = 0; + intptr_t x0, x1; asm volatile ( "movdqa %0,%%xmm4 \n" "movdqa %1,%%xmm5 \n" @@ -1039,8 +1272,8 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 "+rm"(dst_width), // %2 - "+r"(x0), // %3 - "+r"(x1) // %4 + "=&r"(x0), // %3 + "=&r"(x1) // %4 : "rm"(x), // %5 "rm"(dx) // %6 : "memory", "cc", NACL_R14 diff --git a/third_party/libyuv/source/scale_mips.cc b/third_party/libyuv/source/scale_mips.cc index 3eb4f27c4..ae953073f 100644 --- a/third_party/libyuv/source/scale_mips.cc +++ b/third_party/libyuv/source/scale_mips.cc @@ -21,8 +21,8 @@ extern "C" { defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \ (_MIPS_SIM == _MIPS_SIM_ABI32) -void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { __asm__ __volatile__( ".set push \n" ".set noreorder \n" @@ -31,7 +31,6 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, "beqz $t9, 2f \n" " nop \n" - ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| @@ -78,8 +77,8 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { const uint8* t = src_ptr + src_stride; __asm__ __volatile__ ( @@ -90,7 +89,6 @@ void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, "bltz $t9, 2f \n" " nop \n" - ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| @@ -178,8 +176,8 @@ void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { __asm__ __volatile__ ( ".set push \n" ".set noreorder \n" @@ -188,7 +186,6 @@ void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, "beqz $t9, 2f \n" " nop \n" - ".p2align 2 \n" "1: \n" "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| @@ -234,8 +231,8 @@ void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { intptr_t stride = src_stride; const uint8* s1 = src_ptr + stride; const uint8* s2 = s1 + stride; @@ -248,7 +245,6 @@ void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, "srl $t9, %[dst_width], 1 \n" "andi $t8, %[dst_width], 1 \n" - ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t1, 0(%[s1]) \n" // |7|6|5|4| @@ -314,12 +310,11 @@ void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { __asm__ __volatile__ ( ".set push \n" ".set noreorder \n" - ".p2align 2 \n" "1: \n" "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| @@ -361,14 +356,13 @@ void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width) { +void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width) { __asm__ __volatile__ ( ".set push \n" ".set noreorder \n" "repl.ph $t3, 3 \n" // 0x00030003 - ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| @@ -418,14 +412,13 @@ void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width) { +void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width) { __asm__ __volatile__ ( ".set push \n" ".set noreorder \n" "repl.ph $t2, 3 \n" // 0x00030003 - ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| @@ -471,13 +464,12 @@ void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { __asm__ __volatile__ ( ".set push \n" ".set noreorder \n" - ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| @@ -518,8 +510,8 @@ void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { intptr_t stride = src_stride; const uint8* t = src_ptr + stride; const int c = 0x2AAA; @@ -528,7 +520,6 @@ void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ".set push \n" ".set noreorder \n" - ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| @@ -572,9 +563,9 @@ void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { intptr_t stride = src_stride; const uint8* s1 = src_ptr + stride; stride += stride; @@ -586,7 +577,6 @@ void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr, ".set push \n" ".set noreorder \n" - ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| diff --git a/third_party/libyuv/source/scale_neon.cc b/third_party/libyuv/source/scale_neon.cc index 7825878e9..44b0c8080 100644 --- a/third_party/libyuv/source/scale_neon.cc +++ b/third_party/libyuv/source/scale_neon.cc @@ -26,7 +26,6 @@ extern "C" { void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { asm volatile ( - ".p2align 2 \n" "1: \n" // load even pixels into q0, odd into q1 MEMACCESS(0) @@ -47,7 +46,6 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc @@ -73,7 +71,6 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, asm volatile ( // change the stride to row 2 pointer "add %1, %0 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc @@ -101,7 +98,6 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 @@ -123,7 +119,6 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, const uint8* src_ptr2 = src_ptr + src_stride * 2; const uint8* src_ptr3 = src_ptr + src_stride * 3; asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load up 16x4 @@ -162,7 +157,6 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 @@ -185,7 +179,6 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, asm volatile ( "vmov.u8 d24, #3 \n" "add %3, %0 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 @@ -245,7 +238,6 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, asm volatile ( "vmov.u8 d24, #3 \n" "add %3, %0 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 @@ -300,7 +292,6 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, asm volatile ( MEMACCESS(3) "vld1.8 {q3}, [%3] \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {d0, d1, d2, d3}, [%0]! \n" @@ -334,7 +325,6 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, MEMACCESS(7) "vld1.8 {q15}, [%7] \n" "add %3, %0 \n" - ".p2align 2 \n" "1: \n" // d0 = 00 40 01 41 02 42 03 43 @@ -450,7 +440,6 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, MEMACCESS(5) "vld1.8 {q14}, [%5] \n" "add %3, %0 \n" - ".p2align 2 \n" "1: \n" // d0 = 00 40 01 41 02 42 03 43 @@ -543,9 +532,8 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint16* dst_ptr, int src_width, int src_height) { - const uint8* src_tmp = NULL; + const uint8* src_tmp; asm volatile ( - ".p2align 2 \n" "1: \n" "mov %0, %1 \n" "mov r12, %5 \n" @@ -564,12 +552,12 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "add %1, %1, #16 \n" "subs %4, %4, #16 \n" // 16 processed per loop "bgt 1b \n" - : "+r"(src_tmp), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_ptr), // %2 - "+r"(src_stride), // %3 - "+r"(src_width), // %4 - "+r"(src_height) // %5 + : "=&r"(src_tmp), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_ptr), // %2 + "+r"(src_stride), // %3 + "+r"(src_width), // %4 + "+r"(src_height) // %5 : : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List ); @@ -584,13 +572,16 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, MEMACCESS(6) \ "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n" +// The NEON version mimics this formula: +// #define BLENDER(a, b, f) (uint8)((int)(a) + +// ((int)(f) * ((int)(b) - (int)(a)) >> 16)) + void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, int dx) { int dx_offset[4] = {0, 1, 2, 3}; int* tmp = dx_offset; const uint8* src_tmp = src_ptr; asm volatile ( - ".p2align 2 \n" "vdup.32 q0, %3 \n" // x "vdup.32 q1, %4 \n" // dx "vld1.32 {q2}, [%5] \n" // 0 1 2 3 @@ -621,8 +612,8 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, "vmovl.u16 q10, d21 \n" "vmul.s32 q11, q11, q13 \n" "vmul.s32 q12, q12, q10 \n" - "vshrn.s32 d18, q11, #16 \n" - "vshrn.s32 d19, q12, #16 \n" + "vrshrn.s32 d18, q11, #16 \n" + "vrshrn.s32 d19, q12, #16 \n" "vadd.s16 q8, q8, q9 \n" "vmovn.s16 d6, q8 \n" @@ -749,7 +740,6 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { asm volatile ( - ".p2align 2 \n" "1: \n" // load even pixels into q0, odd into q1 MEMACCESS(0) @@ -773,7 +763,6 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, uint8* dst_argb, int dst_width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. @@ -804,7 +793,6 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, asm volatile ( // change the stride to row 2 pointer "add %1, %1, %0 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. @@ -845,7 +833,6 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width) { asm volatile ( "mov r12, %3, lsl #2 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.32 {d0[0]}, [%0], r12 \n" @@ -875,7 +862,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, asm volatile ( "mov r12, %4, lsl #2 \n" "add %1, %1, %0 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 @@ -927,10 +913,9 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx) { - int tmp = 0; + int tmp; const uint8* src_tmp = src_argb; asm volatile ( - ".p2align 2 \n" "1: \n" LOAD1_DATA32_LANE(d0, 0) LOAD1_DATA32_LANE(d0, 1) @@ -945,13 +930,13 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, "vst1.32 {q0, q1}, [%0]! \n" // store pixels "subs %2, %2, #8 \n" // 8 processed per loop "bgt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width), // %2 - "+r"(x), // %3 - "+r"(dx), // %4 - "+r"(tmp), // %5 - "+r"(src_tmp) // %6 + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width), // %2 + "+r"(x), // %3 + "+r"(dx), // %4 + "=&r"(tmp), // %5 + "+r"(src_tmp) // %6 : : "memory", "cc", "q0", "q1" ); @@ -974,7 +959,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, int* tmp = dx_offset; const uint8* src_tmp = src_argb; asm volatile ( - ".p2align 2 \n" "vdup.32 q0, %3 \n" // x "vdup.32 q1, %4 \n" // dx "vld1.32 {q2}, [%5] \n" // 0 1 2 3 diff --git a/third_party/libyuv/source/scale_neon64.cc b/third_party/libyuv/source/scale_neon64.cc index 1d5519357..ff277f26f 100644 --- a/third_party/libyuv/source/scale_neon64.cc +++ b/third_party/libyuv/source/scale_neon64.cc @@ -547,7 +547,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint16* dst_ptr, int src_width, int src_height) { - const uint8* src_tmp = NULL; + const uint8* src_tmp; asm volatile ( "1: \n" "mov %0, %1 \n" @@ -567,12 +567,12 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "add %1, %1, #16 \n" "subs %w4, %w4, #16 \n" // 16 processed per loop "b.gt 1b \n" - : "+r"(src_tmp), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_ptr), // %2 - "+r"(src_stride), // %3 - "+r"(src_width), // %4 - "+r"(src_height) // %5 + : "=&r"(src_tmp), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_ptr), // %2 + "+r"(src_stride), // %3 + "+r"(src_width), // %4 + "+r"(src_height) // %5 : : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List ); @@ -626,8 +626,8 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, "ushll2 v6.4s, v6.8h, #0 \n" "mul v16.4s, v16.4s, v7.4s \n" "mul v17.4s, v17.4s, v6.4s \n" - "shrn v6.4h, v16.4s, #16 \n" - "shrn2 v6.8h, v17.4s, #16 \n" + "rshrn v6.4h, v16.4s, #16 \n" + "rshrn2 v6.8h, v17.4s, #16 \n" "add v4.8h, v4.8h, v6.8h \n" "xtn v4.8b, v4.8h \n" @@ -931,7 +931,7 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. int64 x64 = (int64) x; int64 dx64 = (int64) dx; - int64 tmp64 = 0; + int64 tmp64; asm volatile ( "1: \n" LOAD1_DATA32_LANE(v0, 0) @@ -947,13 +947,13 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels "subs %w2, %w2, #8 \n" // 8 processed per loop "b.gt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width64), // %2 - "+r"(x64), // %3 - "+r"(dx64), // %4 - "+r"(tmp64), // %5 - "+r"(src_tmp) // %6 + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width64), // %2 + "+r"(x64), // %3 + "+r"(dx64), // %4 + "=&r"(tmp64), // %5 + "+r"(src_tmp) // %6 : : "memory", "cc", "v0", "v1" ); diff --git a/third_party/libyuv/source/scale_win.cc b/third_party/libyuv/source/scale_win.cc index c3896ebad..f17097365 100644 --- a/third_party/libyuv/source/scale_win.cc +++ b/third_party/libyuv/source/scale_win.cc @@ -16,9 +16,8 @@ namespace libyuv { extern "C" { #endif -// This module is for Visual C x86. -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ - defined(_MSC_VER) && !defined(__clang__) +// This module is for 32 bit Visual C x86 and clangcl +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) // Offsets for source bytes 0 to 9 static uvec8 kShuf0 = @@ -96,8 +95,8 @@ static uvec16 kScaleAb2 = // Reads 32 pixels, throws half away and writes 16 pixels. __declspec(naked) -void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr // src_stride ignored @@ -122,31 +121,28 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, // Blends 32x1 rectangle to 16x1. __declspec(naked) -void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr // src_stride mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 + + pcmpeqb xmm4, xmm4 // constant 0x0101 + psrlw xmm4, 15 + packuswb xmm4, xmm4 + pxor xmm5, xmm5 // constant 0 wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] - - movdqa xmm2, xmm0 // average columns (32 to 16 pixels) - psrlw xmm0, 8 - movdqa xmm3, xmm1 - psrlw xmm1, 8 - pand xmm2, xmm5 - pand xmm3, xmm5 - pavgw xmm0, xmm2 - pavgw xmm1, xmm3 + pmaddubsw xmm0, xmm4 // horizontal add + pmaddubsw xmm1, xmm4 + pavgw xmm0, xmm5 // (x + 1) / 2 + pavgw xmm1, xmm5 packuswb xmm0, xmm1 - movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 @@ -158,16 +154,19 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, // Blends 32x2 rectangle to 16x1. __declspec(naked) -void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { __asm { push esi mov eax, [esp + 4 + 4] // src_ptr mov esi, [esp + 4 + 8] // src_stride mov edx, [esp + 4 + 12] // dst_ptr mov ecx, [esp + 4 + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 + + pcmpeqb xmm4, xmm4 // constant 0x0101 + psrlw xmm4, 15 + packuswb xmm4, xmm4 + pxor xmm5, xmm5 // constant 0 wloop: movdqu xmm0, [eax] @@ -175,19 +174,17 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, movdqu xmm2, [eax + esi] movdqu xmm3, [eax + esi + 16] lea eax, [eax + 32] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - - movdqa xmm2, xmm0 // average columns (32 to 16 pixels) - psrlw xmm0, 8 - movdqa xmm3, xmm1 - psrlw xmm1, 8 - pand xmm2, xmm5 - pand xmm3, xmm5 - pavgw xmm0, xmm2 - pavgw xmm1, xmm3 + pmaddubsw xmm0, xmm4 // horizontal add + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + paddw xmm0, xmm2 // vertical add + paddw xmm1, xmm3 + psrlw xmm0, 1 + psrlw xmm1, 1 + pavgw xmm0, xmm5 // (x + 1) / 2 + pavgw xmm1, xmm5 packuswb xmm0, xmm1 - movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 @@ -246,14 +243,12 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] - - vpmaddubsw ymm0, ymm0, ymm4 // average horizontally + vpmaddubsw ymm0, ymm0, ymm4 // horizontal add vpmaddubsw ymm1, ymm1, ymm4 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 vpavgw ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 @@ -264,6 +259,8 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, } } +// For rounding, average = (sum + 2) / 4 +// becomes average((sum >> 1), 0) // Blends 64x2 rectangle to 32x1. __declspec(naked) void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, @@ -281,19 +278,23 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, vpxor ymm5, ymm5, ymm5 // constant 0 wloop: - vmovdqu ymm0, [eax] // average rows + vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] - vpavgb ymm0, ymm0, [eax + esi] - vpavgb ymm1, ymm1, [eax + esi + 32] + vmovdqu ymm2, [eax + esi] + vmovdqu ymm3, [eax + esi + 32] lea eax, [eax + 64] - - vpmaddubsw ymm0, ymm0, ymm4 // average horizontally + vpmaddubsw ymm0, ymm0, ymm4 // horizontal add vpmaddubsw ymm1, ymm1, ymm4 + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + vpaddw ymm0, ymm0, ymm2 // vertical add + vpaddw ymm1, ymm1, ymm3 + vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2 + vpsrlw ymm1, ymm1, 1 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 vpavgw ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 @@ -308,7 +309,7 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, // Point samples 32 pixels to 8 pixels. __declspec(naked) -void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr @@ -339,7 +340,7 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, // Blends 32x4 rectangle to 8x1. __declspec(naked) -void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { push esi @@ -349,42 +350,40 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, mov edx, [esp + 8 + 12] // dst_ptr mov ecx, [esp + 8 + 16] // dst_width lea edi, [esi + esi * 2] // src_stride * 3 - pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff - psrlw xmm7, 8 + pcmpeqb xmm4, xmm4 // constant 0x0101 + psrlw xmm4, 15 + movdqa xmm5, xmm4 + packuswb xmm4, xmm4 + psllw xmm5, 3 // constant 0x0008 wloop: movdqu xmm0, [eax] // average rows movdqu xmm1, [eax + 16] movdqu xmm2, [eax + esi] movdqu xmm3, [eax + esi + 16] - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 + pmaddubsw xmm0, xmm4 // horizontal add + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + paddw xmm0, xmm2 // vertical add rows 0, 1 + paddw xmm1, xmm3 movdqu xmm2, [eax + esi * 2] movdqu xmm3, [eax + esi * 2 + 16] - movdqu xmm4, [eax + edi] - movdqu xmm5, [eax + edi + 16] + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + paddw xmm0, xmm2 // add row 2 + paddw xmm1, xmm3 + movdqu xmm2, [eax + edi] + movdqu xmm3, [eax + edi + 16] lea eax, [eax + 32] - pavgb xmm2, xmm4 - pavgb xmm3, xmm5 - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - - movdqa xmm2, xmm0 // average columns (32 to 16 pixels) - psrlw xmm0, 8 - movdqa xmm3, xmm1 - psrlw xmm1, 8 - pand xmm2, xmm7 - pand xmm3, xmm7 - pavgw xmm0, xmm2 - pavgw xmm1, xmm3 - packuswb xmm0, xmm1 - - movdqa xmm2, xmm0 // average columns (16 to 8 pixels) - psrlw xmm0, 8 - pand xmm2, xmm7 - pavgw xmm0, xmm2 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + paddw xmm0, xmm2 // add row 3 + paddw xmm1, xmm3 + phaddw xmm0, xmm1 + paddw xmm0, xmm5 // + 8 for round + psrlw xmm0, 4 // /16 for average of 4 * 4 packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 lea edx, [edx + 8] sub ecx, 8 @@ -443,37 +442,41 @@ void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, mov edx, [esp + 8 + 12] // dst_ptr mov ecx, [esp + 8 + 16] // dst_width lea edi, [esi + esi * 2] // src_stride * 3 - vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff - vpsrlw ymm7, ymm7, 8 + vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101 + vpsrlw ymm4, ymm4, 15 + vpsllw ymm5, ymm4, 3 // constant 0x0008 + vpackuswb ymm4, ymm4, ymm4 wloop: vmovdqu ymm0, [eax] // average rows vmovdqu ymm1, [eax + 32] - vpavgb ymm0, ymm0, [eax + esi] - vpavgb ymm1, ymm1, [eax + esi + 32] + vmovdqu ymm2, [eax + esi] + vmovdqu ymm3, [eax + esi + 32] + vpmaddubsw ymm0, ymm0, ymm4 // horizontal add + vpmaddubsw ymm1, ymm1, ymm4 + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1 + vpaddw ymm1, ymm1, ymm3 vmovdqu ymm2, [eax + esi * 2] vmovdqu ymm3, [eax + esi * 2 + 32] - vpavgb ymm2, ymm2, [eax + edi] - vpavgb ymm3, ymm3, [eax + edi + 32] - lea eax, [eax + 64] - vpavgb ymm0, ymm0, ymm2 - vpavgb ymm1, ymm1, ymm3 - - vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels) - vpand ymm3, ymm1, ymm7 - vpsrlw ymm0, ymm0, 8 - vpsrlw ymm1, ymm1, 8 - vpavgw ymm0, ymm0, ymm2 - vpavgw ymm1, ymm1, ymm3 - vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - - vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels) - vpsrlw ymm0, ymm0, 8 - vpavgw ymm0, ymm0, ymm2 + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + vpaddw ymm0, ymm0, ymm2 // add row 2 + vpaddw ymm1, ymm1, ymm3 + vmovdqu ymm2, [eax + edi] + vmovdqu ymm3, [eax + edi + 32] + lea eax, [eax + 64] + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + vpaddw ymm0, ymm0, ymm2 // add row 3 + vpaddw ymm1, ymm1, ymm3 + vphaddw ymm0, ymm0, ymm1 // mutates + vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw + vpaddw ymm0, ymm0, ymm5 // + 8 for round + vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4 vpackuswb ymm0, ymm0, ymm0 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vmovdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 @@ -499,9 +502,9 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, // src_stride ignored mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width - movdqa xmm3, kShuf0 - movdqa xmm4, kShuf1 - movdqa xmm5, kShuf2 + movdqa xmm3, xmmword ptr kShuf0 + movdqa xmm4, xmmword ptr kShuf1 + movdqa xmm5, xmmword ptr kShuf2 wloop: movdqu xmm0, [eax] @@ -548,12 +551,12 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, mov esi, [esp + 4 + 8] // src_stride mov edx, [esp + 4 + 12] // dst_ptr mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, kShuf01 - movdqa xmm3, kShuf11 - movdqa xmm4, kShuf21 - movdqa xmm5, kMadd01 - movdqa xmm6, kMadd11 - movdqa xmm7, kRound34 + movdqa xmm2, xmmword ptr kShuf01 + movdqa xmm3, xmmword ptr kShuf11 + movdqa xmm4, xmmword ptr kShuf21 + movdqa xmm5, xmmword ptr kMadd01 + movdqa xmm6, xmmword ptr kMadd11 + movdqa xmm7, xmmword ptr kRound34 wloop: movdqu xmm0, [eax] // pixels 0..7 @@ -579,7 +582,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, lea eax, [eax + 32] pavgb xmm0, xmm1 pshufb xmm0, xmm4 - movdqa xmm1, kMadd21 + movdqa xmm1, xmmword ptr kMadd21 pmaddubsw xmm0, xmm1 paddsw xmm0, xmm7 psrlw xmm0, 2 @@ -605,12 +608,12 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, mov esi, [esp + 4 + 8] // src_stride mov edx, [esp + 4 + 12] // dst_ptr mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, kShuf01 - movdqa xmm3, kShuf11 - movdqa xmm4, kShuf21 - movdqa xmm5, kMadd01 - movdqa xmm6, kMadd11 - movdqa xmm7, kRound34 + movdqa xmm2, xmmword ptr kShuf01 + movdqa xmm3, xmmword ptr kShuf11 + movdqa xmm4, xmmword ptr kShuf21 + movdqa xmm5, xmmword ptr kMadd01 + movdqa xmm6, xmmword ptr kMadd11 + movdqa xmm7, xmmword ptr kRound34 wloop: movdqu xmm0, [eax] // pixels 0..7 @@ -639,7 +642,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, pavgb xmm1, xmm0 pavgb xmm0, xmm1 pshufb xmm0, xmm4 - movdqa xmm1, kMadd21 + movdqa xmm1, xmmword ptr kMadd21 pmaddubsw xmm0, xmm1 paddsw xmm0, xmm7 psrlw xmm0, 2 @@ -665,8 +668,8 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, // src_stride ignored mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width - movdqa xmm4, kShuf38a - movdqa xmm5, kShuf38b + movdqa xmm4, xmmword ptr kShuf38a + movdqa xmm5, xmmword ptr kShuf38b xloop: movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 @@ -698,9 +701,9 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, mov esi, [esp + 4 + 8] // src_stride mov edx, [esp + 4 + 12] // dst_ptr mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, kShufAc - movdqa xmm3, kShufAc3 - movdqa xmm4, kScaleAc33 + movdqa xmm2, xmmword ptr kShufAc + movdqa xmm3, xmmword ptr kShufAc3 + movdqa xmm4, xmmword ptr kScaleAc33 pxor xmm5, xmm5 xloop: @@ -763,10 +766,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, mov esi, [esp + 4 + 8] // src_stride mov edx, [esp + 4 + 12] // dst_ptr mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, kShufAb0 - movdqa xmm3, kShufAb1 - movdqa xmm4, kShufAb2 - movdqa xmm5, kScaleAb2 + movdqa xmm2, xmmword ptr kShufAb0 + movdqa xmm3, xmmword ptr kShufAb1 + movdqa xmm4, xmmword ptr kShufAb2 + movdqa xmm5, xmmword ptr kScaleAb2 xloop: movdqu xmm0, [eax] // average 2 rows into xmm0 @@ -857,6 +860,16 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { } #endif // HAS_SCALEADDROW_AVX2 +// Constant for making pixels signed to avoid pmaddubsw +// saturation. +static uvec8 kFsub80 = + { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; + +// Constant for making pixels unsigned and adding .5 for rounding. +static uvec16 kFadd40 = + { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 }; + // Bilinear column filtering. SSSE3 version. __declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, @@ -874,6 +887,8 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, movd xmm5, eax pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. psrlw xmm6, 9 + pcmpeqb xmm7, xmm7 // generate 0x0001 + psrlw xmm7, 15 pextrw eax, xmm2, 1 // get x0 integer. preroll sub ecx, 2 jl xloop29 @@ -896,20 +911,22 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, movd xmm4, ebx pshufb xmm1, xmm5 // 0011 punpcklwd xmm0, xmm4 + psubb xmm0, xmmword ptr kFsub80 // make pixels signed. pxor xmm1, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. + paddusb xmm1, xmm7 // +1 so 0..7f and 80..1 + pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels. pextrw eax, xmm2, 1 // get x0 integer. next iteration. pextrw edx, xmm2, 3 // get x1 integer. next iteration. - psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. - packuswb xmm0, xmm0 // 8 bits, 2 pixels. - movd ebx, xmm0 + paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round. + psrlw xmm1, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm1, xmm1 // 8 bits, 2 pixels. + movd ebx, xmm1 mov [edi], bx lea edi, [edi + 2] sub ecx, 2 // 2 pixels jge xloop2 xloop29: - add ecx, 2 - 1 jl xloop99 @@ -918,11 +935,14 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, movd xmm0, ebx psrlw xmm2, 9 // 7 bit fractions. pshufb xmm2, xmm5 // 0011 + psubb xmm0, xmmword ptr kFsub80 // make pixels signed. pxor xmm2, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm2 // 16 bit - psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. - packuswb xmm0, xmm0 // 8 bits - movd ebx, xmm0 + paddusb xmm2, xmm7 // +1 so 0..7f and 80..1 + pmaddubsw xmm2, xmm0 // 16 bit + paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round. + psrlw xmm2, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm2, xmm2 // 8 bits + movd ebx, xmm2 mov [edi], bl xloop99: @@ -1233,8 +1253,8 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, mov ecx, [esp + 8 + 12] // dst_width movd xmm2, [esp + 8 + 16] // x movd xmm3, [esp + 8 + 20] // dx - movdqa xmm4, kShuffleColARGB - movdqa xmm5, kShuffleFractions + movdqa xmm4, xmmword ptr kShuffleColARGB + movdqa xmm5, xmmword ptr kShuffleFractions pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. psrlw xmm6, 9 pextrw eax, xmm2, 1 // get x0 integer. preroll diff --git a/third_party/libyuv/source/video_common.cc b/third_party/libyuv/source/video_common.cc index 379a0669a..00fb71e18 100644 --- a/third_party/libyuv/source/video_common.cc +++ b/third_party/libyuv/source/video_common.cc @@ -25,6 +25,7 @@ struct FourCCAliasEntry { static const struct FourCCAliasEntry kFourCCAliases[] = { {FOURCC_IYUV, FOURCC_I420}, + {FOURCC_YU12, FOURCC_I420}, {FOURCC_YU16, FOURCC_I422}, {FOURCC_YU24, FOURCC_I444}, {FOURCC_YUYV, FOURCC_YUY2}, diff --git a/third_party/libyuv/source/x86inc.asm b/third_party/libyuv/source/x86inc.asm deleted file mode 100644 index cb5c32df3..000000000 --- a/third_party/libyuv/source/x86inc.asm +++ /dev/null @@ -1,1136 +0,0 @@ -;***************************************************************************** -;* x86inc.asm: x264asm abstraction layer -;***************************************************************************** -;* Copyright (C) 2005-2012 x264 project -;* -;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* Anton Mitrofanov <BugMaster@narod.ru> -;* Jason Garrett-Glaser <darkshikari@gmail.com> -;* Henrik Gramner <hengar-6@student.ltu.se> -;* -;* Permission to use, copy, modify, and/or distribute this software for any -;* purpose with or without fee is hereby granted, provided that the above -;* copyright notice and this permission notice appear in all copies. -;* -;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -;***************************************************************************** - -; This is a header file for the x264ASM assembly language, which uses -; NASM/YASM syntax combined with a large number of macros to provide easy -; abstraction between different calling conventions (x86_32, win64, linux64). -; It also has various other useful features to simplify writing the kind of -; DSP functions that are most often used in x264. - -; Unlike the rest of x264, this file is available under an ISC license, as it -; has significant usefulness outside of x264 and we want it to be available -; to the largest audience possible. Of course, if you modify it for your own -; purposes to add a new feature, we strongly encourage contributing a patch -; as this feature might be useful for others as well. Send patches or ideas -; to x264-devel@videolan.org . - -; Local changes for libyuv: -; remove %define program_name and references in labels -; rename cpus to uppercase - -%define WIN64 0 -%define UNIX64 0 -%if ARCH_X86_64 - %ifidn __OUTPUT_FORMAT__,win32 - %define WIN64 1 - %elifidn __OUTPUT_FORMAT__,win64 - %define WIN64 1 - %else - %define UNIX64 1 - %endif -%endif - -%ifdef PREFIX - %define mangle(x) _ %+ x -%else - %define mangle(x) x -%endif - -; Name of the .rodata section. -; Kludge: Something on OS X fails to align .rodata even given an align attribute, -; so use a different read-only section. -%macro SECTION_RODATA 0-1 16 - %ifidn __OUTPUT_FORMAT__,macho64 - SECTION .text align=%1 - %elifidn __OUTPUT_FORMAT__,macho - SECTION .text align=%1 - fakegot: - %elifidn __OUTPUT_FORMAT__,aout - section .text - %else - SECTION .rodata align=%1 - %endif -%endmacro - -; aout does not support align= -%macro SECTION_TEXT 0-1 16 - %ifidn __OUTPUT_FORMAT__,aout - SECTION .text - %else - SECTION .text align=%1 - %endif -%endmacro - -%if WIN64 - %define PIC -%elif ARCH_X86_64 == 0 -; x86_32 doesn't require PIC. -; Some distros prefer shared objects to be PIC, but nothing breaks if -; the code contains a few textrels, so we'll skip that complexity. - %undef PIC -%endif -%ifdef PIC - default rel -%endif - -; Always use long nops (reduces 0x90 spam in disassembly on x86_32) -CPU amdnop - -; Macros to eliminate most code duplication between x86_32 and x86_64: -; Currently this works only for leaf functions which load all their arguments -; into registers at the start, and make no other use of the stack. Luckily that -; covers most of x264's asm. - -; PROLOGUE: -; %1 = number of arguments. loads them from stack if needed. -; %2 = number of registers used. pushes callee-saved regs if needed. -; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. -; %4 = list of names to define to registers -; PROLOGUE can also be invoked by adding the same options to cglobal - -; e.g. -; cglobal foo, 2,3,0, dst, src, tmp -; declares a function (foo), taking two args (dst and src) and one local variable (tmp) - -; TODO Some functions can use some args directly from the stack. If they're the -; last args then you can just not declare them, but if they're in the middle -; we need more flexible macro. - -; RET: -; Pops anything that was pushed by PROLOGUE, and returns. - -; REP_RET: -; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons -; which are slow when a normal ret follows a branch. - -; registers: -; rN and rNq are the native-size register holding function argument N -; rNd, rNw, rNb are dword, word, and byte size -; rNh is the high 8 bits of the word size -; rNm is the original location of arg N (a register or on the stack), dword -; rNmp is native size - -%macro DECLARE_REG 2-3 - %define r%1q %2 - %define r%1d %2d - %define r%1w %2w - %define r%1b %2b - %define r%1h %2h - %if %0 == 2 - %define r%1m %2d - %define r%1mp %2 - %elif ARCH_X86_64 ; memory - %define r%1m [rsp + stack_offset + %3] - %define r%1mp qword r %+ %1m - %else - %define r%1m [esp + stack_offset + %3] - %define r%1mp dword r %+ %1m - %endif - %define r%1 %2 -%endmacro - -%macro DECLARE_REG_SIZE 3 - %define r%1q r%1 - %define e%1q r%1 - %define r%1d e%1 - %define e%1d e%1 - %define r%1w %1 - %define e%1w %1 - %define r%1h %3 - %define e%1h %3 - %define r%1b %2 - %define e%1b %2 -%if ARCH_X86_64 == 0 - %define r%1 e%1 -%endif -%endmacro - -DECLARE_REG_SIZE ax, al, ah -DECLARE_REG_SIZE bx, bl, bh -DECLARE_REG_SIZE cx, cl, ch -DECLARE_REG_SIZE dx, dl, dh -DECLARE_REG_SIZE si, sil, null -DECLARE_REG_SIZE di, dil, null -DECLARE_REG_SIZE bp, bpl, null - -; t# defines for when per-arch register allocation is more complex than just function arguments - -%macro DECLARE_REG_TMP 1-* - %assign %%i 0 - %rep %0 - CAT_XDEFINE t, %%i, r%1 - %assign %%i %%i+1 - %rotate 1 - %endrep -%endmacro - -%macro DECLARE_REG_TMP_SIZE 0-* - %rep %0 - %define t%1q t%1 %+ q - %define t%1d t%1 %+ d - %define t%1w t%1 %+ w - %define t%1h t%1 %+ h - %define t%1b t%1 %+ b - %rotate 1 - %endrep -%endmacro - -DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 - -%if ARCH_X86_64 - %define gprsize 8 -%else - %define gprsize 4 -%endif - -%macro PUSH 1 - push %1 - %assign stack_offset stack_offset+gprsize -%endmacro - -%macro POP 1 - pop %1 - %assign stack_offset stack_offset-gprsize -%endmacro - -%macro PUSH_IF_USED 1-* - %rep %0 - %if %1 < regs_used - PUSH r%1 - %endif - %rotate 1 - %endrep -%endmacro - -%macro POP_IF_USED 1-* - %rep %0 - %if %1 < regs_used - pop r%1 - %endif - %rotate 1 - %endrep -%endmacro - -%macro LOAD_IF_USED 1-* - %rep %0 - %if %1 < num_args - mov r%1, r %+ %1 %+ mp - %endif - %rotate 1 - %endrep -%endmacro - -%macro SUB 2 - sub %1, %2 - %ifidn %1, rsp - %assign stack_offset stack_offset+(%2) - %endif -%endmacro - -%macro ADD 2 - add %1, %2 - %ifidn %1, rsp - %assign stack_offset stack_offset-(%2) - %endif -%endmacro - -%macro movifnidn 2 - %ifnidn %1, %2 - mov %1, %2 - %endif -%endmacro - -%macro movsxdifnidn 2 - %ifnidn %1, %2 - movsxd %1, %2 - %endif -%endmacro - -%macro ASSERT 1 - %if (%1) == 0 - %error assert failed - %endif -%endmacro - -%macro DEFINE_ARGS 0-* - %ifdef n_arg_names - %assign %%i 0 - %rep n_arg_names - CAT_UNDEF arg_name %+ %%i, q - CAT_UNDEF arg_name %+ %%i, d - CAT_UNDEF arg_name %+ %%i, w - CAT_UNDEF arg_name %+ %%i, h - CAT_UNDEF arg_name %+ %%i, b - CAT_UNDEF arg_name %+ %%i, m - CAT_UNDEF arg_name %+ %%i, mp - CAT_UNDEF arg_name, %%i - %assign %%i %%i+1 - %endrep - %endif - - %xdefine %%stack_offset stack_offset - %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine - %assign %%i 0 - %rep %0 - %xdefine %1q r %+ %%i %+ q - %xdefine %1d r %+ %%i %+ d - %xdefine %1w r %+ %%i %+ w - %xdefine %1h r %+ %%i %+ h - %xdefine %1b r %+ %%i %+ b - %xdefine %1m r %+ %%i %+ m - %xdefine %1mp r %+ %%i %+ mp - CAT_XDEFINE arg_name, %%i, %1 - %assign %%i %%i+1 - %rotate 1 - %endrep - %xdefine stack_offset %%stack_offset - %assign n_arg_names %0 -%endmacro - -%if WIN64 ; Windows x64 ;================================================= - -DECLARE_REG 0, rcx -DECLARE_REG 1, rdx -DECLARE_REG 2, R8 -DECLARE_REG 3, R9 -DECLARE_REG 4, R10, 40 -DECLARE_REG 5, R11, 48 -DECLARE_REG 6, rax, 56 -DECLARE_REG 7, rdi, 64 -DECLARE_REG 8, rsi, 72 -DECLARE_REG 9, rbx, 80 -DECLARE_REG 10, rbp, 88 -DECLARE_REG 11, R12, 96 -DECLARE_REG 12, R13, 104 -DECLARE_REG 13, R14, 112 -DECLARE_REG 14, R15, 120 - -%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... - %assign num_args %1 - %assign regs_used %2 - ASSERT regs_used >= num_args - ASSERT regs_used <= 15 - PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 - %if mmsize == 8 - %assign xmm_regs_used 0 - %else - WIN64_SPILL_XMM %3 - %endif - LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 - DEFINE_ARGS %4 -%endmacro - -%macro WIN64_SPILL_XMM 1 - %assign xmm_regs_used %1 - ASSERT xmm_regs_used <= 16 - %if xmm_regs_used > 6 - SUB rsp, (xmm_regs_used-6)*16+16 - %assign %%i xmm_regs_used - %rep (xmm_regs_used-6) - %assign %%i %%i-1 - movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i - %endrep - %endif -%endmacro - -%macro WIN64_RESTORE_XMM_INTERNAL 1 - %if xmm_regs_used > 6 - %assign %%i xmm_regs_used - %rep (xmm_regs_used-6) - %assign %%i %%i-1 - movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)] - %endrep - add %1, (xmm_regs_used-6)*16+16 - %endif -%endmacro - -%macro WIN64_RESTORE_XMM 1 - WIN64_RESTORE_XMM_INTERNAL %1 - %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 - %assign xmm_regs_used 0 -%endmacro - -%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 - -%macro RET 0 - WIN64_RESTORE_XMM_INTERNAL rsp - POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 -%if mmsize == 32 - vzeroupper -%endif - ret -%endmacro - -%elif ARCH_X86_64 ; *nix x64 ;============================================= - -DECLARE_REG 0, rdi -DECLARE_REG 1, rsi -DECLARE_REG 2, rdx -DECLARE_REG 3, rcx -DECLARE_REG 4, R8 -DECLARE_REG 5, R9 -DECLARE_REG 6, rax, 8 -DECLARE_REG 7, R10, 16 -DECLARE_REG 8, R11, 24 -DECLARE_REG 9, rbx, 32 -DECLARE_REG 10, rbp, 40 -DECLARE_REG 11, R12, 48 -DECLARE_REG 12, R13, 56 -DECLARE_REG 13, R14, 64 -DECLARE_REG 14, R15, 72 - -%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... - %assign num_args %1 - %assign regs_used %2 - ASSERT regs_used >= num_args - ASSERT regs_used <= 15 - PUSH_IF_USED 9, 10, 11, 12, 13, 14 - LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 - DEFINE_ARGS %4 -%endmacro - -%define has_epilogue regs_used > 9 || mmsize == 32 - -%macro RET 0 - POP_IF_USED 14, 13, 12, 11, 10, 9 -%if mmsize == 32 - vzeroupper -%endif - ret -%endmacro - -%else ; X86_32 ;============================================================== - -DECLARE_REG 0, eax, 4 -DECLARE_REG 1, ecx, 8 -DECLARE_REG 2, edx, 12 -DECLARE_REG 3, ebx, 16 -DECLARE_REG 4, esi, 20 -DECLARE_REG 5, edi, 24 -DECLARE_REG 6, ebp, 28 -%define rsp esp - -%macro DECLARE_ARG 1-* - %rep %0 - %define r%1m [esp + stack_offset + 4*%1 + 4] - %define r%1mp dword r%1m - %rotate 1 - %endrep -%endmacro - -DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 - -%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... - %assign num_args %1 - %assign regs_used %2 - %if regs_used > 7 - %assign regs_used 7 - %endif - ASSERT regs_used >= num_args - PUSH_IF_USED 3, 4, 5, 6 - LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 - DEFINE_ARGS %4 -%endmacro - -%define has_epilogue regs_used > 3 || mmsize == 32 - -%macro RET 0 - POP_IF_USED 6, 5, 4, 3 -%if mmsize == 32 - vzeroupper -%endif - ret -%endmacro - -%endif ;====================================================================== - -%if WIN64 == 0 -%macro WIN64_SPILL_XMM 1 -%endmacro -%macro WIN64_RESTORE_XMM 1 -%endmacro -%endif - -%macro REP_RET 0 - %if has_epilogue - RET - %else - rep ret - %endif -%endmacro - -%macro TAIL_CALL 2 ; callee, is_nonadjacent - %if has_epilogue - call %1 - RET - %elif %2 - jmp %1 - %endif -%endmacro - -;============================================================================= -; arch-independent part -;============================================================================= - -%assign function_align 16 - -; Begin a function. -; Applies any symbol mangling needed for C linkage, and sets up a define such that -; subsequent uses of the function name automatically refer to the mangled version. -; Appends cpuflags to the function name if cpuflags has been specified. -%macro cglobal 1-2+ ; name, [PROLOGUE args] -%if %0 == 1 - cglobal_internal %1 %+ SUFFIX -%else - cglobal_internal %1 %+ SUFFIX, %2 -%endif -%endmacro -%macro cglobal_internal 1-2+ - %ifndef cglobaled_%1 - %xdefine %1 mangle(%1) - %xdefine %1.skip_prologue %1 %+ .skip_prologue - CAT_XDEFINE cglobaled_, %1, 1 - %endif - %xdefine current_function %1 - %ifidn __OUTPUT_FORMAT__,elf - global %1:function hidden - %else - global %1 - %endif - align function_align - %1: - RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer - %assign stack_offset 0 - %if %0 > 1 - PROLOGUE %2 - %endif -%endmacro - -%macro cextern 1 - %xdefine %1 mangle(%1) - CAT_XDEFINE cglobaled_, %1, 1 - extern %1 -%endmacro - -; like cextern, but without the prefix -%macro cextern_naked 1 - %xdefine %1 mangle(%1) - CAT_XDEFINE cglobaled_, %1, 1 - extern %1 -%endmacro - -%macro const 2+ - %xdefine %1 mangle(%1) - global %1 - %1: %2 -%endmacro - -; This is needed for ELF, otherwise the GNU linker assumes the stack is -; executable by default. -%ifidn __OUTPUT_FORMAT__,elf -SECTION .note.GNU-stack noalloc noexec nowrite progbits -%endif -%ifidn __OUTPUT_FORMAT__,elf32 -section .note.GNU-stack noalloc noexec nowrite progbits -%endif -%ifidn __OUTPUT_FORMAT__,elf64 -section .note.GNU-stack noalloc noexec nowrite progbits -%endif - -; cpuflags - -%assign cpuflags_MMX (1<<0) -%assign cpuflags_MMX2 (1<<1) | cpuflags_MMX -%assign cpuflags_3dnow (1<<2) | cpuflags_MMX -%assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow -%assign cpuflags_SSE (1<<4) | cpuflags_MMX2 -%assign cpuflags_SSE2 (1<<5) | cpuflags_SSE -%assign cpuflags_SSE2slow (1<<6) | cpuflags_SSE2 -%assign cpuflags_SSE3 (1<<7) | cpuflags_SSE2 -%assign cpuflags_SSSE3 (1<<8) | cpuflags_SSE3 -%assign cpuflags_SSE4 (1<<9) | cpuflags_SSSE3 -%assign cpuflags_SSE42 (1<<10)| cpuflags_SSE4 -%assign cpuflags_AVX (1<<11)| cpuflags_SSE42 -%assign cpuflags_xop (1<<12)| cpuflags_AVX -%assign cpuflags_fma4 (1<<13)| cpuflags_AVX -%assign cpuflags_AVX2 (1<<14)| cpuflags_AVX -%assign cpuflags_fma3 (1<<15)| cpuflags_AVX - -%assign cpuflags_cache32 (1<<16) -%assign cpuflags_cache64 (1<<17) -%assign cpuflags_slowctz (1<<18) -%assign cpuflags_lzcnt (1<<19) -%assign cpuflags_misalign (1<<20) -%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant -%assign cpuflags_atom (1<<22) -%assign cpuflags_bmi1 (1<<23) -%assign cpuflags_bmi2 (1<<24)|cpuflags_bmi1 -%assign cpuflags_tbm (1<<25)|cpuflags_bmi1 - -%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) -%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) - -; Takes up to 2 cpuflags from the above list. -; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. -; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. -%macro INIT_CPUFLAGS 0-2 - %if %0 >= 1 - %xdefine cpuname %1 - %assign cpuflags cpuflags_%1 - %if %0 >= 2 - %xdefine cpuname %1_%2 - %assign cpuflags cpuflags | cpuflags_%2 - %endif - %xdefine SUFFIX _ %+ cpuname - %if cpuflag(AVX) - %assign AVX_enabled 1 - %endif - %if mmsize == 16 && notcpuflag(SSE2) - %define mova movaps - %define movu movups - %define movnta movntps - %endif - %if cpuflag(aligned) - %define movu mova - %elifidn %1, SSE3 - %define movu lddqu - %endif - %else - %xdefine SUFFIX - %undef cpuname - %undef cpuflags - %endif -%endmacro - -; merge MMX and SSE* - -%macro CAT_XDEFINE 3 - %xdefine %1%2 %3 -%endmacro - -%macro CAT_UNDEF 2 - %undef %1%2 -%endmacro - -%macro INIT_MMX 0-1+ - %assign AVX_enabled 0 - %define RESET_MM_PERMUTATION INIT_MMX %1 - %define mmsize 8 - %define num_mmregs 8 - %define mova movq - %define movu movq - %define movh movd - %define movnta movntq - %assign %%i 0 - %rep 8 - CAT_XDEFINE m, %%i, mm %+ %%i - CAT_XDEFINE nmm, %%i, %%i - %assign %%i %%i+1 - %endrep - %rep 8 - CAT_UNDEF m, %%i - CAT_UNDEF nmm, %%i - %assign %%i %%i+1 - %endrep - INIT_CPUFLAGS %1 -%endmacro - -%macro INIT_XMM 0-1+ - %assign AVX_enabled 0 - %define RESET_MM_PERMUTATION INIT_XMM %1 - %define mmsize 16 - %define num_mmregs 8 - %if ARCH_X86_64 - %define num_mmregs 16 - %endif - %define mova movdqa - %define movu movdqu - %define movh movq - %define movnta movntdq - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, xmm %+ %%i - CAT_XDEFINE nxmm, %%i, %%i - %assign %%i %%i+1 - %endrep - INIT_CPUFLAGS %1 -%endmacro - -%macro INIT_YMM 0-1+ - %assign AVX_enabled 1 - %define RESET_MM_PERMUTATION INIT_YMM %1 - %define mmsize 32 - %define num_mmregs 8 - %if ARCH_X86_64 - %define num_mmregs 16 - %endif - %define mova vmovaps - %define movu vmovups - %undef movh - %define movnta vmovntps - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, ymm %+ %%i - CAT_XDEFINE nymm, %%i, %%i - %assign %%i %%i+1 - %endrep - INIT_CPUFLAGS %1 -%endmacro - -INIT_XMM - -; I often want to use macros that permute their arguments. e.g. there's no -; efficient way to implement butterfly or transpose or dct without swapping some -; arguments. -; -; I would like to not have to manually keep track of the permutations: -; If I insert a permutation in the middle of a function, it should automatically -; change everything that follows. For more complex macros I may also have multiple -; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. -; -; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that -; permutes its arguments. It's equivalent to exchanging the contents of the -; registers, except that this way you exchange the register names instead, so it -; doesn't cost any cycles. - -%macro PERMUTE 2-* ; takes a list of pairs to swap -%rep %0/2 - %xdefine tmp%2 m%2 - %xdefine ntmp%2 nm%2 - %rotate 2 -%endrep -%rep %0/2 - %xdefine m%1 tmp%2 - %xdefine nm%1 ntmp%2 - %undef tmp%2 - %undef ntmp%2 - %rotate 2 -%endrep -%endmacro - -%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) -%rep %0-1 -%ifdef m%1 - %xdefine tmp m%1 - %xdefine m%1 m%2 - %xdefine m%2 tmp - CAT_XDEFINE n, m%1, %1 - CAT_XDEFINE n, m%2, %2 -%else - ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here. - ; Be careful using this mode in nested macros though, as in some cases there may be - ; other copies of m# that have already been dereferenced and don't get updated correctly. - %xdefine %%n1 n %+ %1 - %xdefine %%n2 n %+ %2 - %xdefine tmp m %+ %%n1 - CAT_XDEFINE m, %%n1, m %+ %%n2 - CAT_XDEFINE m, %%n2, tmp - CAT_XDEFINE n, m %+ %%n1, %%n1 - CAT_XDEFINE n, m %+ %%n2, %%n2 -%endif - %undef tmp - %rotate 1 -%endrep -%endmacro - -; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later -; calls to that function will automatically load the permutation, so values can -; be returned in mmregs. -%macro SAVE_MM_PERMUTATION 0-1 - %if %0 - %xdefine %%f %1_m - %else - %xdefine %%f current_function %+ _m - %endif - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE %%f, %%i, m %+ %%i - %assign %%i %%i+1 - %endrep -%endmacro - -%macro LOAD_MM_PERMUTATION 1 ; name to load from - %ifdef %1_m0 - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, %1_m %+ %%i - CAT_XDEFINE n, m %+ %%i, %%i - %assign %%i %%i+1 - %endrep - %endif -%endmacro - -; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't -%macro call 1 - call_internal %1, %1 %+ SUFFIX -%endmacro -%macro call_internal 2 - %xdefine %%i %1 - %ifndef cglobaled_%1 - %ifdef cglobaled_%2 - %xdefine %%i %2 - %endif - %endif - call %%i - LOAD_MM_PERMUTATION %%i -%endmacro - -; Substitutions that reduce instruction size but are functionally equivalent -%macro add 2 - %ifnum %2 - %if %2==128 - sub %1, -128 - %else - add %1, %2 - %endif - %else - add %1, %2 - %endif -%endmacro - -%macro sub 2 - %ifnum %2 - %if %2==128 - add %1, -128 - %else - sub %1, %2 - %endif - %else - sub %1, %2 - %endif -%endmacro - -;============================================================================= -; AVX abstraction layer -;============================================================================= - -%assign i 0 -%rep 16 - %if i < 8 - CAT_XDEFINE sizeofmm, i, 8 - %endif - CAT_XDEFINE sizeofxmm, i, 16 - CAT_XDEFINE sizeofymm, i, 32 -%assign i i+1 -%endrep -%undef i - -%macro CHECK_AVX_INSTR_EMU 3-* - %xdefine %%opcode %1 - %xdefine %%dst %2 - %rep %0-2 - %ifidn %%dst, %3 - %error non-AVX emulation of ``%%opcode'' is not supported - %endif - %rotate 1 - %endrep -%endmacro - -;%1 == instruction -;%2 == 1 if float, 0 if int -;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) -;%4 == number of operands given -;%5+: operands -%macro RUN_AVX_INSTR 6-7+ - %ifid %6 - %define %%sizeofreg sizeof%6 - %elifid %5 - %define %%sizeofreg sizeof%5 - %else - %define %%sizeofreg mmsize - %endif - %if %%sizeofreg==32 - %if %4>=3 - v%1 %5, %6, %7 - %else - v%1 %5, %6 - %endif - %else - %if %%sizeofreg==8 - %define %%regmov movq - %elif %2 - %define %%regmov movaps - %else - %define %%regmov movdqa - %endif - - %if %4>=3+%3 - %ifnidn %5, %6 - %if AVX_enabled && %%sizeofreg==16 - v%1 %5, %6, %7 - %else - CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7 - %%regmov %5, %6 - %1 %5, %7 - %endif - %else - %1 %5, %7 - %endif - %elif %4>=3 - %1 %5, %6, %7 - %else - %1 %5, %6 - %endif - %endif -%endmacro - -; 3arg AVX ops with a memory arg can only have it in src2, -; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov). -; So, if the op is symmetric and the wrong one is memory, swap them. -%macro RUN_AVX_INSTR1 8 - %assign %%swap 0 - %if AVX_enabled - %ifnid %6 - %assign %%swap 1 - %endif - %elifnidn %5, %6 - %ifnid %7 - %assign %%swap 1 - %endif - %endif - %if %%swap && %3 == 0 && %8 == 1 - RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6 - %else - RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7 - %endif -%endmacro - -;%1 == instruction -;%2 == 1 if float, 0 if int -;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) -;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not -%macro AVX_INSTR 4 - %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4 - %ifidn %3, fnord - RUN_AVX_INSTR %6, %7, %8, 2, %1, %2 - %elifidn %4, fnord - RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9 - %elifidn %5, fnord - RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4 - %else - RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5 - %endif - %endmacro -%endmacro - -AVX_INSTR addpd, 1, 0, 1 -AVX_INSTR addps, 1, 0, 1 -AVX_INSTR addsd, 1, 0, 1 -AVX_INSTR addss, 1, 0, 1 -AVX_INSTR addsubpd, 1, 0, 0 -AVX_INSTR addsubps, 1, 0, 0 -AVX_INSTR andpd, 1, 0, 1 -AVX_INSTR andps, 1, 0, 1 -AVX_INSTR andnpd, 1, 0, 0 -AVX_INSTR andnps, 1, 0, 0 -AVX_INSTR blendpd, 1, 0, 0 -AVX_INSTR blendps, 1, 0, 0 -AVX_INSTR blendvpd, 1, 0, 0 -AVX_INSTR blendvps, 1, 0, 0 -AVX_INSTR cmppd, 1, 0, 0 -AVX_INSTR cmpps, 1, 0, 0 -AVX_INSTR cmpsd, 1, 0, 0 -AVX_INSTR cmpss, 1, 0, 0 -AVX_INSTR cvtdq2ps, 1, 0, 0 -AVX_INSTR cvtps2dq, 1, 0, 0 -AVX_INSTR divpd, 1, 0, 0 -AVX_INSTR divps, 1, 0, 0 -AVX_INSTR divsd, 1, 0, 0 -AVX_INSTR divss, 1, 0, 0 -AVX_INSTR dppd, 1, 1, 0 -AVX_INSTR dpps, 1, 1, 0 -AVX_INSTR haddpd, 1, 0, 0 -AVX_INSTR haddps, 1, 0, 0 -AVX_INSTR hsubpd, 1, 0, 0 -AVX_INSTR hsubps, 1, 0, 0 -AVX_INSTR maxpd, 1, 0, 1 -AVX_INSTR maxps, 1, 0, 1 -AVX_INSTR maxsd, 1, 0, 1 -AVX_INSTR maxss, 1, 0, 1 -AVX_INSTR minpd, 1, 0, 1 -AVX_INSTR minps, 1, 0, 1 -AVX_INSTR minsd, 1, 0, 1 -AVX_INSTR minss, 1, 0, 1 -AVX_INSTR movhlps, 1, 0, 0 -AVX_INSTR movlhps, 1, 0, 0 -AVX_INSTR movsd, 1, 0, 0 -AVX_INSTR movss, 1, 0, 0 -AVX_INSTR mpsadbw, 0, 1, 0 -AVX_INSTR mulpd, 1, 0, 1 -AVX_INSTR mulps, 1, 0, 1 -AVX_INSTR mulsd, 1, 0, 1 -AVX_INSTR mulss, 1, 0, 1 -AVX_INSTR orpd, 1, 0, 1 -AVX_INSTR orps, 1, 0, 1 -AVX_INSTR pabsb, 0, 0, 0 -AVX_INSTR pabsw, 0, 0, 0 -AVX_INSTR pabsd, 0, 0, 0 -AVX_INSTR packsswb, 0, 0, 0 -AVX_INSTR packssdw, 0, 0, 0 -AVX_INSTR packuswb, 0, 0, 0 -AVX_INSTR packusdw, 0, 0, 0 -AVX_INSTR paddb, 0, 0, 1 -AVX_INSTR paddw, 0, 0, 1 -AVX_INSTR paddd, 0, 0, 1 -AVX_INSTR paddq, 0, 0, 1 -AVX_INSTR paddsb, 0, 0, 1 -AVX_INSTR paddsw, 0, 0, 1 -AVX_INSTR paddusb, 0, 0, 1 -AVX_INSTR paddusw, 0, 0, 1 -AVX_INSTR palignr, 0, 1, 0 -AVX_INSTR pand, 0, 0, 1 -AVX_INSTR pandn, 0, 0, 0 -AVX_INSTR pavgb, 0, 0, 1 -AVX_INSTR pavgw, 0, 0, 1 -AVX_INSTR pblendvb, 0, 0, 0 -AVX_INSTR pblendw, 0, 1, 0 -AVX_INSTR pcmpestri, 0, 0, 0 -AVX_INSTR pcmpestrm, 0, 0, 0 -AVX_INSTR pcmpistri, 0, 0, 0 -AVX_INSTR pcmpistrm, 0, 0, 0 -AVX_INSTR pcmpeqb, 0, 0, 1 -AVX_INSTR pcmpeqw, 0, 0, 1 -AVX_INSTR pcmpeqd, 0, 0, 1 -AVX_INSTR pcmpeqq, 0, 0, 1 -AVX_INSTR pcmpgtb, 0, 0, 0 -AVX_INSTR pcmpgtw, 0, 0, 0 -AVX_INSTR pcmpgtd, 0, 0, 0 -AVX_INSTR pcmpgtq, 0, 0, 0 -AVX_INSTR phaddw, 0, 0, 0 -AVX_INSTR phaddd, 0, 0, 0 -AVX_INSTR phaddsw, 0, 0, 0 -AVX_INSTR phsubw, 0, 0, 0 -AVX_INSTR phsubd, 0, 0, 0 -AVX_INSTR phsubsw, 0, 0, 0 -AVX_INSTR pmaddwd, 0, 0, 1 -AVX_INSTR pmaddubsw, 0, 0, 0 -AVX_INSTR pmaxsb, 0, 0, 1 -AVX_INSTR pmaxsw, 0, 0, 1 -AVX_INSTR pmaxsd, 0, 0, 1 -AVX_INSTR pmaxub, 0, 0, 1 -AVX_INSTR pmaxuw, 0, 0, 1 -AVX_INSTR pmaxud, 0, 0, 1 -AVX_INSTR pminsb, 0, 0, 1 -AVX_INSTR pminsw, 0, 0, 1 -AVX_INSTR pminsd, 0, 0, 1 -AVX_INSTR pminub, 0, 0, 1 -AVX_INSTR pminuw, 0, 0, 1 -AVX_INSTR pminud, 0, 0, 1 -AVX_INSTR pmovmskb, 0, 0, 0 -AVX_INSTR pmulhuw, 0, 0, 1 -AVX_INSTR pmulhrsw, 0, 0, 1 -AVX_INSTR pmulhw, 0, 0, 1 -AVX_INSTR pmullw, 0, 0, 1 -AVX_INSTR pmulld, 0, 0, 1 -AVX_INSTR pmuludq, 0, 0, 1 -AVX_INSTR pmuldq, 0, 0, 1 -AVX_INSTR por, 0, 0, 1 -AVX_INSTR psadbw, 0, 0, 1 -AVX_INSTR pshufb, 0, 0, 0 -AVX_INSTR pshufd, 0, 1, 0 -AVX_INSTR pshufhw, 0, 1, 0 -AVX_INSTR pshuflw, 0, 1, 0 -AVX_INSTR psignb, 0, 0, 0 -AVX_INSTR psignw, 0, 0, 0 -AVX_INSTR psignd, 0, 0, 0 -AVX_INSTR psllw, 0, 0, 0 -AVX_INSTR pslld, 0, 0, 0 -AVX_INSTR psllq, 0, 0, 0 -AVX_INSTR pslldq, 0, 0, 0 -AVX_INSTR psraw, 0, 0, 0 -AVX_INSTR psrad, 0, 0, 0 -AVX_INSTR psrlw, 0, 0, 0 -AVX_INSTR psrld, 0, 0, 0 -AVX_INSTR psrlq, 0, 0, 0 -AVX_INSTR psrldq, 0, 0, 0 -AVX_INSTR psubb, 0, 0, 0 -AVX_INSTR psubw, 0, 0, 0 -AVX_INSTR psubd, 0, 0, 0 -AVX_INSTR psubq, 0, 0, 0 -AVX_INSTR psubsb, 0, 0, 0 -AVX_INSTR psubsw, 0, 0, 0 -AVX_INSTR psubusb, 0, 0, 0 -AVX_INSTR psubusw, 0, 0, 0 -AVX_INSTR ptest, 0, 0, 0 -AVX_INSTR punpckhbw, 0, 0, 0 -AVX_INSTR punpckhwd, 0, 0, 0 -AVX_INSTR punpckhdq, 0, 0, 0 -AVX_INSTR punpckhqdq, 0, 0, 0 -AVX_INSTR punpcklbw, 0, 0, 0 -AVX_INSTR punpcklwd, 0, 0, 0 -AVX_INSTR punpckldq, 0, 0, 0 -AVX_INSTR punpcklqdq, 0, 0, 0 -AVX_INSTR pxor, 0, 0, 1 -AVX_INSTR shufps, 1, 1, 0 -AVX_INSTR subpd, 1, 0, 0 -AVX_INSTR subps, 1, 0, 0 -AVX_INSTR subsd, 1, 0, 0 -AVX_INSTR subss, 1, 0, 0 -AVX_INSTR unpckhpd, 1, 0, 0 -AVX_INSTR unpckhps, 1, 0, 0 -AVX_INSTR unpcklpd, 1, 0, 0 -AVX_INSTR unpcklps, 1, 0, 0 -AVX_INSTR xorpd, 1, 0, 1 -AVX_INSTR xorps, 1, 0, 1 - -; 3DNow instructions, for sharing code between AVX, SSE and 3DN -AVX_INSTR pfadd, 1, 0, 1 -AVX_INSTR pfsub, 1, 0, 0 -AVX_INSTR pfmul, 1, 0, 1 - -; base-4 constants for shuffles -%assign i 0 -%rep 256 - %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) - %if j < 10 - CAT_XDEFINE q000, j, i - %elif j < 100 - CAT_XDEFINE q00, j, i - %elif j < 1000 - CAT_XDEFINE q0, j, i - %else - CAT_XDEFINE q, j, i - %endif -%assign i i+1 -%endrep -%undef i -%undef j - -%macro FMA_INSTR 3 - %macro %1 4-7 %1, %2, %3 - %if cpuflag(xop) - v%5 %1, %2, %3, %4 - %else - %6 %1, %2, %3 - %7 %1, %4 - %endif - %endmacro -%endmacro - -FMA_INSTR pmacsdd, pmulld, paddd -FMA_INSTR pmacsww, pmullw, paddw -FMA_INSTR pmadcswd, pmaddwd, paddd - -; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf. -; This lets us use tzcnt without bumping the yasm version requirement yet. -%define tzcnt rep bsf |