From 0c496a4ff92f6c8c3607bda29f442e254efd28f4 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 1 May 2023 21:55:05 -0700 Subject: cpu.h: add WEBP_AARCH64 and define it to true for __aarch64__ and Win Arm64 + Visual Studio. Microsoft's compiler (cl.exe) does not define __aarch64__, but relies on _M_ARM64 & _M_ARM64EC Bug: b/277254922 Change-Id: I20e4fa07a4031599db69e3d7ba9050345315ef51 --- src/dec/tree_dec.c | 3 ++- src/dsp/cost_neon.c | 4 ++-- src/dsp/cpu.h | 9 +++++++-- src/dsp/dec_neon.c | 4 ++-- src/dsp/enc_neon.c | 4 ++-- src/dsp/lossless_enc_neon.c | 2 +- src/dsp/lossless_neon.c | 2 +- src/dsp/neon.h | 4 ++-- src/dsp/quant.h | 2 +- src/utils/bit_reader_utils.c | 3 ++- src/utils/bit_reader_utils.h | 3 ++- 11 files changed, 24 insertions(+), 16 deletions(-) diff --git a/src/dec/tree_dec.c b/src/dec/tree_dec.c index 1c6fdea2..24346059 100644 --- a/src/dec/tree_dec.c +++ b/src/dec/tree_dec.c @@ -12,10 +12,11 @@ // Author: Skal (pascal.massimino@gmail.com) #include "src/dec/vp8i_dec.h" +#include "src/dsp/cpu.h" #include "src/utils/bit_reader_inl_utils.h" #if !defined(USE_GENERIC_TREE) -#if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) +#if !defined(__arm__) && !defined(_M_ARM) && !WEBP_AARCH64 // using a table is ~1-2% slower on ARM. Prefer the coded-tree approach then. #define USE_GENERIC_TREE 1 // ALTERNATE_CODE #else diff --git a/src/dsp/cost_neon.c b/src/dsp/cost_neon.c index 8cc8ce58..6582669c 100644 --- a/src/dsp/cost_neon.c +++ b/src/dsp/cost_neon.c @@ -29,7 +29,7 @@ static void SetResidualCoeffs_NEON(const int16_t* const coeffs, const uint8x16_t eob = vcombine_u8(vqmovn_u16(eob_0), vqmovn_u16(eob_1)); const uint8x16_t masked = vandq_u8(eob, vld1q_u8(position)); -#ifdef __aarch64__ +#if WEBP_AARCH64 res->last = vmaxvq_u8(masked) - 1; #else const uint8x8_t eob_8x8 = vmax_u8(vget_low_u8(masked), vget_high_u8(masked)); @@ -43,7 +43,7 @@ static void SetResidualCoeffs_NEON(const int16_t* const coeffs, vst1_lane_s32(&res->last, vreinterpret_s32_u32(eob_32x2), 0); --res->last; -#endif // __aarch64__ +#endif // WEBP_AARCH64 res->coeffs = coeffs; } diff --git a/src/dsp/cpu.h b/src/dsp/cpu.h index 1b48eaa1..94ad34d5 100644 --- a/src/dsp/cpu.h +++ b/src/dsp/cpu.h @@ -105,6 +105,12 @@ #define WEBP_USE_INTRINSICS #endif +#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#define WEBP_AARCH64 1 +#else +#define WEBP_AARCH64 0 +#endif + #if defined(WEBP_USE_NEON) && !defined(WEBP_HAVE_NEON) #define WEBP_HAVE_NEON #endif @@ -134,8 +140,7 @@ #define WEBP_NEON_OMIT_C_CODE 0 #endif -#if !(LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 8) || \ - defined(__aarch64__)) +#if !(LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 8) || WEBP_AARCH64) #define WEBP_NEON_WORK_AROUND_GCC 1 #else #define WEBP_NEON_WORK_AROUND_GCC 0 diff --git a/src/dsp/dec_neon.c b/src/dsp/dec_neon.c index fa851707..22784cf1 100644 --- a/src/dsp/dec_neon.c +++ b/src/dsp/dec_neon.c @@ -1428,7 +1428,7 @@ static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) { if (do_top) { const uint8x8_t A = vld1_u8(dst - BPS); // top row -#if defined(__aarch64__) +#if WEBP_AARCH64 const uint16_t p2 = vaddlv_u8(A); sum_top = vdupq_n_u16(p2); #else @@ -1511,7 +1511,7 @@ static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) { if (do_top) { const uint8x16_t A = vld1q_u8(dst - BPS); // top row -#if defined(__aarch64__) +#if WEBP_AARCH64 const uint16_t p3 = vaddlvq_u8(A); sum_top = vdupq_n_u16(p3); #else diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c index 3a04111c..71480036 100644 --- a/src/dsp/enc_neon.c +++ b/src/dsp/enc_neon.c @@ -764,7 +764,7 @@ static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a, // Horizontal sum of all four uint32_t values in 'sum'. static int SumToInt_NEON(uint32x4_t sum) { -#if defined(__aarch64__) +#if WEBP_AARCH64 return (int)vaddvq_u32(sum); #else const uint64x2_t sum2 = vpaddlq_u32(sum); @@ -865,7 +865,7 @@ static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16], uint8x8x4_t shuffles; // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use // non-standard versions there. -#if defined(__APPLE__) && defined(__aarch64__) && \ +#if defined(__APPLE__) && WEBP_AARCH64 && \ defined(__apple_build_version__) && (__apple_build_version__< 6020037) uint8x16x2_t all_out; INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1)); diff --git a/src/dsp/lossless_enc_neon.c b/src/dsp/lossless_enc_neon.c index 7c7b73f8..e32c7961 100644 --- a/src/dsp/lossless_enc_neon.c +++ b/src/dsp/lossless_enc_neon.c @@ -25,7 +25,7 @@ // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use // non-standard versions there. -#if defined(__APPLE__) && defined(__aarch64__) && \ +#if defined(__APPLE__) && WEBP_AARCH64 && \ defined(__apple_build_version__) && (__apple_build_version__< 6020037) #define USE_VTBLQ #endif diff --git a/src/dsp/lossless_neon.c b/src/dsp/lossless_neon.c index 89e3e013..ddc9b617 100644 --- a/src/dsp/lossless_neon.c +++ b/src/dsp/lossless_neon.c @@ -498,7 +498,7 @@ static void PredictorAdd13_NEON(const uint32_t* in, const uint32_t* upper, // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use // non-standard versions there. -#if defined(__APPLE__) && defined(__aarch64__) && \ +#if defined(__APPLE__) && WEBP_AARCH64 && \ defined(__apple_build_version__) && (__apple_build_version__< 6020037) #define USE_VTBLQ #endif diff --git a/src/dsp/neon.h b/src/dsp/neon.h index c591f9b9..f806a278 100644 --- a/src/dsp/neon.h +++ b/src/dsp/neon.h @@ -21,7 +21,7 @@ // Right now, some intrinsics functions seem slower, so we disable them // everywhere except newer clang/gcc or aarch64 where the inline assembly is // incompatible. -#if LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,9) || defined(__aarch64__) +#if LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,9) || WEBP_AARCH64 #define WEBP_USE_INTRINSICS // use intrinsics when possible #endif @@ -46,7 +46,7 @@ // if using intrinsics, this flag avoids some functions that make gcc-4.6.3 // crash ("internal compiler error: in immed_double_const, at emit-rtl."). // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183) -#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__)) +#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || WEBP_AARCH64) #define WORK_AROUND_GCC #endif diff --git a/src/dsp/quant.h b/src/dsp/quant.h index fc099bf9..bf7734cb 100644 --- a/src/dsp/quant.h +++ b/src/dsp/quant.h @@ -22,7 +22,7 @@ #define IsFlat IsFlat_NEON static uint32_t horizontal_add_uint32x4(const uint32x4_t a) { -#if defined(__aarch64__) +#if WEBP_AARCH64 return vaddvq_u32(a); #else const uint64x2_t b = vpaddlq_u32(a); diff --git a/src/utils/bit_reader_utils.c b/src/utils/bit_reader_utils.c index 857cd609..a26557aa 100644 --- a/src/utils/bit_reader_utils.c +++ b/src/utils/bit_reader_utils.c @@ -15,6 +15,7 @@ #include "src/webp/config.h" #endif +#include "src/dsp/cpu.h" #include "src/utils/bit_reader_inl_utils.h" #include "src/utils/utils.h" @@ -121,7 +122,7 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int bits, #define VP8L_LOG8_WBITS 4 // Number of bytes needed to store VP8L_WBITS bits. -#if defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \ +#if defined(__arm__) || defined(_M_ARM) || WEBP_AARCH64 || \ defined(__i386__) || defined(_M_IX86) || \ defined(__x86_64__) || defined(_M_X64) #define VP8L_USE_FAST_LOAD diff --git a/src/utils/bit_reader_utils.h b/src/utils/bit_reader_utils.h index e64156e3..25ff31e5 100644 --- a/src/utils/bit_reader_utils.h +++ b/src/utils/bit_reader_utils.h @@ -19,6 +19,7 @@ #ifdef _MSC_VER #include // _byteswap_ulong #endif +#include "src/dsp/cpu.h" #include "src/webp/types.h" // Warning! This macro triggers quite some MACRO wizardry around func signature! @@ -64,7 +65,7 @@ extern "C" { #define BITS 56 #elif defined(__arm__) || defined(_M_ARM) // ARM #define BITS 24 -#elif defined(__aarch64__) // ARM 64bit +#elif WEBP_AARCH64 // ARM 64bit #define BITS 56 #elif defined(__mips__) // MIPS #define BITS 24 -- cgit v1.2.1