diff options
author | Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> | 2023-01-04 01:58:41 -0800 |
---|---|---|
committer | Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> | 2023-01-04 02:19:32 -0800 |
commit | d7b19a432bcd0add3b591e54a2ee0fbdbfc1b45e (patch) | |
tree | 07490cf91d80d22f39d4d3cf68f2f58ee1ca43b5 /numpy | |
parent | 2a258c3b47eec9521120ebbafca0226ecadc5d0d (diff) | |
download | numpy-d7b19a432bcd0add3b591e54a2ee0fbdbfc1b45e.tar.gz |
Address feedback from 2022-12-14
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src | 703 |
1 files changed, 361 insertions, 342 deletions
diff --git a/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src index fed8213df..9fb1aed0a 100644 --- a/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src +++ b/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src @@ -15,14 +15,23 @@ * Splitting the files out allows us to keep loops_unary_fp.dispatch.c.src * building for VX and VXE so we don't regress performance while adding this * code for other platforms. -*/ + */ +// TODO(@seiko2plus): add support for big-endian +#if NPY_SIMD_BIGENDIAN + #undef NPY_SIMD + #undef NPY_SIMD_F32 + #undef NPY_SIMD_F64 + #define NPY_SIMD 0 + #define NPY_SIMD_F32 0 + #define NPY_SIMD_F64 0 +#endif /** * Force use SSE only on x86, even if AVX2 or AVX512F are enabled * through the baseline, since scatter(AVX512F) and gather very costly * to handle non-contiguous memory access comparing with SSE for * such small operations that this file covers. -*/ + */ #define NPY_SIMD_FORCE_128 #define _UMATHMODULE #define _MULTIARRAYMODULE @@ -42,212 +51,346 @@ ******************************************************************************/ #if NPY_SIMD -/**begin repeat - * #TYPE = FLOAT, DOUBLE# - * #sfx = f32, f64# - * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64# - * #FDMAX = FLT_MAX, DBL_MAX# - * #fd = f, # - * #ssfx = 32, 64# + +/** + * We define intrinsics for isnan, isinf, isfinite, and signbit below. There's + * a few flavors of each. We'll use f32 as an example although f64 versions + * are also defined. + * + * npyv_u32 npyv_KIND_f32(npyv_f32 v) + * These are mainly used for the single vector loops. As such, result should + * be bool true / false, ready to write back. + * + * npyv_b32 _npyv_KIND_f32(npyv_f32 v) + * These are used by the geneal intrinsics above as well as the multi-vector + * packing intrinsics. The multi-vector packing intrinsics are the ones + * utilized in the unrolled vector loops. Results should be vector masks + * of 0x00/0xff. + * + * npyv_u8 npyv_pack_KIND_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3) + * These are the multi-vector packing intrinsics utilized by unrolled vector + * loops. They perform the operation on all input vectors and pack the + * results to a single npyv_u8. Assuming NPY_SIMD == 128, that means we + * can pack results from 4x npyv_f32 or 8x npyv_64 in a single npyv_u8. + * Result should be bool true / false, ready to write back. */ -#if @VCHK@ -static NPY_INLINE npyv_u@ssfx@ -npyv_isnan_@sfx@(npyv_@sfx@ v) +#if NPY_SIMD_F32 +NPY_FINLINE npyv_u32 +npyv_isnan_f32(npyv_f32 v) { - // (v != v) >> (size - 1) -#if defined(NPY_HAVE_SSE2) || defined (NPY_HAVE_SSE41) - // sse npyv_cmpneq_@sfx@ define includes a cast already - npyv_u@ssfx@ r = npyv_cmpneq_@sfx@(v, v); -#else - npyv_u@ssfx@ r = npyv_cvt_u@ssfx@_b@ssfx@(npyv_cmpneq_@sfx@(v, v)); -#endif -#if defined(NPY__CPU_TARGET_VX) || defined(NPY__CPU_TARGET_VXE) - // don't do the shift on s390x - return r; -#else - return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1); + const npyv_u32 truemask = npyv_setall_u32(1==1); + return npyv_andc_u8(npyv_reinterpret_u8_u32(truemask), + npyv_reinterpret_u8_u32(npyv_cvt_u32_b32(npyv_notnan_f32(v)))); +} +NPY_FINLINE npyv_u8 +npyv_pack_isnan_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3) +{ + const npyv_u8 truemask = npyv_setall_u8(1==1); + npyv_b8 notnan = npyv_pack_b8_b32( + npyv_notnan_f32(v0), + npyv_notnan_f32(v1), + npyv_notnan_f32(v2), + npyv_notnan_f32(v3) + ); + return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notnan)); +} #endif +#if NPY_SIMD_F64 +NPY_FINLINE npyv_u64 +npyv_isnan_f64(npyv_f64 v) +{ + const npyv_u64 truemask = npyv_setall_u64(1==1); + return npyv_andc_u8(npyv_reinterpret_u8_u64(truemask), + npyv_reinterpret_u8_u64(npyv_cvt_u64_b64(npyv_notnan_f64(v)))); } +NPY_FINLINE npyv_u8 +npyv_pack_isnan_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3, + npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7) +{ + const npyv_u8 truemask = npyv_setall_u8(1==1); + npyv_b8 notnan = npyv_pack_b8_b64( + npyv_notnan_f64(v0), + npyv_notnan_f64(v1), + npyv_notnan_f64(v2), + npyv_notnan_f64(v3), + npyv_notnan_f64(v4), + npyv_notnan_f64(v5), + npyv_notnan_f64(v6), + npyv_notnan_f64(v7) + ); + return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notnan)); -static NPY_INLINE npyv_u@ssfx@ -npyv_isinf_@sfx@(npyv_@sfx@ v) +} +#endif + +#if NPY_SIMD_F32 +NPY_FINLINE npyv_b32 +_npyv_isinf_f32(npyv_f32 v) { - // (abs(v) > fltmax) >> (size - 1) - const npyv_@sfx@ fltmax = npyv_setall_@sfx@(@FDMAX@); #if defined(NPY_HAVE_NEON) - npyv_u@ssfx@ r = vcagtq_@sfx@(v, fltmax); + // abs(v) > FLT_MAX + const npyv_f32 fltmax = npyv_setall_f32(FLT_MAX); + return vcagtq_f32(v, fltmax); #else - // fabs via masking of sign bit - const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@); - npyv_u8 r_u8 = npyv_andc_u8(npyv_reinterpret_u8_@sfx@(v), npyv_reinterpret_u8_@sfx@(signmask)); - #if defined(NPY_HAVE_SSE2) || defined (NPY_HAVE_SSE41) - // return cast already done in npyv_cmpgt_@sfx@ - npyv_u@ssfx@ r = npyv_cmpgt_@sfx@(npyv_reinterpret_@sfx@_u8(r_u8), fltmax); - #else - npyv_u@ssfx@ r = npyv_reinterpret_u@ssfx@_@sfx@(npyv_cmpgt_@sfx@(npyv_reinterpret_@sfx@_u8(r_u8), fltmax)); - #endif + // cast out the sign and check if all exponent bits are set. + const npyv_u32 exp_mask = npyv_setall_u32(0xff000000); + npyv_u32 bits = npyv_shli_u32(npyv_reinterpret_u32_f32(v), 1); + return npyv_cmpeq_u32(bits, exp_mask); #endif -#if defined(NPY__CPU_TARGET_VX) || defined(NPY__CPU_TARGET_VXE) - // don't do the shift on s390x - return r; +} +NPY_FINLINE npyv_u32 +npyv_isinf_f32(npyv_f32 v) +{ + const npyv_u32 truemask = npyv_setall_u32(1==1); + return npyv_and_u32(truemask, npyv_cvt_u32_b32(_npyv_isinf_f32(v))); +} +NPY_FINLINE npyv_u8 +npyv_pack_isinf_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3) +{ + const npyv_u8 truemask = npyv_setall_u8(1==1); + npyv_b8 isinf = npyv_pack_b8_b32( + _npyv_isinf_f32(v0), + _npyv_isinf_f32(v1), + _npyv_isinf_f32(v2), + _npyv_isinf_f32(v3) + ); + return npyv_and_u8(truemask, npyv_cvt_u8_b8(isinf)); +} +#endif // NPY_SIMD_F32 +#if NPY_SIMD_F64 +NPY_FINLINE npyv_b64 +_npyv_isinf_f64(npyv_f64 v) +{ +#if defined(NPY_HAVE_NEON) + // abs(v) > DBL_MAX + const npyv_f64 fltmax = npyv_setall_f64(DBL_MAX); + return vcagtq_f64(v, fltmax); #else - return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1); + // cast out the sign and check if all exponent bits are set. + const npyv_u64 exp_mask = npyv_setall_u64(0xffe0000000000000); + npyv_u64 bits = npyv_shli_u64(npyv_reinterpret_u64_f64(v), 1); + return npyv_cmpeq_u64(bits, exp_mask); #endif } +NPY_FINLINE npyv_u64 +npyv_isinf_f64(npyv_f64 v) +{ + const npyv_u64 truemask = npyv_setall_u64(1==1); + return npyv_and_u64(truemask, npyv_cvt_u64_b64(_npyv_isinf_f64(v))); +} +NPY_FINLINE npyv_u8 +npyv_pack_isinf_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3, + npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7) +{ + const npyv_u8 truemask = npyv_setall_u8(1==1); + npyv_b8 isinf = npyv_pack_b8_b64( + _npyv_isinf_f64(v0), + _npyv_isinf_f64(v1), + _npyv_isinf_f64(v2), + _npyv_isinf_f64(v3), + _npyv_isinf_f64(v4), + _npyv_isinf_f64(v5), + _npyv_isinf_f64(v6), + _npyv_isinf_f64(v7) + ); + return npyv_and_u8(truemask, npyv_cvt_u8_b8(isinf)); +} +#endif // NPY_SIMD_F64 + -static NPY_INLINE npyv_u@ssfx@ -npyv_isfinite_@sfx@(npyv_@sfx@ v) +#if NPY_SIMD_F32 +NPY_FINLINE npyv_b32 +npyv_notfinite_f32(npyv_f32 v) { - // ((v & signmask) <= fltmax) >> (size-1) - const npyv_@sfx@ fltmax = npyv_setall_@sfx@(@FDMAX@); -#if defined(NPY_HAVE_NEON) - npyv_u@ssfx@ r = vcaleq_@sfx@(v, fltmax); + // cast out the sign and check if all exponent bits are set + // no matter the mentissa is. + const npyv_u32 exp_mask = npyv_setall_u32(0x7f800000); + npyv_u32 bits = npyv_reinterpret_u32_f32(v); + bits = npyv_and_u32(bits, exp_mask); + return npyv_cmpeq_u32(bits, exp_mask); +} +NPY_FINLINE npyv_u32 +npyv_isfinite_f32(npyv_f32 v) +{ + const npyv_u32 truemask = npyv_setall_u32(1==1); + return npyv_andc_u8(npyv_reinterpret_u8_u32(truemask), + npyv_reinterpret_u8_u32(npyv_cvt_u32_b32(npyv_notfinite_f32(v)))); +} +NPY_FINLINE npyv_u8 +npyv_pack_isfinite_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3) +{ +#if defined(NPY_HAVE_NEON) && defined(__aarch64__) + // F32 exponent is 8-bits, which means we can pack multiple into + // a single vector. We shift out sign bit so that we're left + // with only exponent in high byte. If not all bits are set, + // then we've got a finite number. + uint8x16x4_t tbl; + tbl.val[0] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v0), 1)); + tbl.val[1] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v1), 1)); + tbl.val[2] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v2), 1)); + tbl.val[3] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v3), 1)); + + const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63}; + npyv_u8 r = vqtbl4q_u8(tbl, permute); + + const npyv_u8 expmask = npyv_setall_u8(0xff); + r = npyv_cmpneq_u8(r, expmask); + r = vshrq_n_u8(r, 7); + return r; #else - // fabs via masking of sign bit - const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@); - npyv_u8 r_u8 = npyv_andc_u8(npyv_reinterpret_u8_@sfx@(v), npyv_reinterpret_u8_@sfx@(signmask)); - #if defined(NPY_HAVE_SSE2) || defined (NPY_HAVE_SSE41) - // return cast already done in npyv_cmpgt_@sfx@ - npyv_u@ssfx@ r = npyv_cmple_@sfx@(npyv_reinterpret_@sfx@_u8(r_u8), fltmax); - #else - npyv_u@ssfx@ r = npyv_reinterpret_u@ssfx@_@sfx@(npyv_cmple_@sfx@(npyv_reinterpret_@sfx@_u8(r_u8), fltmax)); - #endif + const npyv_u8 truemask = npyv_setall_u8(1==1); + npyv_b8 notfinite = npyv_pack_b8_b32( + npyv_notfinite_f32(v0), + npyv_notfinite_f32(v1), + npyv_notfinite_f32(v2), + npyv_notfinite_f32(v3) + ); + return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notfinite)); #endif -#if defined(NPY__CPU_TARGET_VX) || defined(NPY__CPU_TARGET_VXE) - // don't do the shift on s390x +} +#endif // NPY_SIMD_F32 +#if NPY_SIMD_F64 +NPY_FINLINE npyv_b64 +npyv_notfinite_f64(npyv_f64 v) +{ + // cast out the sign and check if all exponent bits are set + // no matter the mantissa is. + const npyv_u64 exp_mask = npyv_setall_u64(0x7ff0000000000000); + npyv_u64 bits = npyv_reinterpret_u64_f64(v); + bits = npyv_and_u64(bits, exp_mask); + return npyv_cmpeq_u64(bits, exp_mask); +} +NPY_FINLINE npyv_u64 +npyv_isfinite_f64(npyv_f64 v) +{ + const npyv_u64 truemask = npyv_setall_u64(1==1); + return npyv_andc_u8(npyv_reinterpret_u8_u64(truemask), + npyv_reinterpret_u8_u64(npyv_cvt_u64_b64(npyv_notfinite_f64(v)))); +} +NPY_FINLINE npyv_u8 +npyv_pack_isfinite_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3, + npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7) +{ +#if defined(NPY_HAVE_NEON) && defined(__aarch64__) + // F64 exponent is 11-bits, which means we can pack multiple into + // a single vector. We'll need to use u16 to fit all exponent + // bits. If not all bits are set, then we've got a finite number. + uint8x16x4_t t0123, t4567; + t0123.val[0] = npyv_reinterpret_u8_f64(v0); + t0123.val[1] = npyv_reinterpret_u8_f64(v1); + t0123.val[2] = npyv_reinterpret_u8_f64(v2); + t0123.val[3] = npyv_reinterpret_u8_f64(v3); + t4567.val[0] = npyv_reinterpret_u8_f64(v4); + t4567.val[1] = npyv_reinterpret_u8_f64(v5); + t4567.val[2] = npyv_reinterpret_u8_f64(v6); + t4567.val[3] = npyv_reinterpret_u8_f64(v7); + + const npyv_u8 permute = {6,7,14,15, 22,23,30,31, 38,39,46,47, 54,55,62,63}; + npyv_u16 r0 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t0123, permute)); + npyv_u16 r1 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t4567, permute)); + + const npyv_u16 expmask = npyv_setall_u16(0x7ff0); + r0 = npyv_and_u16(r0, expmask); + r0 = npyv_cmpneq_u16(r0, expmask); + r0 = npyv_shri_u16(r0, 15); + r1 = npyv_and_u16(r1, expmask); + r1 = npyv_cmpneq_u16(r1, expmask); + r1 = npyv_shri_u16(r1, 15); + + npyv_u8 r = npyv_pack_b8_b16(r0, r1); return r; #else - return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1); + const npyv_u8 truemask = npyv_setall_u8(1==1); + npyv_b8 notfinite = npyv_pack_b8_b64( + npyv_notfinite_f64(v0), + npyv_notfinite_f64(v1), + npyv_notfinite_f64(v2), + npyv_notfinite_f64(v3), + npyv_notfinite_f64(v4), + npyv_notfinite_f64(v5), + npyv_notfinite_f64(v6), + npyv_notfinite_f64(v7) + ); + return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notfinite)); #endif } +#endif // NPY_SIMD_F64 -static NPY_INLINE npyv_u@ssfx@ -npyv_signbit_@sfx@(npyv_@sfx@ v) +#if NPY_SIMD_F32 +NPY_FINLINE npyv_u32 +npyv_signbit_f32(npyv_f32 v) +{ + return npyv_shri_u32(npyv_reinterpret_u32_f32(v), (sizeof(npyv_lanetype_f32)*8)-1); +} +NPY_FINLINE npyv_u8 +npyv_pack_signbit_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3) { -#if NPY_BYTE_ORDER == NPY_BIG_ENDIAN - // via masking of sign bit - const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@); - npyv_u@ssfx@ r = npyv_reinterpret_u@ssfx@_@sfx@(npyv_and_@sfx@(v, signmask)); +#if defined(NPY_HAVE_NEON) && defined(__aarch64__) + // We only need high byte for signbit, which means we can pack + // multiple inputs into a single vector. + uint8x16x4_t tbl; + tbl.val[0] = npyv_reinterpret_u8_f32(v0); + tbl.val[1] = npyv_reinterpret_u8_f32(v1); + tbl.val[2] = npyv_reinterpret_u8_f32(v2); + tbl.val[3] = npyv_reinterpret_u8_f32(v3); + + const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63}; + npyv_u8 r = vqtbl4q_u8(tbl, permute); + r = vshrq_n_u8(r, 7); return r; #else - return npyv_shri_u@ssfx@(npyv_reinterpret_u@ssfx@_@sfx@(v), (sizeof(npyv_lanetype_@sfx@)*8)-1); + npyv_b8 signbit = npyv_pack_b8_b32( + npyv_cvt_b32_u32(npyv_signbit_f32(v0)), + npyv_cvt_b32_u32(npyv_signbit_f32(v1)), + npyv_cvt_b32_u32(npyv_signbit_f32(v2)), + npyv_cvt_b32_u32(npyv_signbit_f32(v3)) + ); + return signbit; #endif } - -#endif // @VCHK@ -/**end repeat**/ - -// In these functions we use vqtbl4q_u8 which is only available on aarch64 +#endif // NPY_SIMD_F32 +#if NPY_SIMD_F64 +NPY_FINLINE npyv_u64 +npyv_signbit_f64(npyv_f64 v) +{ + return npyv_shri_u64(npyv_reinterpret_u64_f64(v), (sizeof(npyv_lanetype_f64)*8)-1); +} +NPY_FINLINE npyv_u8 +npyv_pack_signbit_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3, + npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7) +{ #if defined(NPY_HAVE_NEON) && defined(__aarch64__) - #define PREPACK_ISFINITE 1 - #define PREPACK_SIGNBIT 1 - - #if NPY_SIMD_F32 - - static NPY_INLINE npyv_u8 - npyv_isfinite_4x_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3) - { - // F32 exponent is 8-bits, which means we can pack multiple into - // a single vector. We shift out sign bit so that we're left - // with only exponent in high byte. If not all bits are set, - // then we've got a finite number. - uint8x16x4_t tbl; - tbl.val[0] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v0), 1)); - tbl.val[1] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v1), 1)); - tbl.val[2] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v2), 1)); - tbl.val[3] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v3), 1)); - - const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63}; - npyv_u8 r = vqtbl4q_u8(tbl, permute); - - const npyv_u8 expmask = npyv_setall_u8(0xff); - r = npyv_cmpneq_u8(r, expmask); - r = vshrq_n_u8(r, 7); - return r; - } + // We only need high byte for signbit, which means we can pack + // multiple inputs into a single vector. - static NPY_INLINE npyv_u8 - npyv_signbit_4x_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3) - { - // We only need high byte for signbit, which means we can pack - // multiple inputs into a single vector. - uint8x16x4_t tbl; - tbl.val[0] = npyv_reinterpret_u8_f32(v0); - tbl.val[1] = npyv_reinterpret_u8_f32(v1); - tbl.val[2] = npyv_reinterpret_u8_f32(v2); - tbl.val[3] = npyv_reinterpret_u8_f32(v3); - - const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63}; - npyv_u8 r = vqtbl4q_u8(tbl, permute); - r = vshrq_n_u8(r, 7); - return r; - } - - #endif // NPY_SIMD_F32 - - #if NPY_SIMD_F64 - - static NPY_INLINE npyv_u8 - npyv_isfinite_8x_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3, - npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7) - { - // F64 exponent is 11-bits, which means we can pack multiple into - // a single vector. We'll need to use u16 to fit all exponent - // bits. If not all bits are set, then we've got a finite number. - uint8x16x4_t t0123, t4567; - t0123.val[0] = npyv_reinterpret_u8_f64(v0); - t0123.val[1] = npyv_reinterpret_u8_f64(v1); - t0123.val[2] = npyv_reinterpret_u8_f64(v2); - t0123.val[3] = npyv_reinterpret_u8_f64(v3); - t4567.val[0] = npyv_reinterpret_u8_f64(v4); - t4567.val[1] = npyv_reinterpret_u8_f64(v5); - t4567.val[2] = npyv_reinterpret_u8_f64(v6); - t4567.val[3] = npyv_reinterpret_u8_f64(v7); - - const npyv_u8 permute = {6,7,14,15, 22,23,30,31, 38,39,46,47, 54,55,62,63}; - npyv_u16 r0 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t0123, permute)); - npyv_u16 r1 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t4567, permute)); - - const npyv_u16 expmask = npyv_setall_u16(0x7ff0); - r0 = npyv_and_u16(r0, expmask); - r0 = npyv_cmpneq_u16(r0, expmask); - r0 = npyv_shri_u16(r0, 15); - r1 = npyv_and_u16(r1, expmask); - r1 = npyv_cmpneq_u16(r1, expmask); - r1 = npyv_shri_u16(r1, 15); - - npyv_u8 r = npyv_pack_b8_b16(r0, r1); - return r; - } - - static NPY_INLINE npyv_u8 - npyv_signbit_8x_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3, - npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7) - { - // We only need high byte for signbit, which means we can pack - // multiple inputs into a single vector. - - // vuzp2 faster than vtbl for f64 - npyv_u32 v01 = vuzp2q_u32(npyv_reinterpret_u32_f64(v0), npyv_reinterpret_u32_f64(v1)); - npyv_u32 v23 = vuzp2q_u32(npyv_reinterpret_u32_f64(v2), npyv_reinterpret_u32_f64(v3)); - npyv_u32 v45 = vuzp2q_u32(npyv_reinterpret_u32_f64(v4), npyv_reinterpret_u32_f64(v5)); - npyv_u32 v67 = vuzp2q_u32(npyv_reinterpret_u32_f64(v6), npyv_reinterpret_u32_f64(v7)); - - npyv_u16 v0123 = vuzp2q_u16(npyv_reinterpret_u16_u32(v01), npyv_reinterpret_u16_u32(v23)); - npyv_u16 v4567 = vuzp2q_u16(npyv_reinterpret_u16_u32(v45), npyv_reinterpret_u16_u32(v67)); - - npyv_u8 r = vuzp2q_u8(npyv_reinterpret_u8_u16(v0123), npyv_reinterpret_u8_u16(v4567)); - r = vshrq_n_u8(r, 7); - return r; - } + // vuzp2 faster than vtbl for f64 + npyv_u32 v01 = vuzp2q_u32(npyv_reinterpret_u32_f64(v0), npyv_reinterpret_u32_f64(v1)); + npyv_u32 v23 = vuzp2q_u32(npyv_reinterpret_u32_f64(v2), npyv_reinterpret_u32_f64(v3)); + npyv_u32 v45 = vuzp2q_u32(npyv_reinterpret_u32_f64(v4), npyv_reinterpret_u32_f64(v5)); + npyv_u32 v67 = vuzp2q_u32(npyv_reinterpret_u32_f64(v6), npyv_reinterpret_u32_f64(v7)); - #endif // NPY_SIMD_F64 + npyv_u16 v0123 = vuzp2q_u16(npyv_reinterpret_u16_u32(v01), npyv_reinterpret_u16_u32(v23)); + npyv_u16 v4567 = vuzp2q_u16(npyv_reinterpret_u16_u32(v45), npyv_reinterpret_u16_u32(v67)); + npyv_u8 r = vuzp2q_u8(npyv_reinterpret_u8_u16(v0123), npyv_reinterpret_u8_u16(v4567)); + r = vshrq_n_u8(r, 7); + return r; #else - #define PREPACK_ISFINITE 0 - #define PREPACK_SIGNBIT 0 -#endif // defined(NPY_HAVE_NEON) && defined(__aarch64__) + npyv_b8 signbit = npyv_pack_b8_b64( + npyv_cvt_b64_u64(npyv_signbit_f64(v0)), + npyv_cvt_b64_u64(npyv_signbit_f64(v1)), + npyv_cvt_b64_u64(npyv_signbit_f64(v2)), + npyv_cvt_b64_u64(npyv_signbit_f64(v3)), + npyv_cvt_b64_u64(npyv_signbit_f64(v4)), + npyv_cvt_b64_u64(npyv_signbit_f64(v5)), + npyv_cvt_b64_u64(npyv_signbit_f64(v6)), + npyv_cvt_b64_u64(npyv_signbit_f64(v7)) + ); + return signbit; +#endif +} +#endif // NPY_SIMD_F64 #endif // NPY_SIMD @@ -264,75 +407,19 @@ npyv_signbit_@sfx@(npyv_@sfx@ v) #define CONTIG 0 #define NCONTIG 1 -/* - * clang has a bug that's present at -O1 or greater. When partially loading a - * vector register for a reciprocal operation, the remaining elements are set - * to 1 to avoid divide-by-zero. The partial load is paired with a partial - * store after the reciprocal operation. clang notices that the entire register - * is not needed for the store and optimizes out the fill of 1 to the remaining - * elements. This causes either a divide-by-zero or 0/0 with invalid exception - * that we were trying to avoid by filling. - * - * Using a dummy variable marked 'volatile' convinces clang not to ignore - * the explicit fill of remaining elements. If `-ftrapping-math` is - * supported, then it'll also avoid the bug. `-ftrapping-math` is supported - * on Apple clang v12+ for x86_64. It is not currently supported for arm64. - * `-ftrapping-math` is set by default of Numpy builds in - * numpy/distutils/ccompiler.py. - * - * Note: Apple clang and clang upstream have different versions that overlap - */ -#if defined(__clang__) - #if defined(__apple_build_version__) - // Apple Clang - #if __apple_build_version__ < 12000000 - // Apple Clang before v12 - #define WORKAROUND_CLANG_RECIPROCAL_BUG 1 - #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64) - // Apple Clang after v12, targeting i386 or x86_64 - #define WORKAROUND_CLANG_RECIPROCAL_BUG 0 - #else - // Apple Clang after v12, not targeting i386 or x86_64 - #define WORKAROUND_CLANG_RECIPROCAL_BUG 1 - #endif - #else - // Clang, not Apple Clang - #if __clang_major__ < 10 - // Clang before v10 - #define WORKAROUND_CLANG_RECIPROCAL_BUG 1 - #elif defined(_MSC_VER) - // clang-cl has the same bug - #define WORKAROUND_CLANG_RECIPROCAL_BUG 1 - #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64) - // Clang v10+, targeting i386 or x86_64 - #define WORKAROUND_CLANG_RECIPROCAL_BUG 0 - #else - // Clang v10+, not targeting i386 or x86_64 - #define WORKAROUND_CLANG_RECIPROCAL_BUG 1 - #endif - #endif -#else -// Not a Clang compiler -#define WORKAROUND_CLANG_RECIPROCAL_BUG 0 -#endif - /**begin repeat * #TYPE = FLOAT, DOUBLE# * #sfx = f32, f64# * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64# - * #FDMAX = FLT_MAX, DBL_MAX# - * #fd = f, # * #ssfx = 32, 64# */ #if @VCHK@ /**begin repeat1 * #kind = isnan, isinf, isfinite, signbit# - * #PREPACK = 0, 0, PREPACK_ISFINITE, PREPACK_SIGNBIT# */ /**begin repeat2 * #STYPE = CONTIG, NCONTIG, CONTIG, NCONTIG# * #DTYPE = CONTIG, CONTIG, NCONTIG, NCONTIG# - * #unroll = 1, 1, 1, 1# */ static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@ (const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len) @@ -345,116 +432,53 @@ static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@ assert(PACK_FACTOR == 4 || PACK_FACTOR == 8); const int vstep = npyv_nlanes_@sfx@; - const int wstep = vstep * @unroll@ * PACK_FACTOR; + const int wstep = vstep * PACK_FACTOR; // unrolled iterations for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) { - /**begin repeat3 - * #N = 0, 1, 2, 3# - */ - #if @unroll@ > @N@ - // Load vectors - #if @STYPE@ == CONTIG - // contiguous input - npyv_@sfx@ v0_@N@ = npyv_load_@sfx@(ip + vstep * (0 + PACK_FACTOR * @N@)); // 2 * (0 + 8 * n) - npyv_@sfx@ v1_@N@ = npyv_load_@sfx@(ip + vstep * (1 + PACK_FACTOR * @N@)); - npyv_@sfx@ v2_@N@ = npyv_load_@sfx@(ip + vstep * (2 + PACK_FACTOR * @N@)); - npyv_@sfx@ v3_@N@ = npyv_load_@sfx@(ip + vstep * (3 + PACK_FACTOR * @N@)); - #if PACK_FACTOR == 8 - npyv_@sfx@ v4_@N@ = npyv_load_@sfx@(ip + vstep * (4 + PACK_FACTOR * @N@)); - npyv_@sfx@ v5_@N@ = npyv_load_@sfx@(ip + vstep * (5 + PACK_FACTOR * @N@)); - npyv_@sfx@ v6_@N@ = npyv_load_@sfx@(ip + vstep * (6 + PACK_FACTOR * @N@)); - npyv_@sfx@ v7_@N@ = npyv_load_@sfx@(ip + vstep * (7 + PACK_FACTOR * @N@)); // 2 * (7 + 8 * n) --> 32 + 7: 39 * 2: 78 - #endif - #else - // non-contiguous input - npyv_@sfx@ v0_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(0 + PACK_FACTOR * @N@), istride); - npyv_@sfx@ v1_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(1 + PACK_FACTOR * @N@), istride); - npyv_@sfx@ v2_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(2 + PACK_FACTOR * @N@), istride); - npyv_@sfx@ v3_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(3 + PACK_FACTOR * @N@), istride); - #if PACK_FACTOR == 8 - npyv_@sfx@ v4_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(4 + PACK_FACTOR * @N@), istride); - npyv_@sfx@ v5_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(5 + PACK_FACTOR * @N@), istride); - npyv_@sfx@ v6_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(6 + PACK_FACTOR * @N@), istride); - npyv_@sfx@ v7_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(7 + PACK_FACTOR * @N@), istride); - #endif - #endif - #endif // @unroll@ > @N@ - /**end repeat3**/ - - /**begin repeat3 - * #N = 0, 1, 2, 3# - */ - #if @unroll@ > @N@ - #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) - #if @ssfx@ == 32 - npyv_u8 r_@N@ = npyv_@kind@_4x_@sfx@(v0_@N@, v1_@N@, v2_@N@, v3_@N@); - #elif @ssfx@ == 64 - npyv_u8 r_@N@ = npyv_@kind@_8x_@sfx@(v0_@N@, v1_@N@, v2_@N@, v3_@N@, - v4_@N@, v5_@N@, v6_@N@, v7_@N@); + // Load vectors + #if @STYPE@ == CONTIG + // contiguous input + npyv_@sfx@ v0 = npyv_load_@sfx@(ip + vstep * 0); + npyv_@sfx@ v1 = npyv_load_@sfx@(ip + vstep * 1); + npyv_@sfx@ v2 = npyv_load_@sfx@(ip + vstep * 2); + npyv_@sfx@ v3 = npyv_load_@sfx@(ip + vstep * 3); + #if PACK_FACTOR == 8 + npyv_@sfx@ v4 = npyv_load_@sfx@(ip + vstep * 4); + npyv_@sfx@ v5 = npyv_load_@sfx@(ip + vstep * 5); + npyv_@sfx@ v6 = npyv_load_@sfx@(ip + vstep * 6); + npyv_@sfx@ v7 = npyv_load_@sfx@(ip + vstep * 7); #endif #else - npyv_b@ssfx@ r0_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v0_@N@)); - npyv_b@ssfx@ r1_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v1_@N@)); - npyv_b@ssfx@ r2_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v2_@N@)); - npyv_b@ssfx@ r3_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v3_@N@)); + // non-contiguous input + npyv_@sfx@ v0 = npyv_loadn_@sfx@(ip + istride * vstep * 0, istride); + npyv_@sfx@ v1 = npyv_loadn_@sfx@(ip + istride * vstep * 1, istride); + npyv_@sfx@ v2 = npyv_loadn_@sfx@(ip + istride * vstep * 2, istride); + npyv_@sfx@ v3 = npyv_loadn_@sfx@(ip + istride * vstep * 3, istride); #if PACK_FACTOR == 8 - npyv_b@ssfx@ r4_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v4_@N@)); - npyv_b@ssfx@ r5_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v5_@N@)); - npyv_b@ssfx@ r6_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v6_@N@)); - npyv_b@ssfx@ r7_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v7_@N@)); - #endif // PACK_FACTOR == 8 - #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) - #endif // @unroll@ > @N@ - /**end repeat3**/ - - /**begin repeat3 - * #N = 0, 1, 2, 3# - */ - #if @unroll@ > @N@ - #if @DTYPE@ == CONTIG - #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) - // Nothing to do, results already packed - #else - // Pack results - #if PACK_FACTOR == 4 - npyv_u8 r_@N@ = npyv_cvt_u8_b8(npyv_pack_b8_b32(r0_@N@, r1_@N@, r2_@N@, r3_@N@)); - #elif PACK_FACTOR == 8 - npyv_u8 r_@N@ = npyv_cvt_u8_b8(npyv_pack_b8_b64(r0_@N@, r1_@N@, r2_@N@, r3_@N@, - r4_@N@, r5_@N@, r6_@N@, r7_@N@)); - #endif // PACK_FACTOR == 8 - #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) - - npyv_store_u8(op + vstep * PACK_FACTOR * @N@, r_@N@); + npyv_@sfx@ v4 = npyv_loadn_@sfx@(ip + istride * vstep * 4, istride); + npyv_@sfx@ v5 = npyv_loadn_@sfx@(ip + istride * vstep * 5, istride); + npyv_@sfx@ v6 = npyv_loadn_@sfx@(ip + istride * vstep * 6, istride); + npyv_@sfx@ v7 = npyv_loadn_@sfx@(ip + istride * vstep * 7, istride); + #endif + #endif + + #if PACK_FACTOR == 4 + npyv_u8 r = npyv_pack_@kind@_@sfx@(v0, v1, v2, v3); + #elif PACK_FACTOR == 8 + npyv_u8 r = npyv_pack_@kind@_@sfx@(v0, v1, v2, v3, v4, v5, v6, v7); + #endif + #if @DTYPE@ == CONTIG + npyv_store_u8(op, r); #else // @DTYPE@ == CONTIG - #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) - // Results are packed, so we can just loop over them - npy_uint8 lane_@N@[npyv_nlanes_u8]; - npyv_store_u8(lane_@N@, r_@N@); - for (int ln=0; (ln * sizeof(npyv_lanetype_@sfx@)) < npyv_nlanes_u8; ++ln){ - op[(ln + @N@ * PACK_FACTOR * vstep) * ostride] = lane_@N@[ln * sizeof(npyv_lanetype_@sfx@)]; - } - #else - // Results are not packed. Use template to loop. - /**begin repeat4 - * #R = 0, 1, 2, 3, 4, 5, 6, 7# - */ - #if @R@ < PACK_FACTOR - npy_uint8 lane@R@_@N@[npyv_nlanes_u8]; - npyv_store_u8(lane@R@_@N@, npyv_reinterpret_u8_u@ssfx@(r@R@_@N@)); - op[(0 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[0 * sizeof(npyv_lanetype_@sfx@)]; - op[(1 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[1 * sizeof(npyv_lanetype_@sfx@)]; - #if npyv_nlanes_@sfx@ == 4 - op[(2 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[2 * sizeof(npyv_lanetype_@sfx@)]; - op[(3 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[3 * sizeof(npyv_lanetype_@sfx@)]; - #endif - #endif - /**end repeat4**/ - #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) + // Results are packed, so we can just loop over them + npy_uint8 lane[npyv_nlanes_u8]; + npyv_store_u8(lane, r); + for (int ln=0; (ln * sizeof(npyv_lanetype_@sfx@)) < npyv_nlanes_u8; ++ln){ + op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_@sfx@)]; + } #endif // @DTYPE@ == CONTIG - #endif // @unroll@ > @N@ - /**end repeat3**/ } // vector-sized iterations @@ -470,11 +494,11 @@ static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@ npy_uint8 lane[npyv_nlanes_u8]; npyv_store_u8(lane, npyv_reinterpret_u8_u@ssfx@(r)); - op[0*ostride] = lane[0 * sizeof(npyv_lanetype_@sfx@)]; - op[1*ostride] = lane[1 * sizeof(npyv_lanetype_@sfx@)]; + op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_@sfx@)]; + op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_@sfx@)]; #if npyv_nlanes_@sfx@ == 4 - op[2*ostride] = lane[2 * sizeof(npyv_lanetype_@sfx@)]; - op[3*ostride] = lane[3 * sizeof(npyv_lanetype_@sfx@)]; + op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_@sfx@)]; + op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_@sfx@)]; #endif } @@ -485,7 +509,6 @@ static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@ *op = (npy_@kind@(*ip) != 0); } - npyv_cleanup(); } /**end repeat2**/ @@ -494,10 +517,6 @@ static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@ #endif // @VCHK@ /**end repeat**/ -#undef WORKAROUND_CLANG_RECIPROCAL_BUG -#undef PREPACK_ISFINITE -#undef PREPACK_SIGNBIT - /******************************************************************************** ** Defining ufunc inner functions ********************************************************************************/ |