diff options
Diffstat (limited to 'numpy/core/src/umath/loops_unary.dispatch.c.src')
-rw-r--r-- | numpy/core/src/umath/loops_unary.dispatch.c.src | 364 |
1 files changed, 364 insertions, 0 deletions
diff --git a/numpy/core/src/umath/loops_unary.dispatch.c.src b/numpy/core/src/umath/loops_unary.dispatch.c.src new file mode 100644 index 000000000..1e2a81d20 --- /dev/null +++ b/numpy/core/src/umath/loops_unary.dispatch.c.src @@ -0,0 +1,364 @@ +/*@targets + ** $maxopt baseline + ** neon asimd + ** sse2 avx2 avx512_skx + ** vsx2 + ** vx vxe + **/ + +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "numpy/npy_math.h" +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +#include "lowlevel_strided_loops.h" +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" + +/******************************************************************************* + ** Scalar ops + ******************************************************************************/ +#define scalar_negative(X) (-X) + +/******************************************************************************* + ** extra SIMD intrinsics + ******************************************************************************/ + +#if NPY_SIMD + +/**begin repeat + * #sfx = s8, u8, s16, u16, s32, u32, s64, u64# + * #ssfx = 8, 8, 16, 16, 32, 32, 64, 64# + */ +static NPY_INLINE npyv_@sfx@ +npyv_negative_@sfx@(npyv_@sfx@ v) +{ +#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || @ssfx@ < 64) + return npyv_reinterpret_@sfx@_s@ssfx@(vnegq_s@ssfx@(npyv_reinterpret_s@ssfx@_@sfx@(v))); +#else + // (x ^ -1) + 1 + const npyv_@sfx@ m1 = npyv_setall_@sfx@((npyv_lanetype_@sfx@)-1); + return npyv_sub_@sfx@(npyv_xor_@sfx@(v, m1), m1); +#endif +} +/**end repeat**/ + +/**begin repeat + * #sfx = f32, f64# + * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64# + * #fd = f, # + */ +#if @VCHK@ +static NPY_INLINE npyv_@sfx@ +npyv_negative_@sfx@(npyv_@sfx@ v) +{ +#if defined(NPY_HAVE_NEON) + return vnegq_@sfx@(v); +#else + // (v ^ signmask) + const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@); + return npyv_xor_@sfx@(v, signmask); +#endif +} +#endif // @VCHK@ +/**end repeat**/ + +#endif // NPY_SIMD + +/******************************************************************************** + ** Defining the SIMD kernels + ********************************************************************************/ +/**begin repeat + * #sfx = s8, u8, s16, u16, s32, u32, s64, u64, f32, f64# + * #simd_chk = NPY_SIMD*8, NPY_SIMD_F32, NPY_SIMD_F64# + * #is_fp = 0*8, 1*2# + * #supports_ncontig = 0*4,1*6# + */ +/**begin repeat1 + * #kind = negative# + * #intrin = negative# + * #unroll = 4# + */ +#if @simd_chk@ +#if @unroll@ < 1 +#error "Unroll must be at least 1" +#elif NPY_SIMD != 128 && @unroll@ > 2 +// Avoid memory bandwidth bottleneck for larger SIMD +#define UNROLL 2 +#else +#define UNROLL @unroll@ +#endif +// contiguous inputs and output. +static NPY_INLINE void +simd_unary_cc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, + npyv_lanetype_@sfx@ *op, + npy_intp len) +{ + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep * UNROLL; + + // unrolled vector loop + for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) { + /**begin repeat2 + * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + #if UNROLL > @U@ + npyv_@sfx@ v_@U@ = npyv_load_@sfx@(ip + @U@ * vstep); + npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@); + npyv_store_@sfx@(op + @U@ * vstep, r_@U@); + #endif + /**end repeat2**/ + } + // single vector loop + for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) { + npyv_@sfx@ v = npyv_load_@sfx@(ip); + npyv_@sfx@ r = npyv_@intrin@_@sfx@(v); + npyv_store_@sfx@(op, r); + } + // scalar finish up any remaining iterations + for (; len > 0; --len, ++ip, ++op) { + *op = scalar_@intrin@(*ip); + } +} + +#if @supports_ncontig@ +// contiguous input, non-contiguous output +static NPY_INLINE void +simd_unary_cn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, + npyv_lanetype_@sfx@ *op, npy_intp ostride, + npy_intp len) +{ + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep * UNROLL; + + // unrolled vector loop + for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) { + /**begin repeat2 + * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + #if UNROLL > @U@ + npyv_@sfx@ v_@U@ = npyv_load_@sfx@(ip + @U@ * vstep); + npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@); + npyv_storen_@sfx@(op + @U@ * vstep * ostride, ostride, r_@U@); + #endif + /**end repeat2**/ + } + // single vector loop + for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) { + npyv_@sfx@ v = npyv_load_@sfx@(ip); + npyv_@sfx@ r = npyv_@intrin@_@sfx@(v); + npyv_storen_@sfx@(op, ostride, r); + } + // scalar finish up any remaining iterations + for (; len > 0; --len, ++ip, op += ostride) { + *op = scalar_@intrin@(*ip); + } +} +// non-contiguous input, contiguous output +static NPY_INLINE void +simd_unary_nc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride, + npyv_lanetype_@sfx@ *op, + npy_intp len) +{ + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep * UNROLL; + + // unrolled vector loop + for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) { + /**begin repeat2 + * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + #if UNROLL > @U@ + npyv_@sfx@ v_@U@ = npyv_loadn_@sfx@(ip + @U@ * vstep * istride, istride); + npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@); + npyv_store_@sfx@(op + @U@ * vstep, r_@U@); + #endif + /**end repeat2**/ + } + // single vector loop + for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) { + npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride); + npyv_@sfx@ r = npyv_@intrin@_@sfx@(v); + npyv_store_@sfx@(op, r); + } + // scalar finish up any remaining iterations + for (; len > 0; --len, ip += istride, ++op) { + *op = scalar_@intrin@(*ip); + } +} +// non-contiguous input and output +// limit unroll to 2x +#if UNROLL > 2 +#undef UNROLL +#define UNROLL 2 +#endif +static NPY_INLINE void +simd_unary_nn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride, + npyv_lanetype_@sfx@ *op, npy_intp ostride, + npy_intp len) +{ + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep * UNROLL; + + // unrolled vector loop + for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) { + /**begin repeat2 + * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + #if UNROLL > @U@ + npyv_@sfx@ v_@U@ = npyv_loadn_@sfx@(ip + @U@ * vstep * istride, istride); + npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@); + npyv_storen_@sfx@(op + @U@ * vstep * ostride, ostride, r_@U@); + #endif + /**end repeat2**/ + } + // single vector loop + for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) { + npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride); + npyv_@sfx@ r = npyv_@intrin@_@sfx@(v); + npyv_storen_@sfx@(op, ostride, r); + } + // scalar finish up any remaining iterations + for (; len > 0; --len, ip += istride, op += ostride) { + *op = scalar_@intrin@(*ip); + } +} +#endif // @supports_ncontig@ +#undef UNROLL +#endif // @simd_chk@ +/*end repeat1**/ +/**end repeat**/ + +/******************************************************************************** + ** Defining ufunc inner functions + ********************************************************************************/ +/**begin repeat + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, + * BYTE, SHORT, INT, LONG, LONGLONG, + * FLOAT, DOUBLE, LONGDOUBLE# + * + * #BTYPE = BYTE, SHORT, INT, LONG, LONGLONG, + * BYTE, SHORT, INT, LONG, LONGLONG, + * FLOAT, DOUBLE, LONGDOUBLE# + * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, + * npy_byte, npy_short, npy_int, npy_long, npy_longlong, + * npy_float, npy_double, npy_longdouble# + * + * #is_fp = 0*10, 1*3# + * #is_unsigned = 1*5, 0*5, 0*3# + * #supports_ncontig = 0*2, 1*3, 0*2, 1*3, 1*3# + */ +#undef TO_SIMD_SFX +#if 0 +/**begin repeat1 + * #len = 8, 16, 32, 64# + */ +#elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@ + #if @is_fp@ + #define TO_SIMD_SFX(X) X##_f@len@ + #if NPY_BITSOF_@BTYPE@ == 32 && !NPY_SIMD_F32 + #undef TO_SIMD_SFX + #endif + #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64 + #undef TO_SIMD_SFX + #endif + #elif @is_unsigned@ + #define TO_SIMD_SFX(X) X##_u@len@ + #else + #define TO_SIMD_SFX(X) X##_s@len@ + #endif +/**end repeat1**/ +#endif + +/**begin repeat1 + * #kind = negative# + * #intrin = negative# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + char *ip = args[0], *op = args[1]; + npy_intp istep = steps[0], ostep = steps[1], + len = dimensions[0]; +#ifdef TO_SIMD_SFX + #undef STYPE + #define STYPE TO_SIMD_SFX(npyv_lanetype) + if (!is_mem_overlap(ip, istep, op, ostep, len)) { + if (IS_UNARY_CONT(@type@, @type@)) { + // no overlap and operands are contiguous + TO_SIMD_SFX(simd_unary_cc_@intrin@)( + (STYPE*)ip, (STYPE*)op, len + ); + goto clear; + } + #if @supports_ncontig@ + const npy_intp istride = istep / sizeof(STYPE); + const npy_intp ostride = ostep / sizeof(STYPE); + if (TO_SIMD_SFX(npyv_loadable_stride)(istride) && + TO_SIMD_SFX(npyv_storable_stride)(ostride)) + { + if (istride == 1 && ostride != 1) { + // contiguous input, non-contiguous output + TO_SIMD_SFX(simd_unary_cn_@intrin@)( + (STYPE*)ip, (STYPE*)op, ostride, len + ); + goto clear; + } + else if (istride != 1 && ostride == 1) { + // non-contiguous input, contiguous output + TO_SIMD_SFX(simd_unary_nc_@intrin@)( + (STYPE*)ip, istride, (STYPE*)op, len + ); + goto clear; + } + // SSE2 does better with unrolled scalar for heavy non-contiguous + #if !defined(NPY_HAVE_SSE2) + else if (istride != 1 && ostride != 1) { + // non-contiguous input and output + TO_SIMD_SFX(simd_unary_nn_@intrin@)( + (STYPE*)ip, istride, (STYPE*)op, ostride, len + ); + goto clear; + } + #endif + } + #endif // @supports_ncontig@ + } +#endif // TO_SIMD_SFX +#ifndef NPY_DISABLE_OPTIMIZATION + /* + * scalar unrolls + * 8x unroll performed best on + * - Apple M1 Native / arm64 + * - Apple M1 Rosetta / SSE42 + * - iMacPro / AVX512 + */ + #define UNROLL 8 + for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) { + /**begin repeat2 + * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + #if UNROLL > @U@ + const @type@ in_@U@ = *((const @type@ *)(ip + @U@ * istep)); + *((@type@ *)(op + @U@ * ostep)) = scalar_@intrin@(in_@U@); + #endif + /**end repeat2**/ + } +#endif // NPY_DISABLE_OPTIMIZATION + for (; len > 0; --len, ip += istep, op += ostep) { + *((@type@ *)op) = scalar_@intrin@(*(const @type@ *)ip); + } +#ifdef TO_SIMD_SFX +clear: + npyv_cleanup(); +#endif +#if @is_fp@ + npy_clear_floatstatus_barrier((char*)dimensions); +#endif +} +/**end repeat**/ + +#undef NEGATIVE_CONTIG_ONLY |