1 files changed, 364 insertions, 0 deletions
diff --git a/numpy/core/src/umath/loops_unary.dispatch.c.src b/numpy/core/src/umath/loops_unary.dispatch.c.src
new file mode 100644
index 000000000..1e2a81d20
--- /dev/null
+++ b/numpy/core/src/umath/loops_unary.dispatch.c.src
@@ -0,0 +1,364 @@
+/*@targets
+ ** $maxopt baseline
+ ** neon asimd
+ ** sse2 avx2 avx512_skx
+ ** vsx2
+ ** vx vxe
+ **/
+
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/*******************************************************************************
+ ** Scalar ops
+ ******************************************************************************/
+#define scalar_negative(X) (-X)
+
+/*******************************************************************************
+ ** extra SIMD intrinsics
+ ******************************************************************************/
+
+#if NPY_SIMD
+
+/**begin repeat
+ * #sfx  = s8, u8, s16, u16, s32, u32, s64, u64#
+ * #ssfx =  8,  8,  16,  16,  32,  32,  64,  64#
+ */
+static NPY_INLINE npyv_@sfx@
+npyv_negative_@sfx@(npyv_@sfx@ v)
+{
+#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || @ssfx@ < 64)
+    return npyv_reinterpret_@sfx@_s@ssfx@(vnegq_s@ssfx@(npyv_reinterpret_s@ssfx@_@sfx@(v)));
+#else
+    // (x ^ -1) + 1
+    const npyv_@sfx@ m1 = npyv_setall_@sfx@((npyv_lanetype_@sfx@)-1);
+    return npyv_sub_@sfx@(npyv_xor_@sfx@(v, m1), m1);
+#endif
+}
+/**end repeat**/
+
+/**begin repeat
+ * #sfx  = f32, f64#
+ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #fd = f, #
+ */
+#if @VCHK@
+static NPY_INLINE npyv_@sfx@
+npyv_negative_@sfx@(npyv_@sfx@ v)
+{
+#if defined(NPY_HAVE_NEON)
+    return vnegq_@sfx@(v);
+#else
+    // (v ^ signmask)
+    const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@);
+    return npyv_xor_@sfx@(v, signmask);
+#endif
+}
+#endif // @VCHK@
+/**end repeat**/
+
+#endif // NPY_SIMD
+
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+/**begin repeat
+ * #sfx = s8, u8, s16, u16, s32, u32, s64, u64, f32, f64#
+ * #simd_chk = NPY_SIMD*8, NPY_SIMD_F32, NPY_SIMD_F64#
+ * #is_fp = 0*8, 1*2#
+ * #supports_ncontig = 0*4,1*6#
+ */
+/**begin repeat1
+ * #kind   = negative#
+ * #intrin = negative#
+ * #unroll = 4#
+ */
+#if @simd_chk@
+#if @unroll@ < 1
+#error "Unroll must be at least 1"
+#elif NPY_SIMD != 128 && @unroll@ > 2
+// Avoid memory bandwidth bottleneck for larger SIMD
+#define UNROLL 2
+#else
+#define UNROLL @unroll@
+#endif
+// contiguous inputs and output.
+static NPY_INLINE void
+simd_unary_cc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip,
+                             npyv_lanetype_@sfx@ *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
+    /**begin repeat2
+     * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+     */
+    #if UNROLL > @U@
+        npyv_@sfx@ v_@U@ = npyv_load_@sfx@(ip + @U@ * vstep);
+        npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+        npyv_store_@sfx@(op + @U@ * vstep, r_@U@);
+    #endif
+    /**end repeat2**/
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
+        npyv_@sfx@ v = npyv_load_@sfx@(ip);
+        npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+        npyv_store_@sfx@(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, ++op) {
+        *op = scalar_@intrin@(*ip);
+    }
+}
+
+#if @supports_ncontig@
+// contiguous input, non-contiguous output
+static NPY_INLINE void
+simd_unary_cn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip,
+                             npyv_lanetype_@sfx@ *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
+    /**begin repeat2
+     * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+     */
+    #if UNROLL > @U@
+        npyv_@sfx@ v_@U@ = npyv_load_@sfx@(ip + @U@ * vstep);
+        npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+        npyv_storen_@sfx@(op + @U@ * vstep * ostride, ostride, r_@U@);
+    #endif
+    /**end repeat2**/
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
+        npyv_@sfx@ v = npyv_load_@sfx@(ip);
+        npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+        npyv_storen_@sfx@(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, op += ostride) {
+        *op = scalar_@intrin@(*ip);
+    }
+}
+// non-contiguous input, contiguous output
+static NPY_INLINE void
+simd_unary_nc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride,
+                             npyv_lanetype_@sfx@ *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
+    /**begin repeat2
+     * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+     */
+    #if UNROLL > @U@
+        npyv_@sfx@ v_@U@ = npyv_loadn_@sfx@(ip + @U@ * vstep * istride, istride);
+        npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+        npyv_store_@sfx@(op + @U@ * vstep, r_@U@);
+    #endif
+    /**end repeat2**/
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
+        npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride);
+        npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+        npyv_store_@sfx@(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, ++op) {
+        *op = scalar_@intrin@(*ip);
+    }
+}
+// non-contiguous input and output
+// limit unroll to 2x
+#if UNROLL > 2
+#undef UNROLL
+#define UNROLL 2
+#endif
+static NPY_INLINE void
+simd_unary_nn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride,
+                             npyv_lanetype_@sfx@ *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+    /**begin repeat2
+     * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+     */
+    #if UNROLL > @U@
+        npyv_@sfx@ v_@U@ = npyv_loadn_@sfx@(ip + @U@ * vstep * istride, istride);
+        npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+        npyv_storen_@sfx@(op + @U@ * vstep * ostride, ostride, r_@U@);
+    #endif
+    /**end repeat2**/
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+        npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride);
+        npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+        npyv_storen_@sfx@(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = scalar_@intrin@(*ip);
+    }
+}
+#endif // @supports_ncontig@
+#undef UNROLL
+#endif // @simd_chk@
+/*end repeat1**/
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *         BYTE,  SHORT,  INT,  LONG,  LONGLONG,
+ *         FLOAT, DOUBLE, LONGDOUBLE#
+ *
+ * #BTYPE = BYTE, SHORT, INT,  LONG, LONGLONG,
+ *          BYTE, SHORT, INT, LONG, LONGLONG,
+ *          FLOAT, DOUBLE, LONGDOUBLE#
+ * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *         npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *         npy_float, npy_double, npy_longdouble#
+ *
+ * #is_fp = 0*10, 1*3#
+ * #is_unsigned = 1*5, 0*5, 0*3#
+ * #supports_ncontig = 0*2, 1*3, 0*2, 1*3, 1*3#
+ */
+#undef TO_SIMD_SFX
+#if 0
+/**begin repeat1
+ * #len = 8, 16, 32, 64#
+ */
+#elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@
+    #if @is_fp@
+        #define TO_SIMD_SFX(X) X##_f@len@
+        #if NPY_BITSOF_@BTYPE@ == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif @is_unsigned@
+        #define TO_SIMD_SFX(X) X##_u@len@
+    #else
+        #define TO_SIMD_SFX(X) X##_s@len@
+    #endif
+/**end repeat1**/
+#endif
+
+/**begin repeat1
+ * #kind = negative#
+ * #intrin = negative#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip = args[0], *op = args[1];
+    npy_intp istep = steps[0], ostep = steps[1],
+             len = dimensions[0];
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
+        if (IS_UNARY_CONT(@type@, @type@)) {
+            // no overlap and operands are contiguous
+            TO_SIMD_SFX(simd_unary_cc_@intrin@)(
+                (STYPE*)ip, (STYPE*)op, len
+            );
+            goto clear;
+        }
+    #if @supports_ncontig@
+        const npy_intp istride = istep / sizeof(STYPE);
+        const npy_intp ostride = ostep / sizeof(STYPE);
+        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
+            TO_SIMD_SFX(npyv_storable_stride)(ostride))
+        {
+            if (istride == 1 && ostride != 1) {
+                // contiguous input, non-contiguous output
+                TO_SIMD_SFX(simd_unary_cn_@intrin@)(
+                    (STYPE*)ip, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+            else if (istride != 1 && ostride == 1) {
+                // non-contiguous input, contiguous output
+                TO_SIMD_SFX(simd_unary_nc_@intrin@)(
+                    (STYPE*)ip, istride, (STYPE*)op, len
+                );
+                goto clear;
+            }
+        // SSE2 does better with unrolled scalar for heavy non-contiguous
+        #if !defined(NPY_HAVE_SSE2)
+            else if (istride != 1 && ostride != 1) {
+                // non-contiguous input and output
+                TO_SIMD_SFX(simd_unary_nn_@intrin@)(
+                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+        #endif
+        }
+    #endif // @supports_ncontig@
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    /*
+     * scalar unrolls
+     * 8x unroll performed best on
+     *  - Apple M1 Native / arm64
+     *  - Apple M1 Rosetta / SSE42
+     *  - iMacPro / AVX512
+     */
+    #define UNROLL 8
+    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
+    /**begin repeat2
+     * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+     */
+    #if UNROLL > @U@
+        const @type@ in_@U@ = *((const @type@ *)(ip + @U@ * istep));
+        *((@type@ *)(op + @U@ * ostep)) = scalar_@intrin@(in_@U@);
+    #endif
+    /**end repeat2**/
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    for (; len > 0; --len, ip += istep, op += ostep) {
+        *((@type@ *)op) = scalar_@intrin@(*(const @type@ *)ip);
+    }
+#ifdef TO_SIMD_SFX
+clear:
+    npyv_cleanup();
+#endif
+#if @is_fp@
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+/**end repeat**/
+
+#undef NEGATIVE_CONTIG_ONLY