summaryrefslogtreecommitdiff
path: root/numpy/core/src/umath/loops_unary.dispatch.c.src
diff options
context:
space:
mode:
Diffstat (limited to 'numpy/core/src/umath/loops_unary.dispatch.c.src')
-rw-r--r--numpy/core/src/umath/loops_unary.dispatch.c.src364
1 files changed, 364 insertions, 0 deletions
diff --git a/numpy/core/src/umath/loops_unary.dispatch.c.src b/numpy/core/src/umath/loops_unary.dispatch.c.src
new file mode 100644
index 000000000..1e2a81d20
--- /dev/null
+++ b/numpy/core/src/umath/loops_unary.dispatch.c.src
@@ -0,0 +1,364 @@
+/*@targets
+ ** $maxopt baseline
+ ** neon asimd
+ ** sse2 avx2 avx512_skx
+ ** vsx2
+ ** vx vxe
+ **/
+
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/*******************************************************************************
+ ** Scalar ops
+ ******************************************************************************/
+#define scalar_negative(X) (-X)
+
+/*******************************************************************************
+ ** extra SIMD intrinsics
+ ******************************************************************************/
+
+#if NPY_SIMD
+
+/**begin repeat
+ * #sfx = s8, u8, s16, u16, s32, u32, s64, u64#
+ * #ssfx = 8, 8, 16, 16, 32, 32, 64, 64#
+ */
+static NPY_INLINE npyv_@sfx@
+npyv_negative_@sfx@(npyv_@sfx@ v)
+{
+#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || @ssfx@ < 64)
+ return npyv_reinterpret_@sfx@_s@ssfx@(vnegq_s@ssfx@(npyv_reinterpret_s@ssfx@_@sfx@(v)));
+#else
+ // (x ^ -1) + 1
+ const npyv_@sfx@ m1 = npyv_setall_@sfx@((npyv_lanetype_@sfx@)-1);
+ return npyv_sub_@sfx@(npyv_xor_@sfx@(v, m1), m1);
+#endif
+}
+/**end repeat**/
+
+/**begin repeat
+ * #sfx = f32, f64#
+ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #fd = f, #
+ */
+#if @VCHK@
+static NPY_INLINE npyv_@sfx@
+npyv_negative_@sfx@(npyv_@sfx@ v)
+{
+#if defined(NPY_HAVE_NEON)
+ return vnegq_@sfx@(v);
+#else
+ // (v ^ signmask)
+ const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@);
+ return npyv_xor_@sfx@(v, signmask);
+#endif
+}
+#endif // @VCHK@
+/**end repeat**/
+
+#endif // NPY_SIMD
+
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+/**begin repeat
+ * #sfx = s8, u8, s16, u16, s32, u32, s64, u64, f32, f64#
+ * #simd_chk = NPY_SIMD*8, NPY_SIMD_F32, NPY_SIMD_F64#
+ * #is_fp = 0*8, 1*2#
+ * #supports_ncontig = 0*4,1*6#
+ */
+/**begin repeat1
+ * #kind = negative#
+ * #intrin = negative#
+ * #unroll = 4#
+ */
+#if @simd_chk@
+#if @unroll@ < 1
+#error "Unroll must be at least 1"
+#elif NPY_SIMD != 128 && @unroll@ > 2
+// Avoid memory bandwidth bottleneck for larger SIMD
+#define UNROLL 2
+#else
+#define UNROLL @unroll@
+#endif
+// contiguous inputs and output.
+static NPY_INLINE void
+simd_unary_cc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip,
+ npyv_lanetype_@sfx@ *op,
+ npy_intp len)
+{
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep * UNROLL;
+
+ // unrolled vector loop
+ for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
+ /**begin repeat2
+ * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+ #if UNROLL > @U@
+ npyv_@sfx@ v_@U@ = npyv_load_@sfx@(ip + @U@ * vstep);
+ npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+ npyv_store_@sfx@(op + @U@ * vstep, r_@U@);
+ #endif
+ /**end repeat2**/
+ }
+ // single vector loop
+ for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
+ npyv_@sfx@ v = npyv_load_@sfx@(ip);
+ npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+ npyv_store_@sfx@(op, r);
+ }
+ // scalar finish up any remaining iterations
+ for (; len > 0; --len, ++ip, ++op) {
+ *op = scalar_@intrin@(*ip);
+ }
+}
+
+#if @supports_ncontig@
+// contiguous input, non-contiguous output
+static NPY_INLINE void
+simd_unary_cn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip,
+ npyv_lanetype_@sfx@ *op, npy_intp ostride,
+ npy_intp len)
+{
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep * UNROLL;
+
+ // unrolled vector loop
+ for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
+ /**begin repeat2
+ * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+ #if UNROLL > @U@
+ npyv_@sfx@ v_@U@ = npyv_load_@sfx@(ip + @U@ * vstep);
+ npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+ npyv_storen_@sfx@(op + @U@ * vstep * ostride, ostride, r_@U@);
+ #endif
+ /**end repeat2**/
+ }
+ // single vector loop
+ for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
+ npyv_@sfx@ v = npyv_load_@sfx@(ip);
+ npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+ npyv_storen_@sfx@(op, ostride, r);
+ }
+ // scalar finish up any remaining iterations
+ for (; len > 0; --len, ++ip, op += ostride) {
+ *op = scalar_@intrin@(*ip);
+ }
+}
+// non-contiguous input, contiguous output
+static NPY_INLINE void
+simd_unary_nc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride,
+ npyv_lanetype_@sfx@ *op,
+ npy_intp len)
+{
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep * UNROLL;
+
+ // unrolled vector loop
+ for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
+ /**begin repeat2
+ * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+ #if UNROLL > @U@
+ npyv_@sfx@ v_@U@ = npyv_loadn_@sfx@(ip + @U@ * vstep * istride, istride);
+ npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+ npyv_store_@sfx@(op + @U@ * vstep, r_@U@);
+ #endif
+ /**end repeat2**/
+ }
+ // single vector loop
+ for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
+ npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride);
+ npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+ npyv_store_@sfx@(op, r);
+ }
+ // scalar finish up any remaining iterations
+ for (; len > 0; --len, ip += istride, ++op) {
+ *op = scalar_@intrin@(*ip);
+ }
+}
+// non-contiguous input and output
+// limit unroll to 2x
+#if UNROLL > 2
+#undef UNROLL
+#define UNROLL 2
+#endif
+static NPY_INLINE void
+simd_unary_nn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride,
+ npyv_lanetype_@sfx@ *op, npy_intp ostride,
+ npy_intp len)
+{
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep * UNROLL;
+
+ // unrolled vector loop
+ for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+ /**begin repeat2
+ * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+ #if UNROLL > @U@
+ npyv_@sfx@ v_@U@ = npyv_loadn_@sfx@(ip + @U@ * vstep * istride, istride);
+ npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+ npyv_storen_@sfx@(op + @U@ * vstep * ostride, ostride, r_@U@);
+ #endif
+ /**end repeat2**/
+ }
+ // single vector loop
+ for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+ npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride);
+ npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+ npyv_storen_@sfx@(op, ostride, r);
+ }
+ // scalar finish up any remaining iterations
+ for (; len > 0; --len, ip += istride, op += ostride) {
+ *op = scalar_@intrin@(*ip);
+ }
+}
+#endif // @supports_ncontig@
+#undef UNROLL
+#endif // @simd_chk@
+/*end repeat1**/
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ * BYTE, SHORT, INT, LONG, LONGLONG,
+ * FLOAT, DOUBLE, LONGDOUBLE#
+ *
+ * #BTYPE = BYTE, SHORT, INT, LONG, LONGLONG,
+ * BYTE, SHORT, INT, LONG, LONGLONG,
+ * FLOAT, DOUBLE, LONGDOUBLE#
+ * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ * npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ * npy_float, npy_double, npy_longdouble#
+ *
+ * #is_fp = 0*10, 1*3#
+ * #is_unsigned = 1*5, 0*5, 0*3#
+ * #supports_ncontig = 0*2, 1*3, 0*2, 1*3, 1*3#
+ */
+#undef TO_SIMD_SFX
+#if 0
+/**begin repeat1
+ * #len = 8, 16, 32, 64#
+ */
+#elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@
+ #if @is_fp@
+ #define TO_SIMD_SFX(X) X##_f@len@
+ #if NPY_BITSOF_@BTYPE@ == 32 && !NPY_SIMD_F32
+ #undef TO_SIMD_SFX
+ #endif
+ #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64
+ #undef TO_SIMD_SFX
+ #endif
+ #elif @is_unsigned@
+ #define TO_SIMD_SFX(X) X##_u@len@
+ #else
+ #define TO_SIMD_SFX(X) X##_s@len@
+ #endif
+/**end repeat1**/
+#endif
+
+/**begin repeat1
+ * #kind = negative#
+ * #intrin = negative#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ char *ip = args[0], *op = args[1];
+ npy_intp istep = steps[0], ostep = steps[1],
+ len = dimensions[0];
+#ifdef TO_SIMD_SFX
+ #undef STYPE
+ #define STYPE TO_SIMD_SFX(npyv_lanetype)
+ if (!is_mem_overlap(ip, istep, op, ostep, len)) {
+ if (IS_UNARY_CONT(@type@, @type@)) {
+ // no overlap and operands are contiguous
+ TO_SIMD_SFX(simd_unary_cc_@intrin@)(
+ (STYPE*)ip, (STYPE*)op, len
+ );
+ goto clear;
+ }
+ #if @supports_ncontig@
+ const npy_intp istride = istep / sizeof(STYPE);
+ const npy_intp ostride = ostep / sizeof(STYPE);
+ if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
+ TO_SIMD_SFX(npyv_storable_stride)(ostride))
+ {
+ if (istride == 1 && ostride != 1) {
+ // contiguous input, non-contiguous output
+ TO_SIMD_SFX(simd_unary_cn_@intrin@)(
+ (STYPE*)ip, (STYPE*)op, ostride, len
+ );
+ goto clear;
+ }
+ else if (istride != 1 && ostride == 1) {
+ // non-contiguous input, contiguous output
+ TO_SIMD_SFX(simd_unary_nc_@intrin@)(
+ (STYPE*)ip, istride, (STYPE*)op, len
+ );
+ goto clear;
+ }
+ // SSE2 does better with unrolled scalar for heavy non-contiguous
+ #if !defined(NPY_HAVE_SSE2)
+ else if (istride != 1 && ostride != 1) {
+ // non-contiguous input and output
+ TO_SIMD_SFX(simd_unary_nn_@intrin@)(
+ (STYPE*)ip, istride, (STYPE*)op, ostride, len
+ );
+ goto clear;
+ }
+ #endif
+ }
+ #endif // @supports_ncontig@
+ }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+ /*
+ * scalar unrolls
+ * 8x unroll performed best on
+ * - Apple M1 Native / arm64
+ * - Apple M1 Rosetta / SSE42
+ * - iMacPro / AVX512
+ */
+ #define UNROLL 8
+ for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
+ /**begin repeat2
+ * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+ #if UNROLL > @U@
+ const @type@ in_@U@ = *((const @type@ *)(ip + @U@ * istep));
+ *((@type@ *)(op + @U@ * ostep)) = scalar_@intrin@(in_@U@);
+ #endif
+ /**end repeat2**/
+ }
+#endif // NPY_DISABLE_OPTIMIZATION
+ for (; len > 0; --len, ip += istep, op += ostep) {
+ *((@type@ *)op) = scalar_@intrin@(*(const @type@ *)ip);
+ }
+#ifdef TO_SIMD_SFX
+clear:
+ npyv_cleanup();
+#endif
+#if @is_fp@
+ npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+/**end repeat**/
+
+#undef NEGATIVE_CONTIG_ONLY