summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/code_generators/generate_umath.py8
-rw-r--r--numpy/core/src/umath/loops_arithm_fp.dispatch.c.src1186
2 files changed, 523 insertions, 671 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index ae1dcee7b..350dd19c3 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -404,7 +404,8 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.conjugate'),
None,
- TD(ints+flts+cmplx, simd=[('avx2', ints), ('avx512f', cmplxvec)]),
+ TD(ints+flts+cmplx, simd=[('avx2', ints)],
+ dispatch=[('loops_arithm_fp', 'FD')]),
TD(P, f='conjugate'),
),
'fmod':
@@ -419,7 +420,8 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.square'),
None,
- TD(ints+inexact, simd=[('avx2', ints), ('avx512f', 'FD')], dispatch=[('loops_unary_fp', 'fd')]),
+ TD(ints+inexact, simd=[('avx2', ints)],
+ dispatch=[('loops_unary_fp', 'fd'), ('loops_arithm_fp', 'FD')]),
TD(O, f='Py_square'),
),
'reciprocal':
@@ -460,7 +462,7 @@ defdict = {
'PyUFunc_AbsoluteTypeResolver',
TD(bints+flts+timedeltaonly, dispatch=[('loops_unary_fp', 'fd'),
('loops_logical', '?')]),
- TD(cmplx, simd=[('avx512f', cmplxvec)], out=('f', 'd', 'g')),
+ TD(cmplx, dispatch=[('loops_arithm_fp', 'FD')], out=('f', 'd', 'g')),
TD(O, f='PyNumber_Absolute'),
),
'_arg':
diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
index c1bfaa63a..183362cf2 100644
--- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -1,6 +1,8 @@
/*@targets
** $maxopt baseline
- ** sse2 avx2 avx512f
+ ** sse2 (avx2 fma3) avx512f
+ ** neon asimd
+ ** vsx2 vsx3
** vx vxe
**/
#define _UMATHMODULE
@@ -14,708 +16,293 @@
// Provides the various *_LOOP macros
#include "fast_loop_macros.h"
-// TODO: replace raw SIMD with NPYV
+/**
+ * TODO:
+ * - Improve the implementation of SIMD complex absolute,
+ * current one kinda slow and it can be optimized by
+ * at least avoiding the division and keep sqrt.
+ * - Vectorize reductions
+ * - Add support for ASIMD/VCMLA through universal intrinics.
+ */
+
//###############################################################################
//## Real Single/Double precision
//###############################################################################
/********************************************************************************
- ** Defining the SIMD kernels
+ ** Defining ufunc inner functions
********************************************************************************/
-#ifdef NPY_HAVE_SSE2
+
/**begin repeat
+ * Float types
* #type = npy_float, npy_double#
* #TYPE = FLOAT, DOUBLE#
- * #scalarf = npy_sqrtf, npy_sqrt#
+ * #sfx = f32, f64#
* #c = f, #
- * #vtype = __m128, __m128d#
- * #vtype256 = __m256, __m256d#
- * #vtype512 = __m512, __m512d#
- * #vpre = _mm, _mm#
- * #vpre256 = _mm256, _mm256#
- * #vpre512 = _mm512, _mm512#
- * #vsuf = ps, pd#
- * #vsufs = ss, sd#
- * #nan = NPY_NANF, NPY_NAN#
- * #double = 0, 1#
- * #cast = _mm_castps_si128, _mm_castpd_si128#
+ * #C = F, #
+ * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
*/
/**begin repeat1
-* Arithmetic
-* # kind = add, subtract, multiply, divide#
-* # OP = +, -, *, /#
-* # VOP = add, sub, mul, div#
-*/
-static void
-sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ * # intrin = add, sub, mul, div#
+ * # OP = +, -, *, /#
+ * # PW = 1, 0, 0, 0#
+ * # is_div = 0*3, 1#
+ * # is_mul = 0*2, 1, 0#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
-#ifdef NPY_HAVE_AVX512F
- const npy_intp vector_size_bytes = 64;
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[i] @OP@ ip2[i];
- /* lots of specializations, to squeeze out max performance */
- if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) {
- if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
- @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
+ npy_intp len = dimensions[0];
+ char *src0 = args[0], *src1 = args[1], *dst = args[2];
+ npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2];
+ // reduce
+ if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) {
+ #if @PW@
+ *((@type@*)src0) @OP@= @TYPE@_pairwise_sum(src1, len, ssrc1);
+ #else
+ @type@ acc = *((@type@*)src0);
+ if (ssrc1 == sizeof(@type@)) {
+ for (; len > 0; --len, src1 += sizeof(@type@)) {
+ acc @OP@= *(@type@ *)src1;
}
- }
- }
- else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
- @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
- else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
- @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
- @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
- }
-#elif defined NPY_HAVE_AVX2
- const npy_intp vector_size_bytes = 32;
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[i] @OP@ ip2[i];
- /* lots of specializations, to squeeze out max performance */
- if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
- npy_is_aligned(&ip2[i], vector_size_bytes)) {
- if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
- @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
+ } else {
+ for (; len > 0; --len, src1 += ssrc1) {
+ acc @OP@= *(@type@ *)src1;
}
}
+ *((@type@*)src0) = acc;
+ #endif
+ return;
}
- else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
- @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
- else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
- @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
- @vpre256@_store_@vsuf@(&op[i], c);
+#if @VECTOR@
+ if (len > npyv_nlanes_@sfx@*2 &&
+ !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
+ !is_mem_overlap(src1, ssrc1, dst, sdst, len)
+ ) {
+ const int vstep = npyv_nlanes_u8;
+ const int wstep = vstep * 2;
+ const int hstep = npyv_nlanes_@sfx@;
+ const int lstep = hstep * 2;
+ // lots of specializations, to squeeze out max performance
+ if (ssrc0 == sizeof(@type@) && ssrc0 == ssrc1 && ssrc0 == sdst) {
+ for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) {
+ npyv_@sfx@ a0 = npyv_load_@sfx@((const @type@*)src0);
+ npyv_@sfx@ a1 = npyv_load_@sfx@((const @type@*)(src0 + vstep));
+ npyv_@sfx@ b0 = npyv_load_@sfx@((const @type@*)src1);
+ npyv_@sfx@ b1 = npyv_load_@sfx@((const @type@*)(src1 + vstep));
+ npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a0, b0);
+ npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a1, b1);
+ npyv_store_@sfx@((@type@*)dst, r0);
+ npyv_store_@sfx@((@type@*)(dst + vstep), r1);
}
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
- @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
+ for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+ #if @is_div@
+ npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
+ npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
+ #else
+ npyv_@sfx@ a = npyv_load_tillz_@sfx@((const @type@*)src0, len);
+ npyv_@sfx@ b = npyv_load_tillz_@sfx@((const @type@*)src1, len);
+ #endif
+ npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+ npyv_store_till_@sfx@((@type@*)dst, len, r);
}
}
- }
-#else
- const npy_intp vector_size_bytes = 16;
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[i] @OP@ ip2[i];
- /* lots of specializations, to squeeze out max performance */
- if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
- npy_is_aligned(&ip2[i], vector_size_bytes)) {
- if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
- @vpre@_store_@vsuf@(&op[i], c);
+ else if (ssrc0 == 0 && ssrc1 == sizeof(@type@) && sdst == ssrc1) {
+ npyv_@sfx@ a = npyv_setall_@sfx@(*((@type@*)src0));
+ for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) {
+ npyv_@sfx@ b0 = npyv_load_@sfx@((const @type@*)src1);
+ npyv_@sfx@ b1 = npyv_load_@sfx@((const @type@*)(src1 + vstep));
+ npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a, b0);
+ npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a, b1);
+ npyv_store_@sfx@((@type@*)dst, r0);
+ npyv_store_@sfx@((@type@*)(dst + vstep), r1);
}
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
- @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
+ for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
+ #if @is_div@ || @is_mul@
+ npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
+ #else
+ npyv_@sfx@ b = npyv_load_tillz_@sfx@((const @type@*)src1, len);
+ #endif
+ npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+ npyv_store_till_@sfx@((@type@*)dst, len, r);
}
}
- }
- else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
- @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
- }
- }
- else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
- @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
- @vpre@_store_@vsuf@(&op[i], c);
+ else if (ssrc1 == 0 && ssrc0 == sizeof(@type@) && sdst == ssrc0) {
+ npyv_@sfx@ b = npyv_setall_@sfx@(*((@type@*)src1));
+ for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) {
+ npyv_@sfx@ a0 = npyv_load_@sfx@((const @type@*)src0);
+ npyv_@sfx@ a1 = npyv_load_@sfx@((const @type@*)(src0 + vstep));
+ npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a0, b);
+ npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a1, b);
+ npyv_store_@sfx@((@type@*)dst, r0);
+ npyv_store_@sfx@((@type@*)(dst + vstep), r1);
}
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
- @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
+ for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+ #if @is_div@ || @is_mul@
+ npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
+ #else
+ npyv_@sfx@ a = npyv_load_tillz_@sfx@((const @type@*)src0, len);
+ #endif
+ npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+ npyv_store_till_@sfx@((@type@*)dst, len, r);
}
+ } else {
+ goto loop_scalar;
}
+ npyv_cleanup();
+ return;
}
+loop_scalar:
#endif
- LOOP_BLOCKED_END {
- op[i] = ip1[i] @OP@ ip2[i];
- }
-}
-
-static void
-sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-#ifdef NPY_HAVE_AVX512F
- const npy_intp vector_size_bytes = 64;
- const @vtype512@ a = @vpre512@_set1_@vsuf@(ip1[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[0] @OP@ ip2[i];
- if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
-
-
-#elif defined NPY_HAVE_AVX2
- const npy_intp vector_size_bytes = 32;
- const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[0] @OP@ ip2[i];
- if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
-#else
- const npy_intp vector_size_bytes = 16;
- const @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[0] @OP@ ip2[i];
- if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
- }
- }
-#endif
- LOOP_BLOCKED_END {
- op[i] = ip1[0] @OP@ ip2[i];
- }
-}
-
-static void
-sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-#ifdef NPY_HAVE_AVX512F
- const npy_intp vector_size_bytes = 64;
- const @vtype512@ b = @vpre512@_set1_@vsuf@(ip2[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[i] @OP@ ip2[0];
- if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
-
-#elif defined NPY_HAVE_AVX2
- const npy_intp vector_size_bytes = 32;
- const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[i] @OP@ ip2[0];
- if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
-#else
- const npy_intp vector_size_bytes = 16;
- const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[i] @OP@ ip2[0];
- if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
- }
- }
-#endif
- LOOP_BLOCKED_END {
- op[i] = ip1[i] @OP@ ip2[0];
+ for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) {
+ const @type@ a = *((@type@*)src0);
+ const @type@ b = *((@type@*)src1);
+ *((@type@*)dst) = a @OP@ b;
}
}
-
/**end repeat1**/
/**end repeat**/
-#else // NPY_HAVE_SSE2
-
-/**begin repeat
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- * #sfx = f32, f64#
- * #CHK = _F32, _F64#
- */
-#if NPY_SIMD@CHK@
-/**begin repeat1
-* Arithmetic
-* # kind = add, subtract, multiply, divide#
-* # OP = +, -, *, /#
-* # VOP = add, sub, mul, div#
-*/
-
-static void
-simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
- LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
- op[i] = ip1[i] @OP@ ip2[i];
- }
- /* lots of specializations, to squeeze out max performance */
- if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
- npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
- npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, a);
- npyv_store_@sfx@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
- npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
- npyv_@sfx@ b = npyv_load_@sfx@(&ip2[i]);
- npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, b);
- npyv_store_@sfx@(&op[i], c);
- }
- }
- LOOP_BLOCKED_END {
- op[i] = ip1[i] @OP@ ip2[i];
- }
-}
+//###############################################################################
+//## Complex Single/Double precision
+//###############################################################################
-static void
-simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
- const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
- op[i] = ip1[0] @OP@ ip2[i];
- }
- LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
- npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]);
- npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
- npyv_store_@sfx@(&op[i], v3);
- }
- LOOP_BLOCKED_END {
- op[i] = ip1[0] @OP@ ip2[i];
- }
-}
+/********************************************************************************
+ ** op intrinics
+ ********************************************************************************/
-static void
-simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_f32x2 simd_set2_f32(const float *a)
{
- const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
- op[i] = ip1[i] @OP@ ip2[0];
- }
- LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
- npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]);
- npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
- npyv_store_@sfx@(&op[i], v3);
- }
- LOOP_BLOCKED_END {
- op[i] = ip1[i] @OP@ ip2[0];
- }
+ npyv_f32 fill = npyv_reinterpret_f32_u64(npyv_setall_u64(*(npy_uint64*)a));
+ npyv_f32x2 r;
+ r.val[0] = fill;
+ r.val[1] = fill;
+ return r;
}
-/**end repeat1**/
-#endif /* NPY_SIMD@CHK@ */
-/**end repeat**/
-#endif // NPY_HAVE_SSE2
-/**begin repeat
- * Float types
- * #type = npy_float, npy_double, npy_longdouble#
- * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
- * #vector = 1, 1, 0#
- * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64, 0 #
- */
-/**begin repeat1
- * Arithmetic
- * # kind = add, subtract, multiply, divide#
- */
-static inline int
-run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+NPY_FINLINE npyv_f32
+simd_cconjugate_f32(npyv_f32 x)
{
-#if @vector@ && defined NPY_HAVE_SSE2
- @type@ * ip1 = (@type@ *)args[0];
- @type@ * ip2 = (@type@ *)args[1];
- @type@ * op = (@type@ *)args[2];
- npy_intp n = dimensions[0];
-#if defined NPY_HAVE_AVX512F
- const npy_uintp vector_size_bytes = 64;
-#elif defined NPY_HAVE_AVX2
- const npy_uintp vector_size_bytes = 32;
+#if NPY_SIMD_BIGENDIAN
+ const npyv_f32 mask = npyv_reinterpret_f32_u64(npyv_setall_u64(0x80000000));
#else
- const npy_uintp vector_size_bytes = 32;
-#endif
- /* argument one scalar */
- if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) {
- sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
- return 1;
- }
- /* argument two scalar */
- else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) {
- sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
- return 1;
- }
- else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) {
- sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
- return 1;
- }
-#elif @VECTOR@
- @type@ * ip1 = (@type@ *)args[0];
- @type@ * ip2 = (@type@ *)args[1];
- @type@ * op = (@type@ *)args[2];
- npy_intp n = dimensions[0];
- /* argument one scalar */
- if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), NPY_SIMD_WIDTH)) {
- simd_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
- return 1;
- }
- /* argument two scalar */
- else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH)) {
- simd_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
- return 1;
- }
- else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
- simd_binary_@kind@_@TYPE@(op, ip1, ip2, n);
- return 1;
- }
+ const npyv_f32 mask = npyv_reinterpret_f32_u64(npyv_setall_u64(0x8000000000000000ULL));
#endif
- return 0;
+ return npyv_xor_f32(x, mask);
}
-/**end repeat1**/
-/**end repeat**/
-/********************************************************************************
- ** Defining ufunc inner functions
- ********************************************************************************/
-/**begin repeat
- * Float types
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- * #c = f, #
- * #C = F, #
- * #count = 4,2#
- */
-/**begin repeat1
- * Arithmetic
- * # kind = add, subtract, multiply, divide#
- * # OP = +, -, *, /#
- * # PW = 1, 0, 0, 0#
- */
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+NPY_FINLINE npyv_f32
+simd_cmul_f32(npyv_f32 a, npyv_f32 b)
{
- if (IS_BINARY_REDUCE) {
-#if @PW@
- @type@ * iop1 = (@type@ *)args[0];
- npy_intp n = dimensions[0];
-
- *iop1 @OP@= @TYPE@_pairwise_sum(args[1], n, steps[1]);
-#else
- BINARY_REDUCE_LOOP(@type@) {
- io1 @OP@= *(@type@ *)ip2;
- }
- *((@type@ *)iop1) = io1;
-#endif
- }
- else if (dimensions[0] < @count@ || !run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) {
- BINARY_LOOP {
- const @type@ in1 = *(@type@ *)ip1;
- const @type@ in2 = *(@type@ *)ip2;
- *((@type@ *)op1) = in1 @OP@ in2;
- }
- }
+ npyv_f32 b_rev = npyv_permi128_f32(b, 1, 0, 3, 2);
+ npyv_f32 a_re = npyv_permi128_f32(a, 0, 0, 2, 2);
+ npyv_f32 a_im = npyv_permi128_f32(a, 1, 1, 3, 3);
+ // a_im * b_im, a_im * b_re
+ npyv_f32 ab_iiir = npyv_mul_f32(a_im, b_rev);
+ return npyv_muladdsub_f32(a_re, b, ab_iiir);
}
-/**end repeat1**/
-/**end repeat**/
-//###############################################################################
-//## Complex Single/Double precision
-//###############################################################################
-/********************************************************************************
- ** Defining the SIMD kernels
- ********************************************************************************/
-#if !defined(_MSC_VER) && defined(NPY_HAVE_AVX512F)
- /**
- * For somehow MSVC commit aggressive optimization lead
- * to raises 'RuntimeWarning: invalid value encountered in multiply'
- *
- * the issue mainly caused by '_mm512_maskz_loadu_ps', we need to
- * investigate about it while moving to NPYV.
- */
- #define AVX512F_NOMSVC
+NPY_FINLINE npyv_f32
+simd_csquare_f32(npyv_f32 x)
+{ return simd_cmul_f32(x, x); }
#endif
-#ifdef AVX512F_NOMSVC
-NPY_FINLINE __mmask16
-avx512_get_full_load_mask_ps(void)
-{
- return 0xFFFF;
-}
+#if NPY_SIMD_F64
-NPY_FINLINE __mmask8
-avx512_get_full_load_mask_pd(void)
-{
- return 0xFF;
-}
-NPY_FINLINE __m512
-avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
+NPY_FINLINE npyv_f64x2 simd_set2_f64(const double *a)
{
- return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
+ npyv_f64 r = npyv_setall_f64(a[0]);
+ npyv_f64 i = npyv_setall_f64(a[1]);
+ return npyv_zip_f64(r, i);
}
-NPY_FINLINE __m512d
-avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
+NPY_FINLINE npyv_f64
+simd_cconjugate_f64(npyv_f64 x)
{
- return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
+ const npyv_f64 mask = npyv_reinterpret_f64_u64(npyv_set_u64(
+ 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+ 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+ 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+ 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+ 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+ 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+ 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+ 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL
+ ));
+ return npyv_xor_f64(x, mask);
}
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
+NPY_FINLINE npyv_f64
+simd_cmul_f64(npyv_f64 a, npyv_f64 b)
{
- return (0x0001 << num_elem) - 0x0001;
+ npyv_f64 b_rev = npyv_permi128_f64(b, 1, 0);
+ npyv_f64 a_re = npyv_permi128_f64(a, 0, 0);
+ npyv_f64 a_im = npyv_permi128_f64(a, 1, 1);
+ // a_im * b_im, a_im * b_re
+ npyv_f64 ab_iiir = npyv_mul_f64(a_im, b_rev);
+ return npyv_muladdsub_f64(a_re, b, ab_iiir);
}
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
-avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
-{
- return (0x01 << num_elem) - 0x01;
-}
-/**begin repeat
- * #vsub = ps, pd#
- * #type= npy_float, npy_double#
- * #epi_vsub = epi32, epi64#
- * #vtype = __m512, __m512d#
- * #mask = __mmask16, __mmask8#
- * #and_const = 0x7fffffff, 0x7fffffffffffffffLL#
- * #neg_mask = 0x80000000, 0x8000000000000000#
- * #perm_ = 0xb1, 0x55#
- * #cmpx_img_mask = 0xAAAA, 0xAA#
- * #cmpx_re_mask = 0x5555, 0x55#
- * #INF = NPY_INFINITYF, NPY_INFINITY#
- * #NAN = NPY_NANF, NPY_NAN#
- */
-NPY_FINLINE @vtype@
-avx512_hadd_@vsub@(const @vtype@ x)
-{
- return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
-}
-
-NPY_FINLINE @vtype@
-avx512_hsub_@vsub@(const @vtype@ x)
-{
- return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
-}
-NPY_FINLINE @vtype@
-avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
-{
- // x1 = r1, i1
- // x2 = r2, i2
- @vtype@ x3 = _mm512_permute_@vsub@(x2, @perm_@); // i2, r2
- @vtype@ x12 = _mm512_mul_@vsub@(x1, x2); // r1*r2, i1*i2
- @vtype@ x13 = _mm512_mul_@vsub@(x1, x3); // r1*i2, r2*i1
- @vtype@ outreal = avx512_hsub_@vsub@(x12); // r1*r2 - i1*i2, r1*r2 - i1*i2
- @vtype@ outimg = avx512_hadd_@vsub@(x13); // r1*i2 + i1*r2, r1*i2 + i1*r2
- return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg);
-}
-/**end repeat**/
+NPY_FINLINE npyv_f64
+simd_csquare_f64(npyv_f64 x)
+{ return simd_cmul_f64(x, x); }
#endif
/**begin repeat
- * #TYPE = CFLOAT, CDOUBLE#
* #type = npy_float, npy_double#
- * #num_lanes = 16, 8#
- * #vsuffix = ps, pd#
- * #epi_vsub = epi32, epi64#
- * #mask = __mmask16, __mmask8#
- * #vtype = __m512, __m512d#
- * #scale = 4, 8#
- * #vindextype = __m512i, __m256i#
- * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
- * #storemask = 0xFF, 0xF#
- * #IS_FLOAT = 1, 0#
- */
-/**begin repeat1
- * #func = add, subtract, multiply#
- * #vectorf = _mm512_add, _mm512_sub, avx512_cmul#
- */
-#if defined AVX512F_NOMSVC
-static inline void
-AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
-{
- const npy_intp array_size = dimensions[0];
- npy_intp num_remaining_elements = 2*array_size;
- @type@* ip1 = (@type@*) args[0];
- @type@* ip2 = (@type@*) args[1];
- @type@* op = (@type@*) args[2];
-
- @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
-
- while (num_remaining_elements > 0) {
- if (num_remaining_elements < @num_lanes@) {
- load_mask = avx512_get_partial_load_mask_@vsuffix@(
- num_remaining_elements, @num_lanes@);
- }
- @vtype@ x1, x2;
- x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
- x2 = avx512_masked_load_@vsuffix@(load_mask, ip2);
-
- @vtype@ out = @vectorf@_@vsuffix@(x1, x2);
-
- _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
-
- ip1 += @num_lanes@;
- ip2 += @num_lanes@;
- op += @num_lanes@;
- num_remaining_elements -= @num_lanes@;
- }
-}
-#endif // AVX512F_NOMSVC
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * #TYPE = CFLOAT, CDOUBLE#
- * #type= npy_float, npy_double#
- * #esize = 8, 16#
- */
-/**begin repeat1
- * #func = add, subtract, multiply#
+ * #sfx = f32, f64#
+ * #bsfx = b32, b64#
+ * #usfx = b32, u64#
+ * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #is_double = 0, 1#
+ * #c = f, #
+ * #INF = NPY_INFINITYF, NPY_INFINITY#
+ * #NAN = NPY_NANF, NPY_NAN#
*/
-static inline int
-run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
+#if @VECTOR@
+NPY_FINLINE npyv_@sfx@
+simd_cabsolute_@sfx@(npyv_@sfx@ re, npyv_@sfx@ im)
{
-#if defined AVX512F_NOMSVC
- if (IS_BINARY_STRIDE_ONE(@esize@, 64)) {
- AVX512F_@func@_@TYPE@(args, dimensions, steps);
- return 1;
- }
- else
- return 0;
-#endif
- return 0;
+ const npyv_@sfx@ inf = npyv_setall_@sfx@(@INF@);
+ const npyv_@sfx@ nan = npyv_setall_@sfx@(@NAN@);
+
+ re = npyv_abs_@sfx@(re);
+ im = npyv_abs_@sfx@(im);
+ /*
+ * If real or imag = INF, then convert it to inf + j*inf
+ * Handles: inf + j*nan, nan + j*inf
+ */
+ npyv_@bsfx@ re_infmask = npyv_cmpeq_@sfx@(re, inf);
+ npyv_@bsfx@ im_infmask = npyv_cmpeq_@sfx@(im, inf);
+ im = npyv_select_@sfx@(re_infmask, inf, im);
+ re = npyv_select_@sfx@(im_infmask, inf, re);
+ /*
+ * If real or imag = NAN, then convert it to nan + j*nan
+ * Handles: x + j*nan, nan + j*x
+ */
+ npyv_@bsfx@ re_nnanmask = npyv_notnan_@sfx@(re);
+ npyv_@bsfx@ im_nnanmask = npyv_notnan_@sfx@(im);
+ im = npyv_select_@sfx@(re_nnanmask, im, nan);
+ re = npyv_select_@sfx@(im_nnanmask, re, nan);
+
+ npyv_@sfx@ larger = npyv_max_@sfx@(re, im);
+ npyv_@sfx@ smaller = npyv_min_@sfx@(im, re);
+ /*
+ * Calculate div_mask to prevent 0./0. and inf/inf operations in div
+ */
+ npyv_@bsfx@ zeromask = npyv_cmpeq_@sfx@(larger, npyv_zero_@sfx@());
+ npyv_@bsfx@ infmask = npyv_cmpeq_@sfx@(smaller, inf);
+ npyv_@bsfx@ div_mask = npyv_not_@bsfx@(npyv_or_@bsfx@(zeromask, infmask));
+
+ npyv_@sfx@ ratio = npyv_ifdivz_@sfx@(div_mask, smaller, larger);
+ npyv_@sfx@ hypot = npyv_sqrt_@sfx@(
+ npyv_muladd_@sfx@(ratio, ratio, npyv_setall_@sfx@(1.0@c@)
+ ));
+ return npyv_mul_@sfx@(hypot, larger);
}
-/**end repeat1**/
+#endif // VECTOR
/**end repeat**/
/********************************************************************************
@@ -725,55 +312,318 @@ run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const
* complex types
* #TYPE = CFLOAT, CDOUBLE#
* #ftype = npy_float, npy_double#
+ * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #sfx = f32, f64#
* #c = f, #
* #C = F, #
*/
/**begin repeat1
* arithmetic
- * #kind = add, subtract#
- * #OP = +, -#
- * #PW = 1, 0#
+ * #kind = add, subtract, multiply#
+ * #vectorf = npyv_add, npyv_sub, simd_cmul#
+ * #OP = +, -, *#
+ * #PW = 1, 0, 0#
+ * #is_mul = 0*2, 1#
*/
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
- // Parenthesis around @PW@ tells clang dead code is intentional
- if (IS_BINARY_REDUCE && (@PW@)) {
- npy_intp n = dimensions[0];
- @ftype@ * or = ((@ftype@ *)args[0]);
- @ftype@ * oi = ((@ftype@ *)args[0]) + 1;
+ npy_intp len = dimensions[0];
+ char *b_src0 = args[0], *b_src1 = args[1], *b_dst = args[2];
+ npy_intp b_ssrc0 = steps[0], b_ssrc1 = steps[1], b_sdst = steps[2];
+#if @PW@
+ // reduce
+ if (b_ssrc0 == 0 && b_ssrc0 == b_sdst && b_src0 == b_dst &&
+ b_ssrc1 % (sizeof(@ftype@)*2) == 0
+ ) {
+ @ftype@ *rl_im = (@ftype@ *)b_src0;
@ftype@ rr, ri;
-
- @TYPE@_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2);
- *or @OP@= rr;
- *oi @OP@= ri;
+ @TYPE@_pairwise_sum(&rr, &ri, b_src1, len * 2, b_ssrc1 / 2);
+ rl_im[0] @OP@= rr;
+ rl_im[1] @OP@= ri;
return;
}
- if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
- BINARY_LOOP {
- const @ftype@ in1r = ((@ftype@ *)ip1)[0];
- const @ftype@ in1i = ((@ftype@ *)ip1)[1];
- const @ftype@ in2r = ((@ftype@ *)ip2)[0];
- const @ftype@ in2i = ((@ftype@ *)ip2)[1];
- ((@ftype@ *)op1)[0] = in1r @OP@ in2r;
- ((@ftype@ *)op1)[1] = in1i @OP@ in2i;
+#endif
+#if @VECTOR@
+ if (is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) ||
+ is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) ||
+ b_sdst % sizeof(@ftype@) != 0 || b_sdst == 0 ||
+ b_ssrc0 % sizeof(@ftype@) != 0 ||
+ b_ssrc1 % sizeof(@ftype@) != 0
+ ) {
+ goto loop_scalar;
+ }
+ const @ftype@ *src0 = (@ftype@*)b_src0;
+ const @ftype@ *src1 = (@ftype@*)b_src1;
+ @ftype@ *dst = (@ftype@*)b_dst;
+
+ const npy_intp ssrc0 = b_ssrc0 / sizeof(@ftype@);
+ const npy_intp ssrc1 = b_ssrc1 / sizeof(@ftype@);
+ const npy_intp sdst = b_sdst / sizeof(@ftype@);
+
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep * 2;
+ const int hstep = vstep / 2;
+
+ const int loadable0 = npyv_loadable_stride_s64(ssrc0);
+ const int loadable1 = npyv_loadable_stride_s64(ssrc1);
+ const int storeable = npyv_storable_stride_s64(sdst);
+
+ // lots**lots of specializations, to squeeze out max performance
+ // contig
+ if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) {
+ for (; len >= vstep; len -= vstep, src0 += wstep, src1 += wstep, dst += wstep) {
+ npyv_@sfx@ a0 = npyv_load_@sfx@(src0);
+ npyv_@sfx@ a1 = npyv_load_@sfx@(src0 + vstep);
+ npyv_@sfx@ b0 = npyv_load_@sfx@(src1);
+ npyv_@sfx@ b1 = npyv_load_@sfx@(src1 + vstep);
+ npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b0);
+ npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b1);
+ npyv_store_@sfx@(dst, r0);
+ npyv_store_@sfx@(dst + vstep, r1);
+ }
+ for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+ npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src0, len);
+ npyv_@sfx@ b = npyv_load2_tillz_@sfx@(src1, len);
+ npyv_@sfx@ r = @vectorf@_@sfx@(a, b);
+ npyv_store2_till_@sfx@(dst, len, r);
+ }
+ }
+ // scalar 0
+ else if (ssrc0 == 0) {
+ npyv_@sfx@x2 a = simd_set2_@sfx@(src0);
+ // contig
+ if (ssrc1 == 2 && sdst == ssrc1) {
+ for (; len >= vstep; len -= vstep, src1 += wstep, dst += wstep) {
+ npyv_@sfx@ b0 = npyv_load_@sfx@(src1);
+ npyv_@sfx@ b1 = npyv_load_@sfx@(src1 + vstep);
+ npyv_@sfx@ r0 = @vectorf@_@sfx@(a.val[0], b0);
+ npyv_@sfx@ r1 = @vectorf@_@sfx@(a.val[1], b1);
+ npyv_store_@sfx@(dst, r0);
+ npyv_store_@sfx@(dst + vstep, r1);
+ }
+ for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
+ #if @is_mul@
+ npyv_@sfx@ b = npyv_load2_till_@sfx@(src1, len, 1.0@c@, 1.0@c@);
+ #else
+ npyv_@sfx@ b = npyv_load2_tillz_@sfx@(src1, len);
+ #endif
+ npyv_@sfx@ r = @vectorf@_@sfx@(a.val[0], b);
+ npyv_store2_till_@sfx@(dst, len, r);
+ }
+ }
+ // non-contig
+ else if (loadable1 && storeable) {
+ for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) {
+ npyv_@sfx@ b0 = npyv_loadn2_@sfx@(src1, ssrc1);
+ npyv_@sfx@ b1 = npyv_loadn2_@sfx@(src1 + ssrc1*hstep, ssrc1);
+ npyv_@sfx@ r0 = @vectorf@_@sfx@(a.val[0], b0);
+ npyv_@sfx@ r1 = @vectorf@_@sfx@(a.val[1], b1);
+ npyv_storen2_@sfx@(dst, sdst, r0);
+ npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
+ }
+ for (; len > 0; len -= hstep, src1 += ssrc1*hstep, dst += sdst*hstep) {
+ #if @is_mul@
+ npyv_@sfx@ b = npyv_loadn2_till_@sfx@(src1, ssrc1, len, 1.0@c@, 1.0@c@);
+ #else
+ npyv_@sfx@ b = npyv_loadn2_tillz_@sfx@(src1, ssrc1, len);
+ #endif
+ npyv_@sfx@ r = @vectorf@_@sfx@(a.val[0], b);
+ npyv_storen2_till_@sfx@(dst, sdst, len, r);
+ }
+ }
+ else {
+ goto loop_scalar;
+ }
+ }
+ // scalar 1
+ else if (ssrc1 == 0) {
+ npyv_@sfx@x2 b = simd_set2_@sfx@(src1);
+ if (ssrc0 == 2 && sdst == ssrc0) {
+ for (; len >= vstep; len -= vstep, src0 += wstep, dst += wstep) {
+ npyv_@sfx@ a0 = npyv_load_@sfx@(src0);
+ npyv_@sfx@ a1 = npyv_load_@sfx@(src0 + vstep);
+ npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b.val[0]);
+ npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b.val[1]);
+ npyv_store_@sfx@(dst, r0);
+ npyv_store_@sfx@(dst + vstep, r1);
+ }
+ for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+ #if @is_mul@
+ npyv_@sfx@ a = npyv_load2_till_@sfx@(src0, len, 1.0@c@, 1.0@c@);
+ #else
+ npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src0, len);
+ #endif
+ npyv_@sfx@ r = @vectorf@_@sfx@(a, b.val[0]);
+ npyv_store2_till_@sfx@(dst, len, r);
+ }
}
+ // non-contig
+ else if (loadable0 && storeable) {
+ for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) {
+ npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src0, ssrc0);
+ npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src0 + ssrc0*hstep, ssrc0);
+ npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b.val[0]);
+ npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b.val[1]);
+ npyv_storen2_@sfx@(dst, sdst, r0);
+ npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
+ }
+ for (; len > 0; len -= hstep, src0 += ssrc0*hstep, dst += sdst*hstep) {
+ #if @is_mul@
+ npyv_@sfx@ a = npyv_loadn2_till_@sfx@(src0, ssrc0, len, 1.0@c@, 1.0@c@);
+ #else
+ npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@(src0, ssrc0, len);
+ #endif
+ npyv_@sfx@ r = @vectorf@_@sfx@(a, b.val[0]);
+ npyv_storen2_till_@sfx@(dst, sdst, len, r);
+ }
+ }
+ else {
+ goto loop_scalar;
+ }
+ }
+ #if @is_mul@
+ // non-contig
+ else if (loadable0 && loadable1 && storeable) {
+ for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep,
+ src1 += ssrc1*vstep, dst += sdst*vstep
+ ) {
+ npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src0, ssrc0);
+ npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src0 + ssrc0*hstep, ssrc0);
+ npyv_@sfx@ b0 = npyv_loadn2_@sfx@(src1, ssrc1);
+ npyv_@sfx@ b1 = npyv_loadn2_@sfx@(src1 + ssrc1*hstep, ssrc1);
+ npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b0);
+ npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b1);
+ npyv_storen2_@sfx@(dst, sdst, r0);
+ npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
+ }
+ for (; len > 0; len -= hstep, src0 += ssrc0*hstep,
+ src1 += ssrc1*hstep, dst += sdst*hstep
+ ) {
+ #if @is_mul@
+ npyv_@sfx@ a = npyv_loadn2_till_@sfx@(src0, ssrc0, len, 1.0@c@, 1.0@c@);
+ npyv_@sfx@ b = npyv_loadn2_till_@sfx@(src1, ssrc1, len, 1.0@c@, 1.0@c@);
+ #else
+ npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@(src0, ssrc0, len);
+ npyv_@sfx@ b = npyv_loadn2_tillz_@sfx@(src1, ssrc1, len);
+ #endif
+ npyv_@sfx@ r = @vectorf@_@sfx@(a, b);
+ npyv_storen2_till_@sfx@(dst, sdst, len, r);
+ }
+ }
+ #endif
+ else {
+ goto loop_scalar;
+ }
+ npyv_cleanup();
+ return;
+loop_scalar:
+#endif
+ for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) {
+ const @ftype@ a_r = ((@ftype@ *)b_src0)[0];
+ const @ftype@ a_i = ((@ftype@ *)b_src0)[1];
+ const @ftype@ b_r = ((@ftype@ *)b_src1)[0];
+ const @ftype@ b_i = ((@ftype@ *)b_src1)[1];
+ #if @is_mul@
+ ((@ftype@ *)b_dst)[0] = a_r*b_r - a_i*b_i;
+ ((@ftype@ *)b_dst)[1] = a_r*b_i + a_i*b_r;
+ #else
+ ((@ftype@ *)b_dst)[0] = a_r @OP@ b_r;
+ ((@ftype@ *)b_dst)[1] = a_i @OP@ b_i;
+ #endif
}
}
/**end repeat1**/
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_multiply)
+/**begin repeat1
+ * #kind = conjugate, square#
+ * #is_square = 0, 1#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
- if (!run_binary_avx512f_multiply_@TYPE@(args, dimensions, steps)) {
- BINARY_LOOP {
- const @ftype@ in1r = ((@ftype@ *)ip1)[0];
- const @ftype@ in1i = ((@ftype@ *)ip1)[1];
- const @ftype@ in2r = ((@ftype@ *)ip2)[0];
- const @ftype@ in2i = ((@ftype@ *)ip2)[1];
- ((@ftype@ *)op1)[0] = in1r*in2r - in1i*in2i;
- ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r;
+ npy_intp len = dimensions[0];
+ char *b_src = args[0], *b_dst = args[1];
+ npy_intp b_ssrc = steps[0], b_sdst = steps[1];
+#if @VECTOR@
+ if (is_mem_overlap(b_src, b_ssrc, b_dst, b_sdst, len) ||
+ b_sdst % sizeof(@ftype@) != 0 ||
+ b_ssrc % sizeof(@ftype@) != 0
+ ) {
+ goto loop_scalar;
+ }
+ const @ftype@ *src = (@ftype@*)b_src;
+ @ftype@ *dst = (@ftype@*)b_dst;
+ const npy_intp ssrc = b_ssrc / sizeof(@ftype@);
+ const npy_intp sdst = b_sdst / sizeof(@ftype@);
+
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep * 2;
+ const int hstep = vstep / 2;
+
+ if (ssrc == 2 && ssrc == sdst) {
+ for (; len >= vstep; len -= vstep, src += wstep, dst += wstep) {
+ npyv_@sfx@ a0 = npyv_load_@sfx@(src);
+ npyv_@sfx@ a1 = npyv_load_@sfx@(src + vstep);
+ npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0);
+ npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1);
+ npyv_store_@sfx@(dst, r0);
+ npyv_store_@sfx@(dst + vstep, r1);
+ }
+ for (; len > 0; len -= hstep, src += vstep, dst += vstep) {
+ npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src, len);
+ npyv_@sfx@ r = simd_c@kind@_@sfx@(a);
+ npyv_store2_till_@sfx@(dst, len, r);
+ }
+ }
+ else if (ssrc == 2 && npyv_storable_stride_s64(sdst)) {
+ for (; len >= vstep; len -= vstep, src += wstep, dst += sdst*vstep) {
+ npyv_@sfx@ a0 = npyv_load_@sfx@(src);
+ npyv_@sfx@ a1 = npyv_load_@sfx@(src + vstep);
+ npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0);
+ npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1);
+ npyv_storen2_@sfx@(dst, sdst, r0);
+ npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
+ }
+ for (; len > 0; len -= hstep, src += vstep, dst += sdst*hstep) {
+ npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src, len);
+ npyv_@sfx@ r = simd_c@kind@_@sfx@(a);
+ npyv_storen2_till_@sfx@(dst, sdst, len, r);
+ }
+ }
+ else if (sdst == 2 && npyv_loadable_stride_s64(ssrc)) {
+ for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += wstep) {
+ npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src, ssrc);
+ npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src + ssrc*hstep, ssrc);
+ npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0);
+ npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1);
+ npyv_store_@sfx@(dst, r0);
+ npyv_store_@sfx@(dst + vstep, r1);
+ }
+ for (; len > 0; len -= hstep, src += ssrc*hstep, dst += vstep) {
+ npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@((@ftype@*)src, ssrc, len);
+ npyv_@sfx@ r = simd_c@kind@_@sfx@(a);
+ npyv_store2_till_@sfx@(dst, len, r);
}
}
+ else {
+ goto loop_scalar;
+ }
+ npyv_cleanup();
+ return;
+loop_scalar:
+#endif
+ for (; len > 0; --len, b_src += b_ssrc, b_dst += b_sdst) {
+ const @ftype@ rl = ((@ftype@ *)b_src)[0];
+ const @ftype@ im = ((@ftype@ *)b_src)[1];
+ #if @is_square@
+ ((@ftype@ *)b_dst)[0] = rl*rl - im*im;
+ ((@ftype@ *)b_dst)[1] = rl*im + im*rl;
+ #else
+ ((@ftype@ *)b_dst)[0] = rl;
+ ((@ftype@ *)b_dst)[1] = -im;
+ #endif
+ }
}
+/**end repeat1**/
/**end repeat**/