diff options
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/code_generators/generate_umath.py | 8 | ||||
-rw-r--r-- | numpy/core/src/umath/loops_arithm_fp.dispatch.c.src | 1186 |
2 files changed, 523 insertions, 671 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index ae1dcee7b..350dd19c3 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -404,7 +404,8 @@ defdict = { Ufunc(1, 1, None, docstrings.get('numpy.core.umath.conjugate'), None, - TD(ints+flts+cmplx, simd=[('avx2', ints), ('avx512f', cmplxvec)]), + TD(ints+flts+cmplx, simd=[('avx2', ints)], + dispatch=[('loops_arithm_fp', 'FD')]), TD(P, f='conjugate'), ), 'fmod': @@ -419,7 +420,8 @@ defdict = { Ufunc(1, 1, None, docstrings.get('numpy.core.umath.square'), None, - TD(ints+inexact, simd=[('avx2', ints), ('avx512f', 'FD')], dispatch=[('loops_unary_fp', 'fd')]), + TD(ints+inexact, simd=[('avx2', ints)], + dispatch=[('loops_unary_fp', 'fd'), ('loops_arithm_fp', 'FD')]), TD(O, f='Py_square'), ), 'reciprocal': @@ -460,7 +462,7 @@ defdict = { 'PyUFunc_AbsoluteTypeResolver', TD(bints+flts+timedeltaonly, dispatch=[('loops_unary_fp', 'fd'), ('loops_logical', '?')]), - TD(cmplx, simd=[('avx512f', cmplxvec)], out=('f', 'd', 'g')), + TD(cmplx, dispatch=[('loops_arithm_fp', 'FD')], out=('f', 'd', 'g')), TD(O, f='PyNumber_Absolute'), ), '_arg': diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src index c1bfaa63a..183362cf2 100644 --- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src +++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src @@ -1,6 +1,8 @@ /*@targets ** $maxopt baseline - ** sse2 avx2 avx512f + ** sse2 (avx2 fma3) avx512f + ** neon asimd + ** vsx2 vsx3 ** vx vxe **/ #define _UMATHMODULE @@ -14,708 +16,293 @@ // Provides the various *_LOOP macros #include "fast_loop_macros.h" -// TODO: replace raw SIMD with NPYV +/** + * TODO: + * - Improve the implementation of SIMD complex absolute, + * current one kinda slow and it can be optimized by + * at least avoiding the division and keep sqrt. + * - Vectorize reductions + * - Add support for ASIMD/VCMLA through universal intrinics. + */ + //############################################################################### //## Real Single/Double precision //############################################################################### /******************************************************************************** - ** Defining the SIMD kernels + ** Defining ufunc inner functions ********************************************************************************/ -#ifdef NPY_HAVE_SSE2 + /**begin repeat + * Float types * #type = npy_float, npy_double# * #TYPE = FLOAT, DOUBLE# - * #scalarf = npy_sqrtf, npy_sqrt# + * #sfx = f32, f64# * #c = f, # - * #vtype = __m128, __m128d# - * #vtype256 = __m256, __m256d# - * #vtype512 = __m512, __m512d# - * #vpre = _mm, _mm# - * #vpre256 = _mm256, _mm256# - * #vpre512 = _mm512, _mm512# - * #vsuf = ps, pd# - * #vsufs = ss, sd# - * #nan = NPY_NANF, NPY_NAN# - * #double = 0, 1# - * #cast = _mm_castps_si128, _mm_castpd_si128# + * #C = F, # + * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64# */ /**begin repeat1 -* Arithmetic -* # kind = add, subtract, multiply, divide# -* # OP = +, -, *, /# -* # VOP = add, sub, mul, div# -*/ -static void -sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) + * Arithmetic + * # kind = add, subtract, multiply, divide# + * # intrin = add, sub, mul, div# + * # OP = +, -, *, /# + * # PW = 1, 0, 0, 0# + * # is_div = 0*3, 1# + * # is_mul = 0*2, 1, 0# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { -#ifdef NPY_HAVE_AVX512F - const npy_intp vector_size_bytes = 64; - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[i] @OP@ ip2[i]; - /* lots of specializations, to squeeze out max performance */ - if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) { - if (ip1 == ip2) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); - @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); + npy_intp len = dimensions[0]; + char *src0 = args[0], *src1 = args[1], *dst = args[2]; + npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2]; + // reduce + if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) { + #if @PW@ + *((@type@*)src0) @OP@= @TYPE@_pairwise_sum(src1, len, ssrc1); + #else + @type@ acc = *((@type@*)src0); + if (ssrc1 == sizeof(@type@)) { + for (; len > 0; --len, src1 += sizeof(@type@)) { + acc @OP@= *(@type@ *)src1; } - } - } - else if (npy_is_aligned(&ip1[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); - @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - else if (npy_is_aligned(&ip2[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); - @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - else { - if (ip1 == ip2) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); - @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - } -#elif defined NPY_HAVE_AVX2 - const npy_intp vector_size_bytes = 32; - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[i] @OP@ ip2[i]; - /* lots of specializations, to squeeze out max performance */ - if (npy_is_aligned(&ip1[i], vector_size_bytes) && - npy_is_aligned(&ip2[i], vector_size_bytes)) { - if (ip1 == ip2) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a); - @vpre256@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); - @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); + } else { + for (; len > 0; --len, src1 += ssrc1) { + acc @OP@= *(@type@ *)src1; } } + *((@type@*)src0) = acc; + #endif + return; } - else if (npy_is_aligned(&ip1[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); - @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); - } - } - else if (npy_is_aligned(&ip2[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); - @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); - } - } - else { - if (ip1 == ip2) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a); - @vpre256@_store_@vsuf@(&op[i], c); +#if @VECTOR@ + if (len > npyv_nlanes_@sfx@*2 && + !is_mem_overlap(src0, ssrc0, dst, sdst, len) && + !is_mem_overlap(src1, ssrc1, dst, sdst, len) + ) { + const int vstep = npyv_nlanes_u8; + const int wstep = vstep * 2; + const int hstep = npyv_nlanes_@sfx@; + const int lstep = hstep * 2; + // lots of specializations, to squeeze out max performance + if (ssrc0 == sizeof(@type@) && ssrc0 == ssrc1 && ssrc0 == sdst) { + for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) { + npyv_@sfx@ a0 = npyv_load_@sfx@((const @type@*)src0); + npyv_@sfx@ a1 = npyv_load_@sfx@((const @type@*)(src0 + vstep)); + npyv_@sfx@ b0 = npyv_load_@sfx@((const @type@*)src1); + npyv_@sfx@ b1 = npyv_load_@sfx@((const @type@*)(src1 + vstep)); + npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a0, b0); + npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a1, b1); + npyv_store_@sfx@((@type@*)dst, r0); + npyv_store_@sfx@((@type@*)(dst + vstep), r1); } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); - @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); + for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) { + #if @is_div@ + npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@); + npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@); + #else + npyv_@sfx@ a = npyv_load_tillz_@sfx@((const @type@*)src0, len); + npyv_@sfx@ b = npyv_load_tillz_@sfx@((const @type@*)src1, len); + #endif + npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b); + npyv_store_till_@sfx@((@type@*)dst, len, r); } } - } -#else - const npy_intp vector_size_bytes = 16; - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[i] @OP@ ip2[i]; - /* lots of specializations, to squeeze out max performance */ - if (npy_is_aligned(&ip1[i], vector_size_bytes) && - npy_is_aligned(&ip2[i], vector_size_bytes)) { - if (ip1 == ip2) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a); - @vpre@_store_@vsuf@(&op[i], c); + else if (ssrc0 == 0 && ssrc1 == sizeof(@type@) && sdst == ssrc1) { + npyv_@sfx@ a = npyv_setall_@sfx@(*((@type@*)src0)); + for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) { + npyv_@sfx@ b0 = npyv_load_@sfx@((const @type@*)src1); + npyv_@sfx@ b1 = npyv_load_@sfx@((const @type@*)(src1 + vstep)); + npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a, b0); + npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a, b1); + npyv_store_@sfx@((@type@*)dst, r0); + npyv_store_@sfx@((@type@*)(dst + vstep), r1); } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); - @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); + for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) { + #if @is_div@ || @is_mul@ + npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@); + #else + npyv_@sfx@ b = npyv_load_tillz_@sfx@((const @type@*)src1, len); + #endif + npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b); + npyv_store_till_@sfx@((@type@*)dst, len, r); } } - } - else if (npy_is_aligned(&ip1[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); - @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); - } - } - else if (npy_is_aligned(&ip2[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); - @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); - } - } - else { - if (ip1 == ip2) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a); - @vpre@_store_@vsuf@(&op[i], c); + else if (ssrc1 == 0 && ssrc0 == sizeof(@type@) && sdst == ssrc0) { + npyv_@sfx@ b = npyv_setall_@sfx@(*((@type@*)src1)); + for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) { + npyv_@sfx@ a0 = npyv_load_@sfx@((const @type@*)src0); + npyv_@sfx@ a1 = npyv_load_@sfx@((const @type@*)(src0 + vstep)); + npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a0, b); + npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a1, b); + npyv_store_@sfx@((@type@*)dst, r0); + npyv_store_@sfx@((@type@*)(dst + vstep), r1); } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); - @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); + for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) { + #if @is_div@ || @is_mul@ + npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@); + #else + npyv_@sfx@ a = npyv_load_tillz_@sfx@((const @type@*)src0, len); + #endif + npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b); + npyv_store_till_@sfx@((@type@*)dst, len, r); } + } else { + goto loop_scalar; } + npyv_cleanup(); + return; } +loop_scalar: #endif - LOOP_BLOCKED_END { - op[i] = ip1[i] @OP@ ip2[i]; - } -} - -static void -sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) -{ -#ifdef NPY_HAVE_AVX512F - const npy_intp vector_size_bytes = 64; - const @vtype512@ a = @vpre512@_set1_@vsuf@(ip1[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[0] @OP@ ip2[i]; - if (npy_is_aligned(&ip2[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - - -#elif defined NPY_HAVE_AVX2 - const npy_intp vector_size_bytes = 32; - const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[0] @OP@ ip2[i]; - if (npy_is_aligned(&ip2[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); - } - } -#else - const npy_intp vector_size_bytes = 16; - const @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[0] @OP@ ip2[i]; - if (npy_is_aligned(&ip2[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); - } - } -#endif - LOOP_BLOCKED_END { - op[i] = ip1[0] @OP@ ip2[i]; - } -} - -static void -sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) -{ -#ifdef NPY_HAVE_AVX512F - const npy_intp vector_size_bytes = 64; - const @vtype512@ b = @vpre512@_set1_@vsuf@(ip2[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[i] @OP@ ip2[0]; - if (npy_is_aligned(&ip1[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - -#elif defined NPY_HAVE_AVX2 - const npy_intp vector_size_bytes = 32; - const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[i] @OP@ ip2[0]; - if (npy_is_aligned(&ip1[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); - } - } -#else - const npy_intp vector_size_bytes = 16; - const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[i] @OP@ ip2[0]; - if (npy_is_aligned(&ip1[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); - } - } -#endif - LOOP_BLOCKED_END { - op[i] = ip1[i] @OP@ ip2[0]; + for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) { + const @type@ a = *((@type@*)src0); + const @type@ b = *((@type@*)src1); + *((@type@*)dst) = a @OP@ b; } } - /**end repeat1**/ /**end repeat**/ -#else // NPY_HAVE_SSE2 - -/**begin repeat - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #sfx = f32, f64# - * #CHK = _F32, _F64# - */ -#if NPY_SIMD@CHK@ -/**begin repeat1 -* Arithmetic -* # kind = add, subtract, multiply, divide# -* # OP = +, -, *, /# -* # VOP = add, sub, mul, div# -*/ - -static void -simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) -{ - LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) { - op[i] = ip1[i] @OP@ ip2[i]; - } - /* lots of specializations, to squeeze out max performance */ - if (ip1 == ip2) { - LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { - npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]); - npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, a); - npyv_store_@sfx@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { - npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]); - npyv_@sfx@ b = npyv_load_@sfx@(&ip2[i]); - npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, b); - npyv_store_@sfx@(&op[i], c); - } - } - LOOP_BLOCKED_END { - op[i] = ip1[i] @OP@ ip2[i]; - } -} +//############################################################################### +//## Complex Single/Double precision +//############################################################################### -static void -simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) -{ - const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) { - op[i] = ip1[0] @OP@ ip2[i]; - } - LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { - npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]); - npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2); - npyv_store_@sfx@(&op[i], v3); - } - LOOP_BLOCKED_END { - op[i] = ip1[0] @OP@ ip2[i]; - } -} +/******************************************************************************** + ** op intrinics + ********************************************************************************/ -static void -simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) +#if NPY_SIMD_F32 +NPY_FINLINE npyv_f32x2 simd_set2_f32(const float *a) { - const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) { - op[i] = ip1[i] @OP@ ip2[0]; - } - LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { - npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]); - npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2); - npyv_store_@sfx@(&op[i], v3); - } - LOOP_BLOCKED_END { - op[i] = ip1[i] @OP@ ip2[0]; - } + npyv_f32 fill = npyv_reinterpret_f32_u64(npyv_setall_u64(*(npy_uint64*)a)); + npyv_f32x2 r; + r.val[0] = fill; + r.val[1] = fill; + return r; } -/**end repeat1**/ -#endif /* NPY_SIMD@CHK@ */ -/**end repeat**/ -#endif // NPY_HAVE_SSE2 -/**begin repeat - * Float types - * #type = npy_float, npy_double, npy_longdouble# - * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# - * #vector = 1, 1, 0# - * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64, 0 # - */ -/**begin repeat1 - * Arithmetic - * # kind = add, subtract, multiply, divide# - */ -static inline int -run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) +NPY_FINLINE npyv_f32 +simd_cconjugate_f32(npyv_f32 x) { -#if @vector@ && defined NPY_HAVE_SSE2 - @type@ * ip1 = (@type@ *)args[0]; - @type@ * ip2 = (@type@ *)args[1]; - @type@ * op = (@type@ *)args[2]; - npy_intp n = dimensions[0]; -#if defined NPY_HAVE_AVX512F - const npy_uintp vector_size_bytes = 64; -#elif defined NPY_HAVE_AVX2 - const npy_uintp vector_size_bytes = 32; +#if NPY_SIMD_BIGENDIAN + const npyv_f32 mask = npyv_reinterpret_f32_u64(npyv_setall_u64(0x80000000)); #else - const npy_uintp vector_size_bytes = 32; -#endif - /* argument one scalar */ - if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) { - sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n); - return 1; - } - /* argument two scalar */ - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) { - sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n); - return 1; - } - else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) { - sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n); - return 1; - } -#elif @VECTOR@ - @type@ * ip1 = (@type@ *)args[0]; - @type@ * ip2 = (@type@ *)args[1]; - @type@ * op = (@type@ *)args[2]; - npy_intp n = dimensions[0]; - /* argument one scalar */ - if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), NPY_SIMD_WIDTH)) { - simd_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n); - return 1; - } - /* argument two scalar */ - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH)) { - simd_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n); - return 1; - } - else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) { - simd_binary_@kind@_@TYPE@(op, ip1, ip2, n); - return 1; - } + const npyv_f32 mask = npyv_reinterpret_f32_u64(npyv_setall_u64(0x8000000000000000ULL)); #endif - return 0; + return npyv_xor_f32(x, mask); } -/**end repeat1**/ -/**end repeat**/ -/******************************************************************************** - ** Defining ufunc inner functions - ********************************************************************************/ -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = f, # - * #C = F, # - * #count = 4,2# - */ -/**begin repeat1 - * Arithmetic - * # kind = add, subtract, multiply, divide# - * # OP = +, -, *, /# - * # PW = 1, 0, 0, 0# - */ -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +NPY_FINLINE npyv_f32 +simd_cmul_f32(npyv_f32 a, npyv_f32 b) { - if (IS_BINARY_REDUCE) { -#if @PW@ - @type@ * iop1 = (@type@ *)args[0]; - npy_intp n = dimensions[0]; - - *iop1 @OP@= @TYPE@_pairwise_sum(args[1], n, steps[1]); -#else - BINARY_REDUCE_LOOP(@type@) { - io1 @OP@= *(@type@ *)ip2; - } - *((@type@ *)iop1) = io1; -#endif - } - else if (dimensions[0] < @count@ || !run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) { - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - *((@type@ *)op1) = in1 @OP@ in2; - } - } + npyv_f32 b_rev = npyv_permi128_f32(b, 1, 0, 3, 2); + npyv_f32 a_re = npyv_permi128_f32(a, 0, 0, 2, 2); + npyv_f32 a_im = npyv_permi128_f32(a, 1, 1, 3, 3); + // a_im * b_im, a_im * b_re + npyv_f32 ab_iiir = npyv_mul_f32(a_im, b_rev); + return npyv_muladdsub_f32(a_re, b, ab_iiir); } -/**end repeat1**/ -/**end repeat**/ -//############################################################################### -//## Complex Single/Double precision -//############################################################################### -/******************************************************************************** - ** Defining the SIMD kernels - ********************************************************************************/ -#if !defined(_MSC_VER) && defined(NPY_HAVE_AVX512F) - /** - * For somehow MSVC commit aggressive optimization lead - * to raises 'RuntimeWarning: invalid value encountered in multiply' - * - * the issue mainly caused by '_mm512_maskz_loadu_ps', we need to - * investigate about it while moving to NPYV. - */ - #define AVX512F_NOMSVC +NPY_FINLINE npyv_f32 +simd_csquare_f32(npyv_f32 x) +{ return simd_cmul_f32(x, x); } #endif -#ifdef AVX512F_NOMSVC -NPY_FINLINE __mmask16 -avx512_get_full_load_mask_ps(void) -{ - return 0xFFFF; -} +#if NPY_SIMD_F64 -NPY_FINLINE __mmask8 -avx512_get_full_load_mask_pd(void) -{ - return 0xFF; -} -NPY_FINLINE __m512 -avx512_masked_load_ps(__mmask16 mask, npy_float* addr) +NPY_FINLINE npyv_f64x2 simd_set2_f64(const double *a) { - return _mm512_maskz_loadu_ps(mask, (__m512 *)addr); + npyv_f64 r = npyv_setall_f64(a[0]); + npyv_f64 i = npyv_setall_f64(a[1]); + return npyv_zip_f64(r, i); } -NPY_FINLINE __m512d -avx512_masked_load_pd(__mmask8 mask, npy_double* addr) +NPY_FINLINE npyv_f64 +simd_cconjugate_f64(npyv_f64 x) { - return _mm512_maskz_loadu_pd(mask, (__m512d *)addr); + const npyv_f64 mask = npyv_reinterpret_f64_u64(npyv_set_u64( + 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, + 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, + 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, + 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, + 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, + 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, + 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, + 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL + )); + return npyv_xor_f64(x, mask); } -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 -avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem) +NPY_FINLINE npyv_f64 +simd_cmul_f64(npyv_f64 a, npyv_f64 b) { - return (0x0001 << num_elem) - 0x0001; + npyv_f64 b_rev = npyv_permi128_f64(b, 1, 0); + npyv_f64 a_re = npyv_permi128_f64(a, 0, 0); + npyv_f64 a_im = npyv_permi128_f64(a, 1, 1); + // a_im * b_im, a_im * b_re + npyv_f64 ab_iiir = npyv_mul_f64(a_im, b_rev); + return npyv_muladdsub_f64(a_re, b, ab_iiir); } -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 -avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem) -{ - return (0x01 << num_elem) - 0x01; -} -/**begin repeat - * #vsub = ps, pd# - * #type= npy_float, npy_double# - * #epi_vsub = epi32, epi64# - * #vtype = __m512, __m512d# - * #mask = __mmask16, __mmask8# - * #and_const = 0x7fffffff, 0x7fffffffffffffffLL# - * #neg_mask = 0x80000000, 0x8000000000000000# - * #perm_ = 0xb1, 0x55# - * #cmpx_img_mask = 0xAAAA, 0xAA# - * #cmpx_re_mask = 0x5555, 0x55# - * #INF = NPY_INFINITYF, NPY_INFINITY# - * #NAN = NPY_NANF, NPY_NAN# - */ -NPY_FINLINE @vtype@ -avx512_hadd_@vsub@(const @vtype@ x) -{ - return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@)); -} - -NPY_FINLINE @vtype@ -avx512_hsub_@vsub@(const @vtype@ x) -{ - return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@)); -} -NPY_FINLINE @vtype@ -avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2) -{ - // x1 = r1, i1 - // x2 = r2, i2 - @vtype@ x3 = _mm512_permute_@vsub@(x2, @perm_@); // i2, r2 - @vtype@ x12 = _mm512_mul_@vsub@(x1, x2); // r1*r2, i1*i2 - @vtype@ x13 = _mm512_mul_@vsub@(x1, x3); // r1*i2, r2*i1 - @vtype@ outreal = avx512_hsub_@vsub@(x12); // r1*r2 - i1*i2, r1*r2 - i1*i2 - @vtype@ outimg = avx512_hadd_@vsub@(x13); // r1*i2 + i1*r2, r1*i2 + i1*r2 - return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg); -} -/**end repeat**/ +NPY_FINLINE npyv_f64 +simd_csquare_f64(npyv_f64 x) +{ return simd_cmul_f64(x, x); } #endif /**begin repeat - * #TYPE = CFLOAT, CDOUBLE# * #type = npy_float, npy_double# - * #num_lanes = 16, 8# - * #vsuffix = ps, pd# - * #epi_vsub = epi32, epi64# - * #mask = __mmask16, __mmask8# - * #vtype = __m512, __m512d# - * #scale = 4, 8# - * #vindextype = __m512i, __m256i# - * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256# - * #storemask = 0xFF, 0xF# - * #IS_FLOAT = 1, 0# - */ -/**begin repeat1 - * #func = add, subtract, multiply# - * #vectorf = _mm512_add, _mm512_sub, avx512_cmul# - */ -#if defined AVX512F_NOMSVC -static inline void -AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps) -{ - const npy_intp array_size = dimensions[0]; - npy_intp num_remaining_elements = 2*array_size; - @type@* ip1 = (@type@*) args[0]; - @type@* ip2 = (@type@*) args[1]; - @type@* op = (@type@*) args[2]; - - @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@(); - - while (num_remaining_elements > 0) { - if (num_remaining_elements < @num_lanes@) { - load_mask = avx512_get_partial_load_mask_@vsuffix@( - num_remaining_elements, @num_lanes@); - } - @vtype@ x1, x2; - x1 = avx512_masked_load_@vsuffix@(load_mask, ip1); - x2 = avx512_masked_load_@vsuffix@(load_mask, ip2); - - @vtype@ out = @vectorf@_@vsuffix@(x1, x2); - - _mm512_mask_storeu_@vsuffix@(op, load_mask, out); - - ip1 += @num_lanes@; - ip2 += @num_lanes@; - op += @num_lanes@; - num_remaining_elements -= @num_lanes@; - } -} -#endif // AVX512F_NOMSVC -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat - * #TYPE = CFLOAT, CDOUBLE# - * #type= npy_float, npy_double# - * #esize = 8, 16# - */ -/**begin repeat1 - * #func = add, subtract, multiply# + * #sfx = f32, f64# + * #bsfx = b32, b64# + * #usfx = b32, u64# + * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64# + * #is_double = 0, 1# + * #c = f, # + * #INF = NPY_INFINITYF, NPY_INFINITY# + * #NAN = NPY_NANF, NPY_NAN# */ -static inline int -run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps) +#if @VECTOR@ +NPY_FINLINE npyv_@sfx@ +simd_cabsolute_@sfx@(npyv_@sfx@ re, npyv_@sfx@ im) { -#if defined AVX512F_NOMSVC - if (IS_BINARY_STRIDE_ONE(@esize@, 64)) { - AVX512F_@func@_@TYPE@(args, dimensions, steps); - return 1; - } - else - return 0; -#endif - return 0; + const npyv_@sfx@ inf = npyv_setall_@sfx@(@INF@); + const npyv_@sfx@ nan = npyv_setall_@sfx@(@NAN@); + + re = npyv_abs_@sfx@(re); + im = npyv_abs_@sfx@(im); + /* + * If real or imag = INF, then convert it to inf + j*inf + * Handles: inf + j*nan, nan + j*inf + */ + npyv_@bsfx@ re_infmask = npyv_cmpeq_@sfx@(re, inf); + npyv_@bsfx@ im_infmask = npyv_cmpeq_@sfx@(im, inf); + im = npyv_select_@sfx@(re_infmask, inf, im); + re = npyv_select_@sfx@(im_infmask, inf, re); + /* + * If real or imag = NAN, then convert it to nan + j*nan + * Handles: x + j*nan, nan + j*x + */ + npyv_@bsfx@ re_nnanmask = npyv_notnan_@sfx@(re); + npyv_@bsfx@ im_nnanmask = npyv_notnan_@sfx@(im); + im = npyv_select_@sfx@(re_nnanmask, im, nan); + re = npyv_select_@sfx@(im_nnanmask, re, nan); + + npyv_@sfx@ larger = npyv_max_@sfx@(re, im); + npyv_@sfx@ smaller = npyv_min_@sfx@(im, re); + /* + * Calculate div_mask to prevent 0./0. and inf/inf operations in div + */ + npyv_@bsfx@ zeromask = npyv_cmpeq_@sfx@(larger, npyv_zero_@sfx@()); + npyv_@bsfx@ infmask = npyv_cmpeq_@sfx@(smaller, inf); + npyv_@bsfx@ div_mask = npyv_not_@bsfx@(npyv_or_@bsfx@(zeromask, infmask)); + + npyv_@sfx@ ratio = npyv_ifdivz_@sfx@(div_mask, smaller, larger); + npyv_@sfx@ hypot = npyv_sqrt_@sfx@( + npyv_muladd_@sfx@(ratio, ratio, npyv_setall_@sfx@(1.0@c@) + )); + return npyv_mul_@sfx@(hypot, larger); } -/**end repeat1**/ +#endif // VECTOR /**end repeat**/ /******************************************************************************** @@ -725,55 +312,318 @@ run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const * complex types * #TYPE = CFLOAT, CDOUBLE# * #ftype = npy_float, npy_double# + * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64# + * #sfx = f32, f64# * #c = f, # * #C = F, # */ /**begin repeat1 * arithmetic - * #kind = add, subtract# - * #OP = +, -# - * #PW = 1, 0# + * #kind = add, subtract, multiply# + * #vectorf = npyv_add, npyv_sub, simd_cmul# + * #OP = +, -, *# + * #PW = 1, 0, 0# + * #is_mul = 0*2, 1# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - // Parenthesis around @PW@ tells clang dead code is intentional - if (IS_BINARY_REDUCE && (@PW@)) { - npy_intp n = dimensions[0]; - @ftype@ * or = ((@ftype@ *)args[0]); - @ftype@ * oi = ((@ftype@ *)args[0]) + 1; + npy_intp len = dimensions[0]; + char *b_src0 = args[0], *b_src1 = args[1], *b_dst = args[2]; + npy_intp b_ssrc0 = steps[0], b_ssrc1 = steps[1], b_sdst = steps[2]; +#if @PW@ + // reduce + if (b_ssrc0 == 0 && b_ssrc0 == b_sdst && b_src0 == b_dst && + b_ssrc1 % (sizeof(@ftype@)*2) == 0 + ) { + @ftype@ *rl_im = (@ftype@ *)b_src0; @ftype@ rr, ri; - - @TYPE@_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2); - *or @OP@= rr; - *oi @OP@= ri; + @TYPE@_pairwise_sum(&rr, &ri, b_src1, len * 2, b_ssrc1 / 2); + rl_im[0] @OP@= rr; + rl_im[1] @OP@= ri; return; } - if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) { - BINARY_LOOP { - const @ftype@ in1r = ((@ftype@ *)ip1)[0]; - const @ftype@ in1i = ((@ftype@ *)ip1)[1]; - const @ftype@ in2r = ((@ftype@ *)ip2)[0]; - const @ftype@ in2i = ((@ftype@ *)ip2)[1]; - ((@ftype@ *)op1)[0] = in1r @OP@ in2r; - ((@ftype@ *)op1)[1] = in1i @OP@ in2i; +#endif +#if @VECTOR@ + if (is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) || + is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) || + b_sdst % sizeof(@ftype@) != 0 || b_sdst == 0 || + b_ssrc0 % sizeof(@ftype@) != 0 || + b_ssrc1 % sizeof(@ftype@) != 0 + ) { + goto loop_scalar; + } + const @ftype@ *src0 = (@ftype@*)b_src0; + const @ftype@ *src1 = (@ftype@*)b_src1; + @ftype@ *dst = (@ftype@*)b_dst; + + const npy_intp ssrc0 = b_ssrc0 / sizeof(@ftype@); + const npy_intp ssrc1 = b_ssrc1 / sizeof(@ftype@); + const npy_intp sdst = b_sdst / sizeof(@ftype@); + + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep * 2; + const int hstep = vstep / 2; + + const int loadable0 = npyv_loadable_stride_s64(ssrc0); + const int loadable1 = npyv_loadable_stride_s64(ssrc1); + const int storeable = npyv_storable_stride_s64(sdst); + + // lots**lots of specializations, to squeeze out max performance + // contig + if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) { + for (; len >= vstep; len -= vstep, src0 += wstep, src1 += wstep, dst += wstep) { + npyv_@sfx@ a0 = npyv_load_@sfx@(src0); + npyv_@sfx@ a1 = npyv_load_@sfx@(src0 + vstep); + npyv_@sfx@ b0 = npyv_load_@sfx@(src1); + npyv_@sfx@ b1 = npyv_load_@sfx@(src1 + vstep); + npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b0); + npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b1); + npyv_store_@sfx@(dst, r0); + npyv_store_@sfx@(dst + vstep, r1); + } + for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) { + npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src0, len); + npyv_@sfx@ b = npyv_load2_tillz_@sfx@(src1, len); + npyv_@sfx@ r = @vectorf@_@sfx@(a, b); + npyv_store2_till_@sfx@(dst, len, r); + } + } + // scalar 0 + else if (ssrc0 == 0) { + npyv_@sfx@x2 a = simd_set2_@sfx@(src0); + // contig + if (ssrc1 == 2 && sdst == ssrc1) { + for (; len >= vstep; len -= vstep, src1 += wstep, dst += wstep) { + npyv_@sfx@ b0 = npyv_load_@sfx@(src1); + npyv_@sfx@ b1 = npyv_load_@sfx@(src1 + vstep); + npyv_@sfx@ r0 = @vectorf@_@sfx@(a.val[0], b0); + npyv_@sfx@ r1 = @vectorf@_@sfx@(a.val[1], b1); + npyv_store_@sfx@(dst, r0); + npyv_store_@sfx@(dst + vstep, r1); + } + for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) { + #if @is_mul@ + npyv_@sfx@ b = npyv_load2_till_@sfx@(src1, len, 1.0@c@, 1.0@c@); + #else + npyv_@sfx@ b = npyv_load2_tillz_@sfx@(src1, len); + #endif + npyv_@sfx@ r = @vectorf@_@sfx@(a.val[0], b); + npyv_store2_till_@sfx@(dst, len, r); + } + } + // non-contig + else if (loadable1 && storeable) { + for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) { + npyv_@sfx@ b0 = npyv_loadn2_@sfx@(src1, ssrc1); + npyv_@sfx@ b1 = npyv_loadn2_@sfx@(src1 + ssrc1*hstep, ssrc1); + npyv_@sfx@ r0 = @vectorf@_@sfx@(a.val[0], b0); + npyv_@sfx@ r1 = @vectorf@_@sfx@(a.val[1], b1); + npyv_storen2_@sfx@(dst, sdst, r0); + npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1); + } + for (; len > 0; len -= hstep, src1 += ssrc1*hstep, dst += sdst*hstep) { + #if @is_mul@ + npyv_@sfx@ b = npyv_loadn2_till_@sfx@(src1, ssrc1, len, 1.0@c@, 1.0@c@); + #else + npyv_@sfx@ b = npyv_loadn2_tillz_@sfx@(src1, ssrc1, len); + #endif + npyv_@sfx@ r = @vectorf@_@sfx@(a.val[0], b); + npyv_storen2_till_@sfx@(dst, sdst, len, r); + } + } + else { + goto loop_scalar; + } + } + // scalar 1 + else if (ssrc1 == 0) { + npyv_@sfx@x2 b = simd_set2_@sfx@(src1); + if (ssrc0 == 2 && sdst == ssrc0) { + for (; len >= vstep; len -= vstep, src0 += wstep, dst += wstep) { + npyv_@sfx@ a0 = npyv_load_@sfx@(src0); + npyv_@sfx@ a1 = npyv_load_@sfx@(src0 + vstep); + npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b.val[0]); + npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b.val[1]); + npyv_store_@sfx@(dst, r0); + npyv_store_@sfx@(dst + vstep, r1); + } + for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) { + #if @is_mul@ + npyv_@sfx@ a = npyv_load2_till_@sfx@(src0, len, 1.0@c@, 1.0@c@); + #else + npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src0, len); + #endif + npyv_@sfx@ r = @vectorf@_@sfx@(a, b.val[0]); + npyv_store2_till_@sfx@(dst, len, r); + } } + // non-contig + else if (loadable0 && storeable) { + for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) { + npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src0, ssrc0); + npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src0 + ssrc0*hstep, ssrc0); + npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b.val[0]); + npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b.val[1]); + npyv_storen2_@sfx@(dst, sdst, r0); + npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1); + } + for (; len > 0; len -= hstep, src0 += ssrc0*hstep, dst += sdst*hstep) { + #if @is_mul@ + npyv_@sfx@ a = npyv_loadn2_till_@sfx@(src0, ssrc0, len, 1.0@c@, 1.0@c@); + #else + npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@(src0, ssrc0, len); + #endif + npyv_@sfx@ r = @vectorf@_@sfx@(a, b.val[0]); + npyv_storen2_till_@sfx@(dst, sdst, len, r); + } + } + else { + goto loop_scalar; + } + } + #if @is_mul@ + // non-contig + else if (loadable0 && loadable1 && storeable) { + for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, + src1 += ssrc1*vstep, dst += sdst*vstep + ) { + npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src0, ssrc0); + npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src0 + ssrc0*hstep, ssrc0); + npyv_@sfx@ b0 = npyv_loadn2_@sfx@(src1, ssrc1); + npyv_@sfx@ b1 = npyv_loadn2_@sfx@(src1 + ssrc1*hstep, ssrc1); + npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b0); + npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b1); + npyv_storen2_@sfx@(dst, sdst, r0); + npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1); + } + for (; len > 0; len -= hstep, src0 += ssrc0*hstep, + src1 += ssrc1*hstep, dst += sdst*hstep + ) { + #if @is_mul@ + npyv_@sfx@ a = npyv_loadn2_till_@sfx@(src0, ssrc0, len, 1.0@c@, 1.0@c@); + npyv_@sfx@ b = npyv_loadn2_till_@sfx@(src1, ssrc1, len, 1.0@c@, 1.0@c@); + #else + npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@(src0, ssrc0, len); + npyv_@sfx@ b = npyv_loadn2_tillz_@sfx@(src1, ssrc1, len); + #endif + npyv_@sfx@ r = @vectorf@_@sfx@(a, b); + npyv_storen2_till_@sfx@(dst, sdst, len, r); + } + } + #endif + else { + goto loop_scalar; + } + npyv_cleanup(); + return; +loop_scalar: +#endif + for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) { + const @ftype@ a_r = ((@ftype@ *)b_src0)[0]; + const @ftype@ a_i = ((@ftype@ *)b_src0)[1]; + const @ftype@ b_r = ((@ftype@ *)b_src1)[0]; + const @ftype@ b_i = ((@ftype@ *)b_src1)[1]; + #if @is_mul@ + ((@ftype@ *)b_dst)[0] = a_r*b_r - a_i*b_i; + ((@ftype@ *)b_dst)[1] = a_r*b_i + a_i*b_r; + #else + ((@ftype@ *)b_dst)[0] = a_r @OP@ b_r; + ((@ftype@ *)b_dst)[1] = a_i @OP@ b_i; + #endif } } /**end repeat1**/ -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_multiply) +/**begin repeat1 + * #kind = conjugate, square# + * #is_square = 0, 1# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - if (!run_binary_avx512f_multiply_@TYPE@(args, dimensions, steps)) { - BINARY_LOOP { - const @ftype@ in1r = ((@ftype@ *)ip1)[0]; - const @ftype@ in1i = ((@ftype@ *)ip1)[1]; - const @ftype@ in2r = ((@ftype@ *)ip2)[0]; - const @ftype@ in2i = ((@ftype@ *)ip2)[1]; - ((@ftype@ *)op1)[0] = in1r*in2r - in1i*in2i; - ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r; + npy_intp len = dimensions[0]; + char *b_src = args[0], *b_dst = args[1]; + npy_intp b_ssrc = steps[0], b_sdst = steps[1]; +#if @VECTOR@ + if (is_mem_overlap(b_src, b_ssrc, b_dst, b_sdst, len) || + b_sdst % sizeof(@ftype@) != 0 || + b_ssrc % sizeof(@ftype@) != 0 + ) { + goto loop_scalar; + } + const @ftype@ *src = (@ftype@*)b_src; + @ftype@ *dst = (@ftype@*)b_dst; + const npy_intp ssrc = b_ssrc / sizeof(@ftype@); + const npy_intp sdst = b_sdst / sizeof(@ftype@); + + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep * 2; + const int hstep = vstep / 2; + + if (ssrc == 2 && ssrc == sdst) { + for (; len >= vstep; len -= vstep, src += wstep, dst += wstep) { + npyv_@sfx@ a0 = npyv_load_@sfx@(src); + npyv_@sfx@ a1 = npyv_load_@sfx@(src + vstep); + npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0); + npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1); + npyv_store_@sfx@(dst, r0); + npyv_store_@sfx@(dst + vstep, r1); + } + for (; len > 0; len -= hstep, src += vstep, dst += vstep) { + npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src, len); + npyv_@sfx@ r = simd_c@kind@_@sfx@(a); + npyv_store2_till_@sfx@(dst, len, r); + } + } + else if (ssrc == 2 && npyv_storable_stride_s64(sdst)) { + for (; len >= vstep; len -= vstep, src += wstep, dst += sdst*vstep) { + npyv_@sfx@ a0 = npyv_load_@sfx@(src); + npyv_@sfx@ a1 = npyv_load_@sfx@(src + vstep); + npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0); + npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1); + npyv_storen2_@sfx@(dst, sdst, r0); + npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1); + } + for (; len > 0; len -= hstep, src += vstep, dst += sdst*hstep) { + npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src, len); + npyv_@sfx@ r = simd_c@kind@_@sfx@(a); + npyv_storen2_till_@sfx@(dst, sdst, len, r); + } + } + else if (sdst == 2 && npyv_loadable_stride_s64(ssrc)) { + for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += wstep) { + npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src, ssrc); + npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src + ssrc*hstep, ssrc); + npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0); + npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1); + npyv_store_@sfx@(dst, r0); + npyv_store_@sfx@(dst + vstep, r1); + } + for (; len > 0; len -= hstep, src += ssrc*hstep, dst += vstep) { + npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@((@ftype@*)src, ssrc, len); + npyv_@sfx@ r = simd_c@kind@_@sfx@(a); + npyv_store2_till_@sfx@(dst, len, r); } } + else { + goto loop_scalar; + } + npyv_cleanup(); + return; +loop_scalar: +#endif + for (; len > 0; --len, b_src += b_ssrc, b_dst += b_sdst) { + const @ftype@ rl = ((@ftype@ *)b_src)[0]; + const @ftype@ im = ((@ftype@ *)b_src)[1]; + #if @is_square@ + ((@ftype@ *)b_dst)[0] = rl*rl - im*im; + ((@ftype@ *)b_dst)[1] = rl*im + im*rl; + #else + ((@ftype@ *)b_dst)[0] = rl; + ((@ftype@ *)b_dst)[1] = -im; + #endif + } } +/**end repeat1**/ /**end repeat**/ |