/*@targets ** $maxopt baseline ** sse2 avx2 avx512f ** vx vxe **/ #define _UMATHMODULE #define _MULTIARRAYMODULE #define NPY_NO_DEPRECATED_API NPY_API_VERSION #include "simd/simd.h" #include "loops_utils.h" #include "loops.h" #include "lowlevel_strided_loops.h" // Provides the various *_LOOP macros #include "fast_loop_macros.h" // TODO: replace raw SIMD with NPYV //############################################################################### //## Real Single/Double precision //############################################################################### /******************************************************************************** ** Defining the SIMD kernels ********************************************************************************/ #ifdef NPY_HAVE_SSE2 /**begin repeat * #type = npy_float, npy_double# * #TYPE = FLOAT, DOUBLE# * #scalarf = npy_sqrtf, npy_sqrt# * #c = f, # * #vtype = __m128, __m128d# * #vtype256 = __m256, __m256d# * #vtype512 = __m512, __m512d# * #vpre = _mm, _mm# * #vpre256 = _mm256, _mm256# * #vpre512 = _mm512, _mm512# * #vsuf = ps, pd# * #vsufs = ss, sd# * #nan = NPY_NANF, NPY_NAN# * #double = 0, 1# * #cast = _mm_castps_si128, _mm_castpd_si128# */ /**begin repeat1 * Arithmetic * # kind = add, subtract, multiply, divide# * # OP = +, -, *, /# * # VOP = add, sub, mul, div# */ static void sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) { #ifdef NPY_HAVE_AVX512F const npy_intp vector_size_bytes = 64; LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[i] @OP@ ip2[i]; /* lots of specializations, to squeeze out max performance */ if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) { if (ip1 == ip2) { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a); @vpre512@_store_@vsuf@(&op[i], c); } } else { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @vpre512@_store_@vsuf@(&op[i], c); } } } else if (npy_is_aligned(&ip1[i], vector_size_bytes)) { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @vpre512@_store_@vsuf@(&op[i], c); } } else if (npy_is_aligned(&ip2[i], vector_size_bytes)) { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @vpre512@_store_@vsuf@(&op[i], c); } } else { if (ip1 == ip2) { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a); @vpre512@_store_@vsuf@(&op[i], c); } } else { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @vpre512@_store_@vsuf@(&op[i], c); } } } #elif defined NPY_HAVE_AVX2 const npy_intp vector_size_bytes = 32; LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[i] @OP@ ip2[i]; /* lots of specializations, to squeeze out max performance */ if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) { if (ip1 == ip2) { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a); @vpre256@_store_@vsuf@(&op[i], c); } } else { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @vpre256@_store_@vsuf@(&op[i], c); } } } else if (npy_is_aligned(&ip1[i], vector_size_bytes)) { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @vpre256@_store_@vsuf@(&op[i], c); } } else if (npy_is_aligned(&ip2[i], vector_size_bytes)) { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @vpre256@_store_@vsuf@(&op[i], c); } } else { if (ip1 == ip2) { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a); @vpre256@_store_@vsuf@(&op[i], c); } } else { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @vpre256@_store_@vsuf@(&op[i], c); } } } #else const npy_intp vector_size_bytes = 16; LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[i] @OP@ ip2[i]; /* lots of specializations, to squeeze out max performance */ if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) { if (ip1 == ip2) { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a); @vpre@_store_@vsuf@(&op[i], c); } } else { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]); @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); @vpre@_store_@vsuf@(&op[i], c); } } } else if (npy_is_aligned(&ip1[i], vector_size_bytes)) { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]); @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); @vpre@_store_@vsuf@(&op[i], c); } } else if (npy_is_aligned(&ip2[i], vector_size_bytes)) { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]); @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); @vpre@_store_@vsuf@(&op[i], c); } } else { if (ip1 == ip2) { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a); @vpre@_store_@vsuf@(&op[i], c); } } else { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]); @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); @vpre@_store_@vsuf@(&op[i], c); } } } #endif LOOP_BLOCKED_END { op[i] = ip1[i] @OP@ ip2[i]; } } static void sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) { #ifdef NPY_HAVE_AVX512F const npy_intp vector_size_bytes = 64; const @vtype512@ a = @vpre512@_set1_@vsuf@(ip1[0]); LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[0] @OP@ ip2[i]; if (npy_is_aligned(&ip2[i], vector_size_bytes)) { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @vpre512@_store_@vsuf@(&op[i], c); } } else { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @vpre512@_store_@vsuf@(&op[i], c); } } #elif defined NPY_HAVE_AVX2 const npy_intp vector_size_bytes = 32; const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]); LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[0] @OP@ ip2[i]; if (npy_is_aligned(&ip2[i], vector_size_bytes)) { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @vpre256@_store_@vsuf@(&op[i], c); } } else { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @vpre256@_store_@vsuf@(&op[i], c); } } #else const npy_intp vector_size_bytes = 16; const @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]); LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[0] @OP@ ip2[i]; if (npy_is_aligned(&ip2[i], vector_size_bytes)) { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]); @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); @vpre@_store_@vsuf@(&op[i], c); } } else { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]); @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); @vpre@_store_@vsuf@(&op[i], c); } } #endif LOOP_BLOCKED_END { op[i] = ip1[0] @OP@ ip2[i]; } } static void sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) { #ifdef NPY_HAVE_AVX512F const npy_intp vector_size_bytes = 64; const @vtype512@ b = @vpre512@_set1_@vsuf@(ip2[0]); LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[i] @OP@ ip2[0]; if (npy_is_aligned(&ip1[i], vector_size_bytes)) { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @vpre512@_store_@vsuf@(&op[i], c); } } else { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @vpre512@_store_@vsuf@(&op[i], c); } } #elif defined NPY_HAVE_AVX2 const npy_intp vector_size_bytes = 32; const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]); LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[i] @OP@ ip2[0]; if (npy_is_aligned(&ip1[i], vector_size_bytes)) { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @vpre256@_store_@vsuf@(&op[i], c); } } else { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @vpre256@_store_@vsuf@(&op[i], c); } } #else const npy_intp vector_size_bytes = 16; const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]); LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[i] @OP@ ip2[0]; if (npy_is_aligned(&ip1[i], vector_size_bytes)) { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); @vpre@_store_@vsuf@(&op[i], c); } } else { LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); @vpre@_store_@vsuf@(&op[i], c); } } #endif LOOP_BLOCKED_END { op[i] = ip1[i] @OP@ ip2[0]; } } /**end repeat1**/ /**end repeat**/ #else // NPY_HAVE_SSE2 /**begin repeat * #type = npy_float, npy_double# * #TYPE = FLOAT, DOUBLE# * #sfx = f32, f64# * #CHK = _F32, _F64# */ #if NPY_SIMD@CHK@ /**begin repeat1 * Arithmetic * # kind = add, subtract, multiply, divide# * # OP = +, -, *, /# * # VOP = add, sub, mul, div# */ static void simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) { LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) { op[i] = ip1[i] @OP@ ip2[i]; } /* lots of specializations, to squeeze out max performance */ if (ip1 == ip2) { LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]); npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, a); npyv_store_@sfx@(&op[i], c); } } else { LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]); npyv_@sfx@ b = npyv_load_@sfx@(&ip2[i]); npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, b); npyv_store_@sfx@(&op[i], c); } } LOOP_BLOCKED_END { op[i] = ip1[i] @OP@ ip2[i]; } } static void simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) { const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]); LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) { op[i] = ip1[0] @OP@ ip2[i]; } LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]); npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2); npyv_store_@sfx@(&op[i], v3); } LOOP_BLOCKED_END { op[i] = ip1[0] @OP@ ip2[i]; } } static void simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) { const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]); LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) { op[i] = ip1[i] @OP@ ip2[0]; } LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]); npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2); npyv_store_@sfx@(&op[i], v3); } LOOP_BLOCKED_END { op[i] = ip1[i] @OP@ ip2[0]; } } /**end repeat1**/ #endif /* NPY_SIMD@CHK@ */ /**end repeat**/ #endif // NPY_HAVE_SSE2 /**begin repeat * Float types * #type = npy_float, npy_double, npy_longdouble# * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# * #vector = 1, 1, 0# * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64, 0 # */ /**begin repeat1 * Arithmetic * # kind = add, subtract, multiply, divide# */ static NPY_INLINE int run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) { #if @vector@ && defined NPY_HAVE_SSE2 @type@ * ip1 = (@type@ *)args[0]; @type@ * ip2 = (@type@ *)args[1]; @type@ * op = (@type@ *)args[2]; npy_intp n = dimensions[0]; #if defined NPY_HAVE_AVX512F const npy_uintp vector_size_bytes = 64; #elif defined NPY_HAVE_AVX2 const npy_uintp vector_size_bytes = 32; #else const npy_uintp vector_size_bytes = 32; #endif /* argument one scalar */ if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) { sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n); return 1; } /* argument two scalar */ else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) { sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n); return 1; } else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) { sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n); return 1; } #elif @VECTOR@ @type@ * ip1 = (@type@ *)args[0]; @type@ * ip2 = (@type@ *)args[1]; @type@ * op = (@type@ *)args[2]; npy_intp n = dimensions[0]; /* argument one scalar */ if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), NPY_SIMD_WIDTH)) { simd_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n); return 1; } /* argument two scalar */ else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH)) { simd_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n); return 1; } else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) { simd_binary_@kind@_@TYPE@(op, ip1, ip2, n); return 1; } #endif return 0; } /**end repeat1**/ /**end repeat**/ /******************************************************************************** ** Defining ufunc inner functions ********************************************************************************/ /**begin repeat * Float types * #type = npy_float, npy_double# * #TYPE = FLOAT, DOUBLE# * #c = f, # * #C = F, # */ /**begin repeat1 * Arithmetic * # kind = add, subtract, multiply, divide# * # OP = +, -, *, /# * # PW = 1, 0, 0, 0# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { if (IS_BINARY_REDUCE) { #if @PW@ @type@ * iop1 = (@type@ *)args[0]; npy_intp n = dimensions[0]; *iop1 @OP@= @TYPE@_pairwise_sum(args[1], n, steps[1]); #else BINARY_REDUCE_LOOP(@type@) { io1 @OP@= *(@type@ *)ip2; } *((@type@ *)iop1) = io1; #endif } else if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) { BINARY_LOOP { const @type@ in1 = *(@type@ *)ip1; const @type@ in2 = *(@type@ *)ip2; *((@type@ *)op1) = in1 @OP@ in2; } } } /**end repeat1**/ /**end repeat**/ //############################################################################### //## Complex Single/Double precision //############################################################################### /******************************************************************************** ** Defining the SIMD kernels ********************************************************************************/ #if !defined(_MSC_VER) && defined(NPY_HAVE_AVX512F) /** * For somehow MSVC commit aggressive optimization lead * to raises 'RuntimeWarning: invalid value encountered in multiply' * * the issue mainly caused by '_mm512_maskz_loadu_ps', we need to * investigate about it while moving to NPYV. */ #define AVX512F_NOMSVC #endif #ifdef AVX512F_NOMSVC NPY_FINLINE __mmask16 avx512_get_full_load_mask_ps(void) { return 0xFFFF; } NPY_FINLINE __mmask8 avx512_get_full_load_mask_pd(void) { return 0xFF; } NPY_FINLINE __m512 avx512_masked_load_ps(__mmask16 mask, npy_float* addr) { return _mm512_maskz_loadu_ps(mask, (__m512 *)addr); } NPY_FINLINE __m512d avx512_masked_load_pd(__mmask8 mask, npy_double* addr) { return _mm512_maskz_loadu_pd(mask, (__m512d *)addr); } NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem) { return (0x0001 << num_elem) - 0x0001; } NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem) { return (0x01 << num_elem) - 0x01; } /**begin repeat * #vsub = ps, pd# * #type= npy_float, npy_double# * #epi_vsub = epi32, epi64# * #vtype = __m512, __m512d# * #mask = __mmask16, __mmask8# * #and_const = 0x7fffffff, 0x7fffffffffffffffLL# * #neg_mask = 0x80000000, 0x8000000000000000# * #perm_ = 0xb1, 0x55# * #cmpx_img_mask = 0xAAAA, 0xAA# * #cmpx_re_mask = 0x5555, 0x55# * #INF = NPY_INFINITYF, NPY_INFINITY# * #NAN = NPY_NANF, NPY_NAN# */ NPY_FINLINE @vtype@ avx512_hadd_@vsub@(const @vtype@ x) { return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@)); } NPY_FINLINE @vtype@ avx512_hsub_@vsub@(const @vtype@ x) { return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@)); } NPY_FINLINE @vtype@ avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2) { // x1 = r1, i1 // x2 = r2, i2 @vtype@ x3 = _mm512_permute_@vsub@(x2, @perm_@); // i2, r2 @vtype@ x12 = _mm512_mul_@vsub@(x1, x2); // r1*r2, i1*i2 @vtype@ x13 = _mm512_mul_@vsub@(x1, x3); // r1*i2, r2*i1 @vtype@ outreal = avx512_hsub_@vsub@(x12); // r1*r2 - i1*i2, r1*r2 - i1*i2 @vtype@ outimg = avx512_hadd_@vsub@(x13); // r1*i2 + i1*r2, r1*i2 + i1*r2 return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg); } /**end repeat**/ #endif /**begin repeat * #TYPE = CFLOAT, CDOUBLE# * #type = npy_float, npy_double# * #num_lanes = 16, 8# * #vsuffix = ps, pd# * #epi_vsub = epi32, epi64# * #mask = __mmask16, __mmask8# * #vtype = __m512, __m512d# * #scale = 4, 8# * #vindextype = __m512i, __m256i# * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256# * #storemask = 0xFF, 0xF# * #IS_FLOAT = 1, 0# */ /**begin repeat1 * #func = add, subtract, multiply# * #vectorf = _mm512_add, _mm512_sub, avx512_cmul# */ #if defined AVX512F_NOMSVC static NPY_INLINE void AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps) { const npy_intp array_size = dimensions[0]; npy_intp num_remaining_elements = 2*array_size; @type@* ip1 = (@type@*) args[0]; @type@* ip2 = (@type@*) args[1]; @type@* op = (@type@*) args[2]; @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@(); while (num_remaining_elements > 0) { if (num_remaining_elements < @num_lanes@) { load_mask = avx512_get_partial_load_mask_@vsuffix@( num_remaining_elements, @num_lanes@); } @vtype@ x1, x2; x1 = avx512_masked_load_@vsuffix@(load_mask, ip1); x2 = avx512_masked_load_@vsuffix@(load_mask, ip2); @vtype@ out = @vectorf@_@vsuffix@(x1, x2); _mm512_mask_storeu_@vsuffix@(op, load_mask, out); ip1 += @num_lanes@; ip2 += @num_lanes@; op += @num_lanes@; num_remaining_elements -= @num_lanes@; } } #endif // AVX512F_NOMSVC /**end repeat1**/ /**end repeat**/ /**begin repeat * #TYPE = CFLOAT, CDOUBLE# * #type= npy_float, npy_double# * #esize = 8, 16# */ /**begin repeat1 * #func = add, subtract, multiply# */ static NPY_INLINE int run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps) { #if defined AVX512F_NOMSVC if (IS_BINARY_STRIDE_ONE(@esize@, 64)) { AVX512F_@func@_@TYPE@(args, dimensions, steps); return 1; } else return 0; #endif return 0; } /**end repeat1**/ /**end repeat**/ /******************************************************************************** ** Defining ufunc inner functions ********************************************************************************/ /**begin repeat * complex types * #TYPE = CFLOAT, CDOUBLE# * #ftype = npy_float, npy_double# * #c = f, # * #C = F, # */ /**begin repeat1 * arithmetic * #kind = add, subtract# * #OP = +, -# * #PW = 1, 0# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { // Parenthesis around @PW@ tells clang dead code is intentional if (IS_BINARY_REDUCE && (@PW@)) { npy_intp n = dimensions[0]; @ftype@ * or = ((@ftype@ *)args[0]); @ftype@ * oi = ((@ftype@ *)args[0]) + 1; @ftype@ rr, ri; @TYPE@_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2); *or @OP@= rr; *oi @OP@= ri; return; } if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) { BINARY_LOOP { const @ftype@ in1r = ((@ftype@ *)ip1)[0]; const @ftype@ in1i = ((@ftype@ *)ip1)[1]; const @ftype@ in2r = ((@ftype@ *)ip2)[0]; const @ftype@ in2i = ((@ftype@ *)ip2)[1]; ((@ftype@ *)op1)[0] = in1r @OP@ in2r; ((@ftype@ *)op1)[1] = in1i @OP@ in2i; } } } /**end repeat1**/ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_multiply) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { if (!run_binary_avx512f_multiply_@TYPE@(args, dimensions, steps)) { BINARY_LOOP { const @ftype@ in1r = ((@ftype@ *)ip1)[0]; const @ftype@ in1i = ((@ftype@ *)ip1)[1]; const @ftype@ in2r = ((@ftype@ *)ip2)[0]; const @ftype@ in2i = ((@ftype@ *)ip2)[1]; ((@ftype@ *)op1)[0] = in1r*in2r - in1i*in2i; ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r; } } } /**end repeat**/