/*@targets ** $maxopt baseline ** sse2 sse42 avx2 avx512f avx512_skx ** vsx2 vsx3 ** neon ** vx vxe **/ #define _UMATHMODULE #define _MULTIARRAYMODULE #define NPY_NO_DEPRECATED_API NPY_API_VERSION #include "simd/simd.h" #include "loops_utils.h" #include "loops.h" #include "lowlevel_strided_loops.h" // Provides the various *_LOOP macros #include "fast_loop_macros.h" /******************************************************************************** ** Defining the SIMD kernels ********************************************************************************/ /**begin repeat * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# * #len = 8, 8, 16, 16, 32, 32, 64, 64, 32, 64# * #signed = 0, 1, 0, 1, 0, 1, 0, 1, 0, 0# * #VECTOR = NPY_SIMD*8, NPY_SIMD_F32, NPY_SIMD_F64# */ /**begin repeat1 * #kind = equal, not_equal, less, less_equal# * #eq = 1, 0, 0, 0# * #neq = 0, 1, 0, 0# * #OP = ==, !=, <, <=# * #VOP = cmpeq, cmpneq, cmplt, cmple# */ #if @VECTOR@ && !((@eq@ || @neq@) && @signed@) static void simd_binary_@kind@_@sfx@(char **args, npy_intp len) { npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1]; npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; const npyv_u8 truemask = npyv_setall_u8(0x1); const int vstep = npyv_nlanes_u8; // Unroll the loop to get a resultant vector with 'vsteps' elements. for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) { #if @len@ >= 8 npyv_@sfx@ a1 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 0); npyv_@sfx@ b1 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 0); npyv_b@len@ c1 = npyv_@VOP@_@sfx@(a1, b1); #if @len@ >= 16 npyv_@sfx@ a2 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 1); npyv_@sfx@ b2 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 1); npyv_b@len@ c2 = npyv_@VOP@_@sfx@(a2, b2); #if @len@ >= 32 npyv_@sfx@ a3 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 2); npyv_@sfx@ b3 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 2); npyv_@sfx@ a4 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 3); npyv_@sfx@ b4 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 3); npyv_b@len@ c3 = npyv_@VOP@_@sfx@(a3, b3); npyv_b@len@ c4 = npyv_@VOP@_@sfx@(a4, b4); #if @len@ == 64 npyv_@sfx@ a5 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 4); npyv_@sfx@ b5 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 4); npyv_@sfx@ a6 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 5); npyv_@sfx@ b6 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 5); npyv_@sfx@ a7 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 6); npyv_@sfx@ b7 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 6); npyv_@sfx@ a8 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 7); npyv_@sfx@ b8 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 7); npyv_b@len@ c5 = npyv_@VOP@_@sfx@(a5, b5); npyv_b@len@ c6 = npyv_@VOP@_@sfx@(a6, b6); npyv_b@len@ c7 = npyv_@VOP@_@sfx@(a7, b7); npyv_b@len@ c8 = npyv_@VOP@_@sfx@(a8, b8); #endif // @len@ >= 64 #endif // @len@ >= 32 #endif // @len@ >= 16 #endif // @len@ >= 8 // Pack the 'c' vectors into a single vector 'r' #if @len@ == 8 npyv_u8 r = npyv_cvt_u8_b8(c1); #elif @len@ == 16 npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2)); #elif @len@ == 32 npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4)); #elif @len@ == 64 npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8)); #endif npyv_store_u8(dst, npyv_and_u8(r, truemask)); } for (; len > 0; --len, ++src1, ++src2, ++dst) { const npyv_lanetype_@sfx@ a = *src1; const npyv_lanetype_@sfx@ b = *src2; *dst = a @OP@ b; } } static void simd_binary_scalar1_@kind@_@sfx@(char **args, npy_intp len) { npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[0]; npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[1]; npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; const npyv_@sfx@ a = npyv_setall_@sfx@(scalar); const npyv_u8 truemask = npyv_setall_u8(0x1); const int vstep = npyv_nlanes_u8; for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { #if @len@ >= 8 npyv_@sfx@ b1 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 0); npyv_b@len@ c1 = npyv_@VOP@_@sfx@(a, b1); #if @len@ >= 16 npyv_@sfx@ b2 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 1); npyv_b@len@ c2 = npyv_@VOP@_@sfx@(a, b2); #if @len@ >= 32 npyv_@sfx@ b3 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 2); npyv_@sfx@ b4 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 3); npyv_b@len@ c3 = npyv_@VOP@_@sfx@(a, b3); npyv_b@len@ c4 = npyv_@VOP@_@sfx@(a, b4); #if @len@ == 64 npyv_@sfx@ b5 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 4); npyv_@sfx@ b6 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 5); npyv_@sfx@ b7 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 6); npyv_@sfx@ b8 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 7); npyv_b@len@ c5 = npyv_@VOP@_@sfx@(a, b5); npyv_b@len@ c6 = npyv_@VOP@_@sfx@(a, b6); npyv_b@len@ c7 = npyv_@VOP@_@sfx@(a, b7); npyv_b@len@ c8 = npyv_@VOP@_@sfx@(a, b8); #endif // @len@ >= 64 #endif // @len@ >= 32 #endif // @len@ >= 16 #endif // @len@ >= 8 #if @len@ == 8 npyv_u8 r = npyv_cvt_u8_b8(c1); #elif @len@ == 16 npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2)); #elif @len@ == 32 npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4)); #elif @len@ == 64 npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8)); #endif npyv_store_u8(dst, npyv_and_u8(r, truemask)); } for (; len > 0; --len, ++src, ++dst) { const npyv_lanetype_@sfx@ b = *src; *dst = scalar @OP@ b; } } static void simd_binary_scalar2_@kind@_@sfx@(char **args, npy_intp len) { npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0]; npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1]; npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; const npyv_@sfx@ b = npyv_setall_@sfx@(scalar); const npyv_u8 truemask = npyv_setall_u8(0x1); const int vstep = npyv_nlanes_u8; for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { #if @len@ >= 8 npyv_@sfx@ a1 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 0); npyv_b@len@ c1 = npyv_@VOP@_@sfx@(a1, b); #if @len@ >= 16 npyv_@sfx@ a2 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 1); npyv_b@len@ c2 = npyv_@VOP@_@sfx@(a2, b); #if @len@ >= 32 npyv_@sfx@ a3 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 2); npyv_@sfx@ a4 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 3); npyv_b@len@ c3 = npyv_@VOP@_@sfx@(a3, b); npyv_b@len@ c4 = npyv_@VOP@_@sfx@(a4, b); #if @len@ == 64 npyv_@sfx@ a5 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 4); npyv_@sfx@ a6 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 5); npyv_@sfx@ a7 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 6); npyv_@sfx@ a8 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 7); npyv_b@len@ c5 = npyv_@VOP@_@sfx@(a5, b); npyv_b@len@ c6 = npyv_@VOP@_@sfx@(a6, b); npyv_b@len@ c7 = npyv_@VOP@_@sfx@(a7, b); npyv_b@len@ c8 = npyv_@VOP@_@sfx@(a8, b); #endif // @len@ >= 64 #endif // @len@ >= 32 #endif // @len@ >= 16 #endif // @len@ >= 8 #if @len@ == 8 npyv_u8 r = npyv_cvt_u8_b8(c1); #elif @len@ == 16 npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2)); #elif @len@ == 32 npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4)); #elif @len@ == 64 npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8)); #endif npyv_store_u8(dst, npyv_and_u8(r, truemask)); } for (; len > 0; --len, ++src, ++dst) { const npyv_lanetype_@sfx@ a = *src; *dst = a @OP@ scalar; } } #endif /**end repeat1**/ /**end repeat**/ /**begin repeat * #kind = equal, not_equal, less, less_equal# * #eq = 1, 0, 0, 0# * #neq = 0, 1, 0, 0# * #OP = ==, !=, <, <=# * #VOP = xnor, xor, andc, orc# */ #if NPY_SIMD static void simd_binary_@kind@_b8(char **args, npy_intp len) { npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0]; npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1]; npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; const npyv_u8 truemask = npyv_setall_u8(0x1); const npyv_u8 vzero = npyv_setall_u8(0x0); const int vstep = npyv_nlanes_u8; for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) { // Whatever element in src != 0x0 is converted to 0xFF npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src1), vzero); npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src2), vzero); npyv_b8 c = npyv_@VOP@_b8(a, b); npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask)); } for (; len > 0; --len, ++src1, ++src2, ++dst) { const npyv_lanetype_u8 a = *src1 != 0; const npyv_lanetype_u8 b = *src2 != 0; *dst = a @OP@ b; } } static void simd_binary_scalar1_@kind@_b8(char **args, npy_intp len) { npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[0]; npyv_lanetype_u8 *src = (npyv_lanetype_u8 *) args[1]; npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; const npyv_u8 vzero = npyv_setall_u8(0x0); const npyv_u8 vscalar = npyv_setall_u8(scalar); const npyv_b8 a = npyv_cmpeq_u8(vscalar, vzero); const npyv_u8 truemask = npyv_setall_u8(0x1); const int vstep = npyv_nlanes_u8; for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src), vzero); npyv_b8 c = npyv_@VOP@_b8(a, b); npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask)); } for (; len > 0; --len, ++src, ++dst) { const npyv_lanetype_u8 b = *src != 0; *dst = scalar @OP@ b; } } static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len) { npyv_lanetype_u8 *src = (npyv_lanetype_u8 *) args[0]; npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1]; npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; const npyv_u8 vzero = npyv_setall_u8(0x0); const npyv_u8 vscalar = npyv_setall_u8(scalar); const npyv_b8 b = npyv_cmpeq_u8(vscalar, vzero); const npyv_u8 truemask = npyv_setall_u8(0x1); const int vstep = npyv_nlanes_u8; for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src), vzero); npyv_b8 c = npyv_@VOP@_b8(a, b); npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask)); } for (; len > 0; --len, ++src, ++dst) { const npyv_lanetype_u8 a = *src != 0; *dst = a @OP@ scalar; } } #endif /**end repeat**/ /**begin repeat * #type = npy_ubyte*2, npy_byte, npy_ushort, npy_short, npy_uint, npy_int, npy_ulonglong, npy_longlong, npy_float, npy_double# * #sfx = b8, u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# * #bool = 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0# * #fp = 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #signed = 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0# * #VECTOR = NPY_SIMD*9, NPY_SIMD_F32, NPY_SIMD_F64# */ /**begin repeat1 * #kind = equal, not_equal, less, less_equal# * #eq = 1, 0, 0, 0# * #neq = 0, 1, 0, 0# * #OP = ==, !=, <, <=# */ #if !((@eq@ || @neq@) && @signed@) static NPY_INLINE void run_binary_simd_@kind@_@sfx@(char **args, npy_intp const *dimensions, npy_intp const *steps) { #if @VECTOR@ /* argument one scalar */ if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) { simd_binary_scalar1_@kind@_@sfx@(args, dimensions[0]); return; } /* argument two scalar */ else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) { simd_binary_scalar2_@kind@_@sfx@(args, dimensions[0]); return; } else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) { simd_binary_@kind@_@sfx@(args, dimensions[0]); return; } #endif BINARY_LOOP { #if @bool@ npy_bool in1 = *((npy_bool *)ip1) != 0; npy_bool in2 = *((npy_bool *)ip2) != 0; #else const @type@ in1 = *(@type@ *)ip1; const @type@ in2 = *(@type@ *)ip2; #endif *((npy_bool *)op1) = in1 @OP@ in2; } } #endif /**end repeat1**/ /**end repeat**/ /******************************************************************************** ** Defining ufunc inner functions ********************************************************************************/ /* * In order to reduce the size of the binary generated from this source, the * following rules are applied: 1) each data type implements its function * 'greater' as a call to the function 'less' but with the arguments swapped, * the same applies to the function 'greater_equal', which is implemented * with a call to the function 'less_equal', and 2) for the integer datatypes * of the same size (eg 8-bit), a single kernel of the functions 'equal' and * 'not_equal' is used to implement both signed and unsigned types. */ /**begin repeat * Signed and Unsigned types * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, * BYTE, SHORT, INT, LONG, LONGLONG# * #STYPE = BYTE, SHORT, INT, LONG, LONGLONG, * BYTE, SHORT, INT, LONG, LONGLONG# * #signed = 0, 0, 0, 0, 0, 1, 1, 1, 1, 1# */ #undef TO_SIMD_SFX #undef TO_SIMD_UTYPE #if 0 /**begin repeat1 * #len = 8, 16, 32, 64# */ #elif NPY_BITSOF_@STYPE@ == @len@ #define TO_SIMD_UTYPE(X) X##_u@len@ #if @signed@ #define TO_SIMD_SFX(X) X##_s@len@ #else #define TO_SIMD_SFX(X) X##_u@len@ #endif /**end repeat1**/ #endif /**begin repeat1 * #kind = greater, greater_equal# * #kind_to = less, less_equal# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { char *nargs[3] = {args[1], args[0], args[2]}; npy_intp nsteps[3] = {steps[1], steps[0], steps[2]}; TO_SIMD_SFX(run_binary_simd_@kind_to@)(nargs, dimensions, nsteps); } /**end repeat1**/ /**begin repeat1 * #kind = less, less_equal# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { TO_SIMD_SFX(run_binary_simd_@kind@)(args, dimensions, steps); } /**end repeat1**/ /**begin repeat1 * #kind = equal, not_equal# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { TO_SIMD_UTYPE(run_binary_simd_@kind@)(args, dimensions, steps); } /**end repeat1**/ /**end repeat**/ /**begin repeat * Boolean & Float types * #TYPE = BOOL, FLOAT, DOUBLE# * #sfx = b8, f32, f64# * #fp = 0, 1, 1# */ /**begin repeat1 * #kind = greater, greater_equal# * #kind_to = less, less_equal# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { char *nargs[3] = {args[1], args[0], args[2]}; npy_intp nsteps[3] = {steps[1], steps[0], steps[2]}; run_binary_simd_@kind_to@_@sfx@(nargs, dimensions, nsteps); #if @fp@ npy_clear_floatstatus_barrier((char*)dimensions); #endif } /**end repeat1**/ /**begin repeat1 * #kind = equal, not_equal, less, less_equal# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { run_binary_simd_@kind@_@sfx@(args, dimensions, steps); #if @fp@ npy_clear_floatstatus_barrier((char*)dimensions); #endif } /**end repeat1**/ /**end repeat**/