diff options
Diffstat (limited to 'numpy/core')
-rw-r--r-- | numpy/core/src/_simd/_simd.dispatch.c.src | 22 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx2/operators.h | 7 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx512/conversion.h | 4 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx512/operators.h | 15 | ||||
-rw-r--r-- | numpy/core/src/common/simd/neon/operators.h | 5 | ||||
-rw-r--r-- | numpy/core/src/common/simd/sse/operators.h | 7 | ||||
-rw-r--r-- | numpy/core/src/common/simd/vsx/operators.h | 5 | ||||
-rw-r--r-- | numpy/core/src/umath/loops_comparison.dispatch.c.src | 201 |
8 files changed, 150 insertions, 116 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index f8a0a3196..0f3e4fc8f 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -31,6 +31,7 @@ * #intdiv_sup= 1, 1, 1, 1, 1, 1, 1, 1, 0, 0# * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0# + * #bitw8b_sup= 1, 0, 0, 0, 0, 0, 0, 0, 0, 0# */ #if @simd_sup@ /*************************** @@ -332,6 +333,13 @@ SIMD_IMPL_INTRIN_1(not_@sfx@, v@sfx@, v@sfx@) SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@bsfx@, v@sfx@, v@sfx@) /**end repeat1**/ +#if @bitw8b_sup@ +SIMD_IMPL_INTRIN_2(andc_@sfx@, v@sfx@, v@sfx@, v@sfx@) +SIMD_IMPL_INTRIN_2(andc_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) +SIMD_IMPL_INTRIN_2(orc_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) +SIMD_IMPL_INTRIN_2(xnor_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) +#endif + /*************************** * Conversion ***************************/ @@ -462,9 +470,6 @@ SIMD_IMPL_INTRIN_2(or_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) SIMD_IMPL_INTRIN_2(xor_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@) /**end repeat**/ -SIMD_IMPL_INTRIN_2(andc_b8, vb8, vb8, vb8) -SIMD_IMPL_INTRIN_2(orc_b8, vb8, vb8, vb8) -SIMD_IMPL_INTRIN_2(xnor_b8, vb8, vb8, vb8) /*************************** * Conversions ***************************/ @@ -503,6 +508,7 @@ static PyMethodDef simd__intrinsics_methods[] = { * #intdiv_sup= 1, 1, 1, 1, 1, 1, 1, 1, 0, 0# * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0# + * #bitw8b_sup= 1, 0, 0, 0, 0, 0, 0, 0, 0, 0# */ #if @simd_sup@ @@ -584,6 +590,13 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@) SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ +#if @bitw8b_sup@ +SIMD_INTRIN_DEF(andc_@sfx@) +SIMD_INTRIN_DEF(andc_@bsfx@) +SIMD_INTRIN_DEF(orc_@bsfx@) +SIMD_INTRIN_DEF(xnor_@bsfx@) +#endif + /*************************** * Conversion ***************************/ @@ -713,9 +726,6 @@ SIMD_INTRIN_DEF(or_@bsfx@) SIMD_INTRIN_DEF(xor_@bsfx@) SIMD_INTRIN_DEF(not_@bsfx@) /**end repeat**/ -SIMD_INTRIN_DEF(andc_b8) -SIMD_INTRIN_DEF(orc_b8) -SIMD_INTRIN_DEF(xnor_b8) /*************************** * Conversions ***************************/ diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h index 0e77fc6be..99ef76dcb 100644 --- a/numpy/core/src/common/simd/avx2/operators.h +++ b/numpy/core/src/common/simd/avx2/operators.h @@ -115,9 +115,10 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c) #define npyv_not_b64 npyv_not_u8 // ANDC, ORC and XNOR -#define npyv_andc_b8(A, B) _mm256_andnot_si256(A, B) -#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B) -#define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B)) +#define npyv_andc_u8(A, B) _mm256_andnot_si256(B, A) +#define npyv_andc_b8(A, B) _mm256_andnot_si256(B, A) +#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A) +#define npyv_xnor_b8 _mm256_cmpeq_epi8 /*************************** * Comparison diff --git a/numpy/core/src/common/simd/avx512/conversion.h b/numpy/core/src/common/simd/avx512/conversion.h index a2f56b2ae..474aee446 100644 --- a/numpy/core/src/common/simd/avx512/conversion.h +++ b/numpy/core/src/common/simd/avx512/conversion.h @@ -104,8 +104,8 @@ NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) { NPY_FINLINE npyv_b8 npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) { #ifdef NPY_HAVE_AVX512BW - __mmask32 ab = (__mmask64)_mm512_kunpackw((__mmask32)b, (__mmask32)a); - __mmask32 cd = (__mmask64)_mm512_kunpackw((__mmask32)d, (__mmask32)c); + __mmask32 ab = _mm512_kunpackw((__mmask32)b, (__mmask32)a); + __mmask32 cd = _mm512_kunpackw((__mmask32)d, (__mmask32)c); return npyv_pack_b8_b16(ab, cd); #else const __m512i idx = _mm512_setr_epi32( diff --git a/numpy/core/src/common/simd/avx512/operators.h b/numpy/core/src/common/simd/avx512/operators.h index 8c98b72dd..b856b345a 100644 --- a/numpy/core/src/common/simd/avx512/operators.h +++ b/numpy/core/src/common/simd/avx512/operators.h @@ -140,6 +140,9 @@ #define npyv_not_f64(A) _mm512_castsi512_pd(npyv_not_u64(_mm512_castpd_si512(A))) #endif +// ANDC +#define npyv_andc_u8(A, B) _mm512_andnot_si512(B, A) + /*************************** * Logical (boolean) ***************************/ @@ -152,8 +155,8 @@ #define npyv_xor_b16 _kxor_mask32 #define npyv_not_b8 _knot_mask64 #define npyv_not_b16 _knot_mask32 - #define npyv_andc_b8 _kandn_mask64 - #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B) + #define npyv_andc_b8(A, B) _kandn_mask64(B, A) + #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A) #define npyv_xnor_b8 _kxnor_mask64 #elif defined(NPY_HAVE_AVX512BW) NPY_FINLINE npyv_b8 npyv_and_b8(npyv_b8 a, npyv_b8 b) @@ -173,9 +176,9 @@ NPY_FINLINE npyv_b16 npyv_not_b16(npyv_b16 a) { return ~a; } NPY_FINLINE npyv_b8 npyv_andc_b8(npyv_b8 a, npyv_b8 b) - { return (~a) & b; } + { return a & (~b); } NPY_FINLINE npyv_b8 npyv_orc_b8(npyv_b8 a, npyv_b8 b) - { return (~a) | b; } + { return a | (~b); } NPY_FINLINE npyv_b8 npyv_xnor_b8(npyv_b8 a, npyv_b8 b) { return ~(a ^ b); } #else @@ -187,8 +190,8 @@ #define npyv_xor_b16 _mm512_xor_si512 #define npyv_not_b8 npyv_not_u8 #define npyv_not_b16 npyv_not_u8 - #define npyv_andc_b8 _mm512_andnot_si512 - #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B) + #define npyv_andc_b8(A, B) _mm512_andnot_si512(B, A) + #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A) #define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B)) #endif diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h index 6c155fc67..a08fa5390 100644 --- a/numpy/core/src/common/simd/neon/operators.h +++ b/numpy/core/src/common/simd/neon/operators.h @@ -117,8 +117,9 @@ #define npyv_not_b64 npyv_not_u64 // ANDC, ORC and XNOR -#define npyv_andc_b8(A, B) vbicq_u8(B, A) -#define npyv_orc_b8(A, B) vornq_u8(B, A) +#define npyv_andc_u8 vbicq_u8 +#define npyv_andc_b8 vbicq_u8 +#define npyv_orc_b8 vornq_u8 #define npyv_xnor_b8 vceqq_u8 /*************************** diff --git a/numpy/core/src/common/simd/sse/operators.h b/numpy/core/src/common/simd/sse/operators.h index 51bdca356..86dbcfea5 100644 --- a/numpy/core/src/common/simd/sse/operators.h +++ b/numpy/core/src/common/simd/sse/operators.h @@ -116,9 +116,10 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c) #define npyv_not_b64 npyv_not_u8 // ANDC, ORC and XNOR -#define npyv_andc_b8(A, B) _mm_andnot_si128(A, B) -#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B) -#define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B)) +#define npyv_andc_u8(A, B) _mm_andnot_si128(B, A) +#define npyv_andc_b8(A, B) _mm_andnot_si128(B, A) +#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A) +#define npyv_xnor_b8 _mm_cmpeq_epi8 /*************************** * Comparison diff --git a/numpy/core/src/common/simd/vsx/operators.h b/numpy/core/src/common/simd/vsx/operators.h index fc29ba920..b01d85321 100644 --- a/numpy/core/src/common/simd/vsx/operators.h +++ b/numpy/core/src/common/simd/vsx/operators.h @@ -134,8 +134,9 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a) { return vec_nor(a, a); } // ANDC, ORC and XNOR -#define npyv_andc_b8(A, B) vec_andc(B, A) -#define npyv_orc_b8(A, B) vec_orc(B, A) +#define npyv_andc_u8 vec_andc +#define npyv_andc_b8 vec_andc +#define npyv_orc_b8 vec_orc #define npyv_xnor_b8 vec_eqv /*************************** diff --git a/numpy/core/src/umath/loops_comparison.dispatch.c.src b/numpy/core/src/umath/loops_comparison.dispatch.c.src index 07bbf0354..01d58fbf9 100644 --- a/numpy/core/src/umath/loops_comparison.dispatch.c.src +++ b/numpy/core/src/umath/loops_comparison.dispatch.c.src @@ -1,6 +1,6 @@ /*@targets ** $maxopt baseline - ** sse2 sse41 avx2 avx512f avx512_skx + ** sse2 sse42 avx2 avx512f avx512_skx ** vsx2 vsx3 ** neon **/ @@ -15,18 +15,23 @@ // Provides the various *_LOOP macros #include "fast_loop_macros.h" +/******************************************************************************** + ** Defining the SIMD kernels + ********************************************************************************/ /**begin repeat * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# * #len = 8, 8, 16, 16, 32, 32, 64, 64, 32, 64# + * #signed = 0, 1, 0, 1, 0, 1, 0, 1, 0, 0# * #VECTOR = NPY_SIMD*9, NPY_SIMD_F64# */ /**begin repeat1 - * #kind = equal, not_equal, less, less_equal, greater, greater_equal# - * #OP = ==, !=, <, <=, >, >=# - * #VOP = cmpeq, cmpneq, cmplt, cmple, cmpgt, cmpge# + * #kind = equal, not_equal, less, less_equal# + * #eq = 1, 0, 0, 0# + * #neq = 0, 1, 0, 0# + * #OP = ==, !=, <, <=# + * #VOP = cmpeq, cmpneq, cmplt, cmple# */ - -#if @VECTOR@ +#if @VECTOR@ && !((@eq@ || @neq@) && @signed@) static void simd_binary_@kind@_@sfx@(char **args, npy_intp len) { npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; @@ -205,10 +210,11 @@ static void simd_binary_scalar2_@kind@_@sfx@(char **args, npy_intp len) /**end repeat**/ /**begin repeat - * #kind = equal, not_equal, less, less_equal, greater, greater_equal# - * #OP = ==, !=, <, <=, >, >=# - * #VOP = xnor, xor, andc, orc, andc, orc# - * #rev = 0, 0, 0, 0, 1, 1# + * #kind = equal, not_equal, less, less_equal# + * #eq = 1, 0, 0, 0# + * #neq = 0, 1, 0, 0# + * #OP = ==, !=, <, <=# + * #VOP = xnor, xor, andc, orc# */ #if NPY_SIMD @@ -224,14 +230,10 @@ static void simd_binary_@kind@_b8(char **args, npy_intp len) for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) { // Whatever element in src != 0x0 is converted to 0xFF - npyv_b8 a = npyv_cmpneq_u8(npyv_load_u8(src1), vzero); - npyv_b8 b = npyv_cmpneq_u8(npyv_load_u8(src2), vzero); -#if !@rev@ + npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src1), vzero); + npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src2), vzero); npyv_b8 c = npyv_@VOP@_b8(a, b); -#else - npyv_b8 c = npyv_@VOP@_b8(b, a); -#endif - npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask)); + npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask)); } for (; len > 0; --len, ++src1, ++src2, ++dst) { @@ -248,18 +250,14 @@ static void simd_binary_scalar1_@kind@_b8(char **args, npy_intp len) npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; const npyv_u8 vzero = npyv_setall_u8(0x0); const npyv_u8 vscalar = npyv_setall_u8(scalar); - const npyv_b8 a = npyv_cmpneq_u8(vscalar, vzero); + const npyv_b8 a = npyv_cmpeq_u8(vscalar, vzero); const npyv_u8 truemask = npyv_setall_u8(0x1); const int vstep = npyv_nlanes_u8; for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { - npyv_b8 b = npyv_cmpneq_u8(npyv_load_u8(src), vzero); -#if !@rev@ + npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src), vzero); npyv_b8 c = npyv_@VOP@_b8(a, b); -#else - npyv_b8 c = npyv_@VOP@_b8(b, a); -#endif - npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask)); + npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask)); } for (; len > 0; --len, ++src, ++dst) { @@ -275,18 +273,14 @@ static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len) npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; const npyv_u8 vzero = npyv_setall_u8(0x0); const npyv_u8 vscalar = npyv_setall_u8(scalar); - const npyv_b8 b = npyv_cmpneq_u8(vscalar, vzero); + const npyv_b8 b = npyv_cmpeq_u8(vscalar, vzero); const npyv_u8 truemask = npyv_setall_u8(0x1); const int vstep = npyv_nlanes_u8; for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { - npyv_b8 a = npyv_cmpneq_u8(npyv_load_u8(src), vzero); -#if !@rev@ + npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src), vzero); npyv_b8 c = npyv_@VOP@_b8(a, b); -#else - npyv_b8 c = npyv_@VOP@_b8(b, a); -#endif - npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask)); + npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask)); } for (; len > 0; --len, ++src, ++dst) { @@ -297,73 +291,73 @@ static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len) #endif /**end repeat**/ - /**begin repeat * #type = npy_ubyte*2, npy_byte, npy_ushort, npy_short, npy_uint, npy_int, npy_ulonglong, npy_longlong, npy_float, npy_double# * #sfx = b8, u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# + * #bool = 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0# + * #fp = 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# + * #signed = 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0# * #VECTOR = NPY_SIMD*10, NPY_SIMD_F64# */ /**begin repeat1 - * #kind = equal, not_equal, less, less_equal, greater, greater_equal# + * #kind = equal, not_equal, less, less_equal# + * #eq = 1, 0, 0, 0# + * #neq = 0, 1, 0, 0# + * #OP = ==, !=, <, <=# */ -static NPY_INLINE int +#if !((@eq@ || @neq@) && @signed@) +static NPY_INLINE void run_binary_simd_@kind@_@sfx@(char **args, npy_intp const *dimensions, npy_intp const *steps) { #if @VECTOR@ /* argument one scalar */ if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) { simd_binary_scalar1_@kind@_@sfx@(args, dimensions[0]); - return 1; + return; } /* argument two scalar */ else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) { simd_binary_scalar2_@kind@_@sfx@(args, dimensions[0]); - return 1; + return; } else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) { simd_binary_@kind@_@sfx@(args, dimensions[0]); - return 1; + return; } #endif - return 0; -} -/**end repeat1**/ -/**end repeat**/ -/* - ***************************************************************************** - ** BOOLEAN LOOPS ** - ***************************************************************************** - */ - -/**begin repeat - * #kind = equal, not_equal, less, less_equal, greater, greater_equal# - * #OP = ==, !=, <, <=, >, >=# - */ -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if (!run_binary_simd_@kind@_b8(args, dimensions, steps)) { - BINARY_LOOP { - npy_bool in1 = *((npy_bool *)ip1) != 0; - npy_bool in2 = *((npy_bool *)ip2) != 0; - *((npy_bool *)op1)= in1 @OP@ in2; - } + BINARY_LOOP { +#if @bool@ + npy_bool in1 = *((npy_bool *)ip1) != 0; + npy_bool in2 = *((npy_bool *)ip2) != 0; +#else + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; +#endif + *((npy_bool *)op1) = in1 @OP@ in2; } } +#endif +/**end repeat1**/ /**end repeat**/ +/******************************************************************************** + ** Defining ufunc inner functions + ********************************************************************************/ + /* - ***************************************************************************** - ** INTEGER LOOPS - ***************************************************************************** + * In order to reduce the size of the binary generated from this source, the + * following rules are applied: 1) each data type implements its function + * 'greater' as a call to the function 'less' but with the arguments swapped, + * the same applies to the function 'greater_equal', which is implemented + * with a call to the function 'less_equal', and 2) for the integer datatypes + * of the same size (eg 8-bit), a single kernel of the functions 'equal' and + * 'not_equal' is used to implement both signed and unsigned types. */ /**begin repeat * Signed and Unsigned types - * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, - * npy_byte, npy_short, npy_int, npy_long, npy_longlong# * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, * BYTE, SHORT, INT, LONG, LONGLONG# * #STYPE = BYTE, SHORT, INT, LONG, LONGLONG, @@ -371,11 +365,13 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@) * #signed = 0, 0, 0, 0, 0, 1, 1, 1, 1, 1# */ #undef TO_SIMD_SFX +#undef TO_SIMD_UTYPE #if 0 /**begin repeat1 * #len = 8, 16, 32, 64# */ #elif NPY_BITSOF_@STYPE@ == @len@ + #define TO_SIMD_UTYPE(X) X##_u@len@ #if @signed@ #define TO_SIMD_SFX(X) X##_s@len@ #else @@ -385,50 +381,71 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@) #endif /**begin repeat1 - * #kind = equal, not_equal, less, less_equal, greater, greater_equal# - * #OP = ==, !=, <, <=, >, >=# + * #kind = greater, greater_equal# + * #kind_to = less, less_equal# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - if (!TO_SIMD_SFX(run_binary_simd_@kind@)(args, dimensions, steps)) { - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - *((npy_bool *)op1) = in1 @OP@ in2; - } - } + char *nargs[3] = {args[1], args[0], args[2]}; + npy_intp nsteps[3] = {steps[1], steps[0], steps[2]}; + TO_SIMD_SFX(run_binary_simd_@kind_to@)(nargs, dimensions, nsteps); } /**end repeat1**/ -/**end repeat**/ -/* - ***************************************************************************** - ** FLOAT LOOPS ** - ***************************************************************************** +/**begin repeat1 + * #kind = less, less_equal# */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + TO_SIMD_SFX(run_binary_simd_@kind@)(args, dimensions, steps); +} +/**end repeat1**/ + +/**begin repeat1 + * #kind = equal, not_equal# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + TO_SIMD_UTYPE(run_binary_simd_@kind@)(args, dimensions, steps); +} +/**end repeat1**/ +/**end repeat**/ /**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #sfx = f32, f64# + * Boolean & Float types + * #TYPE = BOOL, FLOAT, DOUBLE# + * #sfx = b8, f32, f64# + * #fp = 0, 1, 1# */ /**begin repeat1 - * #kind = equal, not_equal, less, less_equal, greater, greater_equal# - * #OP = ==, !=, <, <=, >, >=# + * #kind = greater, greater_equal# + * #kind_to = less, less_equal# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - if (!run_binary_simd_@kind@_@sfx@(args, dimensions, steps)) { - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - *((npy_bool *)op1) = in1 @OP@ in2; - } - } + char *nargs[3] = {args[1], args[0], args[2]}; + npy_intp nsteps[3] = {steps[1], steps[0], steps[2]}; + run_binary_simd_@kind_to@_@sfx@(nargs, dimensions, nsteps); +#if @fp@ npy_clear_floatstatus_barrier((char*)dimensions); +#endif +} +/**end repeat1**/ + +/**begin repeat1 + * #kind = equal, not_equal, less, less_equal# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + run_binary_simd_@kind@_@sfx@(args, dimensions, steps); +#if @fp@ + npy_clear_floatstatus_barrier((char*)dimensions); +#endif } /**end repeat1**/ /**end repeat**/ |