diff options
author | Ganesh Kathiresan <ganesh3597@gmail.com> | 2021-05-11 21:38:51 +0530 |
---|---|---|
committer | Sayed Adel <seiko@imavr.com> | 2021-05-20 23:19:50 +0200 |
commit | 7c163672933d42e76dd643065acbe36a7274dc00 (patch) | |
tree | 74e4b40c40a7d0ff2e42095397acff886f98dda3 /numpy | |
parent | b6b32674d634b6dfe9d92212e8a6ced0f1e14319 (diff) | |
download | numpy-7c163672933d42e76dd643065acbe36a7274dc00.tar.gz |
SIMD: Separate signed and unsigned loops
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/src/umath/loops_arithmetic.dispatch.c.src | 181 |
1 files changed, 105 insertions, 76 deletions
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src index 5e54a45de..a52bb36b7 100644 --- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src +++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src @@ -36,41 +36,35 @@ ********************************************************************************/ #if NPY_SIMD /**begin repeat - * #sfx = u8, u16, u32, u64, s8, s16, s32, s64# - * #len = 8, 16, 32, 64, 8, 16, 32, 64# - * #signed = 0*4, 1*4# + * Signed types + * #sfx = s8, s16, s32, s64# + * #len = 8, 16, 32, 64# */ -#if @signed@ static NPY_INLINE void simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) { - npyv_@sfx@ a, nsign_d, nsign_a, diff_sign, to_ninf, trunc, floor, neg, vzero; - npyv_b@len@ greater_min, noverflow; - npy_bool raise; - npy_uint64 tobits; - npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0]; npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1]; npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2]; const int vstep = npyv_nlanes_@sfx@; const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar); - if (NPY_UNLIKELY(-1 == scalar)) { - noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1)); - vzero = npyv_zero_@sfx@(); + if (scalar == (npyv_lanetype_@sfx@)-1) { + npyv_b@len@ noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1)); + npyv_@sfx@ vzero = npyv_zero_@sfx@(); for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { - a = npyv_load_@sfx@(src); - greater_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@)); - noverflow = npyv_and_b@len@(noverflow, greater_min); - neg = npyv_ifsub_@sfx@(greater_min, vzero, a, vzero); + npyv_@sfx@ a = npyv_load_@sfx@(src); + npyv_b@len@ greater_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@)); + noverflow = npyv_and_b@len@(noverflow, greater_min); + npyv_@sfx@ neg = npyv_ifsub_@sfx@(greater_min, vzero, a, vzero); npyv_store_@sfx@(dst, neg); } - tobits = npyv_tobits_b@len@(noverflow); + npy_uint64 tobits = npyv_tobits_b@len@(noverflow); #if npyv_nlanes_@sfx@ == 64 - raise = (~tobits) != 0; + int raise = (~tobits) != 0; #else - raise = tobits != (1ULL << vstep)-1; + int raise = tobits != (1ULL << vstep)-1; #endif for (; len > 0; --len, ++src, ++dst) { @@ -87,36 +81,37 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) } } else { for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { - nsign_d = npyv_setall_@sfx@(scalar < 0); - a = npyv_load_@sfx@(src); - nsign_a = npyv_cvt_@sfx@_b@len@(npyv_cmplt_@sfx@(a, nsign_d)); - nsign_a = npyv_and_@sfx@(nsign_a, npyv_setall_@sfx@(1)); - diff_sign = npyv_sub_@sfx@(nsign_a, nsign_d); - to_ninf = npyv_xor_@sfx@(nsign_a, nsign_d); - trunc = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor); - floor = npyv_sub_@sfx@(trunc, to_ninf); + npyv_@sfx@ nsign_d = npyv_setall_@sfx@(scalar < 0); + npyv_@sfx@ a = npyv_load_@sfx@(src); + npyv_@sfx@ nsign_a = npyv_cvt_@sfx@_b@len@(npyv_cmplt_@sfx@(a, nsign_d)); + nsign_a = npyv_and_@sfx@(nsign_a, npyv_setall_@sfx@(1)); + npyv_@sfx@ diff_sign = npyv_sub_@sfx@(nsign_a, nsign_d); + npyv_@sfx@ to_ninf = npyv_xor_@sfx@(nsign_a, nsign_d); + npyv_@sfx@ trunc = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor); + npyv_@sfx@ floor = npyv_sub_@sfx@(trunc, to_ninf); npyv_store_@sfx@(dst, floor); } for (; len > 0; --len, ++src, ++dst) { const npyv_lanetype_@sfx@ a = *src; - if (scalar == 0 || (a == (npyv_lanetype_@sfx@)NPY_MIN_INT@len@ && scalar == (npyv_lanetype_@sfx@)-1)) { - npy_set_floatstatus_divbyzero(); - *dst = 0; - } else { - *dst = a / scalar; - /* Negative quotients needs to be rounded down */ - if (((a > 0) != (scalar > 0)) && (*dst * scalar != a)) { - *dst = *dst - 1; - } + *dst = a / scalar; + /* Negative quotients needs to be rounded down */ + if (((a > 0) != (scalar > 0)) && (*dst * scalar != a)) { + *dst = *dst - 1; } } } npyv_cleanup(); } -#else +/**end repeat**/ + +/**begin repeat + * Unsigned types + * #sfx = u8, u16, u32, u64# + * #len = 8, 16, 32, 64# + */ static NPY_INLINE void simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) { @@ -134,17 +129,11 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) for (; len > 0; --len, ++src, ++dst) { const npyv_lanetype_@sfx@ a = *src; - if (scalar == 0 || (a == (npyv_lanetype_@sfx@)NPY_MIN_INT@len@ && scalar == (npyv_lanetype_@sfx@)-1)) { - npy_set_floatstatus_divbyzero(); - *dst = 0; - } else { - *dst = a / scalar; - } + *dst = a / scalar; } npyv_cleanup(); } -#endif /**end repeat**/ #endif @@ -153,31 +142,78 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) ********************************************************************************/ /**begin repeat - * Unsigned types + * Signed types * #type = byte, short, int, long, longlong# * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# */ - +#undef TO_SIMD_SFX +#if 0 /**begin repeat1 - * #signed = 1, 0# + * #len = 8, 16, 32, 64# + */ +#elif NPY_BITSOF_@TYPE@ == @len@ + #define TO_SIMD_SFX(X) X##_s@len@ +/**end repeat1**/ +#endif + +#if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON)) + #undef TO_SIMD_SFX +#endif +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if (IS_BINARY_REDUCE) { + BINARY_REDUCE_LOOP(npy_@type@) { + const npy_@type@ d = *(npy_@type@ *)ip2; + if (NPY_UNLIKELY(d == 0 || (io1 == (npy_@type@)NPY_MIN_@TYPE@ && d == (npy_@type@)-1))) { + npy_set_floatstatus_divbyzero(); + io1 = 0; + } else { + io1 /= d; + } + } + *((npy_@type@ *)iop1) = io1; + } +#if NPY_SIMD && defined(TO_SIMD_SFX) + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_@type@), NPY_SIMD_WIDTH) && + (*(npy_@type@ *)args[1]) != 0) { + TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]); + } +#endif + else { + BINARY_LOOP { + const npy_@type@ in1 = *(npy_@type@ *)ip1; + const npy_@type@ in2 = *(npy_@type@ *)ip2; + if (NPY_UNLIKELY(in2 == 0 || (in1 == (npy_@type@)NPY_MIN_@TYPE@ && in2 == (npy_@type@)-1))) { + npy_set_floatstatus_divbyzero(); + *((npy_@type@ *)op1) = 0; + } else{ + *((npy_@type@ *)op1) = in1 / in2; + /* Negative quotients needs to be rounded down */ + if (((in1 > 0) != (in2 > 0)) && (*((npy_@type@ *)op1) * in2 != in1)) { + *((npy_@type@ *)op1) = *((npy_@type@ *)op1) - 1; + } + } + } + } +} +/**end repeat**/ + +/**begin repeat + * Unsigned types + * #type = byte, short, int, long, longlong# + * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# */ #undef TO_SIMD_SFX -#undef SIMD_TYPE -#undef SIMD_DIVIDE #if 0 -/**begin repeat2 +/**begin repeat1 * #len = 8, 16, 32, 64# */ -#elif NPY_BITSOF_@TYPE@ == @len@ && @signed@ - #define TO_SIMD_SFX(X) X##_s@len@ - #define SIMD_TYPE npy_@type@ - #define SIMD_DIVIDE @TYPE@_divide #elif NPY_BITSOF_@TYPE@ == @len@ #define TO_SIMD_SFX(X) X##_u@len@ - #define SIMD_TYPE npy_u@type@ - #define SIMD_DIVIDE U@TYPE@_divide -/**end repeat2**/ +/**end repeat1**/ #endif /* * For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division @@ -190,46 +226,39 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) #if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON)) #undef TO_SIMD_SFX #endif -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SIMD_DIVIDE) +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(U@TYPE@_divide) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { if (IS_BINARY_REDUCE) { - BINARY_REDUCE_LOOP(SIMD_TYPE) { - const SIMD_TYPE d = *(SIMD_TYPE *)ip2; - if (NPY_UNLIKELY(d == 0 || (io1 == (SIMD_TYPE)NPY_MIN_@TYPE@ && d == (SIMD_TYPE)-1))) { + BINARY_REDUCE_LOOP(npy_u@type@) { + const npy_u@type@ d = *(npy_u@type@ *)ip2; + if (NPY_UNLIKELY(d == 0 || (io1 == (npy_u@type@)NPY_MIN_@TYPE@ && d == (npy_u@type@)-1))) { npy_set_floatstatus_divbyzero(); io1 = 0; } else { io1 /= d; } } - *((SIMD_TYPE *)iop1) = io1; + *((npy_u@type@ *)iop1) = io1; } #if NPY_SIMD && defined(TO_SIMD_SFX) // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(SIMD_TYPE), NPY_SIMD_WIDTH) && - (*(SIMD_TYPE *)args[1]) != 0) { + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_u@type@), NPY_SIMD_WIDTH) && + (*(npy_u@type@ *)args[1]) != 0) { TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]); } #endif else { BINARY_LOOP { - const SIMD_TYPE in1 = *(SIMD_TYPE *)ip1; - const SIMD_TYPE in2 = *(SIMD_TYPE *)ip2; - if (NPY_UNLIKELY(in2 == 0 || (in1 == (SIMD_TYPE)NPY_MIN_@TYPE@ && in2 == (SIMD_TYPE)-1))) { + const npy_u@type@ in1 = *(npy_u@type@ *)ip1; + const npy_u@type@ in2 = *(npy_u@type@ *)ip2; + if (NPY_UNLIKELY(in2 == 0 || (in1 == (npy_u@type@)NPY_MIN_@TYPE@ && in2 == (npy_u@type@)-1))) { npy_set_floatstatus_divbyzero(); - *((SIMD_TYPE *)op1) = 0; + *((npy_u@type@ *)op1) = 0; } else{ - *((SIMD_TYPE *)op1) = in1 / in2; -#if @signed@ - /* Negative quotients needs to be rounded down */ - if (((in1 > 0) != (in2 > 0)) && (*((SIMD_TYPE *)op1) * in2 != in1)) { - *((SIMD_TYPE *)op1) = *((SIMD_TYPE *)op1) - 1; - } -#endif + *((npy_u@type@ *)op1) = in1 / in2; } } } } -/**end repeat1**/ /**end repeat**/ |