diff options
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/code_generators/generate_umath.py | 8 | ||||
-rw-r--r-- | numpy/core/setup.py | 1 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 102 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.h.src | 27 | ||||
-rw-r--r-- | numpy/core/src/umath/loops_arithmetic.dispatch.c.src | 206 | ||||
-rw-r--r-- | numpy/core/src/umath/loops_modulo.dispatch.c.src | 631 |
6 files changed, 811 insertions, 164 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index a33b1f790..e3ec93bf0 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -364,7 +364,7 @@ defdict = { Ufunc(2, 1, None, docstrings.get('numpy.core.umath.fmod'), None, - TD(ints), + TD(ints, dispatch=[('loops_modulo', ints)]), TD(flts, f='fmod', astype={'e': 'f'}), TD(P, f='fmod'), ), @@ -884,7 +884,8 @@ defdict = { Ufunc(2, 1, None, docstrings.get('numpy.core.umath.remainder'), 'PyUFunc_RemainderTypeResolver', - TD(intflt), + TD(ints, dispatch=[('loops_modulo', ints)]), + TD(flts), [TypeDescription('m', FullTypeDescr, 'mm', 'm')], TD(O, f='PyNumber_Remainder'), ), @@ -892,7 +893,8 @@ defdict = { Ufunc(2, 2, None, docstrings.get('numpy.core.umath.divmod'), 'PyUFunc_DivmodTypeResolver', - TD(intflt), + TD(ints, dispatch=[('loops_modulo', ints)]), + TD(flts), [TypeDescription('m', FullTypeDescr, 'mm', 'qm')], # TD(O, f='PyNumber_Divmod'), # gh-9730 ), diff --git a/numpy/core/setup.py b/numpy/core/setup.py index f6b31075d..fe52fde0d 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -1014,6 +1014,7 @@ def configuration(parent_package='',top_path=None): join('src', 'umath', 'loops_umath_fp.dispatch.c.src'), join('src', 'umath', 'loops_exponent_log.dispatch.c.src'), join('src', 'umath', 'loops_hyperbolic.dispatch.c.src'), + join('src', 'umath', 'loops_modulo.dispatch.c.src'), join('src', 'umath', 'matmul.h.src'), join('src', 'umath', 'matmul.c.src'), join('src', 'umath', 'clip.h'), diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 7f084ac39..3a3e838fc 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -764,23 +764,6 @@ NPY_NO_EXPORT void } } -NPY_NO_EXPORT void -@TYPE@_fmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - if (in2 == 0) { - npy_set_floatstatus_divbyzero(); - *((@type@ *)op1) = 0; - } - else { - *((@type@ *)op1)= in1 % in2; - } - - } -} - /**begin repeat1 * #kind = isnan, isinf, isfinite# * #func = npy_isnan, npy_isinf, npy_isfinite# @@ -817,57 +800,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0)); } -NPY_NO_EXPORT void -@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - if (in2 == 0) { - npy_set_floatstatus_divbyzero(); - *((@type@ *)op1) = 0; - } - else { - /* handle mixed case the way Python does */ - const @type@ rem = in1 % in2; - if ((in1 > 0) == (in2 > 0) || rem == 0) { - *((@type@ *)op1) = rem; - } - else { - *((@type@ *)op1) = rem + in2; - } - } - } -} - -NPY_NO_EXPORT void -@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - BINARY_LOOP_TWO_OUT { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - /* see FIXME note for divide above */ - if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) { - npy_set_floatstatus_divbyzero(); - *((@type@ *)op1) = 0; - *((@type@ *)op2) = 0; - } - else { - /* handle mixed case the way Python does */ - const @type@ quo = in1 / in2; - const @type@ rem = in1 % in2; - if ((in1 > 0) == (in2 > 0) || rem == 0) { - *((@type@ *)op1) = quo; - *((@type@ *)op2) = rem; - } - else { - *((@type@ *)op1) = quo - 1; - *((@type@ *)op2) = rem + in2; - } - } - } -} - /**begin repeat1 * #kind = gcd, lcm# **/ @@ -902,40 +834,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0); } -NPY_NO_EXPORT void -@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - if (in2 == 0) { - npy_set_floatstatus_divbyzero(); - *((@type@ *)op1) = 0; - } - else { - *((@type@ *)op1) = in1 % in2; - } - } -} - -NPY_NO_EXPORT void -@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - BINARY_LOOP_TWO_OUT { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - if (in2 == 0) { - npy_set_floatstatus_divbyzero(); - *((@type@ *)op1) = 0; - *((@type@ *)op2) = 0; - } - else { - *((@type@ *)op1)= in1/in2; - *((@type@ *)op2) = in1 % in2; - } - } -} - /**begin repeat1 * #kind = gcd, lcm# **/ diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index fce905616..694518ae0 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -64,6 +64,24 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) /**end repeat**/ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_modulo.dispatch.h" +#endif + +/**begin repeat + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, + BYTE, SHORT, INT, LONG, LONGLONG# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divmod, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) + + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_fmod, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) + + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_remainder, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat**/ + /**begin repeat * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# */ @@ -143,21 +161,12 @@ NPY_NO_EXPORT void @S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void -@S@@TYPE@_fmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void @S@@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void @S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void -@S@@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void @S@@TYPE@_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src index fc170904f..16a9eac2e 100644 --- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src +++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src @@ -101,109 +101,210 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) } /**end repeat**/ +/**begin repeat + * Unsigned types + * #sfx = u8, u16, u32, u64# + * #len = 8, 16, 32, 64# + */ +static NPY_INLINE void +simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) +{ + npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0]; + npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1]; + npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2]; + const int vstep = npyv_nlanes_@sfx@; + const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar); + + for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { + npyv_@sfx@ a = npyv_load_@sfx@(src); + npyv_@sfx@ c = npyv_divc_@sfx@(a, divisor); + npyv_store_@sfx@(dst, c); + } + + for (; len > 0; --len, ++src, ++dst) { + const npyv_lanetype_@sfx@ a = *src; + *dst = a / scalar; + } + npyv_cleanup(); +} +/**end repeat**/ + #if defined(NPY_HAVE_VSX4) + +/**begin repeat + * #t = u, s# + * #signed = 0, 1# + */ /* + * Computes division of 2 8-bit signed/unsigned integer vectors + * * As Power10 only supports integer vector division for data of 32 bits or * greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer * vector division instruction, and then, convert the result back to npyv_u8. */ -NPY_FINLINE npyv_u8 vsx4_div_u8(npyv_u8 a, npyv_u8 b) +NPY_FINLINE npyv_@t@8 +vsx4_div_@t@8(npyv_@t@8 a, npyv_@t@8 b) { +#if @signed@ + npyv_s16x2 ta, tb; + npyv_s32x2 ahi, alo, bhi, blo; + ta.val[0] = vec_unpackh(a); + ta.val[1] = vec_unpackl(a); + tb.val[0] = vec_unpackh(b); + tb.val[1] = vec_unpackl(b); + ahi.val[0] = vec_unpackh(ta.val[0]); + ahi.val[1] = vec_unpackl(ta.val[0]); + alo.val[0] = vec_unpackh(ta.val[1]); + alo.val[1] = vec_unpackl(ta.val[1]); + bhi.val[0] = vec_unpackh(tb.val[0]); + bhi.val[1] = vec_unpackl(tb.val[0]); + blo.val[0] = vec_unpackh(tb.val[1]); + blo.val[1] = vec_unpackl(tb.val[1]); +#else npyv_u16x2 a_expand = npyv_expand_u16_u8(a); + npyv_u16x2 b_expand = npyv_expand_u16_u8(b); npyv_u32x2 ahi = npyv_expand_u32_u16(a_expand.val[0]); npyv_u32x2 alo = npyv_expand_u32_u16(a_expand.val[1]); - npyv_u16x2 b_expand = npyv_expand_u16_u8(b); npyv_u32x2 bhi = npyv_expand_u32_u16(b_expand.val[0]); npyv_u32x2 blo = npyv_expand_u32_u16(b_expand.val[1]); - npyv_u32 divhi1 = vec_div(ahi.val[0], bhi.val[0]); - npyv_u32 divlo1 = vec_div(ahi.val[1], bhi.val[1]); - npyv_u32 divhi2 = vec_div(alo.val[0], blo.val[0]); - npyv_u32 divlo2 = vec_div(alo.val[1], blo.val[1]); - npyv_u16 reshi = (npyv_u16)vec_pack(divhi1, divlo1); - npyv_u16 reslo = (npyv_u16)vec_pack(divhi2, divlo2); - npyv_u8 res = (npyv_u8)vec_pack(reshi, reslo); - return res; +#endif + npyv_@t@32 v1 = vec_div(ahi.val[0], bhi.val[0]); + npyv_@t@32 v2 = vec_div(ahi.val[1], bhi.val[1]); + npyv_@t@32 v3 = vec_div(alo.val[0], blo.val[0]); + npyv_@t@32 v4 = vec_div(alo.val[1], blo.val[1]); + npyv_@t@16 hi = vec_pack(v1, v2); + npyv_@t@16 lo = vec_pack(v3, v4); + return vec_pack(hi, lo); } -NPY_FINLINE npyv_u16 vsx4_div_u16(npyv_u16 a, npyv_u16 b) +NPY_FINLINE npyv_@t@16 +vsx4_div_@t@16(npyv_@t@16 a, npyv_@t@16 b) { - npyv_u32x2 a_expand = npyv_expand_u32_u16(a); - npyv_u32x2 b_expand = npyv_expand_u32_u16(b); - npyv_u32 divhi = vec_div(a_expand.val[0], b_expand.val[0]); - npyv_u32 divlo = vec_div(a_expand.val[1], b_expand.val[1]); - npyv_u16 res = (npyv_u16)vec_pack(divhi, divlo); - return res; +#if @signed@ + npyv_s32x2 a_expand; + npyv_s32x2 b_expand; + a_expand.val[0] = vec_unpackh(a); + a_expand.val[1] = vec_unpackl(a); + b_expand.val[0] = vec_unpackh(b); + b_expand.val[1] = vec_unpackl(b); +#else + npyv_u32x2 a_expand = npyv_expand_@t@32_@t@16(a); + npyv_u32x2 b_expand = npyv_expand_@t@32_@t@16(b); +#endif + npyv_@t@32 v1 = vec_div(a_expand.val[0], b_expand.val[0]); + npyv_@t@32 v2 = vec_div(a_expand.val[1], b_expand.val[1]); + return vec_pack(v1, v2); } -#define vsx4_div_u32 vec_div +#define vsx4_div_@t@32 vec_div +#define vsx4_div_@t@64 vec_div +/**end repeat**/ /**begin repeat * Unsigned types - * #sfx = u8, u16, u32# - * #len = 8, 16, 32# + * #sfx = u8, u16, u32, u64# + * #len = 8, 16, 32, 64# */ static NPY_INLINE void vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len) { - npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; - npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1]; - npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2]; - const int vstep = npyv_nlanes_@sfx@; - const npyv_@sfx@ zero = npyv_zero_@sfx@(); + npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; + npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1]; + npyv_lanetype_@sfx@ *dst1 = (npyv_lanetype_@sfx@ *) args[2]; + const npyv_@sfx@ vzero = npyv_zero_@sfx@(); + const int vstep = npyv_nlanes_@sfx@; for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst += vstep) { + dst1 += vstep) { npyv_@sfx@ a = npyv_load_@sfx@(src1); npyv_@sfx@ b = npyv_load_@sfx@(src2); npyv_@sfx@ c = vsx4_div_@sfx@(a, b); - npyv_store_@sfx@(dst, c); - if (NPY_UNLIKELY(vec_any_eq(b, zero))) { + npyv_store_@sfx@(dst1, c); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { npy_set_floatstatus_divbyzero(); } } - for (; len > 0; --len, ++src1, ++src2, ++dst) { + for (; len > 0; --len, ++src1, ++src2, ++dst1) { const npyv_lanetype_@sfx@ a = *src1; const npyv_lanetype_@sfx@ b = *src2; if (NPY_UNLIKELY(b == 0)) { npy_set_floatstatus_divbyzero(); - *dst = 0; + *dst1 = 0; } else{ - *dst = a / b; + *dst1 = a / b; } } npyv_cleanup(); } /**end repeat**/ -#endif // NPY_HAVE_VSX4 /**begin repeat - * Unsigned types - * #sfx = u8, u16, u32, u64# - * #len = 8, 16, 32, 64# + * Signed types + * #sfx = s8, s16, s32, s64# + * #len = 8, 16, 32, 64# */ static NPY_INLINE void -simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) +vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len) { - npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0]; - npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1]; - npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2]; - const int vstep = npyv_nlanes_@sfx@; - const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar); + npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; + npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1]; + npyv_lanetype_@sfx@ *dst1 = (npyv_lanetype_@sfx@ *) args[2]; + const npyv_@sfx@ vneg_one = npyv_setall_@sfx@(-1); + const npyv_@sfx@ vzero = npyv_zero_@sfx@(); + const npyv_@sfx@ vmin = npyv_setall_@sfx@(NPY_MIN_INT@len@); + npyv_b@len@ warn = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@()); + const int vstep = npyv_nlanes_@sfx@; - for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { - npyv_@sfx@ a = npyv_load_@sfx@(src); - npyv_@sfx@ c = npyv_divc_@sfx@(a, divisor); - npyv_store_@sfx@(dst, c); + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { + npyv_@sfx@ a = npyv_load_@sfx@(src1); + npyv_@sfx@ b = npyv_load_@sfx@(src2); + npyv_@sfx@ quo = vsx4_div_@sfx@(a, b); + npyv_@sfx@ rem = npyv_sub_@sfx@(a, vec_mul(b, quo)); + // (b == 0 || (a == NPY_MIN_INT@len@ && b == -1)) + npyv_b@len@ bzero = npyv_cmpeq_@sfx@(b, vzero); + npyv_b@len@ amin = npyv_cmpeq_@sfx@(a, vmin); + npyv_b@len@ bneg_one = npyv_cmpeq_@sfx@(b, vneg_one); + npyv_b@len@ overflow = npyv_and_@sfx@(bneg_one, amin); + npyv_b@len@ error = npyv_or_@sfx@(bzero, overflow); + // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 + npyv_@sfx@ cvtozero = npyv_select_@sfx@(error, vzero, vneg_one); + warn = npyv_or_@sfx@(error, warn); + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b@len@ a_gt_zero = npyv_cmpgt_@sfx@(a, vzero); + npyv_b@len@ b_gt_zero = npyv_cmpgt_@sfx@(b, vzero); + npyv_b@len@ ab_eq_cond = npyv_cmpeq_@sfx@(a_gt_zero, b_gt_zero); + npyv_b@len@ rem_zero = npyv_cmpeq_@sfx@(rem, vzero); + npyv_b@len@ or = npyv_or_@sfx@(ab_eq_cond, rem_zero); + npyv_@sfx@ to_sub = npyv_select_@sfx@(or, vzero, vneg_one); + quo = npyv_add_@sfx@(quo, to_sub); + npyv_store_@sfx@(dst1, npyv_and_@sfx@(cvtozero, quo)); } - for (; len > 0; --len, ++src, ++dst) { - const npyv_lanetype_@sfx@ a = *src; - *dst = a / scalar; + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_@sfx@ a = *src1; + const npyv_lanetype_@sfx@ b = *src2; + if (b == 0 || (a == NPY_MIN_INT@len@ && b == -1)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } + else { + *dst1 = a / b; + if (((a > 0) != (b > 0)) && ((*dst1 * b) != a)) { + *dst1 -= 1; + } + } } npyv_cleanup(); } /**end repeat**/ +#endif // NPY_HAVE_VSX4 #endif // NPY_SIMD /******************************************************************************** @@ -260,6 +361,12 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) *((@type@ *)iop1) = io1; } #if NPY_SIMD && defined(TO_SIMD_SFX) +#if defined(NPY_HAVE_VSX4) + // both arguments are arrays of the same size + else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]); + } +#endif // for contiguous block of memory, divisor is a scalar and not 0 else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) && (*(@type@ *)args[1]) != 0) { @@ -279,7 +386,6 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong# * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG# * #STYPE = BYTE, SHORT, INT, LONG, LONGLONG# - * #vector = 1, 1, 1, 0, 0# */ #undef TO_SIMD_SFX #if 0 @@ -316,7 +422,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) *((@type@ *)iop1) = io1; } #if NPY_SIMD && defined(TO_SIMD_SFX) -#if defined(NPY_HAVE_VSX4) && @vector@ +#if defined(NPY_HAVE_VSX4) // both arguments are arrays of the same size else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) { TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]); diff --git a/numpy/core/src/umath/loops_modulo.dispatch.c.src b/numpy/core/src/umath/loops_modulo.dispatch.c.src new file mode 100644 index 000000000..d0ecc016f --- /dev/null +++ b/numpy/core/src/umath/loops_modulo.dispatch.c.src @@ -0,0 +1,631 @@ +/*@targets + ** baseline vsx4 + **/ +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +#include "lowlevel_strided_loops.h" +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" + +#if NPY_SIMD && defined(NPY_HAVE_VSX4) +typedef struct { + npyv_u32x2 hi; + npyv_u32x2 lo; +} vsx4_u32x4; + +typedef struct { + npyv_s32x2 hi; + npyv_s32x2 lo; +} vsx4_s32x4; + +// Converts 1 8-bit vector into 2 16-bit vectors +NPY_FINLINE npyv_s16x2 +vsx4_expand_s16_s8(npyv_s8 data) +{ + npyv_s16x2 r; + r.val[0] = vec_unpackh(data); + r.val[1] = vec_unpackl(data); + return r; +} + +// Converts 1 16-bit vector into 2 32-bit vectors +NPY_FINLINE npyv_s32x2 +vsx4_expand_s32_s16(npyv_s16 data) +{ + npyv_s32x2 r; + r.val[0] = vec_unpackh(data); + r.val[1] = vec_unpackl(data); + return r; +} + +/**begin repeat + * #t = u, s# + * #expand = npyv_expand, vsx4_expand# + */ +// Converts 1 8-bit vector into 4 32-bit vectors +NPY_FINLINE vsx4_@t@32x4 +vsx4_expand_@t@32_@t@8(npyv_@t@8 data) +{ + vsx4_@t@32x4 r; + npyv_@t@16x2 expand = @expand@_@t@16_@t@8(data); + r.hi = @expand@_@t@32_@t@16(expand.val[0]); + r.lo = @expand@_@t@32_@t@16(expand.val[1]); + return r; +} + +/**begin repeat1 + * #simd = div, mod## + */ +/* + * Computes division/modulo of 2 8-bit signed/unsigned integer vectors + * + * As Power10 only supports integer vector division/modulo for data of 32 bits + * or greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer + * vector division/modulo instruction, and then, convert the result back to + * npyv_u8. + */ +NPY_FINLINE npyv_@t@8 +vsx4_@simd@_@t@8(npyv_@t@8 a, npyv_@t@8 b) +{ + vsx4_@t@32x4 a_expand = vsx4_expand_@t@32_@t@8(a); + vsx4_@t@32x4 b_expand = vsx4_expand_@t@32_@t@8(b); + npyv_@t@32 v1 = vec_@simd@(a_expand.hi.val[0], b_expand.hi.val[0]); + npyv_@t@32 v2 = vec_@simd@(a_expand.hi.val[1], b_expand.hi.val[1]); + npyv_@t@32 v3 = vec_@simd@(a_expand.lo.val[0], b_expand.lo.val[0]); + npyv_@t@32 v4 = vec_@simd@(a_expand.lo.val[1], b_expand.lo.val[1]); + npyv_@t@16 hi = vec_pack(v1, v2); + npyv_@t@16 lo = vec_pack(v3, v4); + return vec_pack(hi, lo); +} + +NPY_FINLINE npyv_@t@8 +vsx4_@simd@_scalar_@t@8(npyv_@t@8 a, const vsx4_@t@32x4 b_expand) +{ + vsx4_@t@32x4 a_expand = vsx4_expand_@t@32_@t@8(a); + npyv_@t@32 v1 = vec_@simd@(a_expand.hi.val[0], b_expand.hi.val[0]); + npyv_@t@32 v2 = vec_@simd@(a_expand.hi.val[1], b_expand.hi.val[1]); + npyv_@t@32 v3 = vec_@simd@(a_expand.lo.val[0], b_expand.lo.val[0]); + npyv_@t@32 v4 = vec_@simd@(a_expand.lo.val[1], b_expand.lo.val[1]); + npyv_@t@16 hi = vec_pack(v1, v2); + npyv_@t@16 lo = vec_pack(v3, v4); + return vec_pack(hi, lo); +} + +NPY_FINLINE npyv_@t@16 +vsx4_@simd@_@t@16(npyv_@t@16 a, npyv_@t@16 b) +{ + npyv_@t@32x2 a_expand = @expand@_@t@32_@t@16(a); + npyv_@t@32x2 b_expand = @expand@_@t@32_@t@16(b); + npyv_@t@32 v1 = vec_@simd@(a_expand.val[0], b_expand.val[0]); + npyv_@t@32 v2 = vec_@simd@(a_expand.val[1], b_expand.val[1]); + return vec_pack(v1, v2); +} + +NPY_FINLINE npyv_@t@16 +vsx4_@simd@_scalar_@t@16(npyv_@t@16 a, const npyv_@t@32x2 b_expand) +{ + npyv_@t@32x2 a_expand = @expand@_@t@32_@t@16(a); + npyv_@t@32 v1 = vec_@simd@(a_expand.val[0], b_expand.val[0]); + npyv_@t@32 v2 = vec_@simd@(a_expand.val[1], b_expand.val[1]); + return vec_pack(v1, v2); +} + +#define vsx4_@simd@_@t@32 vec_@simd@ +#define vsx4_@simd@_@t@64 vec_@simd@ +#define vsx4_@simd@_scalar_@t@32 vec_@simd@ +#define vsx4_@simd@_scalar_@t@64 vec_@simd@ +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * #sfx = u8, u16, s8, s16# + * #osfx = u32, u32, s32, s32# + * #otype = vsx4_u32x4, npyv_u32x2, vsx4_s32x4, npyv_s32x2# + * #expand = vsx4_expand, npyv_expand, vsx4_expand, vsx4_expand# + */ +// Generates the divisor for the division/modulo operations +NPY_FINLINE @otype@ +vsx4_divisor_@sfx@(const npyv_@sfx@ vscalar) +{ + return @expand@_@osfx@_@sfx@(vscalar); +} +/**end repeat**/ + +/**begin repeat + * #sfx = u32, u64, s32, s64# + */ +NPY_FINLINE npyv_@sfx@ +vsx4_divisor_@sfx@(const npyv_@sfx@ vscalar) +{ + return vscalar; +} +/**end repeat**/ + +/**begin repeat + * Unsigned types + * #sfx = u8, u16, u32, u64# + * #len = 8, 16, 32, 64# + * #divtype = vsx4_u32x4, npyv_u32x2, npyv_u32, npyv_u64# + */ +/**begin repeat1 + * #func = fmod, remainder, divmod# + * #id = 0, 1, 2# + */ +static NPY_INLINE void +vsx4_simd_@func@_contig_@sfx@(char **args, npy_intp len) +{ + npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; + npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1]; + npyv_lanetype_@sfx@ *dst1 = (npyv_lanetype_@sfx@ *) args[2]; + const npyv_@sfx@ vzero = npyv_zero_@sfx@(); + const int vstep = npyv_nlanes_@sfx@; +#if @id@ == 2 /* divmod */ + npyv_lanetype_@sfx@ *dst2 = (npyv_lanetype_@sfx@ *) args[3]; + const npyv_@sfx@ vneg_one = npyv_setall_@sfx@(-1); + npyv_b@len@ warn = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { + npyv_@sfx@ a = npyv_load_@sfx@(src1); + npyv_@sfx@ b = npyv_load_@sfx@(src2); + npyv_@sfx@ quo = vsx4_div_@sfx@(a, b); + npyv_@sfx@ rem = npyv_sub_@sfx@(a, vec_mul(b, quo)); + npyv_b@len@ bzero = npyv_cmpeq_@sfx@(b, vzero); + // when b is 0, 'cvtozero' forces the modulo to be 0 too + npyv_@sfx@ cvtozero = npyv_select_@sfx@(bzero, vzero, vneg_one); + warn = npyv_or_@sfx@(bzero, warn); + npyv_store_@sfx@(dst1, quo); + npyv_store_@sfx@(dst2, npyv_and_@sfx@(cvtozero, rem)); + } + + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_@sfx@ a = *src1; + const npyv_lanetype_@sfx@ b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } else{ + *dst1 = a / b; + *dst2 = a % b; + } + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { + npyv_@sfx@ a = npyv_load_@sfx@(src1); + npyv_@sfx@ b = npyv_load_@sfx@(src2); + npyv_@sfx@ c = vsx4_mod_@sfx@(a, b); + npyv_store_@sfx@(dst1, c); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_@sfx@ a = *src1; + const npyv_lanetype_@sfx@ b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_@func@_by_scalar_contig_@sfx@(char **args, npy_intp len) +{ + npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; + npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1]; + npyv_lanetype_@sfx@ *dst1 = (npyv_lanetype_@sfx@ *) args[2]; + const int vstep = npyv_nlanes_@sfx@; + const npyv_@sfx@ vscalar = npyv_setall_@sfx@(scalar); + const @divtype@ divisor = vsx4_divisor_@sfx@(vscalar); +#if @id@ == 2 /* divmod */ + npyv_lanetype_@sfx@ *dst2 = (npyv_lanetype_@sfx@ *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { + npyv_@sfx@ a = npyv_load_@sfx@(src1); + npyv_@sfx@ quo = vsx4_div_scalar_@sfx@(a, divisor); + npyv_@sfx@ rem = npyv_sub_@sfx@(a, vec_mul(vscalar, quo)); + npyv_store_@sfx@(dst1, quo); + npyv_store_@sfx@(dst2, rem); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_@sfx@ a = *src1; + *dst1 = a / scalar; + *dst2 = a % scalar; + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { + npyv_@sfx@ a = npyv_load_@sfx@(src1); + npyv_@sfx@ c = vsx4_mod_scalar_@sfx@(a, divisor); + npyv_store_@sfx@(dst1, c); + } + + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_@sfx@ a = *src1; + *dst1 = a % scalar; + } +#endif + npyv_cleanup(); +} +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * Signed types + * #sfx = s8, s16, s32, s64# + * #len = 8, 16, 32, 64# + * #divtype = vsx4_s32x4, npyv_s32x2, npyv_s32, npyv_s64# + */ +/**begin repeat1 + * #func = fmod, remainder, divmod# + * #id = 0, 1, 2# + */ +static NPY_INLINE void +vsx4_simd_@func@_contig_@sfx@(char **args, npy_intp len) +{ + npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; + npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1]; + npyv_lanetype_@sfx@ *dst1 = (npyv_lanetype_@sfx@ *) args[2]; + const npyv_@sfx@ vzero = npyv_zero_@sfx@(); + const int vstep = npyv_nlanes_@sfx@; +#if @id@ == 2 /* divmod */ + npyv_lanetype_@sfx@ *dst2 = (npyv_lanetype_@sfx@ *) args[3]; + const npyv_@sfx@ vneg_one = npyv_setall_@sfx@(-1); + const npyv_@sfx@ vmin = npyv_setall_@sfx@(NPY_MIN_INT@len@); + npyv_b@len@ warn = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { +#endif + npyv_@sfx@ a = npyv_load_@sfx@(src1); + npyv_@sfx@ b = npyv_load_@sfx@(src2); +#if @id@ <= 1 /* fmod and remainder */ + npyv_@sfx@ rem = vsx4_mod_@sfx@(a, b); +#else /* divmod */ + npyv_@sfx@ quo = vsx4_div_@sfx@(a, b); + npyv_@sfx@ rem = npyv_sub_@sfx@(a, vec_mul(b, quo)); + // (b == 0 || (a == NPY_MIN_INT@len@ && b == -1)) + npyv_b@len@ bzero = npyv_cmpeq_@sfx@(b, vzero); + npyv_b@len@ amin = npyv_cmpeq_@sfx@(a, vmin); + npyv_b@len@ bneg_one = npyv_cmpeq_@sfx@(b, vneg_one); + npyv_b@len@ overflow = npyv_and_@sfx@(bneg_one, amin); + npyv_b@len@ error = npyv_or_@sfx@(bzero, overflow); + // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 + npyv_@sfx@ cvtozero = npyv_select_@sfx@(error, vzero, vneg_one); + warn = npyv_or_@sfx@(error, warn); +#endif +#if @id@ >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b@len@ a_gt_zero = npyv_cmpgt_@sfx@(a, vzero); + npyv_b@len@ b_gt_zero = npyv_cmpgt_@sfx@(b, vzero); + npyv_b@len@ ab_eq_cond = npyv_cmpeq_@sfx@(a_gt_zero, b_gt_zero); + npyv_b@len@ rem_zero = npyv_cmpeq_@sfx@(rem, vzero); + npyv_b@len@ or = npyv_or_@sfx@(ab_eq_cond, rem_zero); + npyv_@sfx@ to_add = npyv_select_@sfx@(or, vzero, b); + rem = npyv_add_@sfx@(rem, to_add); +#endif +#if @id@ == 2 /* divmod */ + npyv_@sfx@ to_sub = npyv_select_@sfx@(or, vzero, vneg_one); + quo = npyv_add_@sfx@(quo, to_sub); + npyv_store_@sfx@(dst1, npyv_and_@sfx@(cvtozero, quo)); + npyv_store_@sfx@(dst2, npyv_and_@sfx@(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_@sfx@(dst1, rem); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } +#endif + } + +#if @id@ == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_@sfx@ a = *src1; + const npyv_lanetype_@sfx@ b = *src2; + if (b == 0 || (a == NPY_MIN_INT@len@ && b == -1)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / b; + *dst2 = a % b; + if (!((a > 0) == (b > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += b; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_@sfx@ a = *src1; + const npyv_lanetype_@sfx@ b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; +#if @id@ == 1 /* remainder */ + if (!((a > 0) == (b > 0) || *dst1 == 0)) { + *dst1 += b; + } +#endif + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_@func@_by_scalar_contig_@sfx@(char **args, npy_intp len) +{ + npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; + npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1]; + npyv_lanetype_@sfx@ *dst1 = (npyv_lanetype_@sfx@ *) args[2]; + const npyv_@sfx@ vscalar = npyv_setall_@sfx@(scalar); + const @divtype@ divisor = vsx4_divisor_@sfx@(vscalar); + const int vstep = npyv_nlanes_@sfx@; +#if @id@ >= 1 /* remainder and divmod */ + const npyv_@sfx@ vzero = npyv_zero_@sfx@(); + npyv_b@len@ b_gt_zero = npyv_cmpgt_@sfx@(vscalar, vzero); +#endif +#if @id@ == 2 /* divmod */ + npyv_b@len@ warn = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@()); + const npyv_@sfx@ vmin = npyv_setall_@sfx@(NPY_MIN_INT@len@); + const npyv_@sfx@ vneg_one = npyv_setall_@sfx@(-1); + npyv_b@len@ bneg_one = npyv_cmpeq_@sfx@(vscalar, vneg_one); + npyv_lanetype_@sfx@ *dst2 = (npyv_lanetype_@sfx@ *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { +#endif + npyv_@sfx@ a = npyv_load_@sfx@(src1); +#if @id@ <= 1 /* fmod and remainder */ + npyv_@sfx@ rem = vsx4_mod_scalar_@sfx@(a, divisor); +#else /* divmod */ + npyv_@sfx@ quo = vsx4_div_scalar_@sfx@(a, divisor); + npyv_@sfx@ rem = npyv_sub_@sfx@(a, vec_mul(vscalar, quo)); + // (a == NPY_MIN_INT@len@ && b == -1) + npyv_b@len@ amin = npyv_cmpeq_@sfx@(a, vmin); + npyv_b@len@ overflow = npyv_and_@sfx@(bneg_one, amin); + // in case of overflow, 'cvtozero' forces quo/rem to be 0 + npyv_@sfx@ cvtozero = npyv_select_@sfx@(overflow, vzero, vneg_one); + warn = npyv_or_@sfx@(overflow, warn); +#endif +#if @id@ >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b@len@ a_gt_zero = npyv_cmpgt_@sfx@(a, vzero); + npyv_b@len@ ab_eq_cond = npyv_cmpeq_@sfx@(a_gt_zero, b_gt_zero); + npyv_b@len@ rem_zero = npyv_cmpeq_@sfx@(rem, vzero); + npyv_b@len@ or = npyv_or_@sfx@(ab_eq_cond, rem_zero); + npyv_@sfx@ to_add = npyv_select_@sfx@(or, vzero, vscalar); + rem = npyv_add_@sfx@(rem, to_add); +#endif +#if @id@ == 2 /* divmod */ + npyv_@sfx@ to_sub = npyv_select_@sfx@(or, vzero, vneg_one); + quo = npyv_add_@sfx@(quo, to_sub); + npyv_store_@sfx@(dst1, npyv_and_@sfx@(cvtozero, quo)); + npyv_store_@sfx@(dst2, npyv_and_@sfx@(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_@sfx@(dst1, rem); +#endif + } + +#if @id@ == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_@sfx@ a = *src1; + if (a == NPY_MIN_INT@len@ && scalar == -1) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / scalar; + *dst2 = a % scalar; + if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += scalar; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_@sfx@ a = *src1; + *dst1 = a % scalar; +#if @id@ == 1 /* remainder */ + if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { + *dst1 += scalar; + } +#endif + } +#endif + npyv_cleanup(); +} +/**end repeat1**/ +/**end repeat**/ +#endif // NPY_SIMD && defined(NPY_HAVE_VSX4) + +/***************************************************************************** + ** Defining ufunc inner functions + *****************************************************************************/ + +/**begin repeat + * Signed and Unsigned types + * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, + * npy_byte, npy_short, npy_int, npy_long, npy_longlong# + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, + * BYTE, SHORT, INT, LONG, LONGLONG# + * #STYPE = BYTE, SHORT, INT, LONG, LONGLONG, + * BYTE, SHORT, INT, LONG, LONGLONG# + * #signed = 0, 0, 0, 0, 0, 1, 1, 1, 1, 1# + */ +#undef TO_SIMD_SFX +#if 0 +/**begin repeat1 + * #len = 8, 16, 32, 64# + */ +#elif NPY_BITSOF_@STYPE@ == @len@ + #if @signed@ + #define TO_SIMD_SFX(X) X##_s@len@ + #else + #define TO_SIMD_SFX(X) X##_u@len@ + #endif +/**end repeat1**/ +#endif + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_fmod) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) && + (*(@type@ *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif + BINARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((@type@ *)op1) = 0; + } else{ + *((@type@ *)op1)= in1 % in2; + } + } +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_remainder) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) && + (*(@type@ *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif + BINARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((@type@ *)op1) = 0; + } else{ +#if @signed@ + /* handle mixed case the way Python does */ + const @type@ rem = in1 % in2; + if ((in1 > 0) == (in2 > 0) || rem == 0) { + *((@type@ *)op1) = rem; + } + else { + *((@type@ *)op1) = rem + in2; + } +#else + *((@type@ *)op1)= in1 % in2; +#endif + } + } +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divmod) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) && + (*(@type@ *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif +#if @signed@ + BINARY_LOOP_TWO_OUT { + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + /* see FIXME note for divide above */ + if (NPY_UNLIKELY(in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1))) { + npy_set_floatstatus_divbyzero(); + *((@type@ *)op1) = 0; + *((@type@ *)op2) = 0; + } + else { + /* handle mixed case the way Python does */ + const @type@ quo = in1 / in2; + const @type@ rem = in1 % in2; + if ((in1 > 0) == (in2 > 0) || rem == 0) { + *((@type@ *)op1) = quo; + *((@type@ *)op2) = rem; + } + else { + *((@type@ *)op1) = quo - 1; + *((@type@ *)op2) = rem + in2; + } + } + } +#else + BINARY_LOOP_TWO_OUT { + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((@type@ *)op1) = 0; + *((@type@ *)op2) = 0; + } + else { + *((@type@ *)op1)= in1/in2; + *((@type@ *)op2) = in1 % in2; + } + } +#endif +} +/**end repeat**/ |